Optimization of determinant calculation function - c++

Searching for the best algorithm I found there is a tradeoff: complexity to implement and big constant on the one hand, and runtime complexity on the other hand. I choose LU-decomposition-based algorithm, because it is quite simple to implement and have good enough performance.
#include <valarray>
#include <vector>
#include <utility>
#include <cmath>
#include <cstddef>
#include <cassert>
template< typename value_type >
struct math
using size_type = std::size_t;
size_type const dimension_;
value_type const & eps;
value_type const zero = value_type(0);
value_type const one = value_type(1);
private :
using vector = std::valarray< value_type >;
using matrix = std::vector< vector >;
matrix matrix_;
matrix minor_;
public :
math(size_type const _dimension,
value_type const & _eps)
: dimension_(_dimension)
, eps(_eps)
, matrix_(dimension_)
, minor_(dimension_ - 1)
assert(1 < dimension_);
assert(!(eps < zero));
for (size_type r = 0; r < dimension_; ++r) {
size_type const minor_size = dimension_ - 1;
for (size_type r = 0; r < minor_size; ++r) {
template< typename rhs = matrix >
operator = (rhs const & _matrix)
auto irow = std::begin(matrix_);
for (auto const & row_ : _matrix) {
auto icol = std::begin(*irow);
for (auto const & v : row_) {
*icol = v;
det(matrix & _matrix,
size_type const _dimension)
{ // calculates lower unit triangular matrix and upper triangular
assert(0 < _dimension);
value_type det_ = one;
for (size_type i = 0; i < _dimension; ++i) {
vector & ri_ = _matrix[i];
using std::abs;
value_type max_ = abs(ri_[i]);
size_type pivot = i;
size_type p = i;
while (++p < _dimension) {
value_type y_ = abs(_matrix[p][i]);
if (max_ < y_) {
max_ = std::move(y_);
pivot = p;
if (!(eps < max_)) { // regular?
return zero; // singular
if (pivot != i) {
det_ = -det_; // each permutation flips sign of det
value_type & dia_ = ri_[i];
det_ *= dia_; // det is multiple of diagonal elements
for (size_type j = 1 + i; j < _dimension; ++j) {
_matrix[j][i] /= dia_;
for (size_type a = 1 + i; a < _dimension; ++a) {
vector & a_ = minor_[a - 1];
value_type const & ai_ = _matrix[a][i];
for (size_type b = 1 + i; b < _dimension; ++b) {
a_[b - 1] = ai_ * ri_[b];
for (size_type a = 1 + i; a < _dimension; ++a) {
vector const & a_ = minor_[a - 1];
vector & ra_ = _matrix[a];
for (size_type b = 1 + i; b < _dimension; ++b) {
ra_[b] -= a_[b - 1];
return det_;
det(size_type const _dimension)
return det(matrix_, _dimension);
return det(dimension_);
// main.cpp
#include <iostream>
#include <cstdlib>
using value_type = double;
value_type const eps = std::numeric_limits< value_type >::epsilon();
std::size_t const dimension_ = 3;
math< value_type > m(dimension_, eps);
m = { // example from https://en.wikipedia.org/wiki/Determinant#Laplace.27s_formula_and_the_adjugate_matrix
{-2.0, 2.0, -3.0},
{-1.0, 1.0, 3.0},
{ 2.0, 0.0, -1.0}
std::cout << m.det() << std::endl; // 18
det() function is hottest function in the algorithm, that uses it as a part. I sure det() is not as fast as it can be, because runtime performance comparisons (using google-pprof) to reference implementation of the whole algorithm shows a disproportion towards det().
How to improve performance of det() function? What are evident optimizations to apply immediately? Should I change the indexing and memory access order or something else? Container types? Prefetching?
Typical value of dimension_ is in the range of 3 to 10 (but can be 100, if value_type is mpfr or something else).

Isn't your (snippet from det())
for (size_type a = 1 + i; a < _dimension; ++a) {
vector & a_ = minor_[a - 1];
value_type const & ai_ = _matrix[a][i];
for (size_type b = 1 + i; b < _dimension; ++b) {
a_[b - 1] = ai_ * ri_[b];
for (size_type a = 1 + i; a < _dimension; ++a) {
vector const & a_ = minor_[a - 1];
vector & ra_ = _matrix[a];
for (size_type b = 1 + i; b < _dimension; ++b) {
ra_[b] -= a_[b - 1];
doing the same as
for (size_type a = 1 + i; a < _dimension; ++a) {
vector & ra_ = _matrix[a];
value_type ai_ = ra_[i];
for (size_type b = 1 + i; b < _dimension; ++b) {
ra_[b] -= ai_ * ri_[b];
without any need for minor_? Moreover, now the inner loop can easily be vectorised.


How to optimize my C++ OpenMp Matrix Multiplication code

I have written a C++ OpenMp Matrix Multiplication code that multiplies two 1000x1000 matrices.
So far I have gotten a 0.700 sec execution time using OpenMp but I want to see if there is other ways I can make it faster using OpenMp?
I appreciate any advice or tips you can give me.
Here is my code:
#include <iostream>
#include <time.h>
#include <omp.h>
using namespace std;
void Multiply()
//initialize matrices with random numbers
int aMatrix[1000][1000], i, j;
for( i = 0; i < 1000; ++i)
{for( j = 0; j < 1000; ++j)
{aMatrix[i][j] = rand();}
int bMatrix[1000][1000], i1, j2;
for( i1 = 0; i1 < 1000; ++i1)
{for( j2 = 0; j2 < 1000; ++j2)
{bMatrix[i1][j2] = rand();}
//Result Matrix
int product[1000][1000] = {0};
for (int row = 0; row < 1000; row++) {
for (int col = 0; col < 1000; col++) {
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 0; inner < 1000; inner++) {
product[row][col] += aMatrix[row][inner] * bMatrix[inner][col];
int main() {
time_t begin, end;
time_t elapsed = end - begin;
cout << ("Time measured: %ld seconds.\n", elapsed);
return 0;
Following things can be done for speedup:
Using OpenMP for parallelizing external loop, like you did (and like I also did in my following code). Or alternatively using std::async for multi-threading like it was used in another answer.
Transpose B matrix, this will help to increase L1 cache hits, because you will read from sequential memory each B column (or row in transposed variant).
Use vectorized SIMD instructions, this will allow to do several multiplications (and additions) within one CPU cycle. Often compilers do auto-vectorization of your loops well enough through SIMD instructions without your help, but I did it explicitly in my code.
Run several independent SIMD instructions within loop. This will help to occupy whole CPU pipeline of SIMD. I did so in my code by using four SIMD registers r0, r1, r2, r3. In compilers this is usually called loop unrolling.
Align your matrix starting address on 64-bytes boundary. This will help SIMD instructions to do fast aligned read/write.
Align starting address of each matrix row on 64-bytes boundary. I did this in my code by padding each row with zeros till multiple of 64-bytes. This also helps SIMD instructions to do fast aligned read/write.
In my following code I did all 1. - 6. steps above. Memory 64-bytes alignment I did through implementing AlignmentAllocator that was used in std::vector. Also I did time measurements for float/double/int.
On my old 4-core laptop I got following time measurements for the case of 1000x1000 matrix multiplying by 1000x1000:
float: time 0.1569 sec
double: time 0.3168 sec
int: time 0.1565 sec
To compare my hardware capabilities I did measurements of another answer of #doug for the case of int:
Threads w transpose 0.2164 secs.
As one can see my solution is 1.4x times faster that the other answer, I guess due to memory 64-bytes alignment and maybe due to using explicit SIMD (instead of relying on compiler auto-vectorization of a loop).
To compile my program, don't forget to add -fopenmp -lgomp options (for OpenMP support) and -march=native -O3 -std=c++20 options (for SIMD support, optimizations and standard) if you're compiling under GCC/CLang, while MSVC I guess adds OpenMP automatically and doesn't need any special options (use /O2 /GL /std:c++latest for optimizations and standard in MSVC).
In my code I only implemented SSE2/SSE4/AVX/AVX2 instructions for SIMD, if you have more powerful machine you may tell me and I implement also FMA/AVX-512, they will give even twice more speed boost.
My Mul() function is quite generic, it is templated, and you just pass pointers to matrices and row/col count, so your matrices may be stored on calling side in any way (through std::vector or std::array or plain 2D array).
At start of Run() function you may change number of rows and columns if you need a bigger test. Notice that all my functions support any rows and columns, you may even multiply matrix of size 1234x2345 by 2345x3456.
Try it online!
#include <cstdint>
#include <cstring>
#include <stdexcept>
#include <iostream>
#include <iomanip>
#include <vector>
#include <memory>
#include <string>
#include <immintrin.h>
#define USE_OPENMP 1
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#if defined(_MSC_VER)
#define IS_MSVC 1
#define IS_MSVC 0
#include <omp.h>
template <typename T, std::size_t N>
class AlignmentAllocator {
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef T * pointer;
typedef const T * const_pointer;
typedef T & reference;
typedef const T & const_reference;
inline AlignmentAllocator() throw() {}
template <typename T2> inline AlignmentAllocator(const AlignmentAllocator<T2, N> &) throw() {}
inline ~AlignmentAllocator() throw() {}
inline pointer adress(reference r) { return &r; }
inline const_pointer adress(const_reference r) const { return &r; }
inline pointer allocate(size_type n);
inline void deallocate(pointer p, size_type);
inline void construct(pointer p, const value_type & wert);
inline void destroy(pointer p) { p->~value_type(); }
inline size_type max_size() const throw() { return size_type(-1) / sizeof(value_type); }
template <typename T2> struct rebind { typedef AlignmentAllocator<T2, N> other; };
bool operator!=(const AlignmentAllocator<T, N> & other) const { return !(*this == other); }
bool operator==(const AlignmentAllocator<T, N> & other) const { return true; }
template <typename T, std::size_t N>
inline typename AlignmentAllocator<T, N>::pointer AlignmentAllocator<T, N>::allocate(size_type n) {
auto p = (pointer)_aligned_malloc(n * sizeof(value_type), N);
auto p = (pointer)std::aligned_alloc(N, n * sizeof(value_type));
return p;
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::deallocate(pointer p, size_type) {
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::construct(pointer p, const value_type & wert) {
new (p) value_type(wert);
template <typename T>
using AlignedVector = std::vector<T, AlignmentAllocator<T, 64>>;
template <typename T>
struct RegT;
#ifdef __AVX__
template <> struct RegT<float> { static size_t constexpr bisize = 256; using type = __m256; static type zero() { return _mm256_setzero_ps(); } };
template <> struct RegT<double> { static size_t constexpr bisize = 256; using type = __m256d; static type zero() { return _mm256_setzero_pd(); } };
inline void MulAddReg(float const * a, float const * b, __m256 & c) {
c = _mm256_add_ps(c, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
inline void MulAddReg(double const * a, double const * b, __m256d & c) {
c = _mm256_add_pd(c, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
inline void StoreReg(float * dst, __m256 const & src) { _mm256_store_ps(dst, src); }
inline void StoreReg(double * dst, __m256d const & src) { _mm256_store_pd(dst, src); }
#else // SSE2
template <> struct RegT<float> { static size_t constexpr bisize = 128; using type = __m128; static type zero() { return _mm_setzero_ps(); } };
template <> struct RegT<double> { static size_t constexpr bisize = 128; using type = __m128d; static type zero() { return _mm_setzero_pd(); } };
inline void MulAddReg(float const * a, float const * b, __m128 & c) {
c = _mm_add_ps(c, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
inline void MulAddReg(double const * a, double const * b, __m128d & c) {
c = _mm_add_pd(c, _mm_mul_pd(_mm_load_pd(a), _mm_load_pd(b)));
inline void StoreReg(float * dst, __m128 const & src) { _mm_store_ps(dst, src); }
inline void StoreReg(double * dst, __m128d const & src) { _mm_store_pd(dst, src); }
#ifdef __AVX2__
template <> struct RegT<int32_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };
//template <> struct RegT<int64_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };
inline void MulAddReg(int32_t const * a, int32_t const * b, __m256i & c) {
c = _mm256_add_epi32(c, _mm256_mullo_epi32(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
//inline void MulAddReg(int64_t const * a, int64_t const * b, __m256i & c) {
// c = _mm256_add_epi64(c, _mm256_mullo_epi64(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
inline void StoreReg(int32_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
//inline void StoreReg(int64_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
#else // SSE2
template <> struct RegT<int32_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };
//template <> struct RegT<int64_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };
inline void MulAddReg(int32_t const * a, int32_t const * b, __m128i & c) {
c = _mm_add_epi32(c, _mm_mullo_epi32(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
//inline void MulAddReg(int64_t const * a, int64_t const * b, __m128i & c) {
// c = _mm_add_epi64(c, _mm_mullo_epi64(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
inline void StoreReg(int32_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
//inline void StoreReg(int64_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
template <typename T>
void Mul(T const * A0, size_t A_rows, size_t A_cols, T const * B0, size_t B_rows, size_t B_cols, T * C) {
size_t constexpr reg_cnt = RegT<T>::bisize / 8 / sizeof(T), block = 4 * reg_cnt;
ASSERT(A_cols == B_rows);
size_t const A_cols_aligned = (A_cols + block - 1) / block * block, B_rows_aligned = (B_rows + block - 1) / block * block;
// Copy aligned A
AlignedVector<T> Av(A_rows * A_cols_aligned);
for (size_t i = 0; i < A_rows; ++i)
std::memcpy(&Av[i * A_cols_aligned], &A0[i * A_cols], sizeof(Av[0]) * A_cols);
T const * A = Av.data();
// Transpose B
AlignedVector<T> Bv(B_cols * B_rows_aligned);
for (size_t j = 0; j < B_cols; ++j)
for (size_t i = 0; i < B_rows; ++i)
Bv[j * B_rows_aligned + i] = B0[i * B_cols + j];
T const * Bt = Bv.data();
ASSERT(uintptr_t(A) % 64 == 0 && uintptr_t(Bt) % 64 == 0);
ASSERT(uintptr_t(&A[A_cols_aligned]) % 64 == 0 && uintptr_t(&Bt[B_rows_aligned]) % 64 == 0);
// Multiply
#pragma omp parallel for
for (size_t i = 0; i < A_rows; ++i) {
// Aligned Reg storage
AlignedVector<T> Regs(block);
for (size_t j = 0; j < B_cols; ++j) {
T const * Arow = &A[i * A_cols_aligned + 0], * Btrow = &Bt[j * B_rows_aligned + 0];
using Reg = typename RegT<T>::type;
Reg r0 = RegT<T>::zero(), r1 = RegT<T>::zero(), r2 = RegT<T>::zero(), r3 = RegT<T>::zero();
size_t const k_hi = A_cols - A_cols % block;
for (size_t k = 0; k < k_hi; k += block) {
MulAddReg(&Arow[k + reg_cnt * 0], &Btrow[k + reg_cnt * 0], r0);
MulAddReg(&Arow[k + reg_cnt * 1], &Btrow[k + reg_cnt * 1], r1);
MulAddReg(&Arow[k + reg_cnt * 2], &Btrow[k + reg_cnt * 2], r2);
MulAddReg(&Arow[k + reg_cnt * 3], &Btrow[k + reg_cnt * 3], r3);
StoreReg(&Regs[reg_cnt * 0], r0);
StoreReg(&Regs[reg_cnt * 1], r1);
StoreReg(&Regs[reg_cnt * 2], r2);
StoreReg(&Regs[reg_cnt * 3], r3);
T sum1 = 0, sum2 = 0, sum3 = 0;
for (size_t k = 0; k < Regs.size(); ++k)
sum1 += Regs[k];
//for (size_t k = 0; k < A_cols - A_cols % block; ++k) sum3 += Arow[k] * Btrow[k];
for (size_t k = k_hi; k < A_cols; ++k)
sum2 += Arow[k] * Btrow[k];
C[i * A_rows + j] = sum2 + sum1;
#include <random>
#include <thread>
#include <chrono>
#include <type_traits>
template <typename T>
void Test(T const * A, size_t A_rows, size_t A_cols, T const * B, size_t B_rows, size_t B_cols, T const * C, T eps) {
for (size_t i = 0; i < A_rows / 16; ++i)
for (size_t j = 0; j < B_cols / 16; ++j) {
T sum = 0;
for (size_t k = 0; k < A_cols; ++k)
sum += A[i * A_cols + k] * B[k * B_cols + j];
ASSERT_MSG(std::abs(C[i * A_rows + j] - sum) <= eps * A_cols, "i " + std::to_string(i) + " j " + std::to_string(j) +
" C " + std::to_string(C[i * A_rows + j]) + " ref " + std::to_string(sum));
double Time() {
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
template <typename T>
void Run() {
size_t constexpr A_rows = 1000, A_cols = 1000, B_rows = 1000, B_cols = 1000;
std::string const tname = std::is_same_v<T, float> ? "float" : std::is_same_v<T, double> ?
"double" : std::is_same_v<T, int32_t> ? "int" : "<unknown>";
bool const is_int = tname == "int";
std::mt19937_64 rng{123};
std::vector<T> A(A_rows * A_cols), B(B_rows * B_cols), C(A_rows * B_cols);
for (size_t i = 0; i < A.size(); ++i)
A[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
for (size_t i = 0; i < B.size(); ++i)
B[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
auto tim = Time();
Mul(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0]);
tim = Time() - tim;
std::cout << std::setw(6) << tname << ": time " << std::fixed << std::setprecision(4) << tim << " sec" << std::endl;
Test(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0], tname == "float" ? T(1e-7) : tname == "double" ? T(1e-15) : T(0));
int main() {
try {
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
float: time 0.1569 sec
double: time 0.3168 sec
int: time 0.1565 sec
Here's straight c++ code that runs in .08s with ints and .14s with floats or doubles. My system is 10 years old with relatively slow memory. Good at the time but now is now.
I agree with #VictorEijkhout that the best results would be with tuned code. There has been huge amounts of work optimizing those.
#include <vector>
#include <array>
#include <random>
#include <cassert>
#include <iostream>
#include <iomanip>
#include <thread>
#include <future>
#include <chrono>
struct Timer {
std::chrono::system_clock::time_point snapTime;
Timer() { snapTime = std::chrono::system_clock::now(); }
operator double() { return std::chrono::duration<double>(std::chrono::system_clock::now() - snapTime).count(); }
using DataType = int;
using std::array, std::vector;
constexpr int N = 1000, THREADS = 12;
static auto launchType = std::launch::async;
using Matrix = vector<array<DataType, N>>;
Matrix create_matrix() { return Matrix(N); };
Matrix product(Matrix const& v0, Matrix const& v1, double& time)
Matrix ret = create_matrix();
Matrix v2 = create_matrix();
Timer timer;
for (size_t r = 0; r < N; r++) // transpose first
for (size_t c = 0; c < N; c++)
v2[c][r] = v1[r][c];
// lambda to process sets of rows in separate threads
auto do_row_set = [&v0, &v2, &ret](size_t start, size_t last) {
for (size_t row = start; row < last; row++)
for (size_t col = 0; col < N; col++)
DataType tmp{}; // separate tmp variable significantly improves optimization
for (size_t col_t = 0; col_t < N; col_t++)
tmp += v0[row][col_t] * v2[col][col_t];
ret[row][col] = tmp;
vector<size_t> seq;
const size_t NN = N / THREADS;
// make a sequence of NN+1 rows from start to end
for (size_t thread_n = 0; thread_n < N; thread_n += NN)
vector<std::future<void>> results; results.reserve(THREADS);
for (size_t i = 0; i < THREADS; i++)
results.emplace_back(std::async(launchType, do_row_set, seq[i], seq[i + 1]));
for (auto& x : results)
time = timer;
return ret;
bool operator==(Matrix const& v0, Matrix const& v1)
for (size_t r = 0; r < N; r++)
for (size_t c = 0; c < N; c++)
if (v0[r][c] != v1[r][c])
return false;
return true;
int main()
auto fill = [](Matrix& v) {
std::mt19937_64 r(1);
std::normal_distribution dist(1.);
for (size_t row = 0; row < N; row++)
for (size_t col = 0; col < N; col++)
v[row][col] = DataType(dist(r));
Matrix m1 = create_matrix(), m2 = create_matrix(), m3 = create_matrix();
fill(m1); fill(m2);
auto process_test = [&m1, &m2](Matrix& out) {
const int rpt_count = 4;
double sum = 0;
for (int i = 0; i < rpt_count; i++)
double time;
out = product(m1, m2, time);
sum += time / rpt_count;
return sum;
std::cout << std::fixed << std::setprecision(4);
double time{};
time = process_test(m3);
std::cout << "Threads w transpose " << time << " secs.\n";

BigInt multiplication and to_string implementation outputs too many zeros

I created the following for multiplying two big integers stored with base 1,000,000,000 as a vector<int32_t>:
#include <iostream>
#include <vector>
#include <cmath>
#include <limits>
#include <algorithm>
template<typename T>
constexpr T power_of_10(T n)
return n < 0 ? 0 : n == 0 ? 1 : (n == 1 ? 10 : 10 * power_of_10(n - 1));
template<typename T>
constexpr T base_value = power_of_10<T>(std::numeric_limits<T>::digits10);
template<typename T>
constexpr T max_value = base_value<T> - 1;
class BigInt {
static constexpr const std::uint32_t base = base_value<std::uint32_t>;
static constexpr const std::uint32_t max_digits = std::numeric_limits<std::uint32_t>::digits10;
std::vector<std::uint64_t> digits;
BigInt(const char* value) : BigInt(std::string(value))
BigInt(const std::string& value)
constexpr const int stride = std::numeric_limits<std::uint32_t>::digits10;
const std::size_t size = value.size() / stride;
for (std::size_t i = 0; i < size; ++i)
auto it = value.begin();
auto jt = value.begin();
std::advance(it, i * stride);
std::advance(jt, (i * stride) + stride);
digits.push_back(std::stoull(std::string(it, jt)));
if (value.size() % stride)
auto remainder = std::string(value.begin() + size * stride, value.end());
std::reverse(digits.begin(), digits.end());
BigInt& multiply(const BigInt& other)
std::vector<std::uint64_t> product = std::vector<std::uint64_t>(digits.size() + other.digits.size(), 0);
for (std::size_t i = 0; i < other.digits.size(); ++i)
std::uint64_t carry = 0, total = 0;
for (std::size_t j = 0; j < digits.size(); ++j)
total = product.at(i + j) + (other.digits[i] * digits[j]) + carry;
carry = total / base;
total %= base;
product.at(i + j) = total;
if (carry)
product[i + digits.size()] = carry;
digits = product;
return *this;
std::string to_string() {
std::string result = std::to_string(digits[digits.size() - 1]);
// for (std::int64_t i = digits.size() - 2; i >= 0; --i)
// {
// std::string group = std::to_string(digits[i]);
// while (group.size() < max_digits) {
// group = '0' + group;
// }
// result += group;
// }
for (std::int64_t i = digits.size() - 2; i >= 0; --i)
std::uint64_t value = digits[i];
std::uint32_t divisor = base;
if (divisor != base)
result += (value / divisor) + '0';
value %= divisor;
divisor /= 10;
return result;
int main(int argc, const char * argv[])
BigInt a = "5000000000";
BigInt b = "5000000000";
return 0;
When I print the result of the multiplication, I am getting 5,000,000,000 * 5,000,000,000 = 250,000,000,000,000,000,000,000,000,000,000,000 which has way too many zeroes!
It should have 18 zeroes, but mine has 34.
I believe my multiplication algorithm is correct and my to_string is incorrect because 500 * 500 prints correctly as 25,000.
Any ideas what is wrong?
The problem comes from this line:
product[digits.size() + 1] = static_cast<T>(carry);
The index digits.size() + 1 is incorrect. It should be digits.size() + j.

runtime error caused by a constom function

I encountered some runtime errors when I was trying to finish a A* algorithm for tsp problem. The program doesn't want to reach the main function. Here is my code, it is long.
#include <cmath>
#include <iostream>
#include <memory>
#include <queue>
#include <stdexcept>
#include <string>
#include <vector>
using namespace std;
constexpr const size_t R = 200; // Max Vertex number
pair<double, double> coordinate[ R ]{{1, 1}, {2, 2}, {1, 2}, {0, 0}};
// Index Min Priority Queue for double
class IndexMinPQ {
const size_t maxN;
size_t n;
size_t *pq, *qp;
double* elem;
void exch(size_t i, size_t j) {
swap(pq[ i ], pq[ j ]);
swap(qp[ pq[ i ] ], qp[ pq[ j ] ]);
void swim(size_t k) {
while (k > 1 && elem[ pq[ k / 2 ] ] > elem[ pq[ k ] ]) {
exch(k, k / 2);
k = k / 2;
void sink(size_t k) {
while (2 * k <= n) {
int j = 2 * k;
if (j < n && elem[ pq[ j ] ] > elem[ pq[ j + 1 ] ]) ++j;
if (elem[ pq[ k ] ] <= elem[ pq[ j ] ]) break;
exch(k, j);
k = j;
inline void validateIndex(size_t i) const {
if (i >= maxN) throw invalid_argument("index >= capacity: " + to_string(i));
IndexMinPQ(size_t maxN)
: maxN(maxN), n(0), pq(new size_t[ maxN + 1 ]), qp(new size_t[ maxN + 1 ]),
elem(new double[ maxN + 1 ]) {
for (size_t i = 0; i <= maxN; ++i)
qp[ i ] = 0;
IndexMinPQ(const IndexMinPQ& orig)
: maxN(orig.maxN), n(orig.n), pq(new size_t[ maxN + 1 ]),
qp(new size_t[ maxN + 1 ]), elem(new double[ maxN + 1 ]) {
copy(orig.pq, orig.pq + n + 1, pq);
copy(orig.qp, orig.qp + n + 1, qp);
copy(orig.elem, orig.elem + n + 1, elem);
IndexMinPQ& operator=(const IndexMinPQ&) = delete;
~IndexMinPQ() {
delete[] pq;
delete[] qp;
delete[] elem;
void insert(size_t i, double val) {
if (n == maxN) throw overflow_error("priority queue is full");
if (contains(i)) throw invalid_argument("index is already in the priority queue");
qp[ i ] = n;
elem[ i ] = val;
pq[ n ] = i;
bool contains(int i) const {
return qp[ i ] != 0;
void delElem(size_t i) {
if (!contains(i)) {
// throw invalid_argument("index is not in the priority queue");
int index = qp[ i ];
exch(index, n--);
elem[ i ] = 0.0;
qp[ i ] = 0;
size_t size() const { return n; }
double minElem() const {
if (n == 0) throw underflow_error("priority queue is empty");
return elem[ pq[ 1 ] ];
// Weighted Edge
class Edge {
const size_t no;
const size_t v;
const size_t w;
const double wei;
Edge(size_t No, size_t v, size_t w, double weight)
: no(No), v(v), w(w), wei(weight) {}
double weight() const { return this->wei; }
size_t No() const { return no; }
size_t either() const { return v; }
size_t other(int vertex) const {
if (vertex == v)
return w;
else if (vertex == w)
return v;
throw invalid_argument("Inconsistent edge");
class Graph {
const size_t V; // number of Vertexs
const size_t E; // number of Edges=V*(V-1)/2
Edge* edges; // All Edges
vector<size_t>* Adj; // adjcent table
Graph(size_t V) : V(V), E(V * (V - 1) / 2), Adj(new vector<size_t>[ V ]) {
allocator<Edge> alloc; // detach memory allocating and item constructing
edges = alloc.allocate(E);
size_t cnt = 0;
for (size_t i = 0; i < V - 1; ++i)
for (size_t j = i + 1; j < V; ++j) {
double dx = coordinate[ i ].first - coordinate[ j ].first,
dy = coordinate[ i ].second - coordinate[ j ].second;
// Euclid distance between two Vertexs
alloc.construct(edges + cnt, cnt, i, j, sqrt(dx * dx + dy * dy));
// construct adjcent table
for (size_t i = 0; i < E; ++i) {
size_t v = edges[ i ].either(), w = edges[ i ].other(v);
Adj[ v ].push_back(i);
Adj[ w ].push_back(i);
~Graph() {
delete[] Adj;
allocator<Edge> alloc;
for (size_t i = 0; i < E; ++i)
alloc.destroy(edges + i);
alloc.deallocate(edges, E);
inline size_t sizeV() const { return V; }
inline size_t sizeE() const { return E; }
inline const Edge& getEdge(size_t e) const {
if (e >= E)
throw invalid_argument("index >= edges: V * (V - 1) / 2 =" + to_string(E));
return edges[ e ];
inline const vector<size_t>& adj(size_t v) const {
if (v >= V) throw invalid_argument("index >= vetexs: " + to_string(V));
return Adj[ v ];
struct Aux {
const Graph& G;
vector<size_t> path;
bool* marked;
IndexMinPQ pq;
double dist;
double evaluate;
Aux(const Graph& G)
: G(G), marked(new bool[ G.sizeV() ]), pq(G.sizeE()), dist(0), evaluate(0) {
for (size_t i = 1; i < G.sizeV(); ++i)
marked[ i ] = false;
marked[ 0 ] = true;
for (size_t i = G.sizeV() - 1; i < G.sizeE(); ++i)
pq.insert(i, G.getEdge(i).weight());
~Aux() { delete[] marked; }
Aux(const Aux&) = delete;
Aux& operator=(const Aux&) = delete;
Aux(const Aux& orgi, const Edge& e)
: G(orgi.G), marked(new bool[ G.sizeV() ]), pq(orgi.pq), dist(orgi.dist) {
copy(orgi.marked, orgi.marked + G.sizeV(), marked);
size_t v = path.back(), w = e.other(v);
if (marked[ w ]) throw invalid_argument("already in path");
dist += e.weight();
evaluate = dist + (G.sizeV() - path.size() + 1) * pq.minElem();
for (const size_t& e : G.adj(w))
vector<size_t> AStar(const Graph& G) {
auto cmp = [](Aux* lhs, Aux* rhs) -> bool { return lhs->evaluate > rhs->evaluate; };
priority_queue<Aux*, vector<Aux*>, decltype(cmp)> pq(cmp);
Aux* t = new Aux(G);
while (true) {
Aux* a = pq.top();
if (a->path.size() == G.sizeV()) return a->path;
size_t v = a->path.back();
for (size_t ind : G.adj(v)) {
const Edge& e = G.getEdge(ind);
size_t w = e.other(v);
if (!a->marked[ w ]) {
Aux* t2 = new Aux(*a, e);
delete a;
// throw runtime_error("impossible to reach");
return vector<size_t>();
int main() {
// AStar(Graph(4));
return 0;
The compilier didn't say anything. However,
it won't goto the main function and flashed over the screen. I tried to set breakpoint, but it didn't hit, what the gdb says was:
(gdb) r
Starting program: D:\cs\c++\exercise\source3.exe
[New Thread 8152.0x1aec]
[New Thread 8152.0x3254]
[New Thread 8152.0x1740]
[New Thread 8152.0x288]
Mingw-w64 runtime failure:
Unknown pseudo relocation protocol version 256.
[Thread 8152.0x3254 exited with code 3]
[Thread 8152.0x1740 exited with code 3]
[Thread 8152.0x288 exited with code 3]
[Inferior 1 (process 8152) exited with code 03]
However, after I commented the function AStar(line 191-213), the problem disappeared. What is the problem?
You call back() on an empty vector. But please use a (good) debugger before posting here! GDB is good but no really easy to use...
size_t v = a->path.back();

Find max position in a vector of vector of vector

I have a vector of vector of vector
std::vector<std::vector<std::vector<double>>> mountain_table
and I would like to find the coordinates i, j, k of this vector for which it is the highest. I know that I should use max_element but I don't know how to use it in a 3d vector.
How should I get those coordinates?
I'd suggest to linearize your data in order to be able to use standard algorithms. The idea is to provide a couple of functions to get an index from 3D coords and vice et versa:
template<class T>
class Matrix3D // minimal
using value_type = T;
using iterator = std::vector<value_type>::iterator;
std::vector<value_type> _data;
size_t _sizex, _sizey, _sizez;
size_t index_from_coords(size_t x, size_t y, size_t z) const
return x*_sizex*_sizey + y*_sizey + z;
std::tuple<size_t, size_t, size_t> coords_from_index(size_t index) const
const size_t x = index / (_sizex * _sizey);
index = index % x;
const size_t y = index / _sizey;
const size_t z = index % _sizey;
return make_tuple(x, y, z);
Matrix3D(size_t sizex, sizey, sizez) : _sizex(sizex), ... {}
T& operator()(size_t x, size_t y, size_t z) // add const version
return _data[index_from_coords(x, y, z)];
std::tuple<size_t, size_t, size_t> coords(iterator it)
size_t index = std::distance(begin(_data), it);
return coords_from_index(index);
iterator begin() { return begin(_data); }
iterator end() { return end(_data); }
Matrix3D<double> m(3, 3, 3);
auto it = std::max_element(m.begin(), m.end()); // or min, or whatever from http://en.cppreference.com/w/cpp/header/algorithm
auto coords = m.coords(it);
std::cout << "x=" << coords.get<0>() << ... << "\n";
This is untested and incomplete code to give you a kickstart into better data design. i'd be happy to answer further questions about this idea in the comment below ;)
Here is how I would do it, by looping over the matrix, checking for highest values, and recording its indexes.
size_t highestI = 0;
size_t highestJ = 0;
size_t highestK = 0;
double highestValue = -std::numeric_limits<double>::infinity(); // Default value (Include <limits>)
for (size_t i = 0; i < mountain_table.size(); ++i)
for (size_t j = 0; j < mountain_table[i].size(); ++j)
for (size_t k = 0; k < mountain_table[i][j].size(); ++k)
if (mountain_table[i][j][k] > highestValue)
highestValue = mountain_table[i][j][k]; // Highest
// value needed to figure out highest indexes
// Stores the current highest indexes
highestI = i;
highestJ = j;
highestK = k;
This may not be the most efficient algorithm, but it gets the job done in an understandable way.
Since the max_element function is pretty short and easy to implement, I would suggest to write something similar yourself to fit your exact scenario.
// For types like this I would suggest using a type alias
using Vector3d = std::vector<std::vector<std::vector<double>>>;
std::array<size_t, 3> max_element(const Vector3d& vector) {
std::std::array<size_t, 3> indexes;
double biggest = vector[0][0][0];
for (unsigned i = 0; i < vector.size(); ++i)
for (unsigned j = 0; j < vector[i].size(); ++j)
for (unsigned k = 0; k < vector[i][j].size(); ++k)
if (value > biggest) {
biggest = value;
indexes = { i, j, k };
return indexes;
One other suggestion I could give you is to write your custom class Vector3d, with convenient functions like operator()(int x, int y, int z) etc. and save the data internally in simple vector<double> of size width * height * depth.
std::size_t rv[3] = {0};
std::size_t i = 0;
double max_value = mountain_table[0][0][0];
for (const auto& x : mountain_table) {
std::size_t j = 0;
for (const auto& y : x) {
auto it = std::max_element(y.begin(), y.end());
if (*it > max_value) {
rv[0] = i; rv[1] = j; rv[2] = it - y.begin();
max_value = *it;
I do not think you can use std::max_element for such data. You can use std::accumulate():
using dvect = std::vector<double>;
using ddvect = std::vector<dvect>;
using dddvect = std::vector<ddvect>;
dddvect mx = { { { 1, 2, 3 }, { -1, 3 }, { 8,-2, 3 } },
{ {}, { -1, 25, 3 }, { 7, 3, 3 } },
{ { -1, -2, -3 }, {}, { 33 } } };
struct max_value {
size_t i = 0;
size_t j = 0;
size_t k = 0;
double value = -std::numeric_limits<double>::infinity();
max_value() = default;
max_value( size_t i, size_t j, size_t k, double v ) : i( i ), j( j ), k( k ), value( v ) {}
max_value operator<<( const max_value &v ) const
return value > v.value ? *this : v;
auto max = std::accumulate( mx.begin(), mx.end(), max_value{}, [&mx]( const max_value &val, const ddvect &ddv ) {
auto i = std::distance( &*mx.cbegin(), &ddv );
return std::accumulate( ddv.begin(), ddv.end(), val, [i,&ddv]( const max_value &val, const dvect &dv ) {
auto j = std::distance( &*ddv.cbegin(), &dv );
return std::accumulate( dv.begin(), dv.end(), val, [i,j,&dv]( const max_value &val, const double &d ) {
auto k = std::distance( &*dv.cbegin(), &d );
return val << max_value( i, j, k, d );
} );
} );
} );
live example. Code could be simplified if C++14 or later allowed but I am not sure that it would worse the effort and data reorganization most probably would work better (you would be able to use std::max_element() on singe vector for example). On another side this layout supports jagged matrix as shown on example (different size subarrays)
You should use "for" loop , because you don't have 3d vector.
for (size_t i = 0; i <mountain_table.size(); ++i)
for (size_t j = 0; j < mountain_table[i].size() ++j)
// find max element index k here and check if it is maximum.
// If yes save i, j, k and update max val

Storing a Big Number in a Variable and Looping

How can i store a big number in a variable and use a for loop?
I have a very big number 75472202764752234070123900087933251 and i need to loop from 0 to this number!
Is it even possible to do this? how much time will it take to end?
EDIT: i am trying to solve a hard problem by brute force. its a combination problem.the bruteforcing cases may reach 470C450.
so i guess i should use a different algorithm...
This might take
0.23 x 10^23 years if C++ processed 100,000 loops per second :|
It looks that this number fits into 128 bit. So you could use a modern system and a modern compiler that implements such numbers. This would e.g be the case for a 64bit linux system with gcc as a compiler. This has something like __uint128_t that you could use.
Obviously you can't use such a variable as a for-loop variable, others have give you the calculations. But you could use it to store some of your calculations.
Well, you would need an implementation that can handle at least a subset of the initialization, boolean, and arithmetic functions on very large integers. Something like: https://mattmccutchen.net/bigint/.
For something that would give a bit better performance than a general large integer math library, you could use specialized operations specifically to allow use of a large integer as a counter. For an example of this, see dewtell's updated answer to this question.
As for it being possible for you to loop from 0 to that number: well, yes, it is possible to write the code for it with one of the above solutions, but I think the answer is no, you personally will not be able to do it because you will not be alive to see it finish.
[edit: Yes, I would definitely recommend you find a different algorithm. :D]
If you need to loop a certain number of times, and that number is greater than 2^64, just use while(1) because your computer will break before it counts up to 2^64 anyway.
There's no need for a complete bignum package - if all you need is a loop counter, here's a simple byte counter that uses an array of bytes as a counter. It stops when the byte array wraps around to all zeros again. If you wanted to count to some other value than 2^(bytesUsed*CHAR_BITS), you could just compute the two's complement value of the negative of the number of iterations you wanted, and let it count up to 0, keeping in mind that bytes[0] is the low-order byte (or use the positive value and count down instead of up).
#include <stdio.h>
#define MAXBYTES 20
/* Simple byte counter - note it uses argc as # of bytes to use for convenience */
int main(int argc, char **argv) {
unsigned char bytes[MAXBYTES];
const int bytesUsed = argc < MAXBYTES? argc : MAXBYTES;
int i;
unsigned long counter = (unsigned long)-1; /* to give loop something to do */
for (i = 0; i < bytesUsed; i++) bytes[i] = 0; /* Initialize bytes */
do {
for (i = 0; i < bytesUsed && !++bytes[i]; i++) ; /* NULL BODY - this is the byte counter */
} while (i < bytesUsed);
printf("With %d bytes used, final counter value = %lu\n", bytesUsed, counter);
Run times for the first 4 values (under Cygwin, on a Lenovo T61):
$ time ./bytecounter
With 1 bytes used, final counter value = 255
real 0m0.078s
user 0m0.031s
sys 0m0.046s
$ time ./bytecounter a
With 2 bytes used, final counter value = 65535
real 0m0.063s
user 0m0.031s
sys 0m0.031s
$ time ./bytecounter a a
With 3 bytes used, final counter value = 16777215
real 0m0.125s
user 0m0.015s
sys 0m0.046s
$ time ./bytecounter a a a
With 4 bytes used, final counter value = 4294967295
real 0m6.578s
user 0m0.015s
sys 0m0.047s
At this rate, five bytes should take around half an hour, and six bytes should take the better part of a week. Of course the counter value will be inaccurate for those - it's mostly just there to verify the number of iterations for the smaller byte values and give the loop something to do.
Edit: And here's the time for five bytes, around half an hour as I predicted:
$ time ./bytecounter a a a a
With 5 bytes used, final counter value = 4294967295
real 27m22.184s
user 0m0.015s
sys 0m0.062s
Ok, here's code to take an arbitrary decimal number passed as the first arg and count down from it to zero. I set it up to allow the counter to use different size elements (just change the typedef for COUNTER_BASE), but it turns out that bytes are actually somewhat faster than either short or long on my system.
#include <stdio.h>
#include <limits.h> // defines CHAR_BIT
#include <ctype.h>
#include <vector>
using std::vector;
typedef unsigned char COUNTER_BASE;
typedef vector<COUNTER_BASE> COUNTER;
typedef vector<unsigned char> BYTEVEC;
const unsigned long byteMask = (~0ul) << CHAR_BIT;
const size_t MAXBYTES=20;
void mult10(BYTEVEC &val) {
// Multiply value by 10
unsigned int carry = 0;
int i;
for (i = 0; i < val.size(); i++) {
unsigned long value = val[i]*10ul+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
if (carry > 0) val.push_back(carry);
void addDigit(BYTEVEC &val, const char digit) {
// Add digit to the number in BYTEVEC.
unsigned int carry = digit - '0'; // Assumes ASCII char set
int i;
for (i = 0; i < val.size() && carry; i++) {
unsigned long value = static_cast<unsigned long>(val[i])+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
if (carry > 0) val.push_back(carry);
BYTEVEC Cstr2Bytevec(const char *str) {
// Turn a C-style string into a BYTEVEC. Only the digits in str apply,
// so that one can use commas, underscores, or other non-digits to separate
// digit groups.
BYTEVEC result;
unsigned char *res=&result[0]; // For debugging
while (*str) {
if (isdigit(static_cast<int>(*str))) {
addDigit(result, *str);
return result;
void packCounter(COUNTER &ctr, const BYTEVEC &val) {
// Pack the bytes from val into the (possibly larger) datatype of COUNTER
int i;
ctr.erase(ctr.begin(), ctr.end());
COUNTER_BASE value = 0;
for (i = 0; i < val.size(); i++) {
int pos = i%sizeof(COUNTER_BASE); // position of this byte in the value
if (i > 0 && pos == 0) {
value = val[i];
} else {
value |= static_cast<COUNTER_BASE>(val[i]) << pos*CHAR_BIT;
inline bool decrementAndTest(COUNTER &ctr) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < ctr.size() && !(ctr[i]--); i++) ; // EMPTY BODY
return i < ctr.size();
inline bool decrementAndTest2(COUNTER_BASE *ctr, const size_t size) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < size && !(ctr[i]--); i++) ; // EMPTY BODY
return i < size;
/* Vector counter - uses first arg (if supplied) as the count */
int main(int argc, const char *argv[]) {
BYTEVEC limit = Cstr2Bytevec(argc > 1? argv[1] : "0");
packCounter(ctr, limit);
COUNTER_BASE *ctr_vals = ctr.size() > 0 ? &ctr[0] : NULL;
size_t ctr_size = ctr.size();
unsigned long ul_counter = 0ul; /* to give loop something to do */
while(decrementAndTest2(ctr_vals, ctr_size)) {
printf("With %d bytes used, final ul_counter value = %lu\n", limit.size(), ul_counter);
return 0;
Examples of use:
$ time ./bigcounter 5
With 1 bytes used, final ul_counter value = 5
real 0m0.094s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 5,000
With 2 bytes used, final ul_counter value = 5000
real 0m0.062s
user 0m0.015s
sys 0m0.062s
$ time ./bigcounter 5,000,000
With 3 bytes used, final ul_counter value = 5000000
real 0m0.093s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 1,000,000,000
With 4 bytes used, final ul_counter value = 1000000000
real 0m2.688s
user 0m0.015s
sys 0m0.015s
$ time ./bigcounter 2,000,000,000
With 4 bytes used, final ul_counter value = 2000000000
real 0m5.125s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 3,000,000,000
With 4 bytes used, final ul_counter value = 3000000000
real 0m7.485s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 4,000,000,000
With 4 bytes used, final ul_counter value = 4000000000
real 0m9.875s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 5,000,000,000
With 5 bytes used, final ul_counter value = 705032704
real 0m12.594s
user 0m0.046s
sys 0m0.015s
$ time ./bigcounter 6,000,000,000
With 5 bytes used, final ul_counter value = 1705032704
real 0m14.813s
user 0m0.015s
sys 0m0.062s
Unwrapping the counter vector into C-style data structures (i.e., using decrementAndTest2 instead of decrementAndTest) sped things up by around 20-25%, but the code is still about twice as slow as my previous C program for similar-sized examples (around 4 billion). This is with MS Visual C++ 6.0 as the compiler in release mode, optimizing for speed, on a 2GHz dual-core system, for both programs. Inlining the decrementAndTest2 function definitely makes a big difference (around 12 sec. vs. 30 for the 5 billion loop), but I'll have to see whether physically inlining the code as I did in the C program can get similar performance.
the variable in main function can Store even 100 factorial
#include <iostream>
#include <cstdio>
#include <vector>
#include <cstring>
#include <string>
#include <map>
#include <functional>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <stack>
#include <queue>
#include <deque>
#include <limits>
#include <cmath>
#include <numeric>
#include <set>
using namespace std;
//template for BIGINIT
// base and base_digits must be consistent
const int base = 10;
const int base_digits = 1;
struct bigint {
vector<int> a;
int sign;
bigint() :
sign(1) {
bigint(long long v) {
*this = v;
bigint(const string &s) {
void operator=(const bigint &v) {
sign = v.sign;
a = v.a;
void operator=(long long v) {
sign = 1;
if (v < 0)
sign = -1, v = -v;
for (; v > 0; v = v / base)
a.push_back(v % base);
bigint operator+(const bigint &v) const {
if (sign == v.sign) {
bigint res = v;
for (int i = 0, carry = 0; i < (int) max(a.size(), v.a.size()) || carry; ++i) {
if (i == (int) res.a.size())
res.a[i] += carry + (i < (int) a.size() ? a[i] : 0);
carry = res.a[i] >= base;
if (carry)
res.a[i] -= base;
return res;
return *this - (-v);
bigint operator-(const bigint &v) const {
if (sign == v.sign) {
if (abs() >= v.abs()) {
bigint res = *this;
for (int i = 0, carry = 0; i < (int) v.a.size() || carry; ++i) {
res.a[i] -= carry + (i < (int) v.a.size() ? v.a[i] : 0);
carry = res.a[i] < 0;
if (carry)
res.a[i] += base;
return res;
return -(v - *this);
return *this + (-v);
void operator*=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = 0, carry = 0; i < (int) a.size() || carry; ++i) {
if (i == (int) a.size())
long long cur = a[i] * (long long) v + carry;
carry = (int) (cur / base);
a[i] = (int) (cur % base);
//asm("divl %%ecx" : "=a"(carry), "=d"(a[i]) : "A"(cur), "c"(base));
bigint operator*(int v) const {
bigint res = *this;
res *= v;
return res;
friend pair<bigint, bigint> divmod(const bigint &a1, const bigint &b1) {
int norm = base / (b1.a.back() + 1);
bigint a = a1.abs() * norm;
bigint b = b1.abs() * norm;
bigint q, r;
for (int i = a.a.size() - 1; i >= 0; i--) {
r *= base;
r += a.a[i];
int s1 = r.a.size() <= b.a.size() ? 0 : r.a[b.a.size()];
int s2 = r.a.size() <= b.a.size() - 1 ? 0 : r.a[b.a.size() - 1];
int d = ((long long) base * s1 + s2) / b.a.back();
r -= b * d;
while (r < 0)
r += b, --d;
q.a[i] = d;
q.sign = a1.sign * b1.sign;
r.sign = a1.sign;
return make_pair(q, r / norm);
bigint operator/(const bigint &v) const {
return divmod(*this, v).first;
bigint operator%(const bigint &v) const {
return divmod(*this, v).second;
void operator/=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = (int) a.size() - 1, rem = 0; i >= 0; --i) {
long long cur = a[i] + rem * (long long) base;
a[i] = (int) (cur / v);
rem = (int) (cur % v);
bigint operator/(int v) const {
bigint res = *this;
res /= v;
return res;
int operator%(int v) const {
if (v < 0)
v = -v;
int m = 0;
for (int i = a.size() - 1; i >= 0; --i)
m = (a[i] + m * (long long) base) % v;
return m * sign;
void operator+=(const bigint &v) {
*this = *this + v;
void operator-=(const bigint &v) {
*this = *this - v;
void operator*=(const bigint &v) {
*this = *this * v;
void operator/=(const bigint &v) {
*this = *this / v;
bool operator<(const bigint &v) const {
if (sign != v.sign)
return sign < v.sign;
if (a.size() != v.a.size())
return a.size() * sign < v.a.size() * v.sign;
for (int i = a.size() - 1; i >= 0; i--)
if (a[i] != v.a[i])
return a[i] * sign < v.a[i] * sign;
return false;
bool operator>(const bigint &v) const {
return v < *this;
bool operator<=(const bigint &v) const {
return !(v < *this);
bool operator>=(const bigint &v) const {
return !(*this < v);
bool operator==(const bigint &v) const {
return !(*this < v) && !(v < *this);
bool operator!=(const bigint &v) const {
return *this < v || v < *this;
void trim() {
while (!a.empty() && !a.back())
if (a.empty())
sign = 1;
bool isZero() const {
return a.empty() || (a.size() == 1 && !a[0]);
bigint operator-() const {
bigint res = *this;
res.sign = -sign;
return res;
bigint abs() const {
bigint res = *this;
res.sign *= res.sign;
return res;
long long longValue() const {
long long res = 0;
for (int i = a.size() - 1; i >= 0; i--)
res = res * base + a[i];
return res * sign;
friend bigint gcd(const bigint &a, const bigint &b) {
return b.isZero() ? a : gcd(b, a % b);
friend bigint lcm(const bigint &a, const bigint &b) {
return a / gcd(a, b) * b;
void read(const string &s) {
sign = 1;
int pos = 0;
while (pos < (int) s.size() && (s[pos] == '-' || s[pos] == '+')) {
if (s[pos] == '-')
sign = -sign;
for (int i = s.size() - 1; i >= pos; i -= base_digits) {
int x = 0;
for (int j = max(pos, i - base_digits + 1); j <= i; j++)
x = x * 10 + s[j] - '0';
friend istream& operator>>(istream &stream, bigint &v) {
string s;
stream >> s;
return stream;
friend ostream& operator<<(ostream &stream, const bigint &v) {
if (v.sign == -1)
stream << '-';
stream << (v.a.empty() ? 0 : v.a.back());
for (int i = (int) v.a.size() - 2; i >= 0; --i)
stream << setw(base_digits) << setfill('0') << v.a[i];
return stream;
static vector<int> convert_base(const vector<int> &a, int old_digits, int new_digits) {
vector<long long> p(max(old_digits, new_digits) + 1);
p[0] = 1;
for (int i = 1; i < (int) p.size(); i++)
p[i] = p[i - 1] * 10;
vector<int> res;
long long cur = 0;
int cur_digits = 0;
for (int i = 0; i < (int) a.size(); i++) {
cur += a[i] * p[cur_digits];
cur_digits += old_digits;
while (cur_digits >= new_digits) {
res.push_back(int(cur % p[new_digits]));
cur /= p[new_digits];
cur_digits -= new_digits;
res.push_back((int) cur);
while (!res.empty() && !res.back())
return res;
typedef vector<long long> vll;
static vll karatsubaMultiply(const vll &a, const vll &b) {
int n = a.size();
vll res(n + n);
if (n <= 32) {
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
res[i + j] += a[i] * b[j];
return res;
int k = n >> 1;
vll a1(a.begin(), a.begin() + k);
vll a2(a.begin() + k, a.end());
vll b1(b.begin(), b.begin() + k);
vll b2(b.begin() + k, b.end());
vll a1b1 = karatsubaMultiply(a1, b1);
vll a2b2 = karatsubaMultiply(a2, b2);
for (int i = 0; i < k; i++)
a2[i] += a1[i];
for (int i = 0; i < k; i++)
b2[i] += b1[i];
vll r = karatsubaMultiply(a2, b2);
for (int i = 0; i < (int) a1b1.size(); i++)
r[i] -= a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
r[i] -= a2b2[i];
for (int i = 0; i < (int) r.size(); i++)
res[i + k] += r[i];
for (int i = 0; i < (int) a1b1.size(); i++)
res[i] += a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
res[i + n] += a2b2[i];
return res;
bigint operator*(const bigint &v) const {
vector<int> a6 = convert_base(this->a, base_digits, 6);
vector<int> b6 = convert_base(v.a, base_digits, 6);
vll a(a6.begin(), a6.end());
vll b(b6.begin(), b6.end());
while (a.size() < b.size())
while (b.size() < a.size())
while (a.size() & (a.size() - 1))
a.push_back(0), b.push_back(0);
vll c = karatsubaMultiply(a, b);
bigint res;
res.sign = sign * v.sign;
for (int i = 0, carry = 0; i < (int) c.size(); i++) {
long long cur = c[i] + carry;
res.a.push_back((int) (cur % 1000000));
carry = (int) (cur / 1000000);
res.a = convert_base(res.a, 6, base_digits);
return res;
//use : bigint var;
//template for biginit over
int main()
bigint var=10909000890789;
return 0;