Comparing two vector<bool> with SSE

Comparing two vector<bool> with SSE - c++

I have two vector<bool> A and B.
I want to compare them and count the number of elements that are equal:
For example:
A = {0,1,0,1}
B = {0,0,1,1}
Result will be equal to 2.
I can use _mm_cmpeq_epi8 but it is only compare 16 elements (i.e. I should convert 0 and 1 to char and then do the comparison).
Is it possible to compare 128 elements each time with SSE (or SIMD instructions)?

If you can either assume that vector<bool> is using contiguous byte-sized elements for storage, or if you can consider using something like vector<uint8_t> instead, then this example should give you a good starting point:
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
Note that this is using vector<uint8_t>. If you really have to use vector<bool> and can guarantee that the elements will always be contiguous and byte-sized then you'll just need to coerce the vector<bool> into a const uint8_t * or similar somehow.
Test harness:
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>
#include <emmintrin.h> // SSE2
using std::vector;
static size_t count_equal_ref(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size());
const size_t n = vec1.size();
size_t i, count = 0;
for (i = 0 ; i < n; ++i)
{
count += (vec1[i] == vec2[i]);
}
return count;
}
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
int main(int argc, char * argv[])
{
size_t n = 100;
if (argc > 1)
{
n = atoi(argv[1]);
}
vector<uint8_t> vec1(n);
vector<uint8_t> vec2(n);
srand((unsigned int)time(NULL));
for (size_t i = 0; i < n; ++i)
{
vec1[i] = rand() & 1;
vec2[i] = rand() & 1;
}
size_t n_ref = count_equal_ref(vec1, vec2);
size_t n_test = count_equal(vec1, vec2);
if (n_ref == n_test)
{
std::cout << "PASS" << std::endl;
}
else
{
std::cout << "FAIL: n_ref = " << n_ref << ", n_test = " << n_test << std::endl;
}
return 0;
}
Compile and run:
$ g++ -Wall -msse3 -O3 test.cpp && ./a.out
PASS

std::vector<bool> is a specialization of std::vector for the type bool. Although not specified by the C++ standard, in most implementations std::vector<bool> is made space efficient such that each of its element is a single bit instead of a bool.
The behaviour of std::vector<bool> is similar to its primarily template counterpart, except that:
std::vector<bool> does not necessarily store its element contiguously .
In order to expose its elements (i.e., the individual bits) std::vector<bool> uses a proxy class (i.e., std::vector<bool>::reference). Objects of class std::vector<bool>::reference are returned by std::vector<bool> subscript operator (i.e., operator[]) by value.
Accordingly, I don't think it's portable to use _mm_cmpeq_epi8 like functions since storage of a std::vector<bool> is implementation defined (i.e., not guaranteed contiguous).
An alternative but portable way is to use regular STL facilities like the example below:
std::vector<bool> A = {0,1,0,1};
std::vector<bool> B = {0,0,1,1};
std::vector<bool> C(A.size());
std::transform(A.begin(), A.end(), B.begin(), C.begin(), [](bool const &a, bool const &b) { return a == b;});
std::cout << std::count(C.begin(), C.end(), true) << std::endl;
Live Demo

Related

Find largest element in matrix and its column and row indexes using SSE and AVX

I need to find the largest element in 1d matrix and its column and row indexes.
I use 1d matrix, so just finding the max element's index is needed first and then it is easy to get row and column.
My problem is that I cannot get that index.
I have a working function that finds largest element and uses SSE, here it is:
float find_largest_element_in_matrix_SSE(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m128 max_el = _mm_loadu_ps(m);
__m128 curr;
for (i = 4; i < dims * dims; i += 4)
{
curr = _mm_loadu_ps(m + i);
max_el = _mm_max_ps(max_el, curr);
}
__declspec(align(16))float max_v[4] = { 0 };
_mm_store_ps(max_v, max_el);
return max(max(max(max_v[0], max_v[1]), max_v[2]), max_v[3]);
}
and also I have a non-working function that uses AVX:
float find_largest_element_in_matrix_AVX(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m256 max_el = _mm256_loadu_ps(m);
__m256 curr;
for (i = 8; i < dims * dims; i += 8)
{
curr = _mm256_loadu_ps(m + i);
max_el = _mm256_max_ps(max_el, curr);
}
__declspec(align(32))float max_v[8] = { 0 };
_mm256_store_ps(max_v, max_el);
__m256 y = _mm256_permute2f128_ps(max_el, max_el, 1);
__m256 m1 = _mm256_max_ps(max_el, y);m1[1] = max(max_el[1], max_el[3])
__m256 m2 = _mm256_permute_ps(m1, 5);
__m256 m_res = _mm256_max_ps(m1, m2);
return m[0];
}
Could anyone help me with actually finding the index of the max element and make my AVX version work?

Here's a working SSE (SSE 4) implementation that returns the max val and corresponding index, along with a scalar reference implementation and test harness:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h> // SSE 4.1
float find_largest_element_in_matrix_ref(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
int i;
*maxIndex = 0;
for (i = 1; i < dims * dims; ++i)
{
if (m[i] > maxVal)
{
maxVal = m[i];
*maxIndex = i;
}
}
return maxVal;
}
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
float aMaxVal[4];
int32_t aMaxIndex[4];
int i;
*maxIndex = 0;
const __m128i vIndexInc = _mm_set1_epi32(4);
__m128i vMaxIndex = _mm_setr_epi32(0, 1, 2, 3);
__m128i vIndex = vMaxIndex;
__m128 vMaxVal = _mm_loadu_ps(m);
for (i = 4; i < dims * dims; i += 4)
{
__m128 v = _mm_loadu_ps(&m[i]);
__m128 vcmp = _mm_cmpgt_ps(v, vMaxVal);
vIndex = _mm_add_epi32(vIndex, vIndexInc);
vMaxVal = _mm_max_ps(vMaxVal, v);
vMaxIndex = _mm_blendv_epi8(vMaxIndex, vIndex, _mm_castps_si128(vcmp));
}
_mm_storeu_ps(aMaxVal, vMaxVal);
_mm_storeu_si128((__m128i *)aMaxIndex, vMaxIndex);
maxVal = aMaxVal[0];
*maxIndex = aMaxIndex[0];
for (i = 1; i < 4; ++i)
{
if (aMaxVal[i] > maxVal)
{
maxVal = aMaxVal[i];
*maxIndex = aMaxIndex[i];
}
}
return maxVal;
}
int main()
{
const int dims = 1024;
float m[dims * dims];
float maxVal_ref, maxVal_SSE;
int maxIndex_ref = -1, maxIndex_SSE = -1;
int i;
srand(time(NULL));
for (i = 0; i < dims * dims; ++i)
{
m[i] = (float)rand() / RAND_MAX;
}
maxVal_ref = find_largest_element_in_matrix_ref(m, dims, &maxIndex_ref);
maxVal_SSE = find_largest_element_in_matrix_SSE(m, dims, &maxIndex_SSE);
if (maxVal_ref == maxVal_SSE && maxIndex_ref == maxIndex_SSE)
{
printf("PASS: maxVal = %f, maxIndex = %d\n",
maxVal_ref, maxIndex_ref);
}
else
{
printf("FAIL: maxVal_ref = %f, maxVal_SSE = %f, maxIndex_ref = %d, maxIndex_SSE = %d\n",
maxVal_ref, maxVal_SSE, maxIndex_ref, maxIndex_SSE);
}
return 0;
}
Compile and run:
$ gcc -Wall -msse4 Yakovenko.c && ./a.out
PASS: maxVal = 0.999999, maxIndex = 120409
Obviously you can get the row and column indices if needed:
int rowIndex = maxIndex / dims;
int colIndex = maxIndex % dims;
From here it should be fairly straightforward to write an AVX2 implementation.

One approach would be to calculate maximum in the first pass, and find the index by linear search in the second pass. Here is a sample implementation in SSE2:
#define anybit __builtin_ctz //or lookup table with 16 entries...
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex) {
//first pass: calculate maximum as usual
__m128 vMaxVal = _mm_loadu_ps(m);
for (int i = 4; i < dims * dims; i += 4)
vMaxVal = _mm_max_ps(vMaxVal, _mm_loadu_ps(&m[i]));
//perform in-register reduction
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
//second pass: search for maximal value
for (int i = 0; i < dims * dims; i += 4) {
__m128 vIsMax = _mm_cmpeq_ps(vMaxVal, _mm_loadu_ps(&m[i]));
if (int mask = _mm_movemask_ps(vIsMax)) {
*maxIndex = i + anybit(mask);
return _mm_cvtss_f32(vMaxVal);
}
}
}
Note that the branch in the second loop should be almost perfectly predicted unless your input data is very small.
The solution suffers from several problems, notably:
It may work incorrectly in presence of weird floating point values, e.g. with NaNs.
If your matrix does not fit into CPU cache, then the code would read the matrix twice from the main memory, so it would be two times slower than the single-pass approach. This can be solved for large matrices by block-wise processing.
In the first loop each iteration depends on the previous one (vMaxVal is both modified and read) so it would be slowed down by latency of _mm_max_ps. Perhaps it would be great to unroll the first loop a bit (2x or 4x), while having 4 independent registers for vMaxVal (actually, the second loop would also benefit from unrolling).
Porting to AVX should be pretty straight-forward, except for the in-register reduction:
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_permute2f128_ps(vMaxVal, vMaxVal, 1));

yet another approach:
void find_largest_element_in_matrix_SSE(float * matrix, size_t n, int * row, int * column, float * v){
__m128 indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 update = _mm_setr_ps(4, 4, 4, 4);
__m128 max_indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 max = _mm_load_ps(matrix);
for (int i = 4; i < n * n; i+=4){
indecies = _mm_add_ps(indecies, update);
__m128 pm2 = _mm_load_ps(&matrix[i]);
__m128 mask = _mm_cmpge_ps(max, pm2);
max = _mm_max_ps(max, pm2);
max_indecies = _mm_or_ps(_mm_and_ps(max_indecies, mask), _mm_andnot_ps(mask, indecies));
}
__declspec (align(16)) int max_ind[4];
__m128i maxi = _mm_cvtps_epi32(max_indecies);
_mm_store_si128((__m128i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 4; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
void find_largest_element_in_matrix_AVX(float * matrix, size_t n, int * row, int * column, float * v){
__m256 indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 update = _mm256_setr_ps(8, 8, 8, 8, 8, 8, 8, 8);
__m256 max_indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_load_ps(matrix);
for (int i = 8; i < n * n; i += 8){
indecies = _mm256_add_ps(indecies, update);
__m256 pm2 = _mm256_load_ps(&matrix[i]);
__m256 mask = _mm256_cmp_ps(max, pm2, _CMP_GE_OQ);
max = _mm256_max_ps(max, pm2);
max_indecies = _mm256_or_ps(_mm256_and_ps(max_indecies, mask), _mm256_andnot_ps(mask, indecies));
}
__declspec (align(32)) int max_ind[8];
__m256i maxi = _mm256_cvtps_epi32(max_indecies);
_mm256_store_si256((__m256i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 8; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}

Using intrinsics to find next non-zero in an array

I have an int array[10000] and I want to iterate from a certain position to find the next non-zero index. Currently I use a basic while loop:
while(array[i] == 0){
pos++;
}
etc
I know with intrinsics I could test 4 integers for zero at a time, but is there a way to return something indicating the vector index of the "first" non-zero?

It's fairly simple to do this, but throughput improvement may not be great, since you will probably be limited by memory bandwidth (unless your array is already cached):
int index = -1;
for (i = 0; i < n; i += 4)
{
__m128i v = _mm_load_si128(&A[i]);
__m128i vcmp = _mm_cmpeq_epi32(v, _mm_setzero_si128());
int mask = _mm_movemask_epi8(vcmp);
if (mask != 0xffff)
{
break;
}
}
if (i < n)
{
for (j = i; j < i + 4; ++j)
{
if (A[j] != 0)
{
index = j;
break;
}
}
}
This assumes that the array A is 16 byte aligned, its size, n, is a multiple of 4, and that the ints are 32 bits.
Loop unrolling by a factor of 2 may help, particularly if your input data is large and/or sparse, e.g.
int index = -1;
for (i = 0; i < n; i += 8)
{
__m128i v0 = _mm_load_si128(&A[i]);
__m128i v1 = _mm_load_si128(&A[i + 4]);
__m128i vcmp0 = _mm_cmpeq_epi32(v0, _mm_setzero_si128());
__m128i vcmp1 = _mm_cmpeq_epi32(v1, _mm_setzero_si128());
int mask0 = _mm_movemask_epi8(vcmp0);
int mask1 = _mm_movemask_epi8(vcmp1);
if ((mask0 | mask1) != 0xffff)
{
break;
}
}
if (i < n)
{
for (j = i; j < i + 8; ++j)
{
if (A[j] != 0)
{
index = j;
break;
}
}
}
If you have AVX2 (Haswell and later) then you can process 8 ints at a time rather than 4.

Implementing a lookup array for an array of N integers of limited range

I have an arr[N], and need to implement a lookup array for all possible value sets arr can take, e.g. 2^N possible values for the simplest case where the array is bool arr[N].
This can be done by defining an N-dimensional boolean lookup array. For instance, for N=4 and arr being boolean, it would be bool lookup[2][2][2][2]. lookup can then store and retrieve any possible values of arr by lookup[arr[0]][arr[1]][arr[2]][arr[3]].
This is awkward to write and perhaps performance inefficient as well, because N varies, so the actual implementation will have to use a for loop for storage and retrieval. This is a problem since lookups are a very common operation, and making them as fast as possible is the whole point of this exercise.
Are there any other ways to implement this idea? I would be interested in a solution for a boolean arr, perhaps using some kind of bit representation, as well as a more general solution where the range of values in arr is wider than just 2.

For the case of boolean values, you should indeed use bits: Any N that is tractable must fit N^2 entries into memory, which is in the order of 2^32 through 2^38 on modern machines, so you cannot reach N = 64 anyway.
That said, you can use the least significant bits for each array entry, and simply allocate a store of 2^N value. The bit representations of your array can then simply serve as indices into this store.
Something like this:
uint64_t compressArray(long length, bool* array) {
uint64_t result = 0;
for(long i = length; i--; ) result = (result << 1) | (array[i] ? 1 : 0);
return result;
}
...
int* store = malloc(sizeof(*store) * (1 << N));
bool* array = ...;
// Now you can access the ints in store like this:
store[compressArray(N, array)] = 3;

If I understood you correctly...
For boolean type it is enough if you use index of element in one-dimensional array to represent your bits.
For example for arr[4]
ar[0] = 0, ar[1] = 0, ar[2] = 0, ar[3] = 0 0000
ar[0] = 1, ar[1] = 0, ar[2] = 0, ar[3] = 0 0001
ar[0] = 0, ar[1] = 1, ar[2] = 0, ar[3] = 0 0010
ar[0] = 1, ar[1] = 1, ar[2] = 0, ar[3] = 0 0011
etc...
For arr containing values from 1 to 3, for instance, you can utilize two bits per one value.

Following may help:
So, from your example ValueRange = 2 as bool can take 2 values;
Size = N
#include <cassert>
#include <cstddef>
#include <vector>
template<typename T, int ValueRange, int Size>
class MultiArray
{
static_assert(ValueRange > 1, "Need at least 2 or more values");
static_assert(Size > 0, "Size should be strictly positive");
public:
MultiArray() : values(computeTotalSize())
{
assert(!values.empty());
}
const T& get(const std::vector<size_t>& indexes) const
{
return values[computeIndex(indexes)];
}
T& get(const std::vector<size_t>& indexes)
{
return values[computeIndex(indexes)];
}
size_t computeIndex(const std::vector<size_t>& indexes) const
{
assert(indexes.size() == Size);
size_t index = 0;
size_t mul = 1;
for (size_t i = 0; i != Size; ++i) {
assert(indexes[i] < ValueRange);
index += indexes[i] * mul;
mul *= ValueRange;
}
assert(index < values.size());
return index;
}
std::vector<size_t> computeIndexes(size_t index) const
{
assert(index < values.size());
std::vector<size_t> res(Size);
size_t mul = values.size();
for (size_t i = Size; i != 0; --i) {
mul /= ValueRange;
res[i - 1] = index / mul;
assert(res[i - 1] < ValueRange);
index -= res[i - 1] * mul;
}
return res;
}
private:
size_t computeTotalSize() const
{
size_t totalSize = 1;
for (int i = 0; i != Size; ++i) {
totalSize *= ValueRange;
}
return totalSize;
}
private:
std::vector<T> values;
};
So use it like this:
int main()
{
MultiArray<int, 2, 4> m;
m.get({0, 0, 1, 0}) = 42;
m.get({1, 1, 0, 0}) = 42;
// Just for test purpose:
for (size_t i = 0; i != 16; ++i) {
assert(m.computeIndex(m.computeIndexes(i)) == i);
}
return 0;
}

How should I improve the performance of this C++ code?

The following code operates on two std::vectors v1 and v2, each containing multiple 128-element vectors. Loops through the outer vectors (using i1 and i2) contain an inner loop, designed to limit the combinations of i1 and i2 for which further complex processing is performed. Around 99.9% of the combinations are filtered out.
Unfortunately the filtering loop is a major bottleneck in my program - profiling shows that 26% of the entire run time is spent on the line if(a[k] + b[k] > LIMIT).
const vector<vector<uint16_t>> & v1 = ...
const vector<vector<uint16_t>> & v2 = ...
for(size_t i1 = 0; i1 < v1.size(); ++i1) { //v1.size() and v2.size() about 20000
for(size_t i2 = 0; i2 < v2.size(); ++i2) {
const vector<uint16_t> & a = v1[i1];
const vector<uint16_t> & b = v2[i2];
bool good = true;
for(std::size_t k = 0; k < 128; ++k) {
if(a[k] + b[k] > LIMIT) { //LIMIT is a const uint16_t: approx 16000
good = false;
break;
}
}
if(!good) continue;
// Further processing involving i1 and i2
}
}
I think the performance of this code could be improved by increasing memory locality, plus perhaps vectorizing. Any suggestions on how to do this, or on other improvements that could be made?

You could apply SIMD to the inner loop:
bool good = true;
for(std::size_t k = 0; k < 128; ++k) {
if(a[k] + b[k] > LIMIT) { //LIMIT is a const uint16_t: approx 16000
good = false;
break;
}
as follows:
#include <emmintrin.h> // SSE2 intrinsics
#include <limits.h> // SHRT_MIN
// ...
// some useful constants - declare these somewhere before the outermost loop
const __m128i vLIMIT = _mm_set1_epi16(LIMIT + SHRT_MIN); // signed version of LIMIT
const __m128i vOFFSET = _mm_set1_epi16(SHRT_MIN); // offset for uint16_t -> int16_t conversion
// ...
bool good = true;
for(std::size_t k = 0; k < 128; k += 8) {
__m128i v, va, vb; // iterate through a, b, 8 elements at a time
int mask;
va = _mm_loadu_si128(&a[k]); // get 8 elements from a[k], b[k]
vb = _mm_loadu_si128(&b[k]);
v = _mm_add_epi16(va, vb); // add a and b vectors
v = _mm_add_epi16(v, vOFFSET); // subtract 32768 to make signed
v = _mm_cmpgt_epi16(v, vLIMIT); // compare against LIMIT
mask = _mm_maskmove_epi8(v); // get comparison results as 16 bit mask
if (mask != 0) { // if any value exceeded limit
good = false; // clear good flag and exit loop
break;
}
Warning: untested code - may need debugging, but the general approach should be sound.

You've got the most efficient access pattern for v1, but you are sequentially scanning through all of v2 for each iteration of the outer loop. This is very inefficient, because v2 access will continually cause (L2 and probably also L3) cache misses.
A better access pattern is to increase the loop nesting, so that outer loops stride through v1 and v2, and inner loops process elements within a subsegment of both v1 and v2 that's small enough to fit in cache.
Basically, instead of
for(size_t i1 = 0; i1 < v1.size(); ++i1) { //v1.size() and v2.size() about 20000
for(size_t i2 = 0; i2 < v2.size(); ++i2) {
Do
for(size_t i2a = 0; i2a < v2.size(); i2a += 32) {
for(size_t i1 = 0; i1 < v1.size(); ++i1) {
for(size_t i2 = i2a; i2 < v2.size() && i2 < i2a + 32; ++i2) {
Or
size_t i2a = 0;
// handle complete blocks
for(; i2a < v2.size() - 31; i2a += 32) {
for(size_t i1 = 0; i1 < v1.size(); ++i1) {
for(size_t i2 = i2a; i2 < i2a + 32; ++i2) {
}
}
}
// handle leftover partial block
for(size_t i1 = 0; i1 < v1.size(); ++i1) {
for(size_t i2 = i2a; i2 < v2.size(); ++i2) {
}
}
This way, a chunk of 32 * 128 * sizeof (uint16_t) bytes, or 8kB, will be loaded from v2 into cache, and then reused 20,000 times.
This improvement is orthogonal to SIMD (SSE) vectorization. It will interact with thread-based parallelism, but probably in a good way.

First, One simple optimization can be this, but the compiler could do this by itself so i'm not sure how much it could improve:
for(std::size_t k = 0; k < 128 && good; ++k)
{
good = a[k] + b[k] <= LIMIT;
}
Second, i think it could better to keep the good result in a second vector because any
processing involved with i1 and i2 could break the CPU cache.
Third, and this could be major optimization, i think you can rewrite the second for loop as this:
for(size_t i2 = i1; i2 < v2.size(); ++i2) since you are using + operations for a and b vectors which is commutative so the result of i1 and i2 will be the same as i2 and i1.
For this you need to have the v1 and v2 the same size. If the size is different you need to write the iteration in a different way.
Fort, As far as i can see you are processing two matrices, it would be better to keep a vector of elements rather than a vector of vectors.
Hope this helps.
Razvan.

A few suggestions:
As suggested in the comments, replace the inner 128 element vector with an array for better memory locality.
This code looks highly parallelizable, have you tried that? You could split the combinations for filtering across all available cores and then rebalance the collected work and split the processing across all cores.
I implemented a version using arrays for the inner 128 elements, PPL for parallelization (requires VS 2012 or higher) and a bit of SSE code for the filtering and got a pretty significant speedup. Depending on what exactly the 'further processing' involves there may be benefits to structuring things slightly differently (in this example I don't rebalance the work after filtering for example).
Update: I implemented the cache blocking suggested by Ben Voigt and got a bit more of a speed up.
#include <vector>
#include <array>
#include <random>
#include <limits>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <chrono>
#include <iterator>
#include <ppl.h>
#include <immintrin.h>
using namespace std;
using namespace concurrency;
namespace {
const int outerVecSize = 20000;
const int innerVecSize = 128;
const int LIMIT = 16000;
auto engine = default_random_engine();
};
typedef vector<uint16_t> InnerVec;
typedef array<uint16_t, innerVecSize> InnerArr;
template <typename Cont> void randomFill(Cont& c) {
// We want approx 0.1% to pass filter, the mean and standard deviation are chosen to get close to that
static auto dist = normal_distribution<>(LIMIT / 4.0, LIMIT / 4.6);
generate(begin(c), end(c), [] {
auto clamp = [](double x, double minimum, double maximum) { return min(max(minimum, x), maximum); };
return static_cast<uint16_t>(clamp(dist(engine), 0.0, numeric_limits<uint16_t>::max()));
});
}
void resizeInner(InnerVec& v) { v.resize(innerVecSize); }
void resizeInner(InnerArr& a) {}
template <typename Inner> Inner generateRandomInner() {
auto inner = Inner();
resizeInner(inner);
randomFill(inner);
return inner;
}
template <typename Inner> vector<Inner> generateRandomInput() {
auto outer = vector<Inner>(outerVecSize);
generate(begin(outer), end(outer), generateRandomInner<Inner>);
return outer;
}
void Report(const chrono::high_resolution_clock::duration elapsed, size_t in1Size, size_t in2Size,
const int passedFilter, const uint32_t specialValue) {
cout << passedFilter << "/" << in1Size* in2Size << " ("
<< 100.0 * (double(passedFilter) / double(in1Size * in2Size)) << "%) passed filter\n";
cout << specialValue << "\n";
cout << "Elapsed time = " << chrono::duration_cast<chrono::milliseconds>(elapsed).count() << "ms" << endl;
}
void TestOriginalVersion() {
cout << __FUNCTION__ << endl;
engine.seed();
const auto v1 = generateRandomInput<InnerVec>();
const auto v2 = generateRandomInput<InnerVec>();
int passedFilter = 0;
uint32_t specialValue = 0;
auto startTime = chrono::high_resolution_clock::now();
for (size_t i1 = 0; i1 < v1.size(); ++i1) { // v1.size() and v2.size() about 20000
for (size_t i2 = 0; i2 < v2.size(); ++i2) {
const vector<uint16_t>& a = v1[i1];
const vector<uint16_t>& b = v2[i2];
bool good = true;
for (std::size_t k = 0; k < 128; ++k) {
if (static_cast<int>(a[k]) + static_cast<int>(b[k])
> LIMIT) { // LIMIT is a const uint16_t: approx 16000
good = false;
break;
}
}
if (!good) continue;
// Further processing involving i1 and i2
++passedFilter;
specialValue += inner_product(begin(a), end(a), begin(b), 0);
}
}
auto endTime = chrono::high_resolution_clock::now();
Report(endTime - startTime, v1.size(), v2.size(), passedFilter, specialValue);
}
bool needsProcessing(const InnerArr& a, const InnerArr& b) {
static_assert(sizeof(a) == sizeof(b) && (sizeof(a) % 16) == 0, "Array size must be multiple of 16 bytes.");
static const __m128i mmLimit = _mm_set1_epi16(LIMIT);
static const __m128i mmLimitPlus1 = _mm_set1_epi16(LIMIT + 1);
static const __m128i mmOnes = _mm_set1_epi16(-1);
auto to_m128i = [](const uint16_t* p) { return reinterpret_cast<const __m128i*>(p); };
return equal(to_m128i(a.data()), to_m128i(a.data() + a.size()), to_m128i(b.data()), [&](const __m128i& a, const __m128i& b) {
// avoid overflow due to signed compare by clamping sum to LIMIT + 1
const __m128i clampSum = _mm_min_epu16(_mm_adds_epu16(a, b), mmLimitPlus1);
return _mm_test_all_zeros(_mm_cmpgt_epi16(clampSum, mmLimit), mmOnes);
});
}
void TestArrayParallelVersion() {
cout << __FUNCTION__ << endl;
engine.seed();
const auto v1 = generateRandomInput<InnerArr>();
const auto v2 = generateRandomInput<InnerArr>();
combinable<int> passedFilterCombinable;
combinable<uint32_t> specialValueCombinable;
auto startTime = chrono::high_resolution_clock::now();
const size_t blockSize = 64;
parallel_for(0u, v1.size(), blockSize, [&](size_t i) {
for (const auto& b : v2) {
const auto blockBegin = begin(v1) + i;
const auto blockEnd = begin(v1) + min(v1.size(), i + blockSize);
for (auto it = blockBegin; it != blockEnd; ++it) {
const InnerArr& a = *it;
if (!needsProcessing(a, b))
continue;
// Further processing involving a and b
++passedFilterCombinable.local();
specialValueCombinable.local() += inner_product(begin(a), end(a), begin(b), 0);
}
}
});
auto passedFilter = passedFilterCombinable.combine(plus<int>());
auto specialValue = specialValueCombinable.combine(plus<uint32_t>());
auto endTime = chrono::high_resolution_clock::now();
Report(endTime - startTime, v1.size(), v2.size(), passedFilter, specialValue);
}
int main() {
TestOriginalVersion();
TestArrayParallelVersion();
}
On my 8 core system I see a pretty good speedup, your results will vary depending on how many cores you have etc.
TestOriginalVersion
441579/400000000 (0.110395%) passed filter
2447300015
Elapsed time = 12525ms
TestArrayParallelVersion
441579/400000000 (0.110395%) passed filter
2447300015
Elapsed time = 657ms

Storing a Big Number in a Variable and Looping

How can i store a big number in a variable and use a for loop?
I have a very big number 75472202764752234070123900087933251 and i need to loop from 0 to this number!
Is it even possible to do this? how much time will it take to end?
EDIT: i am trying to solve a hard problem by brute force. its a combination problem.the bruteforcing cases may reach 470C450.
so i guess i should use a different algorithm...

This might take
0.23 x 10^23 years if C++ processed 100,000 loops per second :|
http://www.wolframalpha.com/input/?i=75472202764752234070123900087933251%2F%28100000*1*3600*24*365%29

It looks that this number fits into 128 bit. So you could use a modern system and a modern compiler that implements such numbers. This would e.g be the case for a 64bit linux system with gcc as a compiler. This has something like __uint128_t that you could use.
Obviously you can't use such a variable as a for-loop variable, others have give you the calculations. But you could use it to store some of your calculations.

Well, you would need an implementation that can handle at least a subset of the initialization, boolean, and arithmetic functions on very large integers. Something like: https://mattmccutchen.net/bigint/.
For something that would give a bit better performance than a general large integer math library, you could use specialized operations specifically to allow use of a large integer as a counter. For an example of this, see dewtell's updated answer to this question.
As for it being possible for you to loop from 0 to that number: well, yes, it is possible to write the code for it with one of the above solutions, but I think the answer is no, you personally will not be able to do it because you will not be alive to see it finish.
[edit: Yes, I would definitely recommend you find a different algorithm. :D]

If you need to loop a certain number of times, and that number is greater than 2^64, just use while(1) because your computer will break before it counts up to 2^64 anyway.

There's no need for a complete bignum package - if all you need is a loop counter, here's a simple byte counter that uses an array of bytes as a counter. It stops when the byte array wraps around to all zeros again. If you wanted to count to some other value than 2^(bytesUsed*CHAR_BITS), you could just compute the two's complement value of the negative of the number of iterations you wanted, and let it count up to 0, keeping in mind that bytes[0] is the low-order byte (or use the positive value and count down instead of up).
#include <stdio.h>
#define MAXBYTES 20
/* Simple byte counter - note it uses argc as # of bytes to use for convenience */
int main(int argc, char **argv) {
unsigned char bytes[MAXBYTES];
const int bytesUsed = argc < MAXBYTES? argc : MAXBYTES;
int i;
unsigned long counter = (unsigned long)-1; /* to give loop something to do */
for (i = 0; i < bytesUsed; i++) bytes[i] = 0; /* Initialize bytes */
do {
for (i = 0; i < bytesUsed && !++bytes[i]; i++) ; /* NULL BODY - this is the byte counter */
counter++;
} while (i < bytesUsed);
printf("With %d bytes used, final counter value = %lu\n", bytesUsed, counter);
}
Run times for the first 4 values (under Cygwin, on a Lenovo T61):
$ time ./bytecounter
With 1 bytes used, final counter value = 255
real 0m0.078s
user 0m0.031s
sys 0m0.046s
$ time ./bytecounter a
With 2 bytes used, final counter value = 65535
real 0m0.063s
user 0m0.031s
sys 0m0.031s
$ time ./bytecounter a a
With 3 bytes used, final counter value = 16777215
real 0m0.125s
user 0m0.015s
sys 0m0.046s
$ time ./bytecounter a a a
With 4 bytes used, final counter value = 4294967295
real 0m6.578s
user 0m0.015s
sys 0m0.047s
At this rate, five bytes should take around half an hour, and six bytes should take the better part of a week. Of course the counter value will be inaccurate for those - it's mostly just there to verify the number of iterations for the smaller byte values and give the loop something to do.
Edit: And here's the time for five bytes, around half an hour as I predicted:
$ time ./bytecounter a a a a
With 5 bytes used, final counter value = 4294967295
real 27m22.184s
user 0m0.015s
sys 0m0.062s

Ok, here's code to take an arbitrary decimal number passed as the first arg and count down from it to zero. I set it up to allow the counter to use different size elements (just change the typedef for COUNTER_BASE), but it turns out that bytes are actually somewhat faster than either short or long on my system.
#include <stdio.h>
#include <limits.h> // defines CHAR_BIT
#include <ctype.h>
#include <vector>
using std::vector;
typedef unsigned char COUNTER_BASE;
typedef vector<COUNTER_BASE> COUNTER;
typedef vector<unsigned char> BYTEVEC;
const unsigned long byteMask = (~0ul) << CHAR_BIT;
const size_t MAXBYTES=20;
void mult10(BYTEVEC &val) {
// Multiply value by 10
unsigned int carry = 0;
int i;
for (i = 0; i < val.size(); i++) {
unsigned long value = val[i]*10ul+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
}
if (carry > 0) val.push_back(carry);
}
void addDigit(BYTEVEC &val, const char digit) {
// Add digit to the number in BYTEVEC.
unsigned int carry = digit - '0'; // Assumes ASCII char set
int i;
for (i = 0; i < val.size() && carry; i++) {
unsigned long value = static_cast<unsigned long>(val[i])+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
}
if (carry > 0) val.push_back(carry);
}
BYTEVEC Cstr2Bytevec(const char *str) {
// Turn a C-style string into a BYTEVEC. Only the digits in str apply,
// so that one can use commas, underscores, or other non-digits to separate
// digit groups.
BYTEVEC result;
result.reserve(MAXBYTES);
result[0]=0;
unsigned char *res=&result[0]; // For debugging
while (*str) {
if (isdigit(static_cast<int>(*str))) {
mult10(result);
addDigit(result, *str);
}
str++;
}
return result;
}
void packCounter(COUNTER &ctr, const BYTEVEC &val) {
// Pack the bytes from val into the (possibly larger) datatype of COUNTER
int i;
ctr.erase(ctr.begin(), ctr.end());
COUNTER_BASE value = 0;
for (i = 0; i < val.size(); i++) {
int pos = i%sizeof(COUNTER_BASE); // position of this byte in the value
if (i > 0 && pos == 0) {
ctr.push_back(value);
value = val[i];
} else {
value |= static_cast<COUNTER_BASE>(val[i]) << pos*CHAR_BIT;
}
}
ctr.push_back(value);
}
inline bool decrementAndTest(COUNTER &ctr) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < ctr.size() && !(ctr[i]--); i++) ; // EMPTY BODY
return i < ctr.size();
}
inline bool decrementAndTest2(COUNTER_BASE *ctr, const size_t size) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < size && !(ctr[i]--); i++) ; // EMPTY BODY
return i < size;
}
/* Vector counter - uses first arg (if supplied) as the count */
int main(int argc, const char *argv[]) {
BYTEVEC limit = Cstr2Bytevec(argc > 1? argv[1] : "0");
COUNTER ctr;
packCounter(ctr, limit);
COUNTER_BASE *ctr_vals = ctr.size() > 0 ? &ctr[0] : NULL;
size_t ctr_size = ctr.size();
unsigned long ul_counter = 0ul; /* to give loop something to do */
while(decrementAndTest2(ctr_vals, ctr_size)) {
ul_counter++;
};
printf("With %d bytes used, final ul_counter value = %lu\n", limit.size(), ul_counter);
return 0;
}
Examples of use:
$ time ./bigcounter 5
With 1 bytes used, final ul_counter value = 5
real 0m0.094s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 5,000
With 2 bytes used, final ul_counter value = 5000
real 0m0.062s
user 0m0.015s
sys 0m0.062s
$ time ./bigcounter 5,000,000
With 3 bytes used, final ul_counter value = 5000000
real 0m0.093s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 1,000,000,000
With 4 bytes used, final ul_counter value = 1000000000
real 0m2.688s
user 0m0.015s
sys 0m0.015s
$ time ./bigcounter 2,000,000,000
With 4 bytes used, final ul_counter value = 2000000000
real 0m5.125s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 3,000,000,000
With 4 bytes used, final ul_counter value = 3000000000
real 0m7.485s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 4,000,000,000
With 4 bytes used, final ul_counter value = 4000000000
real 0m9.875s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 5,000,000,000
With 5 bytes used, final ul_counter value = 705032704
real 0m12.594s
user 0m0.046s
sys 0m0.015s
$ time ./bigcounter 6,000,000,000
With 5 bytes used, final ul_counter value = 1705032704
real 0m14.813s
user 0m0.015s
sys 0m0.062s
Unwrapping the counter vector into C-style data structures (i.e., using decrementAndTest2 instead of decrementAndTest) sped things up by around 20-25%, but the code is still about twice as slow as my previous C program for similar-sized examples (around 4 billion). This is with MS Visual C++ 6.0 as the compiler in release mode, optimizing for speed, on a 2GHz dual-core system, for both programs. Inlining the decrementAndTest2 function definitely makes a big difference (around 12 sec. vs. 30 for the 5 billion loop), but I'll have to see whether physically inlining the code as I did in the C program can get similar performance.

the variable in main function can Store even 100 factorial
#include <iostream>
#include <cstdio>
#include <vector>
#include <cstring>
#include <string>
#include <map>
#include <functional>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <stack>
#include <queue>
#include <deque>
#include <limits>
#include <cmath>
#include <numeric>
#include <set>
using namespace std;
//template for BIGINIT
// base and base_digits must be consistent
const int base = 10;
const int base_digits = 1;
struct bigint {
vector<int> a;
int sign;
bigint() :
sign(1) {
}
bigint(long long v) {
*this = v;
}
bigint(const string &s) {
read(s);
}
void operator=(const bigint &v) {
sign = v.sign;
a = v.a;
}
void operator=(long long v) {
sign = 1;
if (v < 0)
sign = -1, v = -v;
for (; v > 0; v = v / base)
a.push_back(v % base);
}
bigint operator+(const bigint &v) const {
if (sign == v.sign) {
bigint res = v;
for (int i = 0, carry = 0; i < (int) max(a.size(), v.a.size()) || carry; ++i) {
if (i == (int) res.a.size())
res.a.push_back(0);
res.a[i] += carry + (i < (int) a.size() ? a[i] : 0);
carry = res.a[i] >= base;
if (carry)
res.a[i] -= base;
}
return res;
}
return *this - (-v);
}
bigint operator-(const bigint &v) const {
if (sign == v.sign) {
if (abs() >= v.abs()) {
bigint res = *this;
for (int i = 0, carry = 0; i < (int) v.a.size() || carry; ++i) {
res.a[i] -= carry + (i < (int) v.a.size() ? v.a[i] : 0);
carry = res.a[i] < 0;
if (carry)
res.a[i] += base;
}
res.trim();
return res;
}
return -(v - *this);
}
return *this + (-v);
}
void operator*=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = 0, carry = 0; i < (int) a.size() || carry; ++i) {
if (i == (int) a.size())
a.push_back(0);
long long cur = a[i] * (long long) v + carry;
carry = (int) (cur / base);
a[i] = (int) (cur % base);
//asm("divl %%ecx" : "=a"(carry), "=d"(a[i]) : "A"(cur), "c"(base));
}
trim();
}
bigint operator*(int v) const {
bigint res = *this;
res *= v;
return res;
}
friend pair<bigint, bigint> divmod(const bigint &a1, const bigint &b1) {
int norm = base / (b1.a.back() + 1);
bigint a = a1.abs() * norm;
bigint b = b1.abs() * norm;
bigint q, r;
q.a.resize(a.a.size());
for (int i = a.a.size() - 1; i >= 0; i--) {
r *= base;
r += a.a[i];
int s1 = r.a.size() <= b.a.size() ? 0 : r.a[b.a.size()];
int s2 = r.a.size() <= b.a.size() - 1 ? 0 : r.a[b.a.size() - 1];
int d = ((long long) base * s1 + s2) / b.a.back();
r -= b * d;
while (r < 0)
r += b, --d;
q.a[i] = d;
}
q.sign = a1.sign * b1.sign;
r.sign = a1.sign;
q.trim();
r.trim();
return make_pair(q, r / norm);
}
bigint operator/(const bigint &v) const {
return divmod(*this, v).first;
}
bigint operator%(const bigint &v) const {
return divmod(*this, v).second;
}
void operator/=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = (int) a.size() - 1, rem = 0; i >= 0; --i) {
long long cur = a[i] + rem * (long long) base;
a[i] = (int) (cur / v);
rem = (int) (cur % v);
}
trim();
}
bigint operator/(int v) const {
bigint res = *this;
res /= v;
return res;
}
int operator%(int v) const {
if (v < 0)
v = -v;
int m = 0;
for (int i = a.size() - 1; i >= 0; --i)
m = (a[i] + m * (long long) base) % v;
return m * sign;
}
void operator+=(const bigint &v) {
*this = *this + v;
}
void operator-=(const bigint &v) {
*this = *this - v;
}
void operator*=(const bigint &v) {
*this = *this * v;
}
void operator/=(const bigint &v) {
*this = *this / v;
}
bool operator<(const bigint &v) const {
if (sign != v.sign)
return sign < v.sign;
if (a.size() != v.a.size())
return a.size() * sign < v.a.size() * v.sign;
for (int i = a.size() - 1; i >= 0; i--)
if (a[i] != v.a[i])
return a[i] * sign < v.a[i] * sign;
return false;
}
bool operator>(const bigint &v) const {
return v < *this;
}
bool operator<=(const bigint &v) const {
return !(v < *this);
}
bool operator>=(const bigint &v) const {
return !(*this < v);
}
bool operator==(const bigint &v) const {
return !(*this < v) && !(v < *this);
}
bool operator!=(const bigint &v) const {
return *this < v || v < *this;
}
void trim() {
while (!a.empty() && !a.back())
a.pop_back();
if (a.empty())
sign = 1;
}
bool isZero() const {
return a.empty() || (a.size() == 1 && !a[0]);
}
bigint operator-() const {
bigint res = *this;
res.sign = -sign;
return res;
}
bigint abs() const {
bigint res = *this;
res.sign *= res.sign;
return res;
}
long long longValue() const {
long long res = 0;
for (int i = a.size() - 1; i >= 0; i--)
res = res * base + a[i];
return res * sign;
}
friend bigint gcd(const bigint &a, const bigint &b) {
return b.isZero() ? a : gcd(b, a % b);
}
friend bigint lcm(const bigint &a, const bigint &b) {
return a / gcd(a, b) * b;
}
void read(const string &s) {
sign = 1;
a.clear();
int pos = 0;
while (pos < (int) s.size() && (s[pos] == '-' || s[pos] == '+')) {
if (s[pos] == '-')
sign = -sign;
++pos;
}
for (int i = s.size() - 1; i >= pos; i -= base_digits) {
int x = 0;
for (int j = max(pos, i - base_digits + 1); j <= i; j++)
x = x * 10 + s[j] - '0';
a.push_back(x);
}
trim();
}
friend istream& operator>>(istream &stream, bigint &v) {
string s;
stream >> s;
v.read(s);
return stream;
}
friend ostream& operator<<(ostream &stream, const bigint &v) {
if (v.sign == -1)
stream << '-';
stream << (v.a.empty() ? 0 : v.a.back());
for (int i = (int) v.a.size() - 2; i >= 0; --i)
stream << setw(base_digits) << setfill('0') << v.a[i];
return stream;
}
static vector<int> convert_base(const vector<int> &a, int old_digits, int new_digits) {
vector<long long> p(max(old_digits, new_digits) + 1);
p[0] = 1;
for (int i = 1; i < (int) p.size(); i++)
p[i] = p[i - 1] * 10;
vector<int> res;
long long cur = 0;
int cur_digits = 0;
for (int i = 0; i < (int) a.size(); i++) {
cur += a[i] * p[cur_digits];
cur_digits += old_digits;
while (cur_digits >= new_digits) {
res.push_back(int(cur % p[new_digits]));
cur /= p[new_digits];
cur_digits -= new_digits;
}
}
res.push_back((int) cur);
while (!res.empty() && !res.back())
res.pop_back();
return res;
}
typedef vector<long long> vll;
static vll karatsubaMultiply(const vll &a, const vll &b) {
int n = a.size();
vll res(n + n);
if (n <= 32) {
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
res[i + j] += a[i] * b[j];
return res;
}
int k = n >> 1;
vll a1(a.begin(), a.begin() + k);
vll a2(a.begin() + k, a.end());
vll b1(b.begin(), b.begin() + k);
vll b2(b.begin() + k, b.end());
vll a1b1 = karatsubaMultiply(a1, b1);
vll a2b2 = karatsubaMultiply(a2, b2);
for (int i = 0; i < k; i++)
a2[i] += a1[i];
for (int i = 0; i < k; i++)
b2[i] += b1[i];
vll r = karatsubaMultiply(a2, b2);
for (int i = 0; i < (int) a1b1.size(); i++)
r[i] -= a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
r[i] -= a2b2[i];
for (int i = 0; i < (int) r.size(); i++)
res[i + k] += r[i];
for (int i = 0; i < (int) a1b1.size(); i++)
res[i] += a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
res[i + n] += a2b2[i];
return res;
}
bigint operator*(const bigint &v) const {
vector<int> a6 = convert_base(this->a, base_digits, 6);
vector<int> b6 = convert_base(v.a, base_digits, 6);
vll a(a6.begin(), a6.end());
vll b(b6.begin(), b6.end());
while (a.size() < b.size())
a.push_back(0);
while (b.size() < a.size())
b.push_back(0);
while (a.size() & (a.size() - 1))
a.push_back(0), b.push_back(0);
vll c = karatsubaMultiply(a, b);
bigint res;
res.sign = sign * v.sign;
for (int i = 0, carry = 0; i < (int) c.size(); i++) {
long long cur = c[i] + carry;
res.a.push_back((int) (cur % 1000000));
carry = (int) (cur / 1000000);
}
res.a = convert_base(res.a, 6, base_digits);
res.trim();
return res;
}
};
//use : bigint var;
//template for biginit over
int main()
{
bigint var=10909000890789;
cout<<var;
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Comparing two vector<bool> with SSE - c++

Related

Find largest element in matrix and its column and row indexes using SSE and AVX

Using intrinsics to find next non-zero in an array

Implementing a lookup array for an array of N integers of limited range

How should I improve the performance of this C++ code?

Storing a Big Number in a Variable and Looping

Categories

Resources