Drastic difference in GCC and Clang code performance

Drastic difference in GCC and Clang code performance - c++

community. I have this piece of code that finds closest pair of points in Euclidean 3D space. This question is neither about the algorithm nor its implementation or whatever. The problem is that it runs significantly slower when compiled with GCC rather than Clang. Most confusingly, it has comparable execution time on random samples and like 100 times slower on some specific one.
I suspect there can be a bug in GCC as I cannot think of any other option.
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <unordered_set>
#include <unordered_map>
#include <queue>
#include <ctime>
#include <fstream>
#include <cassert>
#include <complex>
#include <string>
#include <cstring>
#include <chrono>
#include <random>
#include <queue>
static std::mt19937 mmtw(std::chrono::steady_clock::now().time_since_epoch().count());
int64_t rng(int64_t x, int64_t y) {
static std::uniform_int_distribution<int64_t> d;
return d(mmtw) % (y - x + 1) + x;
}
constexpr static int MAXN = 1e5 + 10;
void solve(std::istream &in, std::ostream &out);
void generate(std::ostream &out) {
constexpr int N = 1e5;
out << N << '\n';
int MIN = -1e6;
int MAX = 1e6;
for (int i = 0; i < N; ++i) {
out << 0 << ' ';
out << i << ' ';
out << (i + 1) * int(1e4) << '\n';
}
}
int main() {
freopen("input.txt", "r", stdin);
std::ios_base::sync_with_stdio(false);
std::cin.tie(nullptr);
std::cout.tie(nullptr);
std::cerr.tie(nullptr);
std::ofstream fout("input.txt");
generate(fout);
fout.close();
solve(std::cin, std::cout);
return 0;
}
struct point_t {
int32_t x, y, z;
int id;
point_t() = default;
point_t(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
point_t operator +(const point_t &rhs) const {
return point_t(x + rhs.x, y + rhs.y, z + rhs.z);
}
point_t operator -(const point_t &rhs) const {
return point_t(x - rhs.x, y - rhs.y, z - rhs.z);
}
int64_t abs2() const {
return 1LL * x * x + 1LL * y * y + 1LL * z * z;
}
};
std::istream &operator >>(std::istream &in, point_t &pt) {
return in >> pt.x >> pt.y >> pt.z;
}
inline bool cmp_x(const point_t &lhs, const point_t &rhs) {
return lhs.x < rhs.x;
}
inline bool cmp_y(const point_t &lhs, const point_t &rhs) {
return lhs.y < rhs.y;
}
inline bool cmp_z(const point_t &lhs, const point_t &rhs) {
return lhs.z < rhs.z;
}
struct pair_t {
int64_t distance_sq;
point_t a {}, b {};
pair_t() : distance_sq(std::numeric_limits<int64_t>::max()) {};
pair_t(const point_t &a, const point_t &b) : distance_sq((a - b).abs2()), a(a), b(b) {}
bool operator<(const pair_t &rhs) const {
return distance_sq < rhs.distance_sq;
}
};
template <typename T> inline T sqr(T arg) { return arg * arg; }
point_t pts[MAXN];
static pair_t ans = pair_t();
void recur_2D(point_t pts[], int size, int64_t threshold_sq) {
if (size <= 3) {
for (int i = 0; i < size; ++i) {
for (int j = i + 1; j < size; ++j) {
ans = std::min(ans, pair_t(pts[i], pts[j]));
}
}
std::sort(pts, pts + size, cmp_y);
return;
}
int mid = size / 2;
int midx = pts[mid].x;
recur_2D(pts, mid, threshold_sq);
recur_2D(pts + mid, size - mid, threshold_sq);
static point_t buffer[MAXN];
std::merge(pts, pts + mid, pts + mid, pts + size, buffer, cmp_y);
std::copy(buffer, buffer + size, pts);
int buff_sz = 0;
for (int i = 0; i < size; ++i) {
if (sqr(pts[i].x - midx) >= threshold_sq) {
continue;
}
int64_t x_sqr = sqr(pts[i].x - midx);
for (int j = buff_sz - 1; j >= 0; --j) {
if (sqr(pts[i].y - buffer[j].y) + x_sqr >= threshold_sq) {
break;
}
ans = std::min(ans, pair_t(pts[i], buffer[j]));
}
buffer[buff_sz++] = pts[i];
}
}
void recur_3D(point_t pts[], int size) {
if (size <= 3) {
for (int i = 0; i < size; ++i) {
for (int j = i + 1; j < size; ++j) {
ans = std::min(ans, pair_t(pts[i], pts[j]));
}
}
std::sort(pts, pts + size, cmp_x);
return;
}
int mid = size / 2;
int midz = pts[mid].z;
recur_3D(pts, mid);
recur_3D(pts + mid, size - mid);
static point_t buffer[MAXN];
std::merge(pts, pts + mid, pts + mid, pts + size, buffer, cmp_x);
std::copy(buffer, buffer + size, pts);
int buff_sz = 0;
for (int i = 0; i < size; ++i) {
if (sqr(pts[i].z - midz) >= ans.distance_sq) {
continue;
}
buffer[buff_sz++] = pts[i];
}
recur_2D(buffer, buff_sz, ans.distance_sq);
}
void solve(std::istream &in, std::ostream &out) {
clock_t start = clock();
int num_of_points;
in >> num_of_points;
for (int i = 0; i < num_of_points; ++i) {
in >> pts[i];
pts[i].id = i + 1;
}
std::sort(pts, pts + num_of_points, cmp_z);
recur_3D(pts, num_of_points);
out << ans.distance_sq << '\n';
out << 1.0 * (clock() - start) / CLOCKS_PER_SEC << " s.\n";
}
Link to this code: https://code.re/2yfPzjkD
It generates the sample that makes the code very slow and then measures algorithm execution time.
I compile with
g++ -DLOCAL -std=c++1z -O3 -Wno-everything main.cpp
and with
clang++ -DLOCAL -std=c++1z -O3 -Wno-everything main.cpp
and run
./main while having input.txt in the same directory.
The Clang-copiled binary runs in 0.053798 s. while the GCC one in 12.4276 s. . These numbers are from program's output, you can see that function solve.
I have also verified the difference on https://wandbox.org/ on different compiler versions.
https://wandbox.org/permlink/YFEEWSKyos2dQf32 -- clang
https://wandbox.org/permlink/XctarNHvd3I1B0x8 -- gcc
Note, I compressed input and thus had to change reading in solve a bit.
On my local machine I have these compilers.
clang++ --version
clang version 7.0.0 (tags/RELEASE_700/final)
g++ --version
g++ (GCC) 8.2.1 20180831
It feels like I run GCC code without compiler optimizations. What can be the reason?
UPD.
Also, there is a version that calls std::sort only once in the very beginning.
https://wandbox.org/permlink/i9Kd3GdewxSRwXsM
I have also tried to compile Clang with -stdlib=libstdc++, shuffling data and think that different implementations of std::sort are not cause.

This is simply undefined behavior. Your code has undefined behavior due to signed integer overflow at:
template <typename T> inline T sqr(T arg) { return arg * arg; }
You can replace that with:
template <typename T>
inline T sqr(T arg)
{
assert(double(arg)*arg <= std::numeric_limits<T>::max());
assert(double(arg)*arg >= std::numeric_limits<T>::min());
return arg * arg;
}
and catch the error in a debugger. It fails with arg=-60000 called from recur_3D on the line:
if (sqr(pts[i].z - midz) >= ans.distance_sq) {
this happens with pts[i] = {x = 0, y = 0, z = 10000, id = 1} and midz=70000.
Because this is undefiend behavior, all bets are off. Different compiles utilize the assumption that "undefined behavior never happens" in different ways. This is why clang and gcc perform differently, and it is pure "luck".
Consider using UndefinedBehaviorSanitizer to catch these errors. I don't have it on my clang installation, but clang++ -fsanitize=signed-integer-overflow should do the trick.
Fixing this function gives a comparable speed for both clang and gcc.

Related

Question about Dynamic memory allocation in C++ for 3D array

I want to write a fuction to creat a 3D Matrix,but I have a broblem when I try to display it,Please help me,Thanks.
My teacher gave me his codes.His codes can create a 2D array which has Dynamic memory, I want to change his codes to creat a 3D matrix. When I tried to display the Matrix,I realize the array I created is a 2D array,And Because I just begin to use C++,I can't find my mistake.
/*<Array3D.h>*/
#pragma once
#ifndef _ARRAY_3D_H_
#define _ARRAY_3D_H_
//-----------------------------------------
#include <cassert>
#include <vector>
using namespace std;
template<typename T>
class Array3D
{
public:
typedef Array3D<T> _Myt;
Array3D()
: m_nRows(0)
, m_nCols(0)
, m_nDepths(0)
{}
Array3D(size_t r, size_t c,size_t d)
{
Resize(r, c, d);
}
//Allocating memory size
void Resize(size_t r, size_t c,size_t d)
{
if (r == m_nRows && c == m_nCols && d==m_nDepths) { return; }
bool bValid = r > 0 && c > 0 && d>0;
if (!bValid) return;
m_nRows = r;
m_nCols = c;
m_nDepths = d;
m_v.resize(m_nRows * m_nCols * m_nDepths);
}
const T* operator[](size_t r) const
{
assert(r >= 0 && r < m_nRows);
return &(*(m_v.begin() + (r * m_nCols * m_nDepths)));
}
T* operator[](size_t r)
{
assert(r >= 0 && r < m_nRows);
return &(*(m_v.begin() + (r * m_nCols * m_nDepths)));
}
//Gets the start position of a one-dimensional array
const T* GetRawPointer() const { return &(m_v[0]); }
T* GetRawPointer() { return &(m_v[0]); }
long Rows() const
{
return static_cast<long>(m_nRows);
}
long Cols() const
{
return static_cast<long>(m_nCols);
}
long Depths() const
{
return static_cast<long>(m_nDepths);
}
long TotalSize() const
{
return static_cast<long>(m_nRows * m_nCols * m_nDepths);
}
void ClearUp()
{
m_nRows = 0;
m_nCols = 0;
m_nDepths = 0;
m_v.clear();
}
bool IsEmpty() const { return m_v.empty(); }
protected:
vector<T> m_v;// Internally a one-dimensional is used
size_t m_nRows;
size_t m_nCols;
size_t m_nDepths;
};
<Matrix3D.h>
#pragma once
#include "Array3D.h"
class Matrix3D
{
public:
Matrix3D(int r = 0, int c = 0, int d = 0)
{
setSize(r, c, d);
}
void setSize(int r, int c, int d) { m_data3D.Resize(r, c, d); }
void clear() { m_data3D.ClearUp(); }
int rows() const { return m_data3D.Rows(); }
int cols() const { return m_data3D.Cols(); }
int Depths() const { return m_data3D.Depths(); }
void display() const;
//Operator Overloading
float* operator[](int rIndex) { return m_data3D[rIndex]; }
const float* operator[](int rIndex) const { return m_data3D[rIndex]; }
private:
Array3D<float> m_data3D;
};
#include "Matrix3D.h"
#include <iostream>
using namespace std;
void Matrix3D::display() const
{
if (m_data3D.IsEmpty())
{
cout << "empty matrix" << endl;
return;
}
cout << "------------------------" << endl;
const int rows = this->rows();
const int cols = this->cols();
const int Depths = this->Depths();
cout << rows << "x" << cols << "x" << Depths << endl;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
for(int k = 0 ;k < Depths; k++)
{
cout << m_data3D[i][j][k] << ' '; /*This section will pose an error "E0142"*/
}
cout << endl;
}
}
}

I am going to guess that your teachers class was Array2D and that you mostly just added m_nDepths to it to turn it into Array3D. If this assumption is not right, then my answer here is most likely going to be completely wrong.
But if I am right, then his operator[] looked something like:
T* operator[](size_t r)
{
assert(r >= 0 && r < m_nRows);
return &(*(m_v.begin() + (r * m_nCols)));
}
Which means that when you then do m_data2D[i][j], what happens is that the first [i] applies to the Array2D. This, as shown above, returns a pointer (in your case likely a float*). The [j] is then applied to this pointer, which results in some pointer arithmetic that results in a float (for a float *x; x[y] means *(x+y)).
In other words, your teacher stores his 2D array line by line in a 1D array, and when you [] into it you get a pointer to the right line that you can then further [] into.
This is all fine.
The problem is when you add a third dimension to this and try the same approach: The first [] still returns a float* (but this time to the correct 2D matrix), the second [] returns a float (the first element in the correct line of the 2D matrix), and the third [] tries to apply to a float - which it cannot, and you get the error.
There are two ways to fix this:
Change the return type of Array3D::operator[] to return some kind of 2D array type whose operator[] works like the original Array2D.
Abandon the operator[] approach and replace it with a member function that takes 3 arguments and returns the correct element right away. I think this is what I would prefer myself.
For the second option, something like (not tested, rather large chance I messed up the order of the arguments):
T element(size_t x, size_t y, size_t z) const
{
assert(x >= 0 && x < m_nDepths);
assert(y >= 0 && y < m_nCols);
assert(z >= 0 && z < m_nRows);
return *(m_v.begin() + (x * m_nCols * m_nDepths +
y * m_nCols +
z));
}
T& element(size_t x, size_t y, size_t z)
{
assert(x >= 0 && x < m_nDepths);
assert(y >= 0 && y < m_nCols);
assert(z >= 0 && z < m_nRows);
return *(m_v.begin() + (x * m_nCols * m_nDepths +
y * m_nCols +
z));
}
Which turns cout << m_data3D[i][j][k] << ' '; into cout << m_data3D.element(i,j,k) << ' ';

Finding the total area and cover area of multiple polygon

I'm trying to solve this problem using the slicing technic.
But I just passed the first two test cases in this gym (Problem A)
2017-2018 ACM-ICPC East Central North America Regional Contest (ECNA 2017)
The whole step in my code looks like this:
calculate the total area using vector cross when inputting the polygon information.
find all the endpoints and intersection points, and record x value of them.
for each slice (sort x list and for each interval) find the lines from every polygon which crosses this interval and store this smaller segment.
sort segments.
for each trapezoid or triangle, calculate the area if this area is covered by some polygon.
sum them all to find the cover area.
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
template<typename T> using p = pair<T, T>;
template<typename T> using vec = vector<T>;
template<typename T> using deq = deque<T>;
// #define dbg
#define yccc ios_base::sync_with_stdio(false), cin.tie(0)
#define al(a) a.begin(),a.end()
#define F first
#define S second
#define eb emplace_back
const int INF = 0x3f3f3f3f;
const ll llINF = 1e18;
const int MOD = 1e9+7;
const double eps = 1e-9;
const double PI = acos(-1);
double fcmp(double a, double b = 0, double eps = 1e-9) {
if (fabs(a-b) < eps) return 0;
return a - b;
}
template<typename T1, typename T2>
istream& operator>>(istream& in, pair<T1, T2>& a) { in >> a.F >> a.S; return in; }
template<typename T1, typename T2>
ostream& operator<<(ostream& out, pair<T1, T2> a) { out << a.F << ' ' << a.S; return out; }
struct Point {
double x, y;
Point() {}
Point(double x, double y) : x(x), y(y) {}
Point operator+(const Point& src) {
return Point(src.x + x, src.y + y);
}
Point operator-(const Point& src) {
return Point(x - src.x, y - src.y);
}
Point operator*(double src) {
return Point(x * src, y * src);
}
//* vector cross.
double operator^(Point src) {
return x * src.y - y * src.x;
}
};
struct Line {
Point sp, ep;
Line() {}
Line(Point sp, Point ep) : sp(sp), ep(ep) {}
//* check if two segment intersect.
bool is_intersect(Line src) {
if (fcmp(ori(src.sp) * ori(src.ep)) < 0 and fcmp(src.ori(sp) * src.ori(ep)) < 0) {
double t = ((src.ep - src.sp) ^ (sp - src.sp)) / ((ep - sp) ^ (src.ep - src.sp));
return fcmp(t) >= 0 and fcmp(t, 1) <= 0;
}
return false;
}
//* if two segment intersect, find the intersection point.
Point intersect(Line src) {
double t = ((src.ep - src.sp) ^ (sp - src.sp)) / ((ep - sp) ^ (src.ep - src.sp));
return sp + (ep - sp) * t;
}
double ori(Point src) {
return (ep - sp) ^ (src - sp);
}
bool operator<(Line src) const {
if (fcmp(sp.y, src.sp.y) != 0)
return sp.y < src.sp.y;
return ep.y < src.ep.y;
}
};
int next(int i, int n) {
return (i + 1) % n;
}
int main()
{
yccc;
int n;
cin >> n;
//* the point list and the line list of polygons.
vec<vec<Point>> p_list(n);
vec<vec<Line>> l_list(n);
//* x's set for all endpoints and intersection points
vec<double> x_list;
//* initializing point list and line list
double total = 0, cover = 0;
for (int i = 0; i < n; i++) {
int m;
cin >> m;
p_list[i].resize(m);
for (auto &k : p_list[i]) {
cin >> k.x >> k.y;
x_list.eb(k.x);
}
l_list[i].resize(m);
for (int k = 0; k < m; k++) {
l_list[i][k].sp = p_list[i][k];
l_list[i][k].ep = p_list[i][next(k, m)];
}
//* calculate the polygon area.
double tmp = 0;
for (int k = 0; k < m; k++) {
tmp += l_list[i][k].sp.x * l_list[i][k].ep.y - l_list[i][k].sp.y * l_list[i][k].ep.x;
}
total += abs(tmp);
}
//* find all intersection points
for (int i = 0; i < n; i++)
for (int k = i+1; k < n; k++) {
for (auto li : l_list[i])
for (auto lk : l_list[k])
if (li.is_intersect(lk)) {
x_list.eb(li.intersect(lk).x);
}
}
sort(al(x_list));
auto same = [](double a, double b) -> bool {
return fcmp(a, b) == 0;
};
x_list.resize(unique(al(x_list), same) - x_list.begin());
//* for each slicing, calculate the cover area.
for (int i = 0; i < x_list.size() - 1; i++) {
vec<pair<Line, int>> seg;
for (int k = 0; k < n; k++) {
for (auto line : l_list[k]) {
//* check if line crosses this slicing
if (fcmp(x_list[i], min(line.sp.x, line.ep.x)) >= 0 and fcmp(max(line.sp.x, line.ep.x), x_list[i+1]) >= 0) {
Point sub = line.ep - line.sp;
double t_sp = (x_list[i] - line.sp.x) / sub.x, t_ep = (x_list[i+1] - line.sp.x) / sub.x;
seg.eb(Line(Point(x_list[i], line.sp.y + sub.y * t_sp), Point(x_list[i+1], line.sp.y + sub.y * t_ep)), k);
}
}
}
//* sort this slicing
sort(al(seg));
deq<bool> inside(n);
int count = 0;
Line prev;
// cout << x_list[i] << ' ' << x_list[i+1] << endl;
//* calculate cover area in this slicing.
for (int k = 0; k < seg.size(); k++) {
if (count)
cover += ((seg[k].F.sp.y - prev.sp.y) + (seg[k].F.ep.y - prev.ep.y)) * (x_list[i+1] - x_list[i]);
prev = seg[k].F;
// cout << seg[k].S << ": (" << seg[k].F.sp.x << ", " << seg[k].F.sp.y << "), (" << seg[k].F.ep.x << ", " << seg[k].F.ep.y << ")" << endl;
inside[k] = !inside[k];
count += (inside[k] ? 1 : -1);
}
}
//* answer
cout << total / 2 << ' ' << cover / 2;
}
I can't not figure out which part I made a mistake :(

How to optimize my C++ OpenMp Matrix Multiplication code

I have written a C++ OpenMp Matrix Multiplication code that multiplies two 1000x1000 matrices.
So far I have gotten a 0.700 sec execution time using OpenMp but I want to see if there is other ways I can make it faster using OpenMp?
I appreciate any advice or tips you can give me.
Here is my code:
#include <iostream>
#include <time.h>
#include <omp.h>
using namespace std;
void Multiply()
{
//initialize matrices with random numbers
int aMatrix[1000][1000], i, j;
for( i = 0; i < 1000; ++i)
{for( j = 0; j < 1000; ++j)
{aMatrix[i][j] = rand();}
}
int bMatrix[1000][1000], i1, j2;
for( i1 = 0; i1 < 1000; ++i1)
{for( j2 = 0; j2 < 1000; ++j2)
{bMatrix[i1][j2] = rand();}
}
//Result Matrix
int product[1000][1000] = {0};
for (int row = 0; row < 1000; row++) {
for (int col = 0; col < 1000; col++) {
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 0; inner < 1000; inner++) {
product[row][col] += aMatrix[row][inner] * bMatrix[inner][col];
}
}
}
}
int main() {
time_t begin, end;
time(&begin);
Multiply();
time(&end);
time_t elapsed = end - begin;
cout << ("Time measured: %ld seconds.\n", elapsed);
return 0;
}

Following things can be done for speedup:
Using OpenMP for parallelizing external loop, like you did (and like I also did in my following code). Or alternatively using std::async for multi-threading like it was used in another answer.
Transpose B matrix, this will help to increase L1 cache hits, because you will read from sequential memory each B column (or row in transposed variant).
Use vectorized SIMD instructions, this will allow to do several multiplications (and additions) within one CPU cycle. Often compilers do auto-vectorization of your loops well enough through SIMD instructions without your help, but I did it explicitly in my code.
Run several independent SIMD instructions within loop. This will help to occupy whole CPU pipeline of SIMD. I did so in my code by using four SIMD registers r0, r1, r2, r3. In compilers this is usually called loop unrolling.
Align your matrix starting address on 64-bytes boundary. This will help SIMD instructions to do fast aligned read/write.
Align starting address of each matrix row on 64-bytes boundary. I did this in my code by padding each row with zeros till multiple of 64-bytes. This also helps SIMD instructions to do fast aligned read/write.
In my following code I did all 1. - 6. steps above. Memory 64-bytes alignment I did through implementing AlignmentAllocator that was used in std::vector. Also I did time measurements for float/double/int.
On my old 4-core laptop I got following time measurements for the case of 1000x1000 matrix multiplying by 1000x1000:
float: time 0.1569 sec
double: time 0.3168 sec
int: time 0.1565 sec
To compare my hardware capabilities I did measurements of another answer of #doug for the case of int:
Threads w transpose 0.2164 secs.
As one can see my solution is 1.4x times faster that the other answer, I guess due to memory 64-bytes alignment and maybe due to using explicit SIMD (instead of relying on compiler auto-vectorization of a loop).
To compile my program, don't forget to add -fopenmp -lgomp options (for OpenMP support) and -march=native -O3 -std=c++20 options (for SIMD support, optimizations and standard) if you're compiling under GCC/CLang, while MSVC I guess adds OpenMP automatically and doesn't need any special options (use /O2 /GL /std:c++latest for optimizations and standard in MSVC).
In my code I only implemented SSE2/SSE4/AVX/AVX2 instructions for SIMD, if you have more powerful machine you may tell me and I implement also FMA/AVX-512, they will give even twice more speed boost.
My Mul() function is quite generic, it is templated, and you just pass pointers to matrices and row/col count, so your matrices may be stored on calling side in any way (through std::vector or std::array or plain 2D array).
At start of Run() function you may change number of rows and columns if you need a bigger test. Notice that all my functions support any rows and columns, you may even multiply matrix of size 1234x2345 by 2345x3456.
Try it online!
#include <cstdint>
#include <cstring>
#include <stdexcept>
#include <iostream>
#include <iomanip>
#include <vector>
#include <memory>
#include <string>
#include <immintrin.h>
#define USE_OPENMP 1
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#if defined(_MSC_VER)
#define IS_MSVC 1
#else
#define IS_MSVC 0
#endif
#if USE_OPENMP
#include <omp.h>
#endif
template <typename T, std::size_t N>
class AlignmentAllocator {
public:
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef T * pointer;
typedef const T * const_pointer;
typedef T & reference;
typedef const T & const_reference;
public:
inline AlignmentAllocator() throw() {}
template <typename T2> inline AlignmentAllocator(const AlignmentAllocator<T2, N> &) throw() {}
inline ~AlignmentAllocator() throw() {}
inline pointer adress(reference r) { return &r; }
inline const_pointer adress(const_reference r) const { return &r; }
inline pointer allocate(size_type n);
inline void deallocate(pointer p, size_type);
inline void construct(pointer p, const value_type & wert);
inline void destroy(pointer p) { p->~value_type(); }
inline size_type max_size() const throw() { return size_type(-1) / sizeof(value_type); }
template <typename T2> struct rebind { typedef AlignmentAllocator<T2, N> other; };
bool operator!=(const AlignmentAllocator<T, N> & other) const { return !(*this == other); }
bool operator==(const AlignmentAllocator<T, N> & other) const { return true; }
};
template <typename T, std::size_t N>
inline typename AlignmentAllocator<T, N>::pointer AlignmentAllocator<T, N>::allocate(size_type n) {
#if IS_MSVC
auto p = (pointer)_aligned_malloc(n * sizeof(value_type), N);
#else
auto p = (pointer)std::aligned_alloc(N, n * sizeof(value_type));
#endif
ASSERT(p);
return p;
}
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::deallocate(pointer p, size_type) {
#if IS_MSVC
_aligned_free(p);
#else
std::free(p);
#endif
}
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::construct(pointer p, const value_type & wert) {
new (p) value_type(wert);
}
template <typename T>
using AlignedVector = std::vector<T, AlignmentAllocator<T, 64>>;
template <typename T>
struct RegT;
#ifdef __AVX__
template <> struct RegT<float> { static size_t constexpr bisize = 256; using type = __m256; static type zero() { return _mm256_setzero_ps(); } };
template <> struct RegT<double> { static size_t constexpr bisize = 256; using type = __m256d; static type zero() { return _mm256_setzero_pd(); } };
inline void MulAddReg(float const * a, float const * b, __m256 & c) {
c = _mm256_add_ps(c, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void MulAddReg(double const * a, double const * b, __m256d & c) {
c = _mm256_add_pd(c, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void StoreReg(float * dst, __m256 const & src) { _mm256_store_ps(dst, src); }
inline void StoreReg(double * dst, __m256d const & src) { _mm256_store_pd(dst, src); }
#else // SSE2
template <> struct RegT<float> { static size_t constexpr bisize = 128; using type = __m128; static type zero() { return _mm_setzero_ps(); } };
template <> struct RegT<double> { static size_t constexpr bisize = 128; using type = __m128d; static type zero() { return _mm_setzero_pd(); } };
inline void MulAddReg(float const * a, float const * b, __m128 & c) {
c = _mm_add_ps(c, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
}
inline void MulAddReg(double const * a, double const * b, __m128d & c) {
c = _mm_add_pd(c, _mm_mul_pd(_mm_load_pd(a), _mm_load_pd(b)));
}
inline void StoreReg(float * dst, __m128 const & src) { _mm_store_ps(dst, src); }
inline void StoreReg(double * dst, __m128d const & src) { _mm_store_pd(dst, src); }
#endif
#ifdef __AVX2__
template <> struct RegT<int32_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };
//template <> struct RegT<int64_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };
inline void MulAddReg(int32_t const * a, int32_t const * b, __m256i & c) {
c = _mm256_add_epi32(c, _mm256_mullo_epi32(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
}
//inline void MulAddReg(int64_t const * a, int64_t const * b, __m256i & c) {
// c = _mm256_add_epi64(c, _mm256_mullo_epi64(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
//}
inline void StoreReg(int32_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
//inline void StoreReg(int64_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
#else // SSE2
template <> struct RegT<int32_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };
//template <> struct RegT<int64_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };
inline void MulAddReg(int32_t const * a, int32_t const * b, __m128i & c) {
c = _mm_add_epi32(c, _mm_mullo_epi32(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
}
//inline void MulAddReg(int64_t const * a, int64_t const * b, __m128i & c) {
// c = _mm_add_epi64(c, _mm_mullo_epi64(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
//}
inline void StoreReg(int32_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
//inline void StoreReg(int64_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
#endif
template <typename T>
void Mul(T const * A0, size_t A_rows, size_t A_cols, T const * B0, size_t B_rows, size_t B_cols, T * C) {
size_t constexpr reg_cnt = RegT<T>::bisize / 8 / sizeof(T), block = 4 * reg_cnt;
ASSERT(A_cols == B_rows);
size_t const A_cols_aligned = (A_cols + block - 1) / block * block, B_rows_aligned = (B_rows + block - 1) / block * block;
// Copy aligned A
AlignedVector<T> Av(A_rows * A_cols_aligned);
for (size_t i = 0; i < A_rows; ++i)
std::memcpy(&Av[i * A_cols_aligned], &A0[i * A_cols], sizeof(Av[0]) * A_cols);
T const * A = Av.data();
// Transpose B
AlignedVector<T> Bv(B_cols * B_rows_aligned);
for (size_t j = 0; j < B_cols; ++j)
for (size_t i = 0; i < B_rows; ++i)
Bv[j * B_rows_aligned + i] = B0[i * B_cols + j];
T const * Bt = Bv.data();
ASSERT(uintptr_t(A) % 64 == 0 && uintptr_t(Bt) % 64 == 0);
ASSERT(uintptr_t(&A[A_cols_aligned]) % 64 == 0 && uintptr_t(&Bt[B_rows_aligned]) % 64 == 0);
// Multiply
#pragma omp parallel for
for (size_t i = 0; i < A_rows; ++i) {
// Aligned Reg storage
AlignedVector<T> Regs(block);
for (size_t j = 0; j < B_cols; ++j) {
T const * Arow = &A[i * A_cols_aligned + 0], * Btrow = &Bt[j * B_rows_aligned + 0];
using Reg = typename RegT<T>::type;
Reg r0 = RegT<T>::zero(), r1 = RegT<T>::zero(), r2 = RegT<T>::zero(), r3 = RegT<T>::zero();
size_t const k_hi = A_cols - A_cols % block;
for (size_t k = 0; k < k_hi; k += block) {
MulAddReg(&Arow[k + reg_cnt * 0], &Btrow[k + reg_cnt * 0], r0);
MulAddReg(&Arow[k + reg_cnt * 1], &Btrow[k + reg_cnt * 1], r1);
MulAddReg(&Arow[k + reg_cnt * 2], &Btrow[k + reg_cnt * 2], r2);
MulAddReg(&Arow[k + reg_cnt * 3], &Btrow[k + reg_cnt * 3], r3);
}
StoreReg(&Regs[reg_cnt * 0], r0);
StoreReg(&Regs[reg_cnt * 1], r1);
StoreReg(&Regs[reg_cnt * 2], r2);
StoreReg(&Regs[reg_cnt * 3], r3);
T sum1 = 0, sum2 = 0, sum3 = 0;
for (size_t k = 0; k < Regs.size(); ++k)
sum1 += Regs[k];
//for (size_t k = 0; k < A_cols - A_cols % block; ++k) sum3 += Arow[k] * Btrow[k];
for (size_t k = k_hi; k < A_cols; ++k)
sum2 += Arow[k] * Btrow[k];
C[i * A_rows + j] = sum2 + sum1;
}
}
}
#include <random>
#include <thread>
#include <chrono>
#include <type_traits>
template <typename T>
void Test(T const * A, size_t A_rows, size_t A_cols, T const * B, size_t B_rows, size_t B_cols, T const * C, T eps) {
for (size_t i = 0; i < A_rows / 16; ++i)
for (size_t j = 0; j < B_cols / 16; ++j) {
T sum = 0;
for (size_t k = 0; k < A_cols; ++k)
sum += A[i * A_cols + k] * B[k * B_cols + j];
ASSERT_MSG(std::abs(C[i * A_rows + j] - sum) <= eps * A_cols, "i " + std::to_string(i) + " j " + std::to_string(j) +
" C " + std::to_string(C[i * A_rows + j]) + " ref " + std::to_string(sum));
}
}
double Time() {
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
}
template <typename T>
void Run() {
size_t constexpr A_rows = 1000, A_cols = 1000, B_rows = 1000, B_cols = 1000;
std::string const tname = std::is_same_v<T, float> ? "float" : std::is_same_v<T, double> ?
"double" : std::is_same_v<T, int32_t> ? "int" : "<unknown>";
bool const is_int = tname == "int";
std::mt19937_64 rng{123};
std::vector<T> A(A_rows * A_cols), B(B_rows * B_cols), C(A_rows * B_cols);
for (size_t i = 0; i < A.size(); ++i)
A[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
for (size_t i = 0; i < B.size(); ++i)
B[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
auto tim = Time();
Mul(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0]);
tim = Time() - tim;
std::cout << std::setw(6) << tname << ": time " << std::fixed << std::setprecision(4) << tim << " sec" << std::endl;
Test(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0], tname == "float" ? T(1e-7) : tname == "double" ? T(1e-15) : T(0));
}
int main() {
try {
#if USE_OPENMP
omp_set_num_threads(std::thread::hardware_concurrency());
#endif
Run<float>();
Run<double>();
Run<int32_t>();
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
Output:
float: time 0.1569 sec
double: time 0.3168 sec
int: time 0.1565 sec

Here's straight c++ code that runs in .08s with ints and .14s with floats or doubles. My system is 10 years old with relatively slow memory. Good at the time but now is now.
I agree with #VictorEijkhout that the best results would be with tuned code. There has been huge amounts of work optimizing those.
#include <vector>
#include <array>
#include <random>
#include <cassert>
#include <iostream>
#include <iomanip>
#include <thread>
#include <future>
#include <chrono>
struct Timer {
std::chrono::system_clock::time_point snapTime;
Timer() { snapTime = std::chrono::system_clock::now(); }
operator double() { return std::chrono::duration<double>(std::chrono::system_clock::now() - snapTime).count(); }
};
using DataType = int;
using std::array, std::vector;
constexpr int N = 1000, THREADS = 12;
static auto launchType = std::launch::async;
using Matrix = vector<array<DataType, N>>;
Matrix create_matrix() { return Matrix(N); };
Matrix product(Matrix const& v0, Matrix const& v1, double& time)
{
Matrix ret = create_matrix();
Matrix v2 = create_matrix();
Timer timer;
for (size_t r = 0; r < N; r++) // transpose first
for (size_t c = 0; c < N; c++)
v2[c][r] = v1[r][c];
// lambda to process sets of rows in separate threads
auto do_row_set = [&v0, &v2, &ret](size_t start, size_t last) {
for (size_t row = start; row < last; row++)
for (size_t col = 0; col < N; col++)
{
DataType tmp{}; // separate tmp variable significantly improves optimization
for (size_t col_t = 0; col_t < N; col_t++)
tmp += v0[row][col_t] * v2[col][col_t];
ret[row][col] = tmp;
}
};
vector<size_t> seq;
const size_t NN = N / THREADS;
// make a sequence of NN+1 rows from start to end
for (size_t thread_n = 0; thread_n < N; thread_n += NN)
seq.push_back(thread_n);
seq.push_back(N);
vector<std::future<void>> results; results.reserve(THREADS);
for (size_t i = 0; i < THREADS; i++)
results.emplace_back(std::async(launchType, do_row_set, seq[i], seq[i + 1]));
for (auto& x : results)
x.get();
time = timer;
return ret;
}
bool operator==(Matrix const& v0, Matrix const& v1)
{
for (size_t r = 0; r < N; r++)
for (size_t c = 0; c < N; c++)
if (v0[r][c] != v1[r][c])
return false;
return true;
}
int main()
{
auto fill = [](Matrix& v) {
std::mt19937_64 r(1);
std::normal_distribution dist(1.);
for (size_t row = 0; row < N; row++)
for (size_t col = 0; col < N; col++)
v[row][col] = DataType(dist(r));
};
Matrix m1 = create_matrix(), m2 = create_matrix(), m3 = create_matrix();
fill(m1); fill(m2);
auto process_test = [&m1, &m2](Matrix& out) {
const int rpt_count = 4;
double sum = 0;
for (int i = 0; i < rpt_count; i++)
{
double time;
out = product(m1, m2, time);
sum += time / rpt_count;
}
return sum;
};
std::cout << std::fixed << std::setprecision(4);
double time{};
time = process_test(m3);
std::cout << "Threads w transpose " << time << " secs.\n";
}

BigInt multiplication and to_string implementation outputs too many zeros

I created the following for multiplying two big integers stored with base 1,000,000,000 as a vector<int32_t>:
#include <iostream>
#include <vector>
#include <cmath>
#include <limits>
#include <algorithm>
template<typename T>
constexpr T power_of_10(T n)
{
return n < 0 ? 0 : n == 0 ? 1 : (n == 1 ? 10 : 10 * power_of_10(n - 1));
}
template<typename T>
constexpr T base_value = power_of_10<T>(std::numeric_limits<T>::digits10);
template<typename T>
constexpr T max_value = base_value<T> - 1;
class BigInt {
private:
static constexpr const std::uint32_t base = base_value<std::uint32_t>;
static constexpr const std::uint32_t max_digits = std::numeric_limits<std::uint32_t>::digits10;
std::vector<std::uint64_t> digits;
public:
BigInt(const char* value) : BigInt(std::string(value))
{
}
BigInt(const std::string& value)
{
constexpr const int stride = std::numeric_limits<std::uint32_t>::digits10;
const std::size_t size = value.size() / stride;
for (std::size_t i = 0; i < size; ++i)
{
auto it = value.begin();
auto jt = value.begin();
std::advance(it, i * stride);
std::advance(jt, (i * stride) + stride);
digits.push_back(std::stoull(std::string(it, jt)));
}
if (value.size() % stride)
{
auto remainder = std::string(value.begin() + size * stride, value.end());
digits.push_back(std::stoull(remainder));
}
std::reverse(digits.begin(), digits.end());
}
BigInt& multiply(const BigInt& other)
{
std::vector<std::uint64_t> product = std::vector<std::uint64_t>(digits.size() + other.digits.size(), 0);
for (std::size_t i = 0; i < other.digits.size(); ++i)
{
std::uint64_t carry = 0, total = 0;
for (std::size_t j = 0; j < digits.size(); ++j)
{
total = product.at(i + j) + (other.digits[i] * digits[j]) + carry;
carry = total / base;
total %= base;
product.at(i + j) = total;
}
if (carry)
{
product[i + digits.size()] = carry;
}
}
digits = product;
return *this;
}
std::string to_string() {
std::string result = std::to_string(digits[digits.size() - 1]);
//
// for (std::int64_t i = digits.size() - 2; i >= 0; --i)
// {
// std::string group = std::to_string(digits[i]);
// while (group.size() < max_digits) {
// group = '0' + group;
// }
// result += group;
// }
for (std::int64_t i = digits.size() - 2; i >= 0; --i)
{
std::uint64_t value = digits[i];
std::uint32_t divisor = base;
while(divisor)
{
if (divisor != base)
{
result += (value / divisor) + '0';
}
value %= divisor;
divisor /= 10;
}
}
return result;
}
};
int main(int argc, const char * argv[])
{
BigInt a = "5000000000";
BigInt b = "5000000000";
std::cout<<a.multiply(b).to_string()<<"\n";
std::cout<<"25000000000000000000"<<"\n";
return 0;
}
When I print the result of the multiplication, I am getting 5,000,000,000 * 5,000,000,000 = 250,000,000,000,000,000,000,000,000,000,000,000 which has way too many zeroes!
It should have 18 zeroes, but mine has 34.
I believe my multiplication algorithm is correct and my to_string is incorrect because 500 * 500 prints correctly as 25,000.
Any ideas what is wrong?

The problem comes from this line:
product[digits.size() + 1] = static_cast<T>(carry);
The index digits.size() + 1 is incorrect. It should be digits.size() + j.

Storing a Big Number in a Variable and Looping

How can i store a big number in a variable and use a for loop?
I have a very big number 75472202764752234070123900087933251 and i need to loop from 0 to this number!
Is it even possible to do this? how much time will it take to end?
EDIT: i am trying to solve a hard problem by brute force. its a combination problem.the bruteforcing cases may reach 470C450.
so i guess i should use a different algorithm...

This might take
0.23 x 10^23 years if C++ processed 100,000 loops per second :|
http://www.wolframalpha.com/input/?i=75472202764752234070123900087933251%2F%28100000*1*3600*24*365%29

It looks that this number fits into 128 bit. So you could use a modern system and a modern compiler that implements such numbers. This would e.g be the case for a 64bit linux system with gcc as a compiler. This has something like __uint128_t that you could use.
Obviously you can't use such a variable as a for-loop variable, others have give you the calculations. But you could use it to store some of your calculations.

Well, you would need an implementation that can handle at least a subset of the initialization, boolean, and arithmetic functions on very large integers. Something like: https://mattmccutchen.net/bigint/.
For something that would give a bit better performance than a general large integer math library, you could use specialized operations specifically to allow use of a large integer as a counter. For an example of this, see dewtell's updated answer to this question.
As for it being possible for you to loop from 0 to that number: well, yes, it is possible to write the code for it with one of the above solutions, but I think the answer is no, you personally will not be able to do it because you will not be alive to see it finish.
[edit: Yes, I would definitely recommend you find a different algorithm. :D]

If you need to loop a certain number of times, and that number is greater than 2^64, just use while(1) because your computer will break before it counts up to 2^64 anyway.

There's no need for a complete bignum package - if all you need is a loop counter, here's a simple byte counter that uses an array of bytes as a counter. It stops when the byte array wraps around to all zeros again. If you wanted to count to some other value than 2^(bytesUsed*CHAR_BITS), you could just compute the two's complement value of the negative of the number of iterations you wanted, and let it count up to 0, keeping in mind that bytes[0] is the low-order byte (or use the positive value and count down instead of up).
#include <stdio.h>
#define MAXBYTES 20
/* Simple byte counter - note it uses argc as # of bytes to use for convenience */
int main(int argc, char **argv) {
unsigned char bytes[MAXBYTES];
const int bytesUsed = argc < MAXBYTES? argc : MAXBYTES;
int i;
unsigned long counter = (unsigned long)-1; /* to give loop something to do */
for (i = 0; i < bytesUsed; i++) bytes[i] = 0; /* Initialize bytes */
do {
for (i = 0; i < bytesUsed && !++bytes[i]; i++) ; /* NULL BODY - this is the byte counter */
counter++;
} while (i < bytesUsed);
printf("With %d bytes used, final counter value = %lu\n", bytesUsed, counter);
}
Run times for the first 4 values (under Cygwin, on a Lenovo T61):
$ time ./bytecounter
With 1 bytes used, final counter value = 255
real 0m0.078s
user 0m0.031s
sys 0m0.046s
$ time ./bytecounter a
With 2 bytes used, final counter value = 65535
real 0m0.063s
user 0m0.031s
sys 0m0.031s
$ time ./bytecounter a a
With 3 bytes used, final counter value = 16777215
real 0m0.125s
user 0m0.015s
sys 0m0.046s
$ time ./bytecounter a a a
With 4 bytes used, final counter value = 4294967295
real 0m6.578s
user 0m0.015s
sys 0m0.047s
At this rate, five bytes should take around half an hour, and six bytes should take the better part of a week. Of course the counter value will be inaccurate for those - it's mostly just there to verify the number of iterations for the smaller byte values and give the loop something to do.
Edit: And here's the time for five bytes, around half an hour as I predicted:
$ time ./bytecounter a a a a
With 5 bytes used, final counter value = 4294967295
real 27m22.184s
user 0m0.015s
sys 0m0.062s

Ok, here's code to take an arbitrary decimal number passed as the first arg and count down from it to zero. I set it up to allow the counter to use different size elements (just change the typedef for COUNTER_BASE), but it turns out that bytes are actually somewhat faster than either short or long on my system.
#include <stdio.h>
#include <limits.h> // defines CHAR_BIT
#include <ctype.h>
#include <vector>
using std::vector;
typedef unsigned char COUNTER_BASE;
typedef vector<COUNTER_BASE> COUNTER;
typedef vector<unsigned char> BYTEVEC;
const unsigned long byteMask = (~0ul) << CHAR_BIT;
const size_t MAXBYTES=20;
void mult10(BYTEVEC &val) {
// Multiply value by 10
unsigned int carry = 0;
int i;
for (i = 0; i < val.size(); i++) {
unsigned long value = val[i]*10ul+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
}
if (carry > 0) val.push_back(carry);
}
void addDigit(BYTEVEC &val, const char digit) {
// Add digit to the number in BYTEVEC.
unsigned int carry = digit - '0'; // Assumes ASCII char set
int i;
for (i = 0; i < val.size() && carry; i++) {
unsigned long value = static_cast<unsigned long>(val[i])+carry;
carry = (value & byteMask) >> CHAR_BIT;
val[i] = value & ~byteMask;
}
if (carry > 0) val.push_back(carry);
}
BYTEVEC Cstr2Bytevec(const char *str) {
// Turn a C-style string into a BYTEVEC. Only the digits in str apply,
// so that one can use commas, underscores, or other non-digits to separate
// digit groups.
BYTEVEC result;
result.reserve(MAXBYTES);
result[0]=0;
unsigned char *res=&result[0]; // For debugging
while (*str) {
if (isdigit(static_cast<int>(*str))) {
mult10(result);
addDigit(result, *str);
}
str++;
}
return result;
}
void packCounter(COUNTER &ctr, const BYTEVEC &val) {
// Pack the bytes from val into the (possibly larger) datatype of COUNTER
int i;
ctr.erase(ctr.begin(), ctr.end());
COUNTER_BASE value = 0;
for (i = 0; i < val.size(); i++) {
int pos = i%sizeof(COUNTER_BASE); // position of this byte in the value
if (i > 0 && pos == 0) {
ctr.push_back(value);
value = val[i];
} else {
value |= static_cast<COUNTER_BASE>(val[i]) << pos*CHAR_BIT;
}
}
ctr.push_back(value);
}
inline bool decrementAndTest(COUNTER &ctr) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < ctr.size() && !(ctr[i]--); i++) ; // EMPTY BODY
return i < ctr.size();
}
inline bool decrementAndTest2(COUNTER_BASE *ctr, const size_t size) {
// decrement value in ctr and return true if old value was not all zeros
int i;
for (i = 0; i < size && !(ctr[i]--); i++) ; // EMPTY BODY
return i < size;
}
/* Vector counter - uses first arg (if supplied) as the count */
int main(int argc, const char *argv[]) {
BYTEVEC limit = Cstr2Bytevec(argc > 1? argv[1] : "0");
COUNTER ctr;
packCounter(ctr, limit);
COUNTER_BASE *ctr_vals = ctr.size() > 0 ? &ctr[0] : NULL;
size_t ctr_size = ctr.size();
unsigned long ul_counter = 0ul; /* to give loop something to do */
while(decrementAndTest2(ctr_vals, ctr_size)) {
ul_counter++;
};
printf("With %d bytes used, final ul_counter value = %lu\n", limit.size(), ul_counter);
return 0;
}
Examples of use:
$ time ./bigcounter 5
With 1 bytes used, final ul_counter value = 5
real 0m0.094s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 5,000
With 2 bytes used, final ul_counter value = 5000
real 0m0.062s
user 0m0.015s
sys 0m0.062s
$ time ./bigcounter 5,000,000
With 3 bytes used, final ul_counter value = 5000000
real 0m0.093s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 1,000,000,000
With 4 bytes used, final ul_counter value = 1000000000
real 0m2.688s
user 0m0.015s
sys 0m0.015s
$ time ./bigcounter 2,000,000,000
With 4 bytes used, final ul_counter value = 2000000000
real 0m5.125s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 3,000,000,000
With 4 bytes used, final ul_counter value = 3000000000
real 0m7.485s
user 0m0.031s
sys 0m0.047s
$ time ./bigcounter 4,000,000,000
With 4 bytes used, final ul_counter value = 4000000000
real 0m9.875s
user 0m0.015s
sys 0m0.046s
$ time ./bigcounter 5,000,000,000
With 5 bytes used, final ul_counter value = 705032704
real 0m12.594s
user 0m0.046s
sys 0m0.015s
$ time ./bigcounter 6,000,000,000
With 5 bytes used, final ul_counter value = 1705032704
real 0m14.813s
user 0m0.015s
sys 0m0.062s
Unwrapping the counter vector into C-style data structures (i.e., using decrementAndTest2 instead of decrementAndTest) sped things up by around 20-25%, but the code is still about twice as slow as my previous C program for similar-sized examples (around 4 billion). This is with MS Visual C++ 6.0 as the compiler in release mode, optimizing for speed, on a 2GHz dual-core system, for both programs. Inlining the decrementAndTest2 function definitely makes a big difference (around 12 sec. vs. 30 for the 5 billion loop), but I'll have to see whether physically inlining the code as I did in the C program can get similar performance.

the variable in main function can Store even 100 factorial
#include <iostream>
#include <cstdio>
#include <vector>
#include <cstring>
#include <string>
#include <map>
#include <functional>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <stack>
#include <queue>
#include <deque>
#include <limits>
#include <cmath>
#include <numeric>
#include <set>
using namespace std;
//template for BIGINIT
// base and base_digits must be consistent
const int base = 10;
const int base_digits = 1;
struct bigint {
vector<int> a;
int sign;
bigint() :
sign(1) {
}
bigint(long long v) {
*this = v;
}
bigint(const string &s) {
read(s);
}
void operator=(const bigint &v) {
sign = v.sign;
a = v.a;
}
void operator=(long long v) {
sign = 1;
if (v < 0)
sign = -1, v = -v;
for (; v > 0; v = v / base)
a.push_back(v % base);
}
bigint operator+(const bigint &v) const {
if (sign == v.sign) {
bigint res = v;
for (int i = 0, carry = 0; i < (int) max(a.size(), v.a.size()) || carry; ++i) {
if (i == (int) res.a.size())
res.a.push_back(0);
res.a[i] += carry + (i < (int) a.size() ? a[i] : 0);
carry = res.a[i] >= base;
if (carry)
res.a[i] -= base;
}
return res;
}
return *this - (-v);
}
bigint operator-(const bigint &v) const {
if (sign == v.sign) {
if (abs() >= v.abs()) {
bigint res = *this;
for (int i = 0, carry = 0; i < (int) v.a.size() || carry; ++i) {
res.a[i] -= carry + (i < (int) v.a.size() ? v.a[i] : 0);
carry = res.a[i] < 0;
if (carry)
res.a[i] += base;
}
res.trim();
return res;
}
return -(v - *this);
}
return *this + (-v);
}
void operator*=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = 0, carry = 0; i < (int) a.size() || carry; ++i) {
if (i == (int) a.size())
a.push_back(0);
long long cur = a[i] * (long long) v + carry;
carry = (int) (cur / base);
a[i] = (int) (cur % base);
//asm("divl %%ecx" : "=a"(carry), "=d"(a[i]) : "A"(cur), "c"(base));
}
trim();
}
bigint operator*(int v) const {
bigint res = *this;
res *= v;
return res;
}
friend pair<bigint, bigint> divmod(const bigint &a1, const bigint &b1) {
int norm = base / (b1.a.back() + 1);
bigint a = a1.abs() * norm;
bigint b = b1.abs() * norm;
bigint q, r;
q.a.resize(a.a.size());
for (int i = a.a.size() - 1; i >= 0; i--) {
r *= base;
r += a.a[i];
int s1 = r.a.size() <= b.a.size() ? 0 : r.a[b.a.size()];
int s2 = r.a.size() <= b.a.size() - 1 ? 0 : r.a[b.a.size() - 1];
int d = ((long long) base * s1 + s2) / b.a.back();
r -= b * d;
while (r < 0)
r += b, --d;
q.a[i] = d;
}
q.sign = a1.sign * b1.sign;
r.sign = a1.sign;
q.trim();
r.trim();
return make_pair(q, r / norm);
}
bigint operator/(const bigint &v) const {
return divmod(*this, v).first;
}
bigint operator%(const bigint &v) const {
return divmod(*this, v).second;
}
void operator/=(int v) {
if (v < 0)
sign = -sign, v = -v;
for (int i = (int) a.size() - 1, rem = 0; i >= 0; --i) {
long long cur = a[i] + rem * (long long) base;
a[i] = (int) (cur / v);
rem = (int) (cur % v);
}
trim();
}
bigint operator/(int v) const {
bigint res = *this;
res /= v;
return res;
}
int operator%(int v) const {
if (v < 0)
v = -v;
int m = 0;
for (int i = a.size() - 1; i >= 0; --i)
m = (a[i] + m * (long long) base) % v;
return m * sign;
}
void operator+=(const bigint &v) {
*this = *this + v;
}
void operator-=(const bigint &v) {
*this = *this - v;
}
void operator*=(const bigint &v) {
*this = *this * v;
}
void operator/=(const bigint &v) {
*this = *this / v;
}
bool operator<(const bigint &v) const {
if (sign != v.sign)
return sign < v.sign;
if (a.size() != v.a.size())
return a.size() * sign < v.a.size() * v.sign;
for (int i = a.size() - 1; i >= 0; i--)
if (a[i] != v.a[i])
return a[i] * sign < v.a[i] * sign;
return false;
}
bool operator>(const bigint &v) const {
return v < *this;
}
bool operator<=(const bigint &v) const {
return !(v < *this);
}
bool operator>=(const bigint &v) const {
return !(*this < v);
}
bool operator==(const bigint &v) const {
return !(*this < v) && !(v < *this);
}
bool operator!=(const bigint &v) const {
return *this < v || v < *this;
}
void trim() {
while (!a.empty() && !a.back())
a.pop_back();
if (a.empty())
sign = 1;
}
bool isZero() const {
return a.empty() || (a.size() == 1 && !a[0]);
}
bigint operator-() const {
bigint res = *this;
res.sign = -sign;
return res;
}
bigint abs() const {
bigint res = *this;
res.sign *= res.sign;
return res;
}
long long longValue() const {
long long res = 0;
for (int i = a.size() - 1; i >= 0; i--)
res = res * base + a[i];
return res * sign;
}
friend bigint gcd(const bigint &a, const bigint &b) {
return b.isZero() ? a : gcd(b, a % b);
}
friend bigint lcm(const bigint &a, const bigint &b) {
return a / gcd(a, b) * b;
}
void read(const string &s) {
sign = 1;
a.clear();
int pos = 0;
while (pos < (int) s.size() && (s[pos] == '-' || s[pos] == '+')) {
if (s[pos] == '-')
sign = -sign;
++pos;
}
for (int i = s.size() - 1; i >= pos; i -= base_digits) {
int x = 0;
for (int j = max(pos, i - base_digits + 1); j <= i; j++)
x = x * 10 + s[j] - '0';
a.push_back(x);
}
trim();
}
friend istream& operator>>(istream &stream, bigint &v) {
string s;
stream >> s;
v.read(s);
return stream;
}
friend ostream& operator<<(ostream &stream, const bigint &v) {
if (v.sign == -1)
stream << '-';
stream << (v.a.empty() ? 0 : v.a.back());
for (int i = (int) v.a.size() - 2; i >= 0; --i)
stream << setw(base_digits) << setfill('0') << v.a[i];
return stream;
}
static vector<int> convert_base(const vector<int> &a, int old_digits, int new_digits) {
vector<long long> p(max(old_digits, new_digits) + 1);
p[0] = 1;
for (int i = 1; i < (int) p.size(); i++)
p[i] = p[i - 1] * 10;
vector<int> res;
long long cur = 0;
int cur_digits = 0;
for (int i = 0; i < (int) a.size(); i++) {
cur += a[i] * p[cur_digits];
cur_digits += old_digits;
while (cur_digits >= new_digits) {
res.push_back(int(cur % p[new_digits]));
cur /= p[new_digits];
cur_digits -= new_digits;
}
}
res.push_back((int) cur);
while (!res.empty() && !res.back())
res.pop_back();
return res;
}
typedef vector<long long> vll;
static vll karatsubaMultiply(const vll &a, const vll &b) {
int n = a.size();
vll res(n + n);
if (n <= 32) {
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
res[i + j] += a[i] * b[j];
return res;
}
int k = n >> 1;
vll a1(a.begin(), a.begin() + k);
vll a2(a.begin() + k, a.end());
vll b1(b.begin(), b.begin() + k);
vll b2(b.begin() + k, b.end());
vll a1b1 = karatsubaMultiply(a1, b1);
vll a2b2 = karatsubaMultiply(a2, b2);
for (int i = 0; i < k; i++)
a2[i] += a1[i];
for (int i = 0; i < k; i++)
b2[i] += b1[i];
vll r = karatsubaMultiply(a2, b2);
for (int i = 0; i < (int) a1b1.size(); i++)
r[i] -= a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
r[i] -= a2b2[i];
for (int i = 0; i < (int) r.size(); i++)
res[i + k] += r[i];
for (int i = 0; i < (int) a1b1.size(); i++)
res[i] += a1b1[i];
for (int i = 0; i < (int) a2b2.size(); i++)
res[i + n] += a2b2[i];
return res;
}
bigint operator*(const bigint &v) const {
vector<int> a6 = convert_base(this->a, base_digits, 6);
vector<int> b6 = convert_base(v.a, base_digits, 6);
vll a(a6.begin(), a6.end());
vll b(b6.begin(), b6.end());
while (a.size() < b.size())
a.push_back(0);
while (b.size() < a.size())
b.push_back(0);
while (a.size() & (a.size() - 1))
a.push_back(0), b.push_back(0);
vll c = karatsubaMultiply(a, b);
bigint res;
res.sign = sign * v.sign;
for (int i = 0, carry = 0; i < (int) c.size(); i++) {
long long cur = c[i] + carry;
res.a.push_back((int) (cur % 1000000));
carry = (int) (cur / 1000000);
}
res.a = convert_base(res.a, 6, base_digits);
res.trim();
return res;
}
};
//use : bigint var;
//template for biginit over
int main()
{
bigint var=10909000890789;
cout<<var;
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Drastic difference in GCC and Clang code performance - c++

Related

Question about Dynamic memory allocation in C++ for 3D array

Finding the total area and cover area of multiple polygon

How to optimize my C++ OpenMp Matrix Multiplication code

BigInt multiplication and to_string implementation outputs too many zeros

Storing a Big Number in a Variable and Looping

Categories

Resources