I am tasked to implement from scratch addition and subtraction of signed integers of arbitrary size. Such an integer is stored in an array of 64-bit unsigned integers. The least significant bit of the array's first element is the least significant bit of the whole integer. An array of size d represents a signed integer of at most 64 * d - 1 bits. Format of the integer should not be changed.
I came up with the following:
// Add huge integers: z[] = x[] + y[]
template<typename ing>
inline void addHint(uint64_t *z, uint64_t *x, uint64_t *y, ing d)
{
uint64_t carry = 0;
for (ing i = 0; i < d; ++i)
{
uint64_t zi = x[i] + y[i];
// uint64_t carryNew = zi < x[i] or zi < y[i];
uint64_t carryNew = zi < x[i]; // zi < y[i] unnecessary.
z[i] = zi + carry;
carry = carryNew or z[i] < zi;
}
}
// Subtract huge integers: x[] = z[] - y[]
template<typename ing>
inline void subHint(uint64_t *x, uint64_t *z, uint64_t *y, ing d)
{
uint64_t carry = 0;
for (ing i = 0; i < d; ++i)
{
uint64_t xi = z[i] - y[i];
// uint64_t carryNew = z[i] < y[i];
uint64_t carryNew = z[i] < xi; // Somehow x86-64 g++ 8.3 -O2 dumps fewer assembly lines than the above according to godbolt.org
x[i] = xi - carry;
carry = carryNew or xi < x[i];
}
}
My team is unsatisfied with the speed. Can it be improved?
Thanks!
mp_limb_t mpn_add_n (mp_limb_t *rp, const mp_limb_t *s1p, const mp_limb_t *s2p, mp_size_t n)
and
mp_limb_t mpn_sub_n (mp_limb_t *rp, const mp_limb_t *s1p, const mp_limb_t *s2p, mp_size_t n)
in GMP are what you were looking for.
Related
Regarding the code shown below:
#include <cmath>
int main()
{
const int n = 10000;
const int K = 10;
double* matrix = new double[n * n];
for(int k = 0; k < K; ++k) {
for(int j = 0; j < n; ++j) {
for(int i = 0; i < n; ++i) {
double ai = (double)i/double(n);
double aj = (double)i/double(n);
matrix[i * n + j] += pow(n, (double)k / K) / exp((double)k / K) * pow(sin(ai),2) * pow(sin(aj),2);
}
}
}
}
Is the way that lines
double ai = (double)i/double(n);
double aj = (double)i/double(n);
are defined is because we want floating-point division as opposed to integer division?
In addition, why are the operands cast in the way they are, i.e (double)i/double(n) instead of double(i)/double(n)?
Yes because if i and n are two integers as follow:
int i = ...;
int n = ...;
double ai = i/n
This will be an integer division. Say i=5 and n=9, even ai being a double it will end up with 0 as result.
You can either cast i or n to tell the compiler that you want a float division.
Yes. Actually, it would be enough to cast just one of the operands, but some programmers prefer to cast all the operands for consistency and more clarity.
I have a problem with the NIST/Diehard Binary Matrix test. It's about dividing a binary sequence into a 32x32 matrix and calculating its rank. After calculating ranks I need to compute a xi^2 value and then calculate p-value(must be from 0 to 1). I'm getting p-value extremely small even in a random sequence.
I've tried to hardcode some small examples and getting my p-value right though I think my problem is in reading a binary sequence file and getting bits from it.
This is reading from a file and converting to bits sequence.
ifstream fin("seq1.bin", ios::binary);
fin.seekg(0, ios::end);
int n = fin.tellg();
unsigned int start, end;
char *buf = new char[n];
fin.seekg(0, ios::beg);
fin.read(buf, n);
n *= 8;
bool *s = new bool[n];
for (int i = 0; i < n / 8; i++) {
for (int j = 7; j >= 0; j--) {
s[(i) * 8 + 7 - j] = (bool)((buf[i] >> j) & 1);
}
}
Then I form my matrix and calculate it's rank
int *ranks = new int[N];
for (int i = 0; i < N; i++) {
bool *arr = new bool[m*q];
copy(s + i * m*q, s +(i * m*q) + (m * q), arr);
ranks[i] = binary_rank(arr, m, q);
}
Cheking occurance in ranks
int count_occurrences(int arr[], int n, int x){
int result = 0;
for (int i = 0; i < n; i++)
if (x == arr[i])
result++;
return result;
}
Calculating xi^2 and p-value
double calculate_xi(int fm, int fm_1, int remaining, int N) {
double N1 = 0.2888*N;
double N2 = 0.5776*N;
double N3 = 0.1336*N;
double x1 = (fm - N1)*(fm - N1) / N1;
double x2 = (fm_1 - N2)*(fm_1 - N2) / N2;
double x3 = (remaining - N3)*(remaining - N3) / N3;
return x1 + x2 + x3;
}
double calculate_pvalue(double xi2) {
return exp(-(xi2 / 2));
}
I expect p-value between 0 and 1 but getting 0 every time. It's because of the extremely big xi^2 value and I couldn't find what I've done wrong. Could you please help me to get things right.
For this part:
for (int i = 0; i < n / 8; i++) {
for (int j = 7; j >= 0; j--) {
s[(i) * 8 + 7 - j] = (bool)((buf[i] >> j) & 1);
}
}
when you add elements to s array, looks like you switch the position of bytes inside each character: the last bit in character in buf goes into the first bit in character in s array, because the shift initially is 7, so you take first bit in char from buf[], but for s[] it looks to be 0, resulting in swapping. It is easy to verify with debugger though, as from code it is not so obvious. Thanks.
I have a function:
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
for (size_t i = 0; i < size; ++i)
c[i] = (a[i]*b[i])%p;
}
This function performs many modulo multiplication for arrays of integer.
All integers are positive.
And I need to improve its performance.
I thought about SSE and AVX. But they don't have an operation to vectorize modulo multiplication.
Or maybe I'm wrong?
Maybe anybody know any posibility to solve this problem?
At first I want to note that modulo operation can be realized with using of float point numbers:
d % p = d - int(float(d)/float(p))*p.
Although the amount of operation in right part is greater then in left one this part is preferable because it can be vectorized with using of SSE/AVX.
An implementation with SSE4.1 for 32x32 => 32-bit integer multiplication. Note that conversion from FP back to integer is done with round-to-nearest; use truncation toward zero (cvttps_epi32) if you want semantics like C float->integer conversions.
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m128 _k = _mm_set1_ps(1.0f / p);
__m128i _p = _mm_set1_epi32(p);
for (size_t i = 0; i < size; i += 4)
{
__m128i _a = _mm_loadu_si128((__m128i*)(a + i));
__m128i _b = _mm_loadu_si128((__m128i*)(b + i));
__m128i _d = _mm_mullo_epi32(_a, _b);
__m128i _e = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m128i _c = _mm_sub_epi32(_d, _mm_mullo_epi32(_e, _p));
_mm_storeu_si128((__m128i*)(c + i), _c);
}
}
An implementation with using of AVX :
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m256 _k = _mm256_set1_ps(1.0f / p);
__m256i _p = _mm256_set1_epi32(p);
for (size_t i = 0; i < size; i += 8)
{
__m256i _a = _mm256_loadu_si128((__m256i*)(a + i));
__m256i _b = _mm256_loadu_si128((__m256i*)(b + i));
__m256i _d = _mm256_mullo_epi32(_a, _b);
__m256i _e = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m256i _c = _mm256_sub_epi32(_d, _mm256_mullo_epi32(_e, _p));
_mm256_storeu_si128((__m256i*)(c + i), _c);
}
}
Actually there is an instrinsic that is performing this operation:
_mm256_irem_epi32
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_irem_epi32
I am not familiar with x86_64 intrinsics, I'd like to have the following operations using 256bit vector registers.
I was using _mm256_maddubs_epi16(a, b); however, it seems that this instruction has overflow issue since char*char can exceeds 16-bit maximum value. I have issue understanding _mm256_unpackhi_epi32 and related instructions.
Can anyone elaborate me and show me the light to the destination? Thank you!
int sumup_char_arrays(char *A, char *B, int size) {
assert (size % 32 == 0);
int sum = 0;
for (int i = 0; i < size; i++) {
sum += A[i]*B[i];
}
return sum;
}
I've figured out the solution, any idea to improve it, especially the final stage of reduction.
int sumup_char_arrays(char *A, char *B, int size) {
assert (size % 32 == 0);
int sum = 0;
__m256i sum_tmp;
for (int i = 0; i < size; i += 32) {
__m256i ma_l = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)A));
__m256i ma_h = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)(A+16)));
__m256i mb_l = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)B));
__m256i mb_h = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)(B+16)));
__m256i mc = _mm256_madd_epi16(ma_l, mb_l);
mc = _mm256_add_epi32(mc, _mm256_madd_epi16(ma_h, mb_h));
sum_tmp = _mm256_add_epi32(mc, sum_tmp);
//sum += A[i]*B[i];
}
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_permute2x128_si256(sum_tmp, sum_tmp, 0x81));
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_srli_si256(sum_tmp, 8));
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_srli_si256(sum_tmp, 4));
sum = _mm256_extract_epi32(sum_tmp, 0);
return sum;
}
I need to compare a big amount of similar images of small size (up to 200x200).
So I try to implement SSIM (Structural similarity see https://en.wikipedia.org/wiki/Structural_similarity ) algorithm.
SSIM requires calculation of covariance of two 8-bit gray images.
A trivial implementation look like:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
But it has poor performance.
So I hope to improve it with using SIMD or CUDA (I heard that it can be done).
Unfortunately I have no experience to do this.
How it will look? And where I have to go?
I have another nice solution!
At first I want to mention some mathematical formulas:
averageX = Sum(x[i])/size;
averageY = Sum(y[i])/size;
And therefore:
Sum((x[i] - averageX)*(y[i] - averageY))/size =
Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size -
Sum(averageX*y[i])/size + Sum(averageX*averageY)/size =
Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size -
averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =
Sum(x[i]*y[i])/size - averageY*averageX -
averageX*averageY + averageX*averageY =
Sum(x[i]*y[i])/size - averageY*averageX;
It allows to modify our algorithm:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
for(size_t i = 0; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
And only after that we can use SIMD (I used SSE2):
#include <emmintrin.h>
inline __m128i SigmaXY(__m128i x, __m128i y)
{
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
return _mm_add_epi32(lo, hi);
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128i sums = _mm_setzero_si128();
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
}
uint32_t _sums[4];
_mm_storeu_si128(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
There is a SIMD implementation of the algorithm (I used SSE4.1):
#include <smmintrin.h>
template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
{
__m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
__m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128 sums = _mm_setzero_ps();
__m128 avgX = _mm_set1_ps(averageX);
__m128 avgY = _mm_set1_ps(averageY);
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
}
float _sums[4];
_mm_storeu_ps(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
I hope that it will useful for you.