Simple SSE loop slower than non-SSE version - c++

I am trying to compare SSE float[4] addition to standard float[4] addition. As a demo I compute the sum of the summed components, with and without SSE:
#include <iostream>
#include <vector>
struct Point4
data[0] = 0;
data[1] = 0;
data[2] = 0;
data[3] = 0;
float data[4];
void Standard()
Point4 a;[0] = 1.0f;[1] = 2.0f;[2] = 3.0f;[3] = 4.0f;
Point4 b;[0] = 1.0f;[1] = 6.0f;[2] = 3.0f;[3] = 5.0f;
float total = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
for(unsigned int component = 0; component < 4; ++component)
total +=[component] +[component];
std::cout << "total: " << total << std::endl;
void Vectorized()
typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
v4sf a;
float* aPointer = (float*)&a;
aPointer[0] = 1.0f; aPointer[1] = 2.0f; aPointer[2] = 3.0f; aPointer[3] = 4.0f;
v4sf b;
float* bPointer = (float*)&b;
bPointer[0] = 1.0f; bPointer[1] = 6.0f; bPointer[2] = 3.0f; bPointer[3] = 5.0f;
v4sf result;
float* resultPointer = (float*)&result;
resultPointer[0] = 0.0f;
resultPointer[1] = 0.0f;
resultPointer[2] = 0.0f;
resultPointer[3] = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
result += a + b; // Vectorized operation
// Sum the components of the result (this is done with the "total += " in the Standard() loop
float total = 0.0f;
for(unsigned int component = 0; component < 4; ++component)
total += resultPointer[component];
std::cout << "total: " << total << std::endl;
int main()
// Standard();
return 0;
However, the code seems to be faster (~.2 seconds) with the standard method than with the vectorized (~.4 seconds) method. Is it because of the for loop to sum the v4sf values? Is there a better operation I can use to time the difference between these two techniques and still compare the output to make sure there were no differences between the two?

Then reason your version is slower as SSE is that you have to unpack from an SSE register to a scalar register 4 times every iteration, which has more of an overhead than what you gain from the vectorized addition. Look at the disassembly and you should get a clearer picture.
I think what you want to do is the following (which is faster with SSE):
for(unsigned int i = 0; i < 1e6; ++i)
result += a + b; // Vectorized operation
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
total += resultPointer[component];
Also the following might be even faster:
for(unsigned int i = 0; i < 1e6/4; ++i)
result0 += a + b; // Vectorized operation
result1 += a + b; // Vectorized operation
result2 += a + b; // Vectorized operation
result3 += a + b; // Vectorized operation
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
total += resultPointer0[component];
total += resultPointer1[component];
total += resultPointer2[component];
total += resultPointer3[component];


Compiling c++ OpenACC parallel CPU code using GCC (G++)

When trying to compile OpenACC code with GCC-9.3.0 (g++) configured with --enable-languages=c,c++,lto --disable-multilib the following code does not use multiple cores, whereas if the same code is compiled with the pgc++ compiler it does use multiple cores.
g++ compilation: g++ -lgomp -Ofast -o jsolve -fopenacc jsolvec.cpp
pgc++ compilation: pgc++ -o jsolvec.exe jsolvec.cpp -fast -Minfo=opt -ta=multicore
Code from OpenACC Tutorial1/solver
// Jacobi iterative method for solving a system of linear equations
// This is guaranteed to converge if the matrix is diagonally dominant,
// so we artificially force the matrix to be diagonally dominant.
// See
// We solve for vector x in Ax = b
// Rewrite the matrix A as a
// lower triangular (L),
// upper triangular (U),
// and diagonal matrix (D).
// Ax = (L + D + U)x = b
// rearrange to get: Dx = b - (L+U)x --> x = (b-(L+U)x)/D
// we can do this iteratively: x_new = (b-(L+U)x_old)/D
// build with TYPE=double (default) or TYPE=float
// build with TOLERANCE=0.001 (default) or TOLERANCE= any other value
// three arguments:
// vector size
// maximum iteration count
// frequency of printing the residual (every n-th iteration)
#include <cmath>
#include <omp.h>
#include <cstdlib>
#include <iostream>
#include <iomanip>
using std::cout;
#ifndef TYPE
#define TYPE double
#define TOLERANCE 0.001
init_simple_diag_dom(int nsize, TYPE* A)
int i, j;
// In a diagonally-dominant matrix, the diagonal element
// is greater than the sum of the other elements in the row.
// Scale the matrix so the sum of the row elements is close to one.
for (i = 0; i < nsize; ++i) {
TYPE sum;
sum = (TYPE)0;
for (j = 0; j < nsize; ++j) {
x = (rand() % 23) / (TYPE)1000;
A[i*nsize + j] = x;
sum += x;
// Fill diagonal element with the sum
A[i*nsize + i] += sum;
// scale the row so the final matrix is almost an identity matrix
for (j = 0; j < nsize; j++)
A[i*nsize + j] /= sum;
} // init_simple_diag_dom
main(int argc, char **argv)
int nsize; // A[nsize][nsize]
int i, j, iters, max_iters, riter;
double start_time, elapsed_time;
TYPE residual, err, chksum;
TYPE *A, *b, *x1, *x2, *xnew, *xold, *xtmp;
// set matrix dimensions and allocate memory for matrices
nsize = 0;
if (argc > 1)
nsize = atoi(argv[1]);
if (nsize <= 0)
nsize = 1000;
max_iters = 0;
if (argc > 2)
max_iters = atoi(argv[2]);
if (max_iters <= 0)
max_iters = 5000;
riter = 0;
if (argc > 3)
riter = atoi(argv[3]);
if (riter <= 0)
riter = 200;
cout << "nsize = " << nsize << ", max_iters = " << max_iters << "\n";
A = new TYPE[nsize*nsize];
b = new TYPE[nsize];
x1 = new TYPE[nsize];
x2 = new TYPE[nsize];
// generate a diagonally dominant matrix
init_simple_diag_dom(nsize, A);
// zero the x vectors, random values to the b vector
for (i = 0; i < nsize; i++) {
x1[i] = (TYPE)0.0;
x2[i] = (TYPE)0.0;
b[i] = (TYPE)(rand() % 51) / 100.0;
start_time = omp_get_wtime();
// jacobi iterative solver
residual = TOLERANCE + 1.0;
iters = 0;
xnew = x1; // swap these pointers in each iteration
xold = x2;
while ((residual > TOLERANCE) && (iters < max_iters)) {
// swap input and output vectors
xtmp = xnew;
xnew = xold;
xold = xtmp;
#pragma acc parallel loop
for (i = 0; i < nsize; ++i) {
TYPE rsum = (TYPE)0;
#pragma acc loop reduction(+:rsum)
for (j = 0; j < nsize; ++j) {
if (i != j) rsum += A[i*nsize + j] * xold[j];
xnew[i] = (b[i] - rsum) / A[i*nsize + i];
// test convergence, sqrt(sum((xnew-xold)**2))
residual = 0.0;
#pragma acc parallel loop reduction(+:residual)
for (i = 0; i < nsize; i++) {
TYPE dif;
dif = xnew[i] - xold[i];
residual += dif * dif;
residual = sqrt((double)residual);
if (iters % riter == 0 ) cout << "Iteration " << iters << ", residual is " << residual << "\n";
elapsed_time = omp_get_wtime() - start_time;
cout << "\nConverged after " << iters << " iterations and " << elapsed_time << " seconds, residual is " << residual << "\n";
// test answer by multiplying my computed value of x by
// the input A matrix and comparing the result with the
// input b vector.
err = (TYPE)0.0;
chksum = (TYPE)0.0;
for (i = 0; i < nsize; i++) {
TYPE tmp;
xold[i] = (TYPE)0.0;
for (j = 0; j < nsize; j++)
xold[i] += A[i*nsize + j] * xnew[j];
tmp = xold[i] - b[i];
chksum += xnew[i];
err += tmp * tmp;
err = sqrt((double)err);
cout << "Solution error is " << err << "\n";
if (err > TOLERANCE)
cout << "****** Final Solution Out of Tolerance ******\n" << err << " > " << TOLERANCE << "\n";
delete A;
delete b;
delete x1;
delete x2;
return 0;
It's not yet supported in GCC to use OpenACC to schedule parallel loops onto multicore CPUs. Using OpenMP works for that, of course, and you can have code with mixed OpenACC (for GPU offloading, as already present in your code) and OpenMP directives (for CPU parallelization, not yet present in your code), so that the respective mechanism will be used depending on whether compiling with -fopenacc vs. -fopenmp.
Like PGI are doing, it certainly can be supported in GCC; we'll certainly be able to implement that, but it has not yet been scheduled, has not yet been funded for GCC.

Gradient descent converging towards the wrong value

I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
return 0;
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
return res / 2.0 / (double)n;
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
p += alpha * pg / n;
m += alpha * mg / n;
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm
This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
for (int i = 1; i <= 8; i++) {
return 0;
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
return c;
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
omega = omega2;
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
std::cout << "\tError : " << res * .5/(double)n << std::endl;

Issue with a DCT implementation

I have to implement a DCT algorithm in C++, here is my present code :
// dct: computes the discrete cosinus tranform of a 8x8 block
template<typename Tin=uchar,typename Tout=float>
inline cv::Mat_<Tout> dct(const cv::Mat_<Tin>& oBlock) {
int indexNumber;
float pi = 3.14159265359;
float fcoscos, fxy, cos1, cos2, forCos1, forCos2;
cv::Mat_<Tout> resultBloc(8, 8);
for (int u = 0; u < oBlock.rows; u++){
for (int v = 0; v < oBlock.cols; v++){
float cu=0, cv=0, Result=0;
// calcul c(u)
if (u == 0){
cu = (float)sqrt((float)1 / (float)oBlock.rows);
else {
cu = (float)sqrt((float)2 / (float)oBlock.rows);
// calcul c(v)
if (v == 0){
cv = (float)sqrt((float)1 / (float)oBlock.cols);
else {
cv = (float)sqrt((float)2 / (float)oBlock.cols);
float sums = 0;
for (int x = 0; x < oBlock.rows; x++){
for (int y = 0; y < oBlock.cols; y++){
indexNumber = x * oBlock.rows + y;
fxy = (int)[indexNumber];
forCos1 = (pi*((2 * x) + 1)*u) / (2 * oBlock.rows);
forCos2 = (pi*((2 * y) + 1)*v) / (2 * oBlock.cols);
cos1 = cos(forCos1);
cos2 = cos(forCos2);
fcoscos = fxy * cos1 * cos2;
sums += fcoscos;
// calcul total
Result = sums*cu*cv;
indexNumber = u * oBlock.rows + v;[indexNumber] = Result;
return resultBloc;
I compared the result with the cv DCT algorithm as follow :
cv::Mat_<float> tempImage(8,8);
for (int i = 0; i < vecImageCut[0].cols*vecImageCut[0].rows; i++){[i] = (int)vecImageCut[0].data[i];
cv::Mat_<float> dctCV;
cv::dct(tempImage, dctCV);
for (int i = 0; i < blocksAfterDCT[0].cols*blocksAfterDCT[0].rows; i++){
std::cerr << "Difference DCT for pixel " << i << " : " <<[i] - blocksAfterDCT[0].data[i] << std::endl;
The results between my DCT and the cv DCT are very different so i assume my DCT algorithm is wrong but i searched for hours and i can't find my mistake, can anyone tell me where i did something wrong ?
Your index calculations are wrong. In indexNumber = x * oBlock.rows + y;, since x is counting rows it needs to be multiplied by the number of columns:
indexNumber = x * oBlock.cols + y;
The same for indexNumber = u * oBlock.rows + v;
indexNumber = u * oBlock.cols + v;

Seeking knowledge on array of arrays memory performance

Context: Multichannel real time digital audio processing.
Access pattern: "Column-major", like so:
for (int sample = 0; sample < size; ++sample)
for (int channel = 0; channel < size; ++channel)
auto data = arr[channel][sample];
// do some computations
I'm seeking advice on how to make the life easier for the CPU and memory, in general. I realize interleaving the data would be better, but it's not possible.
My theory is, that as long as you sequentially access memory for a while, the CPU will prefetch it - will this hold for N (channel) buffers? What about size of the buffers, any "breaking points"?
Will it be very beneficial to have the channels in contiguous memory (increasing locality), or does that only hold for very small buffers (like, size of cache lines)? We could be talking buffersizes > 100 kb apart.
I guess there would also be a point where the time of the computational part makes memory optimizations negligible - ?
Is this a case, where manual prefetching makes sense?
I could test/profile my own system, but I only have that - 1 system. So any design choices I make may only positively affect that particular system. Any knowledge on these matters are appreciated, links, literature etc., platform specific knowledge.
Let me know if the question is too vague, I primarily thought it would be nice to have some wiki-ish experience / info on this area.
I created a program, that tests the three cases I mentioned (distant, adjecant and contiguous mentioned in supposedly increasing performance order), which tests these patterns on small and big data sets. Maybe people will run it and report anomalies.
#include <iostream>
#include <chrono>
#include <algorithm>
const int b = 196000;
const int s = 64 / sizeof(float);
const int extra_it = 16;
float sbuf1[s];
float bbuf1[b];
int main()
float sbuf2[s];
float bbuf2[b];
float * sbuf3 = new float[s];
float * bbuf3 = new float[b];
float * sbuf4 = new float[s * 3];
float * bbuf4 = new float[b * 3];
float use = 0;
while (1)
using namespace std;
int c;
bool sorb;
cout << "small or big test (0/1)? ";
if (!(cin >> sorb))
return -1;
cout << endl << "test distant buffers (0), contiguous access (1) or adjecant access (2)? ";
if (!(cin >> c))
return -1;
auto t = std::chrono::high_resolution_clock::now();
if (c == 0)
// "worst case scenario", 3 distant buffers constantly touched
if (sorb)
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
sbuf1[i] = k; // static memory
sbuf2[i] = k; // stack memory
sbuf3[i] = k; // heap memory
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
bbuf1[i] = k; // static memory
bbuf2[i] = k; // stack memory
bbuf3[i] = k; // heap memory
else if (c == 1)
// "best case scenario", only contiguous memory touched, interleaved
if (sorb)
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s * 3; i += 3)
sbuf4[i] = k;
sbuf4[i + 1] = k;
sbuf4[i + 2] = k;
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b * 3; i += 3)
bbuf4[i] = k;
bbuf4[i + 1] = k;
bbuf4[i + 2] = k;
else if (c == 2)
// "compromise", adjecant memory buffers touched
if (sorb)
auto b1 = sbuf4;
auto b2 = sbuf4 + s;
auto b3 = sbuf4 + s * 2;
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
b1[i] = k;
b2[i] = k;
b3[i] = k;
auto b1 = bbuf4;
auto b2 = bbuf4 + b;
auto b3 = bbuf4 + b * 2;
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
b1[i] = k;
b2[i] = k;
b3[i] = k;
cout << chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count() << " ms" << endl;
// basically just touching the buffers, avoiding clever optimizations
use += std::accumulate(sbuf1, sbuf1 + s, 0);
use += std::accumulate(sbuf2, sbuf2 + s, 0);
use += std::accumulate(sbuf3, sbuf3 + s, 0);
use += std::accumulate(sbuf4, sbuf4 + s * 3, 0);
use -= std::accumulate(bbuf1, bbuf1 + b, 0);
use -= std::accumulate(bbuf2, bbuf2 + b, 0);
use -= std::accumulate(bbuf3, bbuf3 + b, 0);
use -= std::accumulate(bbuf4, bbuf4 + b * 3, 0);
std::cout << use;
On my Intel i7-3740qm surprisingly, distant buffers consistently outperforms the more locality-friendly tests. It is close, however.

SSE addition producing garbage

I am trying to compare SSE float[4] addition to standard float[4] addition. I tried this:
#include <iostream>
#include <vector>
struct Point4
data[0] = 0;
data[1] = 0;
data[2] = 0;
data[3] = 0;
float data[4];
static float SumOfDifferences(const Point4& a, const Point4& b)
// This function only returns the sum of the sum of the components
float sumValues = 0.0f;
for(unsigned int i = 0; i < 4; ++i)
sumValues +=[i] +[i];
return sumValues;
void Standard()
Point4 a;[0] = 1;[1] = 2;[2] = 3;[3] = 4;
Point4 b;[0] = 1;[1] = 6;[2] = 3;[3] = 5;
float total = 0.0f;
for(unsigned int i = 0; i < 1e6; ++i)
total += SumOfDifferences(a, b);
std::cout << "total: " << total << std::endl;
void Vectorized()
typedef int v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
v4sf a;
float* aPointer = (float*)&a;
aPointer[0] = 1; aPointer[1] = 2; aPointer[2] = 3; aPointer[3] = 4;
v4sf b;
float* bPointer = (float*)&b;
bPointer[0] = 1; bPointer[1] = 2; bPointer[2] = 3; bPointer[3] = 4;
float total = 0.0f;
v4sf result;
float* resultPointer = (float*)&result;
for(unsigned int i = 0; i < 1e6; ++i)
result = a + b; // Vectorized operation
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
total += resultPointer[component];
std::cout << "total: " << total << std::endl;
int main()
// Standard();
return 0;
but the output is 'inf' for the Vectorized() function. When I stepped through with a debugger, the values of 'result' seem to be garbage (i'd expect them to be (0, 4, 0, 1) ). Where am I going wrong here?
Try typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
I get 2e+07 as the result.