Why is c++ foreach slower than naive single thread loop? - c++

My code is like this:
auto t1 = std::chrono::steady_clock::now();
for (int t{0}; t < 100; ++t) {
vector<int> table(256, 0);
Mat im2 = cv::imread(impth, cv::ImreadModes::IMREAD_COLOR);
im2.forEach<cv::Vec3b>([&table](cv::Vec3b &pix, const int* pos) {
for (int i{0}; i < 3; ++i) ++table[pix[i]];
});
}
auto t2 = std::chrono::steady_clock::now();
cout << "time is: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << endl;
auto t3 = std::chrono::steady_clock::now();
for (int t{0}; t < 100; ++t) {
vector<int> table(256, 0);
Mat im2 = cv::imread(impth, cv::ImreadModes::IMREAD_COLOR);
for (int r{0}; r < im2.rows; ++r) {
auto ptr = im2.ptr<uint8_t>(r);
for (int c{0}; c < im2.cols; ++c) {
for (int i{0}; i < 3; ++i) ++table[ptr[i]];
ptr += 3;
}
}
}
auto t4 = std::chrono::steady_clock::now();
cout << "time is: " << std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count() << endl;
Intuitively, I feel that foreach should work faster since it used multi-thread mechanism to do the work, but the result turns out that the foreach methods took 14759ms while the naive loop method took only 6791ms. What is the cause of this slower foreach method, and how could make it faster ?

Related

SSE slower than standard logic [duplicate]

This question already has answers here:
SSE intrinsics without compiler optimization
(2 answers)
Idiomatic way of performance evaluation?
(1 answer)
Closed 3 days ago.
Add 2 arrays element by element. Standart logic and by SSE
int count = std::pow(2, 20);
alignas(16) float* fm1 = new float[count];
alignas(16) float* fm2 = new float[count];
alignas(16) float* res = new float[count];
for (int i = 0; i < count; ++i) {
fm1[i] = static_cast<float>(i);
fm2[i] = static_cast<float>(i);
}
{
auto start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < 1000; ++j) {
for (int i = 0; i < count; ++i) {
res[i] = fm1[i] + fm2[i];
}
}
auto diff = std::chrono::high_resolution_clock::now() - start;
std::cout << "execute time duration = " << std::chrono::duration<double, std::milli>(diff).count() << " milliseconds" << std::endl;
}
{
assert(count % 4 == 0);
auto start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < 1000; ++j) {
for (int i = 0; i < count; i += 4) {
__m128 a = _mm_load_ps(&fm1[i]);
__m128 b = _mm_load_ps(&fm2[i]);
__m128 r = _mm_add_ps(a, b);
_mm_store_ps(&res[i], r);
}
}
auto diff = std::chrono::high_resolution_clock::now() - start;
std::cout << "execute time duration = " << std::chrono::duration<double, std::milli>(diff).count() << " milliseconds" << std::endl;
}
result
execute time duration = 1692.19 milliseconds
execute time duration = 2339.49 milliseconds
laptop configuration
11th Gen Intel(R) Core(TM) i7-11370H # 3.30GHz 3.30 GHz
16,0 ГБ
I expect that SSE logic wil be faster at least 3 times, but it slover

How can this for loop be optimized to run faster without parallelizing or SSE?

I am trying to optimize a piece of code without resorting to parallelizing / SSE.
Current critical code runs in about 20ms on my PC with O2. That seems quite a bit even for ~17mil iterations.
The particular piece that is too slow is as follows:
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
Update: Changing to iterators reduced the run-time to 17ms.
for (int d = 0; d < numDims; d++)
{
std::vector<float>::iterator floodIt;
for (floodIt = floodVals[d].begin(); floodIt < floodVals[d].end(); floodIt++)
{
bins[d][(int) (*floodIt * binSteps)]++;
}
}
The full dummy code is here:
#include <vector>
#include <random>
#include <iostream>
#include <chrono>
int main()
{
// Initialize random normalized input [0, 1)
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dist(0, 0.99999);
// Initialize dimensions
const int numDims = 130;
const int numNodes = 130000;
const int binSteps = 30;
// Make dummy data
std::vector<std::vector<float>> floodVals(numDims, std::vector<float>(numNodes));
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
floodVals[d][i] = dist(gen);
}
}
// Initialize binning
std::vector<std::vector<int>> bins(numDims, std::vector<int>(binSteps, 0));
// Time critical section of code
auto start = std::chrono::high_resolution_clock::now();
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
std::cout << "Elapsed: " << elapsed.count() * 1000 << " ms" << std::endl;
return 0;
}
Try eliminating indexing on d in the inner loop, since it is constant in the inner loop anyway. This was roughly 2x faster for me.
for (int d = 0; d < numDims; d++)
{
int* const bins_d = &bins[d][0];
float* const floodVals_d = &floodVals[d][0];
for (int i = 0; i < numNodes; i++)
{
bins_d[(int) (floodVals_d[i] * binSteps)]++;
}
}

Why is my code blocking while is instroducing data from a matrix of doubles into a file in c++?

Im trying to introduce numbers of my matrix of doubles into a file, but my matrix is of 15000x15000 and when my code starts to introduce numbers into file, it blocks.
Why is my code blocking there?
i cut execution and when i opened the file, it had been writen, so i think that functions works, but it never ends, because my matrix its to big.
This is my code:
double **generarMatriz()
{
double **M;
M = new double *[N];
for (int i = 0; i < N; i++)
M[i] = new double[N];
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if (i == j)
M[i][j] = 1.0;
else if (i > j)
M[i][j] = (double)50 * (i + 1) * (j + 1) / (N + N); // Triangular inferior
else
M[i][j] = (double)-50 * (i + 1) * (j + 1) / (N + N); // Triangular superior
}
}
return M;
}
void crearFicheros(double **&M, string nombreFicheroMatriz)
{
ifstream infile(nombreFicheroMatriz);
auto t3 = high_resolution_clock::now();
if (infile)
{
infile.close();
fstream outfile;
outfile.open(nombreFicheroMatriz, ios_base::in);
printf("Fichero ya existente");
M = new double *[N];
for (int i = 0; i < N; i++)
M[i] = new double[N];
auto t1 = high_resolution_clock::now();
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
outfile >> M[i][j];
}
}
auto t2 = high_resolution_clock::now();
msGenerarMatriz = t2 - t1;
}
else
{
ofstream outfile(nombreFicheroMatriz);
//outfile.open(nombreFicheroMatriz);
auto t1 = high_resolution_clock::now();
M = generarMatriz();
auto t2 = high_resolution_clock::now();
msGenerarMatriz = t2 - t1;
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
outfile << M[i][j] << " ";
}
outfile << endl;;
}
outfile.close();
}
auto t4 = high_resolution_clock::now();
msGuardadoMatriz = t4 - t3;
}
void iterar()
{
double **M;
int m;
string nombreFicheroTexto, nombreFicheroMatriz;
auto t5 = high_resolution_clock::now();
pedirIteraciones(m, nombreFicheroMatriz, nombreFicheroTexto);
crearFicheros(M, nombreFicheroMatriz);
auto t7 = high_resolution_clock::now();
iterativo(M, nombreFicheroMatriz, nombreFicheroTexto, m);
auto t8 = high_resolution_clock::now();
auto t6 = high_resolution_clock::now();
tiempoGlobal = t6 - t5;
tiempoEjecucion = t8 - t7;
filetext << "Tiempo de ejecución de la matriz: " << tiempoEjecucion.count() << "ms" << endl;
filetext << "Tiempo global: " << tiempoGlobal.count() << "ms" << endl;
filetext.close();
}

Measuring time with chrono changes after printing

I want to measure the execution time of a program in ns in C++. For that purpose I am using the chrono library.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
return 0;
}
I measured the time and it executed in 90 ns . However when I add a printing afterwards the time changes.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
std::cout<<input[i][j];
}
return 0;
}
The time changes to 89700 ns. What could be the problem. I only want to measure the execution time of the for.

Optimizing a parallel_for implementation

I had some code which used Microsofts PPL to do parallel_for loops, and then I had to move that to Linux and Mac which made me make my own version. It does what is should and it does it with decent performance, but it is still some 20% slower than an otherwise identical PPL parallel_for loop.
I should perhaps mention that commonly 10 thousand to 100 thousand iterations are being executed, but each iteration is only a couple of square roots and multiplications. It has to run very fast, though, since it is for an interactive application.
Still new to C++ 11 so I would love it if someone more experienced could take a look at my implementation and give some feedback as to why it is not all the way there and what could be improved.
template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;
int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
blockSize++;
std::vector<std::future<void>> futures;
int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;
for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
{
for (size_t i = blockStart; i < blockEnd; ++i)
{
userLambda(i);
}
})));
blockStart += blockSize;
blockEnd = blockStart + blockSize;
if (blockStart >= end) break;
if (blockEnd > end) blockEnd = end;
}
for (std::future<void> &f: futures)
f.get();
}
The full test code is below.
#include "stdafx.h" //nothing in there in this test
#include <ppl.h>
#include <chrono>
#include <iostream>
#include <vector>
#include <future>
template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;
int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
blockSize++;
std::vector<std::future<void>> futures;
int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;
for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
{
for (size_t i = blockStart; i < blockEnd; ++i)
{
userLambda(i);
}
})));
blockStart += blockSize;
blockEnd = blockStart + blockSize;
if (blockStart >= end) break;
if (blockEnd > end) blockEnd = end;
}
for (std::future<void> &f: futures)
f.get();
}
int main()
{
//serial execution
std::vector<double> valueSerial(1000);
auto startSerial = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000; i++)
for (int j = 0; j < 1000000; j++)
valueSerial[i] += sqrt(abs(cos(sin(sqrt(i)))));
auto durationSerial = (std::chrono::high_resolution_clock::now() - startSerial).count() / 1000;
std::cout << durationSerial << " Serial" << std::endl;
//PPL parallel for
std::vector<double> valueParallelForPPL(1000);
auto startParallelForPPL = std::chrono::high_resolution_clock::now();
Concurrency::parallel_for(size_t(0), size_t(1000), [&](size_t i)
{
for (int j = 0; j < 1000000; j++)
valueParallelForPPL[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelForPPL = (std::chrono::high_resolution_clock::now() - startParallelForPPL).count() / 1000;
std::cout << durationParallelForPPL << " PPL parallel for"<<std::endl;
//my parallel for
std::vector<double> valueParallelFor(1000);
auto startParallelFor = std::chrono::high_resolution_clock::now();
parallel_forMine(0, 1000, [&](size_t i)
{
for (int j = 0; j < 1000000; j++)
valueParallelFor[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelFor = (std::chrono::high_resolution_clock::now() - startParallelFor).count() / 1000;
std::cout << durationParallelFor << " My parallel for"<<std::endl;
//only really to make sure the compiler doesn't optimize everything away
for (int i = 0; i < valueSerial.size();i++)
if (valueSerial[i] != valueParallelFor[i] || valueParallelFor[i]!= valueParallelForPPL[i])
std::cout << "error";
std::cin.get();
return 0;
}