Optimizing a parallel_for implementation - c++

I had some code which used Microsofts PPL to do parallel_for loops, and then I had to move that to Linux and Mac which made me make my own version. It does what is should and it does it with decent performance, but it is still some 20% slower than an otherwise identical PPL parallel_for loop.
I should perhaps mention that commonly 10 thousand to 100 thousand iterations are being executed, but each iteration is only a couple of square roots and multiplications. It has to run very fast, though, since it is for an interactive application.
Still new to C++ 11 so I would love it if someone more experienced could take a look at my implementation and give some feedback as to why it is not all the way there and what could be improved.
template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;
int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
blockSize++;
std::vector<std::future<void>> futures;
int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;
for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
{
for (size_t i = blockStart; i < blockEnd; ++i)
{
userLambda(i);
}
})));
blockStart += blockSize;
blockEnd = blockStart + blockSize;
if (blockStart >= end) break;
if (blockEnd > end) blockEnd = end;
}
for (std::future<void> &f: futures)
f.get();
}
The full test code is below.
#include "stdafx.h" //nothing in there in this test
#include <ppl.h>
#include <chrono>
#include <iostream>
#include <vector>
#include <future>
template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;
int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
blockSize++;
std::vector<std::future<void>> futures;
int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;
for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
{
for (size_t i = blockStart; i < blockEnd; ++i)
{
userLambda(i);
}
})));
blockStart += blockSize;
blockEnd = blockStart + blockSize;
if (blockStart >= end) break;
if (blockEnd > end) blockEnd = end;
}
for (std::future<void> &f: futures)
f.get();
}
int main()
{
//serial execution
std::vector<double> valueSerial(1000);
auto startSerial = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000; i++)
for (int j = 0; j < 1000000; j++)
valueSerial[i] += sqrt(abs(cos(sin(sqrt(i)))));
auto durationSerial = (std::chrono::high_resolution_clock::now() - startSerial).count() / 1000;
std::cout << durationSerial << " Serial" << std::endl;
//PPL parallel for
std::vector<double> valueParallelForPPL(1000);
auto startParallelForPPL = std::chrono::high_resolution_clock::now();
Concurrency::parallel_for(size_t(0), size_t(1000), [&](size_t i)
{
for (int j = 0; j < 1000000; j++)
valueParallelForPPL[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelForPPL = (std::chrono::high_resolution_clock::now() - startParallelForPPL).count() / 1000;
std::cout << durationParallelForPPL << " PPL parallel for"<<std::endl;
//my parallel for
std::vector<double> valueParallelFor(1000);
auto startParallelFor = std::chrono::high_resolution_clock::now();
parallel_forMine(0, 1000, [&](size_t i)
{
for (int j = 0; j < 1000000; j++)
valueParallelFor[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelFor = (std::chrono::high_resolution_clock::now() - startParallelFor).count() / 1000;
std::cout << durationParallelFor << " My parallel for"<<std::endl;
//only really to make sure the compiler doesn't optimize everything away
for (int i = 0; i < valueSerial.size();i++)
if (valueSerial[i] != valueParallelFor[i] || valueParallelFor[i]!= valueParallelForPPL[i])
std::cout << "error";
std::cin.get();
return 0;
}

Related

How can this for loop be optimized to run faster without parallelizing or SSE?

I am trying to optimize a piece of code without resorting to parallelizing / SSE.
Current critical code runs in about 20ms on my PC with O2. That seems quite a bit even for ~17mil iterations.
The particular piece that is too slow is as follows:
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
Update: Changing to iterators reduced the run-time to 17ms.
for (int d = 0; d < numDims; d++)
{
std::vector<float>::iterator floodIt;
for (floodIt = floodVals[d].begin(); floodIt < floodVals[d].end(); floodIt++)
{
bins[d][(int) (*floodIt * binSteps)]++;
}
}
The full dummy code is here:
#include <vector>
#include <random>
#include <iostream>
#include <chrono>
int main()
{
// Initialize random normalized input [0, 1)
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dist(0, 0.99999);
// Initialize dimensions
const int numDims = 130;
const int numNodes = 130000;
const int binSteps = 30;
// Make dummy data
std::vector<std::vector<float>> floodVals(numDims, std::vector<float>(numNodes));
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
floodVals[d][i] = dist(gen);
}
}
// Initialize binning
std::vector<std::vector<int>> bins(numDims, std::vector<int>(binSteps, 0));
// Time critical section of code
auto start = std::chrono::high_resolution_clock::now();
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
std::cout << "Elapsed: " << elapsed.count() * 1000 << " ms" << std::endl;
return 0;
}
Try eliminating indexing on d in the inner loop, since it is constant in the inner loop anyway. This was roughly 2x faster for me.
for (int d = 0; d < numDims; d++)
{
int* const bins_d = &bins[d][0];
float* const floodVals_d = &floodVals[d][0];
for (int i = 0; i < numNodes; i++)
{
bins_d[(int) (floodVals_d[i] * binSteps)]++;
}
}

Why is c++ foreach slower than naive single thread loop?

My code is like this:
auto t1 = std::chrono::steady_clock::now();
for (int t{0}; t < 100; ++t) {
vector<int> table(256, 0);
Mat im2 = cv::imread(impth, cv::ImreadModes::IMREAD_COLOR);
im2.forEach<cv::Vec3b>([&table](cv::Vec3b &pix, const int* pos) {
for (int i{0}; i < 3; ++i) ++table[pix[i]];
});
}
auto t2 = std::chrono::steady_clock::now();
cout << "time is: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << endl;
auto t3 = std::chrono::steady_clock::now();
for (int t{0}; t < 100; ++t) {
vector<int> table(256, 0);
Mat im2 = cv::imread(impth, cv::ImreadModes::IMREAD_COLOR);
for (int r{0}; r < im2.rows; ++r) {
auto ptr = im2.ptr<uint8_t>(r);
for (int c{0}; c < im2.cols; ++c) {
for (int i{0}; i < 3; ++i) ++table[ptr[i]];
ptr += 3;
}
}
}
auto t4 = std::chrono::steady_clock::now();
cout << "time is: " << std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count() << endl;
Intuitively, I feel that foreach should work faster since it used multi-thread mechanism to do the work, but the result turns out that the foreach methods took 14759ms while the naive loop method took only 6791ms. What is the cause of this slower foreach method, and how could make it faster ?

Measuring time with chrono changes after printing

I want to measure the execution time of a program in ns in C++. For that purpose I am using the chrono library.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
return 0;
}
I measured the time and it executed in 90 ns . However when I add a printing afterwards the time changes.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
std::cout<<input[i][j];
}
return 0;
}
The time changes to 89700 ns. What could be the problem. I only want to measure the execution time of the for.

Very slow mutex in LLVM/OpenMP

I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));

Passing pointer as argument with std::thread function C++11

I wanted to pass a pointer to the thread function, but it gives back
error:
attempt to use a deleted function
__invoke(_VSTD::move(_VSTD::get<0>(__t)), _VSTD::move(_VSTD::get<_In...
Code fragment in main
for (int i = 0; i < threadCount; ++i) {
ptrTabThreads = new std::thread(checkMin, ptrTab[i]);
ptrTabThreads->join();
++ptrTabThreads;
}
And code for checkMin function
void checkMin(int* tab) {
int sizeOfTable = 0;
if (tab == ptrTab[threadCount-1])
sizeOfTable = partSize + additionalNumbers;
else
sizeOfTable = partSize;
mt.lock();
for (int i = 0; i < sizeOfTable; ++i) {
if (tab[i] < minValue) {
minValue = tab[i];
}
}
mt.unlock();
}
Where ptrTab is an array of pointers:
int* ptrTab[threadCount];
Full code is:
#include <iostream>
#include <thread>
#include <condition_variable>
#include <stdlib.h>
#include <climits>
#define threadCount 10
#define numbersCount 75
std::mutex mt;
int minValue = INT32_MAX;
int partSize, additionalNumbers;
int* ptrTab[threadCount];
void checkMin(int value);
void printTab(int *tab);
int main() {
int tab[numbersCount];
srand(time(NULL));
for (int i = 0; i < numbersCount; ++i) {
tab[i] = rand() % 1000;
std::cout << " " << tab[i];
}
partSize = numbersCount / threadCount;
additionalNumbers = numbersCount % threadCount;
for (int i = 0; i < threadCount-1; ++i) {
int *newTab = new int[partSize];
ptrTab[i] = newTab;
}
int *newTab = new int[partSize+additionalNumbers];
ptrTab[threadCount-1] = newTab;
int copiedElements = 0;
for (int i = 0; i < threadCount-1; ++i) {
int *tmpTab = ptrTab[i];
for (int j = 0; j < partSize; j++) {
tmpTab[j] = tab[copiedElements];
copiedElements++;
}
}
int *tmpTab = ptrTab[threadCount-1];
int elementsLeft = numbersCount-copiedElements;
for (int i = 0; i < elementsLeft; ++i) {
tmpTab[i] = tab[copiedElements];
copiedElements++;
}
/*for (int i = 0; i < threadCount; ++i) {
printTab(ptrTab[i]);
}*/
//----------------------
std::thread tabThreads[threadCount];
std::thread *ptrTabThreads = tabThreads;
for (int i = 0; i < threadCount; ++i) {
ptrTabThreads = new std::thread(checkMin, ptrTab[i]);
ptrTabThreads->join();
++ptrTabThreads;
}
std::cout << "\n\n" << minValue << "\n\n";
//for check
std::cout << "for check: minimal value is ";
int min = INT32_MAX;
for (int i = 0; i < numbersCount; ++i) {
if (tab[i] < min) {
min = tab[i];
}
}
std::cout << min << "\n\n";
}
void checkMin(int* tab) {
int sizeOfTable = 0;
if (tab == ptrTab[threadCount-1])
sizeOfTable = partSize + additionalNumbers;
else
sizeOfTable = partSize;
mt.lock();
for (int i = 0; i < sizeOfTable; ++i) {
if (tab[i] < minValue) {
minValue = tab[i];
}
}
mt.unlock();
}
void printTab(int *tab) {
for (int i = 0; i < 10; ++i) {
std::cout << tab[i] << " ";
}
std::cout << "\n\n";
}
Thank you for all your advices.
The immediate problem which triggers compilation error is right here:
void checkMin(int value);
This is the prototype of your function, and it is incorrect - it should be
void checkMin(int* value); //<-- not the pointer.
But this is not the only one! Your code makes no sense. Look at this fragment:
std::thread tabThreads[threadCount];
std::thread *ptrTabThreads = tabThreads;
for (int i = 0; i < threadCount; ++i) {
ptrTabThreads = new std::thread(checkMin, ptrTab[i]);
ptrTabThreads->join();
++ptrTabThreads;
}
What's the purpose of all this jumping with pointers? You also have a leak in your code, since you are modifying the pointer you obtained from new before deleteing it. Why not use following simple code?
std::array<std::thread, threadCount> tabThreads;
for (int i = 0; i < threadCount; ++i) {
tabThreads[i] = std::thread(checkMin, ptrTab[i]);
tabThreads[i].join();
}
This still serves no pratical purpose (application remains effectively single-threaded, since you join your thread right after creating it), but at least, the code is correct. To really do some fancy multithreading, you need your loop to look like following:
for (int i = 0; i < threadCount; ++i)
tabThreads[i] = std::thread(checkMin, ptrTab[i]);
for (std::thread& t : tabThreads) // so-called range-for loop. Nice thing!
t.join();
This will paralellize stuff!