I am learning parallel processing using Pthreads. I have a quad core processor. Unfortunately, the parallelized portion of the following code is running roughly 5X slower than the non-parallelized code. What am I doing wrong here? Thanks in advance for the help.
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
b->sum += b->arr[i];
}
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i != size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]){
clock_t begin, end;
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i != NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
begin = clock();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
begin = clock();
sum2 = arrSum(myArr, SIZE);
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
You're missing one important aspect of parallel programming.
The worker threads need to be created once per process, not for every task.
Creating and destroying threads takes time.
The solution is to use a thread pool and send tasks to the pool.
My suggestion is to use OpenMP which simplifies this task considerably and works with many compilers.
Example:
int sum = 0
#pragma omp for shared(sum)
for(int i=0; i<SIZE; ++i)
{
#pragma omp atomic
sum += myArr[i]
}
To make this work faster, do some loop unrolling - e.g. calculate the sum of 8 number in a single for loop scope.
The main problem is that you're using clock() which does not return the wall time but the cumulative CPU time. This is the most common mistake with the OpenMP tag with SO (and if the frequency listing was useful on SO it should show this).
The simplest way to get the wall time is to use a function from OpenMP: omp_get_wtime(). This works Linux and Windows with GCC, ICC, and MSVC (and I assume Clang which now supports OpenMP 3.1).
When I use this with your code I get on my four core/eight hyper-thread i7 IVB system:
Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000
Some other comments. Your scheduling is error prone. You set the array for each thread to
p.arr = myArr + i*(int)(SIZE/NTHREADS);
And then have each thread run over (SIZE/NTHREADS). This can give wrong results to to rounding errors for some values of SIZE and NTHREADS.
You should have each thread run over
int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;
And then have each thread point to the beginning of the array and do
int sum = 0;
for (i = start; i < finish; ++i){
sum += b->arr[i];
}
This is essentially what OpenMP's schedule(static) does. In fact you can get the same effect of pthreads using OpenMP by doing
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
sum += arr[i];
}
Here is the code I used
//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>
#define NTHREADS 4
#define SIZE NTHREADS*100000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
int sum = 0;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
sum += b->arr[i];
}
b->sum = sum;
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]) {
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i < NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
runTime = -omp_get_wtime();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
runTime += omp_get_wtime();
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
runTime = -omp_get_wtime();
sum2 = arrSum(myArr, SIZE);
runTime += omp_get_wtime();
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
Related
I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}
How do I get a better optimization for this piece of code using openmp.
Number of threads is 6, but can't get better performance.
I have tried different scheduling options, but i can't get it optimized better.
Is there a way of getting a better result ?
int lenght = 40000;
int idx;
long *result = new long[ size ];
#pragma omp parallel for private(idx) schedule(dynamic)
for ( int i = 0; i < lenght; i++ ) {
for ( int j = 0; j < i; j++ ) {
idx = (int)( someCalculations( i, j ) );
#pragma omp atomic
result[ idx ] += 1;
}
}
This piece of code does optimize the calculation time, but I still need a better result.
Thanks in advance.
Since OpenMP 4.0 you can write your own reduction.
The idea is :
in for loop, you tell the compiler to reduce the place you modify in each loop.
since omp doesn't know how to reduce such array, you must write your own adder my_add which will simply sum two array.
you tell omp how to use it in your reducer (myred)
#include <stdio.h>
#include <stdlib.h>
#define LEN 40000
int someCalculations(int i, int j)
{
return i * j % 40000 ;
}
/* simple adder, just sum x+y in y */
long *my_add(long * x, long *y)
{
int i;
#pragma omp parallel for private(i)
for (i = 0; i < LEN; ++i)
{
x[i] += y[i];
}
free(y);
return x;
}
/* reduction declaration:
name
type
operation to be performed
initializer */
#pragma omp declare reduction(myred: long*:omp_out=my_add(omp_out,omp_in))\
initializer(omp_priv=calloc(LEN, sizeof(long)))
int main(void)
{
int i, j;
long *result = calloc(LEN, sizeof *result);
// tell omp how to use it
#pragma omp parallel for reduction(myred:result) private (i, j)
for (i = 0; i < LEN; i++) {
for (j = 0; j < i; j++) {
int idx = someCalculations(i, j);
result[idx] += 1;
}
}
// simple display, I store it in a file and compare
// result files with/without openmp to be sure it's correct...
for (i = 0; i < LEN; ++i) {
printf("%ld\n", result[i]);
}
return 0;
}
Without -fopenmp: real 0m3.727s
With -fopenmp: real 0m0.835s
Hello I'm having a hard time with this program, I'm supposed to go trough whole data vector sequentially and sum up each one of the vectors in there in parallel using openmp(and store the sum in solution[i]). But the program gets stuck for some reason. The input vectors that I'm given aren't many but are very large (like 2.5m ints each). Any idea what am I doing wrong?
Here is the code, ps: igone the unused minVectorSize parameter:
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
#pragma omp parallel
{
unsigned long sum = 0;
int thread = omp_get_thread_num();
int threadnum = omp_get_num_threads();
int begin = thread * data[i].size() / threadnum;
int end = ((thread + 1) * data[i].size() / threadnum) - 1;
for (int j = begin; j <= end; j++) {
sum += data[i][j];
}
#pragma omp critical
{
solution[i] += sum;
}
}
}
}
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
unsigned long sum = 0;
int begin = 0;
int end = data[i].size();
#omp parallel for reduction(+:sum)
for (int j = begin; j < end; j++) {
sum += data[i][j];
}
solution[i] += sum;
}
}
Something like this should be more elegant and work better, Could you compile and comment if it works for you or doesnt
I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));
I want to calculate access time for these two ways : Row major and Column major
as we know C/C++ is Row major , so when we process in first way (Row major) we should be faster.
but look at this code in C++ language
#include <iostream>
#include <time.h>
#include <cstdio>
clock_t RowMajor()
{
char* buf =new char [20000,20000];
clock_t start = clock();
for (int i = 0; i < 20000; i++)
for (int j = 0; j <20000; j++)
{
++buf[i,j];
}
clock_t elapsed = clock() - start;
delete [] buf;
return elapsed ;
}
clock_t ColumnMajor()
{
char* buf =new char[20000,20000];
clock_t start = clock();
for (int i = 0; i < 20000; i++)
for (int j = 0; j < 20000; j++)
{
++buf[j,i];
}
clock_t elapsed = clock() - start;
delete [] buf;
return elapsed ;
}
int main()
{
std::cout << "Process Started." << std::endl;
printf( "ColumnMajor took %lu microSeconds. \n", ColumnMajor()*1000000/ (CLOCKS_PER_SEC) );
printf( "RowMajor took %lu microSeconds. \n", RowMajor() *1000000/ (CLOCKS_PER_SEC) );
std::cout << "done" << std::endl; return 0;
}
but whenever i run this code i get diffrent answers , sometimes Rowmajor time is grater than column major time and sometimes is opposite,
any help is apriciated.
in c++ the coma operator can't be used create/access matrix thing. To make a matrix you need to keep track of with and height and allocate all the memory as an array. Basically you need to create a vector with the number or elements equivalent to number of elements in the matrix and you get each element by taking the x + y * width.
clock_t RowMajor()
{
int width = 20000;
int height = 20000;
char* buf = new char[width * height];
clock_t start = clock();
for (int j = 0; j < height; j++)
for (int i = 0; i < width; i++)
{
++buf[i + width * j];
}
clock_t elapsed = clock() - start;
delete[] buf;
return elapsed;
}
for ColumnMajor the buf needs to be accessed with buf[j * width + i];
An alternative way to create a matrix (from comments, thanks to James Kanze) is to create the buffer like so: char (*buf)[20000] = new char[20000][200000]. In this case, accessing the buffer is like: buf[i][j]
The safest way to do this is to use std::vector or array, and avoid using new/delete. Use std::vector to prevent buffer write overflows:
clock_t RowMajor()
{
int width = 20000;
int height = 20000;
std::vector<char> buf;
buf.resize(width * height);
clock_t start = clock();
for (int j = 0; j <height; j++)
for (int i = 0; i <width; i++)
{
++buf[i + j * width];
}
clock_t elapsed = clock() - start;
return elapsed;
}
Thanks to Raxvan ,this is the final code works fine so far
#include <iostream>
#include <time.h>
#include <cstdio>
#include <windows.h>
int calc = 0;
clock_t RowMajor()
{
int width = 20000;
int height = 20000;
char* buf = new char[width * height];
clock_t start = clock();
for (int j = 0; j < height; j++)
for (int i = 0; i < width; i++)
{
++buf[i + width * j];
}
clock_t elapsed = clock() - start;
delete[] buf;
return elapsed;
}
clock_t ColumnMajor()
{
int width = 20000;
int height = 20000;
char* buf = new char[width * height];
clock_t start = clock();
for (int j = 0; j < height; j++)
for (int i = 0; i < width; i++)
{
++buf[j + width * i];
}
clock_t elapsed = clock() - start;
delete[] buf;
return elapsed;
}
int main()
{
std::cout << "Process Started." << std::endl;
calc= ColumnMajor() /CLOCKS_PER_SEC ;
printf( "ColumnMajor took %lu . \n", calc );
calc=RowMajor()/CLOCKS_PER_SEC ;
printf( "RowMajor took %lu . \n", calc );
std::cout << "done" << std::endl; return 0;
}