Hello I'm having a hard time with this program, I'm supposed to go trough whole data vector sequentially and sum up each one of the vectors in there in parallel using openmp(and store the sum in solution[i]). But the program gets stuck for some reason. The input vectors that I'm given aren't many but are very large (like 2.5m ints each). Any idea what am I doing wrong?
Here is the code, ps: igone the unused minVectorSize parameter:
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
#pragma omp parallel
{
unsigned long sum = 0;
int thread = omp_get_thread_num();
int threadnum = omp_get_num_threads();
int begin = thread * data[i].size() / threadnum;
int end = ((thread + 1) * data[i].size() / threadnum) - 1;
for (int j = begin; j <= end; j++) {
sum += data[i][j];
}
#pragma omp critical
{
solution[i] += sum;
}
}
}
}
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
unsigned long sum = 0;
int begin = 0;
int end = data[i].size();
#omp parallel for reduction(+:sum)
for (int j = begin; j < end; j++) {
sum += data[i][j];
}
solution[i] += sum;
}
}
Something like this should be more elegant and work better, Could you compile and comment if it works for you or doesnt
Related
I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}
The goal is to add as much OpenMP to the following Cholesky factor function to increase parallelization. So far, I only have one #pragma omp parallel for implemented correctly. vector<vector<double>> represents a 2-D matrix. I've already tried adding #pragma omp parallel for for
for (int i = 0; i < n; ++i), for (int k = 0; k < i; ++k), and for (int j = 0; j < k; ++j) but the parallelization goes wrong. makeMatrix(n, n) initializes a vector<vector<double>> of all zeroes of size nxn.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}
I don't think you can parallelize much more than this with this algorithm, as the ith iteration of the outer loop depends on the results of the i - 1th iteration and the kth iteration of the inner loop depends on the results of the k - 1th iteration.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
// reduction(-: value) does the same
// (private instances of value are initialized to zero and
// added to the initial instance of value when the threads are joining
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}
I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));
I need help with this parallel counting sort. I got a segmentation fault. Gdb says the source of segmentation fault is at this line: c[i] = 0;
What could possibly gone wrong, and how to fix it? Thanks.
void radix_sort::sort_array(int array[], int n)
{
std::hash<int> hash;
std::size_t m = n / nthreads;
std::vector <int> a(n);
a.insert(a.end(), &array[0], &array[n]);
std::vector<int>::iterator begin = a.begin();
std::vector<int>::iterator end = a.end();
int max = *std::max_element(a.begin(), a.end());
//int min = *std::min_element(a.begin(), a.end());
//int x = max - min + 1;
int *split_positions = new int [nthreads+1];
for(std::size_t i=0; i<a.size(); i=i+m){
if(a.begin()+i+m <= a.end()){
split_positions[i] = *a.begin()+i;
split_positions[i+1] = *a.begin()+i+m;
}
else {
split_positions[i] = *a.begin()+i;
split_positions[i+1] = *a.end();
}
}
// create one counter array for each thread
int **thread_counters = new int* [nthreads];
for (int i = 0; i < nthreads; i++)
thread_counters[i] = new int[m];
// count occurences
#pragma omp parallel num_threads(_nthreads)
{
int thread_id = omp_get_thread_num();
int *&c = thread_counters[thread_id];
// reset counters
for (int i = 0; i <= max; i++)
c[i] = 0;
// count occurences
for (int i = split_positions[thread_id]; i < split_positions[thread_id + 1]; i++)
{
c[hash(begin[i])]++;
}
}
// Compute global prefix sums / ranks from local ones. We *could*
// make this parallel, too, but there are only num_threads * (max_key + 1)
// entries in total.
for (int i = 0, sum = 0; i <= max; i++)
{
for (int j = 0; j < nthreads; j++)
{
int t = thread_counters[j][i];
thread_counters[j][i] = sum;
sum += t;
}
}
int *buffer = new int[n]; // backbuffer, copied back to input later
// write sorted result to backbuffer
#pragma omp parallel num_threads(_nthreads)
{
int thread_id = omp_get_thread_num();
int *&c = thread_counters[thread_id];
for (int i = split_positions[thread_id]; i < split_positions[thread_id + 1]; i++)
{
buffer[c[hash(begin[i])]++] = begin[i];
}
}
// write result from buffer back into input
std::copy(buffer, buffer + n, array);
// cleanup
delete [] buffer;
for (int i = 0; i < nthreads; i++)
delete [] thread_counters[i];
delete [] thread_counters;
delete [] split_positions;
}
I am using two different versions of reduction in openmp and I get totally different results. Which one of the following is wrong?
omp_set_num_threads(t);
long long unsigned int d = 0;
#pragma omp parallel for default(none) shared(some_stuff) reduction(+:d)
for (int i=start; i< n; i++)
{
d += calc(i,some_stuff);
}
cout << d << endl;
and the second version is this:
omp_set_num_threads(t);
//reduction array
long long unsigned int* d = new long long unsigned int[t];
for(int i = 0; i < t; i++)
d[i] = 0;
#pragma omp parallel for default(none) shared(somestuff, d)
for (int i=start; i< n; i++)
{
long long unsigned dd = calc(i, somestuff);
d[omp_get_thread_num()] += dd;
}
long long unsigned int res = 0;
for(int i = 0; i < omp_get_num_threads(); i++){
res += d[i];
}
delete[] d;
cout << res << endl;
The second code is wrong. omp_get_num_threads() returns 1 when called outside a parallel region and therefore your code does not reduce all values into the final result. Since you explicitly fix the number of threads to be t, you should instead use:
for(int i = 0; i < t; i++){
res += d[i];
}
Alternatively, you could use omp_get_max_threads().