Speedup when avoiding false sharing problem? - c++

I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}

Related

Multithread calculate mean and std does not improve efficiency

I am a novice in the field of C++ multithread programming and I try to use multithread to compute the mean and standard deviation of my data in parallel to reduce the cost of time. My function of calculation of mean and standard deviation is as the following.
void cal_mean_std(float* data, float* mean, float* sd, int N, int start_index, int span_cols)
{
int value;
for(int j = start_index; j < start_index + span_cols; j++){
mean[j] = 0;
sd[j] = 0;
for (int i = 0; i < N; i++) {
value = data[j * N + i];
mean[j] += value;
sd[j] += value * value;
}
mean[j] = mean[j] / N;
sd[j] = sqrt(sd[j] / N - mean[j] * mean[j]);
}
}
I specify the start index and calculation spans of each thread and I activate my thread_pool as the following.
x.mean = new float[x.M];
x.sd = new float[x.M];
std::vector<std::thread> thread_pool;
int h = 4;
thread_pool.reserve(h);
int SNIPs = static_cast<int>(x.M / h + 1);
int SNIPs_final = x.M - (h - 1) * SNIPs;
for (int i = 0; i < h - 1; i++)
{
thread_pool.push_back(std::thread(std::bind(cal_mean_std, x.data, x.mean, x.sd,
x.N, i*SNIPs, SNIPs)));
}
thread_pool.push_back(std::thread(std::bind(cal_mean_std, x.data, x.mean, x.sd,
x.N, (h-1)*SNIPs, SNIPs_final)));
for (int i = 0; i < h; i++)
thread_pool.at(i).join();
where the x.M is the total number of cols of my data. However, I found that implement in this way did not improve the program efficiency. I am not sure what the problem is.
Actually, we can simulate data to do the computation. My data size is 5k x 300k. The sequential calculation by using for loop all over the data one thread takes 15 seconds. My multithreading version sometimes takes 16 seconds.
The simulation code is as the following and I find that when I use h = 1, the program takes 6s to finish. However, when I use h = 4, the program takes 14s to finish.
#include <thread>
#include <vector>
#include <stdlib.h>
#include <vector>
#include <stdio.h>
#include <iostream>
#include <math.h>
void gen_matrix(int N, int P, float* data){
for (int i = 0; i < N * P; i++)
{
data[i] = rand() % 10;
}
}
void cal_mean_std(float* data, float* mean, float* sd, int N, int start_index, int span_cols)
{
int value;
for(int j = start_index; j < start_index + span_cols; j++){
mean[j] = 0;
sd[j] = 0;
for (int i = 0; i < N; i++) {
value = data[j * N + i];
mean[j] += value;
sd[j] += value * value;
}
mean[j] = mean[j] / N;
sd[j] = sqrt(sd[j] / N - mean[j] * mean[j]);
}
}
int main()
{
int N = 5000;
int P = 300000;
float* data = new float[N*P];
gen_matrix(N, P, data);
float* mean = new float[P];
float* std = new float[P];
std::vector<std::thread> thread_pool;
clock_t t1;
t1 = clock();
int h = 1;
thread_pool.reserve(h);
int SNIPs = static_cast<int>(P / h + 1);
int SNIPs_final = P - (h - 1) * SNIPs;
for (int i = 0; i < h - 1; i++)
{
thread_pool.push_back(std::thread(std::bind(cal_mean_std, data, mean, std,
N, i*SNIPs, SNIPs)));
}
thread_pool.push_back(std::thread(std::bind(cal_mean_std, data, mean, std,
N, (h-1)*SNIPs, SNIPs_final)));
for (int i = 0; i < h; i++)
thread_pool.at(i).join();
std::cout <<"Time for the cal mean and std is " << (clock() - t1) * 1.0/CLOCKS_PER_SEC << std::endl;
return 0;
}
Thank you, everyone. Finally, I found what the problem is with my code. The timer clock_t computes the CPU consumption time instead of wall time.

c++ threading in openmp

Hello I'm having a hard time with this program, I'm supposed to go trough whole data vector sequentially and sum up each one of the vectors in there in parallel using openmp(and store the sum in solution[i]). But the program gets stuck for some reason. The input vectors that I'm given aren't many but are very large (like 2.5m ints each). Any idea what am I doing wrong?
Here is the code, ps: igone the unused minVectorSize parameter:
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
#pragma omp parallel
{
unsigned long sum = 0;
int thread = omp_get_thread_num();
int threadnum = omp_get_num_threads();
int begin = thread * data[i].size() / threadnum;
int end = ((thread + 1) * data[i].size() / threadnum) - 1;
for (int j = begin; j <= end; j++) {
sum += data[i][j];
}
#pragma omp critical
{
solution[i] += sum;
}
}
}
}
void sumsOfVectors_omp_per_vector(const vector<vector<int8_t>> &data, vector<long> &solution, unsigned long minVectorSize) {
unsigned long vectorNum = data.size();
for (int i = 0; i < vectorNum; i++) {
unsigned long sum = 0;
int begin = 0;
int end = data[i].size();
#omp parallel for reduction(+:sum)
for (int j = begin; j < end; j++) {
sum += data[i][j];
}
solution[i] += sum;
}
}
Something like this should be more elegant and work better, Could you compile and comment if it works for you or doesnt

Very slow mutex in LLVM/OpenMP

I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));

Segmentation Fault Counting Sort C++11 OpenMP

I need help with this parallel counting sort. I got a segmentation fault. Gdb says the source of segmentation fault is at this line: c[i] = 0;
What could possibly gone wrong, and how to fix it? Thanks.
void radix_sort::sort_array(int array[], int n)
{
std::hash<int> hash;
std::size_t m = n / nthreads;
std::vector <int> a(n);
a.insert(a.end(), &array[0], &array[n]);
std::vector<int>::iterator begin = a.begin();
std::vector<int>::iterator end = a.end();
int max = *std::max_element(a.begin(), a.end());
//int min = *std::min_element(a.begin(), a.end());
//int x = max - min + 1;
int *split_positions = new int [nthreads+1];
for(std::size_t i=0; i<a.size(); i=i+m){
if(a.begin()+i+m <= a.end()){
split_positions[i] = *a.begin()+i;
split_positions[i+1] = *a.begin()+i+m;
}
else {
split_positions[i] = *a.begin()+i;
split_positions[i+1] = *a.end();
}
}
// create one counter array for each thread
int **thread_counters = new int* [nthreads];
for (int i = 0; i < nthreads; i++)
thread_counters[i] = new int[m];
// count occurences
#pragma omp parallel num_threads(_nthreads)
{
int thread_id = omp_get_thread_num();
int *&c = thread_counters[thread_id];
// reset counters
for (int i = 0; i <= max; i++)
c[i] = 0;
// count occurences
for (int i = split_positions[thread_id]; i < split_positions[thread_id + 1]; i++)
{
c[hash(begin[i])]++;
}
}
// Compute global prefix sums / ranks from local ones. We *could*
// make this parallel, too, but there are only num_threads * (max_key + 1)
// entries in total.
for (int i = 0, sum = 0; i <= max; i++)
{
for (int j = 0; j < nthreads; j++)
{
int t = thread_counters[j][i];
thread_counters[j][i] = sum;
sum += t;
}
}
int *buffer = new int[n]; // backbuffer, copied back to input later
// write sorted result to backbuffer
#pragma omp parallel num_threads(_nthreads)
{
int thread_id = omp_get_thread_num();
int *&c = thread_counters[thread_id];
for (int i = split_positions[thread_id]; i < split_positions[thread_id + 1]; i++)
{
buffer[c[hash(begin[i])]++] = begin[i];
}
}
// write result from buffer back into input
std::copy(buffer, buffer + n, array);
// cleanup
delete [] buffer;
for (int i = 0; i < nthreads; i++)
delete [] thread_counters[i];
delete [] thread_counters;
delete [] split_positions;
}

POSIX Threads not producing speed up in C

I am learning parallel processing using Pthreads. I have a quad core processor. Unfortunately, the parallelized portion of the following code is running roughly 5X slower than the non-parallelized code. What am I doing wrong here? Thanks in advance for the help.
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
b->sum += b->arr[i];
}
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i != size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]){
clock_t begin, end;
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i != NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
begin = clock();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
begin = clock();
sum2 = arrSum(myArr, SIZE);
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
You're missing one important aspect of parallel programming.
The worker threads need to be created once per process, not for every task.
Creating and destroying threads takes time.
The solution is to use a thread pool and send tasks to the pool.
My suggestion is to use OpenMP which simplifies this task considerably and works with many compilers.
Example:
int sum = 0
#pragma omp for shared(sum)
for(int i=0; i<SIZE; ++i)
{
#pragma omp atomic
sum += myArr[i]
}
To make this work faster, do some loop unrolling - e.g. calculate the sum of 8 number in a single for loop scope.
The main problem is that you're using clock() which does not return the wall time but the cumulative CPU time. This is the most common mistake with the OpenMP tag with SO (and if the frequency listing was useful on SO it should show this).
The simplest way to get the wall time is to use a function from OpenMP: omp_get_wtime(). This works Linux and Windows with GCC, ICC, and MSVC (and I assume Clang which now supports OpenMP 3.1).
When I use this with your code I get on my four core/eight hyper-thread i7 IVB system:
Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000
Some other comments. Your scheduling is error prone. You set the array for each thread to
p.arr = myArr + i*(int)(SIZE/NTHREADS);
And then have each thread run over (SIZE/NTHREADS). This can give wrong results to to rounding errors for some values of SIZE and NTHREADS.
You should have each thread run over
int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;
And then have each thread point to the beginning of the array and do
int sum = 0;
for (i = start; i < finish; ++i){
sum += b->arr[i];
}
This is essentially what OpenMP's schedule(static) does. In fact you can get the same effect of pthreads using OpenMP by doing
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
sum += arr[i];
}
Here is the code I used
//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>
#define NTHREADS 4
#define SIZE NTHREADS*100000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
int sum = 0;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
sum += b->arr[i];
}
b->sum = sum;
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]) {
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i < NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
runTime = -omp_get_wtime();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
runTime += omp_get_wtime();
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
runTime = -omp_get_wtime();
sum2 = arrSum(myArr, SIZE);
runTime += omp_get_wtime();
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}