I have a short piece of code which does the following thing:
Accessing an array of struct (vec1) sequentially
Accessing to an int array (hashT) randomly according to the value got from the struct array
Modifying the values of the struct array using the value got from the int array.
I ran the program on 8 physical cpus on the same numa node. But with hashT of size 5M (i.e. there are 5M ints in it) I only got x7.2 speedup. I changed the size of hashT and saw that when it's small, I got x7.9 speedup, but as it grows larger and at some point (when there are 3M ints in the array) the speedup starts to drop down. I have a 200GB RAM so memory should not be a problem. I also measured the memory bandwidth and I'm nowhere near a half of the maximal available bandwidth. So I thought that it might be due of contention of cache. I have a 20MB L3 cache which is shared by all threads. But I measured the L2 and L3 cache misses and I found that they are almost the same no matter how many threads I use (it is true that increasing the size of hashT causes more cache misses, though). So I don't see any reasons why the speedup drops. Can anybody give me an idea?
#include<iostream>
#include<vector>
#include<chrono>
#include<omp.h>
#include<stdio.h>
using namespace std;
typedef std::chrono::milliseconds ms;
struct Element{
int a, b, c;
Element() : a(0), b(1), c(2) {}
Element(int aa, int bb, int cc) : a(aa), b(bb), c(cc) {}
};
int main(int argc, char *argv[]){
int HASHT_SIZE = atoi(argv[1]);
int RID_SIZE = 5000000;
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
vector<vector<Element> > vec1(40);
for(size_t i = 0; i < vec1.size(); i++){
vec1[i].reserve(RID_SIZE);
for(size_t j = 0; j < vec1[i].capacity(); j++){
vec1[i].push_back(Element(rand() % HASHT_SIZE,0,0));
}
}
//initialize hashT
vector<int> hashT;
hashT.reserve(HASHT_SIZE);
for(int i = 0; i < HASHT_SIZE; i++){
hashT.push_back(rand() % 2);
}
start = std::chrono::high_resolution_clock::now();
//test program
#pragma omp parallel for schedule(dynamic)
for(int j = 0; j < 40; j++)
{
for(size_t i = 0; i < RID_SIZE; i++){
int nid = hashT[vec1[j][i].a];
for(int k = 0; k < 10; k++){
vec1[j][i].b += nid / (k+1);
vec1[j][i].c += nid / (k+1);
}
}
}
end = std::chrono::high_resolution_clock::now();
for(int i = 0; i < 40; i++){
for(int j = 0; j < 10; j++){
cout << vec1[i][j].a << vec1[i][j].b << vec1[i][j].c;
}
}cout << endl;
cout << "Time used: " << std::chrono::duration_cast<ms>(end - start).count() << endl;
return 0;
}
Related
I was performing sorting algorithm to calculate their runtime to execute, in which I was giving millions of number of input to sort, but my code is exiting on above 500,000 input and not showing any output. Is there anyway I can solve it.
int size;
cout<<"Enter size of the array: "<<endl;
cin>>size;
int a[size];
for(int i=0;i<size;i++)
{
a[i]=rand()%size;
}
int temp = 0;
double cl=clock();
for (int i = 0; i < size; i++)
{
for (int j = i + 1; j < size; j++)
{
if (a[j] < a[i])
{
temp = a[i];
a[i] = a[j];
a[j] = temp;
}
}
}
double final=clock()-cl;
cout<<final/(double)CLOCKS_PER_SEC;
}
You code crashes on 500'000 input because of stack overflow, you're allocating array on stack of too big size:
int a[size];
Stack size is usually few megabytes at most.
Also it is probably an extensions not of all compilers to have dynamically allocated array on stack, usually size should be a compile time constant.
To overcome stack crash either you have to use std::vector which can provide any size as big as there is free memory, for that do:
std::vector<int> a(size);
(also #include <vector>). Or you may use dynamically allocated array through new operator:
int * a = new int[size];
For this case don't forget to do delete[] a; at the end of program (see docs here).
Don't forget that input 500'000 takes very much of time using your bubble sort. For example 10 times less, 50'000, takes around 10 seconds on my machine.
Full working code using std::vector plus code formatting:
Try it online!
#include <iostream>
#include <vector>
using namespace std;
int main() {
int size;
cout << "Enter size of the array: " << endl;
cin >> size;
std::vector<int> a(size);
for (int i = 0; i < size; i++) {
a[i] = rand() % size;
}
int temp = 0;
double cl = clock();
for (int i = 0; i < size; i++) {
for (int j = i + 1; j < size; j++) {
if (a[j] < a[i]) {
temp = a[i];
a[i] = a[j];
a[j] = temp;
}
}
}
double final = clock() - cl;
cout << final / (double)CLOCKS_PER_SEC;
}
I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}
I have several matrices that I want to multiply in c++ with allowing vectorization. However the following code results in a large execution time ~858146125 ns. How do I modify the code so I have vectorization of the matrix multiplication and reach around 100ns of execution time.
I am using the flag O3.
const int ROWS = 1000;
const int COLS = 1000;
const int ROWS1 = 1000;
const int COLS1 = 1000;
const int l = 1000;
double random_matrix[ROWS][COLS];
double random_matrix1[ROWS1][COLS1];
double mult[l][l];
int i;
int j;
/* generate number: */
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
random_matrix[i][j] = i + j;
}
for (i = 0; i < ROWS1; i++) {
for (j = 0; j < COLS1; j++)
random_matrix1[i][j] = i + j;
}
auto start = std::chrono::steady_clock::now();
for (size_t row = 0; row < ROWS; ++row) {
for (size_t tmp = 0; tmp < COLS1; ++tmp) {
mult[row][tmp] = random_matrix[row][0]*random_matrix1[0][tmp];
for (size_t col = 1; col < COLS; ++col) {
mult[row][tmp] += random_matrix[row][col] * random_matrix1[col][tmp];
}
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "Elapsed time in nanoseconds : "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()
<< " ns" << std::endl;
std::cout<<"\n";
for (i=0;i<ROWS;i++)
{
for (j=0;j<COLS1;j++)
std::cout << mult[i][j] <<std::endl; //display table
std::cout<<"\n";
}
I'm afraid you'll never get to 100 ns total execution time with these matrix sizes, with vectorization or without. Matrix multiplication of two matrices 1000 x 1000 elements takes on the order of 1000 ^ 3 = 1,000,000,000 multiply-adds. That is one billion operations.
Secondly, if performance matters so much to you, you should NOT write your own code for these low-level mathematical primitives. There are optimized C++ libraries that will perform these operations for you, such as Eigen or BLAS (Intel MKL is a package that implements BLAS).
By using one of these packages, you not only get much better performance, but also avoid the potential pitfalls or bugs that you would likely have otherwise.
I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));
I just started to use OpenMP to do parallel computing in C++. The program has a bad parallel performance. Since I don't know many multi-threading profiling tool (unlike simple gprof for single thread), I wrote a sample program to test the performance.
I have a 2D matrix(N * N), with each element a 3d vector(x, y, z). I simply do a double for loop to set each value in the matrix:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
where VECTOR3D is a simple class has x, y, z attributes:
class VECTOR3D {
double x, y, z; // component along each axis
}
On the other hand, I can also use a (N * N * 3) 3D array to do this:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
From the memory aspect, there are also several different choices, such as use raw pointers with manually allocate and deallocate:
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
or simply use std::vector:
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
I also considered to manually allocate continuous memory for arrays as did in LAMMP(Molecular Simulation Package source code.
So my results for N=10000 are listed here:
For single thread:
OMP_NUM_THREADS=1 ./a.out
Allocating memory for array on heap...
======= Array on heap Results =======
Finished within time (total): 0.720385 seconds
Finished within time (real): 0.720463 seconds
Deallocating memory for array on heap...
Allocating memory for array continous...
======= Array continuous Results =======
Finished within time (total): 0.819733 seconds
Finished within time (real): 0.819774 seconds
Deallocating memory for array continuous...
Allocating memory for vector on heap...
======= Vector on heap Results =======
Finished within time (total): 3.08715 seconds
Finished within time (real): 3.08725 seconds
Deallocating memory for vector on heap...
Allocating memory for vector on stack...
======= Vector on stack Results =======
Finished within time (total): 1.49888 seconds
Finished within time (real): 1.49895 seconds
For multi-threads (threads=4):
OMP_NUM_THREADS=4 ./a.out
Allocating memory for array on heap...
======= Array on heap Results =======
Finished within time (total): 2.29184 seconds
Finished within time (real): 0.577807 seconds
Deallocating memory for array on heap...
Allocating memory for array continous...
======= Array continuous Results =======
Finished within time (total): 1.79501 seconds
Finished within time (real): 0.454139 seconds
Deallocating memory for array continuous...
Allocating memory for vector on heap...
======= Vector on heap Results =======
Finished within time (total): 6.80917 seconds
Finished within time (real): 1.92541 seconds
Deallocating memory for vector on heap...
Allocating memory for vector on stack...
======= Vector on stack Results =======
Finished within time (total): 1.64086 seconds
Finished within time (real): 0.411 seconds
The overall parallel efficiency is not good. Unexpected, the fancy continuous memory allocating is not helpful?! Why is this happening? It seems the std::vector is good enough for this case?
Could someone explain the results for me? and I also need suggestions on how to improve the code.
Thanks very much!!!
Attached all the source code.
(please directly go to main, there are several functions to manually manage the memory at the beginning).
#include <iostream>
#include <omp.h>
#include <vector>
#include <stdlib.h>
#include <cinttypes>
#include "vector3d.h"
typedef int64_t bigint;
void *smalloc(bigint nbytes, const char *name)
{
if (nbytes == 0) return NULL;
void *ptr = malloc(nbytes);
return ptr;
}
template <typename TYPE>
TYPE ***create(TYPE ***&array, int n1, int n2, int n3, const char *name)
{
bigint nbytes = ((bigint) sizeof(TYPE)) * n1*n2*n3;
TYPE *data = (TYPE *) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE *)) * n1*n2;
TYPE **plane = (TYPE **) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE **)) * n1;
array = (TYPE ***) smalloc(nbytes,name);
int i,j;
bigint m;
bigint n = 0;
for (i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
return array;
}
template <typename TYPE>
TYPE ***create3d_offset(TYPE ***&array, int n1lo, int n1hi,
int n2, int n3, const char *name)
{
int n1 = n1hi - n1lo + 1;
create(array,n1,n2,n3,name);
array -= n1lo;
return array;
}
void sfree(void *ptr) {
if (ptr == NULL) return;
free(ptr);
}
template <typename TYPE>
void destroy(TYPE ***&array)
{
if (array == NULL) return;
sfree(array[0][0]);
sfree(array[0]);
sfree(array);
array = NULL;
}
template <typename TYPE>
void destroy3d_offset(TYPE ***&array, int offset)
{
if (array == NULL) return;
sfree(&array[offset][0][0]);
sfree(&array[offset][0]);
sfree(&array[offset]);
array = NULL;
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
int main() {
using namespace std;
const int N = 10000;
///////////////////////////////////////
double sum = 0.0;
clock_t t;
double startTime, stopTime, secsElapsed;
printf("\n\nAllocating memory for array on heap...\n");
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
printf("======= Array on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] arrayHeap[i][j];
}
delete [] arrayHeap[i];
}
delete [] arrayHeap;
///////////////////////////////////////
printf("\n\nAllocating memory for array continous...\n");
double ***array_continuous;
create3d_offset(array_continuous,0, N, N, 3, "array");
printf("======= Array continuous Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
array_continuous[i][j][0] = (1.0*i*i);
array_continuous[i][j][1] = (1.0*j*j);
array_continuous[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array continuous...\n");
destroy3d_offset(array_continuous, 0);
///////////////////////////////////////k
printf("\n\nAllocating memory for vector on heap...\n");
VECTOR3D ***vectorHeap;
vectorHeap = new VECTOR3D**[N];
for(int i = 0; i < N; ++i) {
vectorHeap[i] = new VECTOR3D* [N];
for(int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(0,0,0);
}
}
printf("======= Vector on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for vector on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] vectorHeap[i][j];
}
delete [] vectorHeap[i];
}
delete [] vectorHeap;
/////////////////////////////////////////////////
printf("\n\nAllocating memory for vector on stack...\n");
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
printf("======= Vector on stack Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
/////////////////////////////////
return 0;
}
And the VECTOR3D class:
#ifndef _VECTOR3D_H
#define _VECTOR3D_H
#include <iostream>
#include <cmath>
#include <iomanip>
#include <limits>
class VECTOR3D {
public:
double x, y, z; // component along each axis (cartesian)
VECTOR3D(double xx = 0.0, double yy = 0.0, double zz = 0.0) : x(xx), y(yy), z(zz) // make a 3d vector
{
}
}
General Misconception
Your trivial loop is not compute bound, but entirely memory bound: You access each element only once. No re-use means that you cannot efficiently use caches. Therefore you can not expect a speedup equal to the number of used threads/cores. The actual speedup depends on the specific system (memory bandwidth).
Indirection
All of your data structures, including the fancy continuous memory perform many indirections on the data access. This is not strictly necessary. To gain full advantage of the continuous memory, you should simply lay out your 2d array flat:
template<class T>
class Array2d
{
public:
Array2d(size_t rows, size_t columns) : rows_(rows), columns_(columns), data_(rows_ * columns_) {}
T& operator()(size_t r, size_t c)
{
return data_[r * columns_ + c];
}
const T& operator()(size_t r, size_t c) const
{
return data_[r * columns_ + c];
}
private:
size_t rows_;
size_t columns_;
std::vector<T> data_;
};
Note: You could also make a fancy operator[] that returns a proxy object providing another operator[] if you really must retain the [i][j] indexing.
Clarification
If you are limited by memory bandwidth and N is large enough, there will be no noticeable performance difference between indirection or flat layout.