How to nest for loops in CUDA?

How to nest for loops in CUDA? - c++

I would like to ask for a complete example of CUDA code, one that includes everything someone may want to include so that it may be referenced by people trying to write such code such as myself.
My main concerns are whether or not it is possible to process multiple for loops at the same time on different threads in the same block. This is the difference between running (for a clear example) a total of 2016 threads divided into blocks of 32 on case 3 in the example code and running 1024 threads on each for loop theoretically with the code we have we could run even fewer taking of another 2 blocks by running the for loops of other cases under the same block. Otherwise separate cases would primarily be used for processing separate tasks such as a for loop. Currently it appears that the CUDA code simply knows when to run in parallel.
// note: rarely referenced, you can process if statements in parallel seemingly by block, I'd say that is the primary purpose of using more blocks instead of increasing thread count per block during call, other than the need of multiple SMs (Streaming Multiprocessors), capped at 2048 threads (also the cap for a block)//
If we have the following code including for loops and if statements then what would the code that optimizes parallelization be?
public void main(string[] args) {
doMath(3); // we want to process each statement in parallel. For this we use different blocks.
}
void doMath(int question) {
int[] x = new int{0,1,2,3,4,5,6,7,8,9};
int[] y = new int{0,1,2,3,4,5,6,7,8,10};
int[] z = new int{0,1,2,3,4,5,6,7,8,11};
int[] w = new int{0,1,2,3,4,5,6,7,8,12};
int[] q = new int[1000];
int[] r = new int[1000];
int[] v = new int[1000];
int[] t = new int[1000];
switch(question) {
case 1:
for (int a = 0; a < x.length; a++) {
for (int b = 0; b < y.length; b++) {
for (int c = 0; c < z.length; c++) {
q[(a*100)+(b*10)+(c)] = x[a] + y[b] + z[c];
}
}
}
break;
case 2:
for (int a = 0; a < x.length; a++) {
for (int b = 0; b < y.length; b++) {
for (int c = 0; c < w.length; c++) {
r[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
break;
case 3:
for (int a = 0; a < x.length; a++) {
for (int b = 0; b < z.length; b++) {
for (int c = 0; c < w.length; c++) {
v[(a*100)+(b*10)+(c)] = x[a] + z[b] + w[c];
}
}
}
for (int a = 0; a < x.length; a++) {
for (int b = 0; b < y.length; b++) {
for (int c = 0; c < w.length; c++) {
t[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
break;
}
}
From the samples I have seen the CUDA code would be as follows:
// 3 blocks for 3 switch cases the third case requires 2000 threads to be done in perfect parallel while the first two only require 1000. blocks operate by multiples of 32 (threads). the trick is to take the greatest common denominator of all cases, or if/else statements as the... case... may be, and appropriate the number of blocks required to each case. (in this example we would need 127 blocks of 32 threads (1024 * 2 + 2048 - 32)//
//side note: each Streaming Multiprocessor or SM can only support 2048 threads and 2048 / (# of blocks * # of threads/block)//
public void main(string[] args) {
int *x, *y *z, *w, *q, *r, *t;
int[] x = new int{0,1,2,3,4,5,6,7,8,9};
int[] y = new int{0,1,2,3,4,5,6,7,8,10};
int[] z = new int{0,1,2,3,4,5,6,7,8,11};
int[] w = new int{0,1,2,3,4,5,6,7,8,12};
int[] q = new int[1000];
int[] r = new int[1000];
int[] t = new int[1000];
cudaMallocManaged(&x, x.length*sizeof(int));
cudaMallocManaged(&y, y.length*sizeof(int));
cudaMallocManaged(&z, z.length*sizeof(int));
cudaMallocManaged(&w, w.length*sizeof(int));
cudaMallocManaged(&q, q.length*sizeof(int));
cudaMallocManaged(&r, r.length*sizeof(int));
cudaMallocManaged(&t, t.length*sizeof(int));
doMath<<<127,32>>>(x, y, z, w, q, r, t);
cudaDeviceSynchronize();
cudaFree(x);
cudaFree(y);
cudaFree(z);
cudaFree(w);
cudaFree(q);
cudaFree(r);
cudaFree(t);
}
__global__
void doMath(int *x, int *y, int *z, int *w, int *q, int *r, int *t) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
switch(question) {
case 1:
for (int a = index; a < x.length; a+=stride ) {
for (int b = index; b < y.length; b+=stride) {
for (int c = index; c < z.length; c+=stride) {
q[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
break;
case 2:
for (int a = index; a < x.length; a+=stride) {
for (int b = index; b < y.length; b+=stride) {
for (int c = index; c < w.length; c+=stride) {
r[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
break;
case 3:
for (int a = index; a < x.length; a+=stride) {
for (int b = index; b < y.length; b+=stride) {
for (int c = index; c < z.length; c+=stride) {
q[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
for (int a = index; a < x.length; a+=stride) {
for (int b = index; b < y.length; b+=stride) {
for (int c = index; c < w.length; c+=stride) {
t[(a*100)+(b*10)+(c)] = x[a] + y[b] + w[c];
}
}
}
break;
}
}

In cuda every thread runs your kernel. If you want the threads to do different things you have to branch dependent (in someway) on threadIdx and/or blockIdx.
You did this by calculating index. Every thread in you kernel has a different index. Now you have to map your indices to the work the kernel should do. So you have to map every index to one or multiple triplets of (a,b,c).
Your current mapping is something like:
index -> (index+i*stride,index+j*stride,index+k*stride)
I do not believe this was your intent.

Related

Porting code from C++ to C. How do I deal with this vector and range based loop

I am trying to port to C. Since there's no vectors in C, I used a normal array, but I don't know how I'm going to deal with the ranged based loop on line 18.
for (int u : d[i]) if (dfs(rev[u])) {
par[i] = u;
rev[u] = i;
return true;
}
Complete code:
#include <iostream>
#include <vector>
#include <string>
#include <sstream>
using namespace std;
const int Maxn = 200;
vector<int> d[Maxn];
int par[Maxn];
int rev[Maxn];
bool vs[Maxn];
bool dfs(int i) {
if (i < 0) return true;
if (vs[i]) return false;
vs[i] = true;
for (int u : d[i]) if (dfs(rev[u])) {
par[i] = u;
rev[u] = i;
return true;
}
return false;
}
int main() {
ios_base::sync_with_stdio(false);
int n;
cin >> n;
string s;
getline(cin, s);
for (int i = 0; i < n; i++) {
getline(cin, s);
stringstream ss(s);
vector<int> mk(n, 1);
mk[i] = 0;
int x;
while (ss >> x)
mk[x] = 0;
for (int x = 0; x < n; x++)
if (mk[x])
d[i].push_back(x);
}
memset(par, -1, sizeof par);
memset(rev, -1, sizeof rev);
for (bool ok = true; ok; ) {
ok = false;
memset(vs, 0, sizeof vs);
for (int i = 0; i < n; i++)
if (par[i] < 0) {
ok |= dfs(i);
}
}
int ans = 0;
for (int i = 0; i < n; i++)
ans += (par[i] < 0);
cout << ans;
}

In C there is no std::vector, the closes would be an array.
int array[] = [ 1, 3, 5, 7, 9 ];
for(int i = 0; i < sizeof array / sizeof *array; ++i)
printf("array[%d] = %d\n", i, array[i]);
If you get a pointer of an array of int, the you have to pass the length of
the array as well, as sizeof arr / sizeof *arr works with arrays only.
void foo(in *array, size_t len)
{
for(int i = 0; i < len; ++i)
printf("array[%d] = %d\n", i, array[i]);
}
void bar(void)
{
int array[] = [ 1, 3, 5, 7, 9 ];
foo(array, sizeof array / sizeof *array);
}
edit 2
I noticed that you've posted your code and that d is declared as vector<int> d[Maxn];. Also taking in consideration your recent comment
So this is an array of vectors. Do you have any idea how i can work with arrays taking that in consideration in C
There a couple of ways to convert the array of vectors in C. But this depends
on your needs. If for example you know that all vectors are going to have the
same size (for example int vectsize = 100), then you can create a two
dimensional array with the sizes1
int Maxn = 200;
int vectsize = 100;
int d[Maxn][vectsize];
memset(d, 0, sizeof d); // initialize all elements with 0
// filling the data
for(int i = 0; i < Maxn; ++i)
{
for(j = 0; j < vectsize; ++j)
d[i][j] = get_value_for(i, j);
}
The the range-loop is very easy:
// assuming that the variables i, par, rev are valid, i between 0 and Maxn-1
for(int j = 0; j < vectsize; ++j)
{
int u = d[i][j];
if (dfs(rev[u])) {
par[i] = u;
rev[u] = i;
return true;
}
}
It gets a little more complicated if you only know one dimension, for example
every vector in the array could have a different size.
d[0].size() --> 10
d[1].size() --> 1
d[2].size() --> 3
...
The you can create an array of pointers to int, but you would have to keep
another array of ints with the length for every d[i] vector.
int Maxn = 200;
int *d[Maxn]; // pointer to int[Maxn] arrays
int vectsize[Maxn];
// initializing with 0
memset(d, 0, sizeof d);
memset(vectsize, 0, sizeof vectsize);
// filling the data
for(int i = 0; i < Maxn; ++i)
{
vectsize[i] = get_length_for(i);
d[i] = malloc(vectsize[i] * sizeof *d[i]);
if(d[i] == NULL)
// error handling
for(j = 0; j < vectsize[i]; ++j)
d[i][j] = get_value_for(i, j);
}
Note that I'm using here (and in the last example) get_length_for() and get_value_for() as placeholders2.
Now your range-base loop would look like this:
// assuming that the variables i, par, rev are valid, i between 0 and Maxn-1
for(int j = 0; j < vectsize[i]; ++j)
{
int u = d[i][j];
if (dfs(rev[u])) {
par[i] = u;
rev[u] = i;
return true;
}
}
At some point however you would have to free the memory:
for(int i = 0; i < Maxn; ++i)
free(d[i]);
The third option would be using a double pointer and using malloc/realloc
to allocate the memory. This is the more general solution, but you have to
take care of memory management and that can be sometimes difficult, especially when you
haven't programmed in C to much. But also in case where both dimension are unknown, this is the way to go:
int Maxn = get_some_maxn_value();
int **d, *vectsize;
d = malloc(Maxn * sizeof *d);
if(d == NULL)
// error handling
vectsize = malloc(Maxn * sizeof *vectsize);
if(vectsize == NULL)
// error handling,
// if you exit the function, don't forget
// to do free(d) first as part of the
// error handling
// initialize all elements with 0
memset(d, 0, Maxn * sizeof *d);
memset(vectsize, 0, Maxn * sizeof *vectsize);
// filling the data (the same as above)
for(int i = 0; i < Maxn; ++i)
{
vectsize[i] = get_length_for(i);
d[i] = malloc(vectsize[i] * sizeof *d[i]);
if(d[i] == NULL)
// error handling
for(j = 0; j < vectsize[i]; ++j)
d[i][j] = get_value_for(i, j);
}
In this case the range-loop would look exactly as for the array of pointers.
Freeing the memory is a little bit different though:
for(int i = 0; i < Maxn; ++i)
free(d[i]);
free(d);
free(vectsize);
Like I said earlier, which one of these three methods to use depends on the way
the original C++ code fills the values, how long the vectors are, etc. Judging
form the C++ code you posted, you read an integer from the user and store it
in n. Then you read more values from the user and push then in the vector
d[i] for all i between 0 and Maxn-1. It seems that all vectors have at
most length n, but because of
if (mk[x])
d[i].push_back(x);
they also could have less than n elements. That's why I think that the third
solution is preferable here.
Annotations
1Prior to C99, Variable Length Arrays (VLA) were not supported, so if you had the
dimension in a variable, you had to use malloc to allocate enough memory.
C99 supports VLAs, but I'm not quite sure how well supported they are and/or
whether your compiler supports them.
I personally don't use them in my code at all, that's why I really don't know. I compiled this examples with GNU
GCC 6.4.0 (on linux) and they worked fine.
The first two options use VLAs, if your compiler doesn't support that, then
you have to use the third option.
For more information about VLAs:
malloced array VS. variable-length-array
What's the difference between a VLA and dynamic memory allocation via malloc?
Variable length array
GCC manual: 6.19 Arrays of Variable Length (in case you ise GCC)
2How you really get this values depends on the original C++ code.
So far I've only looked very briefly over your C++ code. Using the values from
my example get_length_for(0) would return 10, get_length_for(1) would return 1,
get_length_for(2) would return 3, etc.

Assuming d[i] is a vector, this is a similar loop:
for (size_t s = 0; s < d[i].size(); s++)
{
int u = d[i][s];
if (dfs(rev[u]))
{
par[i] = u;
rev[u] = i;
return true;
}
}

C++ 4d array memory deallocation is slow

My code has a 4D matrix in it for some math problem solving
int**** Sads = new int***[inputImage->HeightLines];
for (size_t i = 0; i < inputImage->HeightLines; i++)
{
Sads[i] = new int**[inputImage->WidthColumns];
for (size_t j = 0; j < inputImage->WidthColumns; j++)
{
Sads[i][j] = new int*[W_SIZE];
for (size_t k = 0; k < W_SIZE; k++)
{
Sads[i][j][k] = new int[W_SIZE];
}
}
}
//do something with Sads...
for (int i = 0; i < inputImage->HeightLines; i++)
{
int*** tempI = Sads[i];
for (int j = 0; j < inputImage->WidthColumns; j++)
{
int** tempJ = tempI[j];
for (int k = 0; k < W_SIZE; k++)
{
delete[] tempJ[k];
}
delete[] Sads[i][j];
}
delete[] Sads[i];
}
delete[] Sads;
The sizes are very large WidthColumns = 2018, HeightLines = 1332, W_SIZE =7, the memory allocation is very fast but the memory deallocation (delete) is very slow.
Is there a way to optimize it?
I tired openMP but it throws unrelated errors of missing DLL which are there... if I removed the #pragma omp parallel for everything works fine. but slow...

Using a pointer to a pointer to... is a bad idea because it will fragment your data a lot.
I would create a class ta manage the indices transform and use 1D array, it's a bit more complicated but it will be faster.
Anyway, a trick: nothing prevent you to build your int**** with pointers to a zone in memory that isn't sparse (1D array you preallocated) and then use it as a 4D array.

I'd probably be inclined to use a std::vector. Now memory allocation is taken care of for me (in one allocation/deallocation) and I get free copy/move semantics.
All I have to do is provide the offset calculations:
#include <vector>
#include <cstddef>
struct vector4
{
vector4(std::size_t lines, std::size_t columns)
: lines_(lines), columns_(columns)
, storage_(totalSize())
{}
auto totalSize() const -> std::size_t
{
return lines_ * columns_ * w_size * w_size;
}
int* at(std::size_t a)
{
return storage_.data() + (a * columns_ * w_size * w_size);
}
int* at(std::size_t a, std::size_t b)
{
return at(a) + (b * w_size * w_size);
}
int* at(std::size_t a, std::size_t b, std::size_t c)
{
return at(a, b) + (c * w_size);
}
int& at(std::size_t a, std::size_t b, std::size_t c, std::size_t d)
{
return *(at(a, b, c) + d);
}
private:
std::size_t lines_, columns_;
static constexpr std::size_t w_size = 32; // ?
std::vector<int> storage_;
};
int main()
{
auto v = vector4(20, 20);
v.at(3, 2, 5, 1) = 6;
// other things
// now let it go out of scope
}

The correct way to create, use, and delete a 4D array is this, using the closure of the statement group to delete the automatic variables.
{
const int H = 10;
const int I = 10;
const int J = 10;
const int K = 10;
int h = 0;
int i = 0;
int j = 0;
int k = 0;
int fourDimArray [H][I][J][K];
fourDimArray[h][i][j][k] = 0;
}
If you have a need to dynamically allocate, then use either STL's list or vector class or use something like this with perhaps inline methods to calculate the index of the 1D array from the 4D array indices if you need blazing speed.
int * fourDimArrayAsOneDim = new int[H*I*J*K];
fourDimArrayAsOneDim[indexFromIndices(h, i, j, k)] = 0;
delete [] fourDimArrayAsOneDim;

How can I parallel this loop with open mp?

I don't know how I can parallel this loops because I have a lot of dependent variables and I am very confused
can you help and guide me?
the number one is :
for (int a = 0; a < sigmaLen; ++a) {
int f = freq[a];
if (f >= sumFreqLB)
if (updateRemainingDistances(s, a, pos))
if (prunePassed(pos + 1)) {
lmer[pos] = a;
enumerateStrings(pos + 1, sumFreqLB - f);
}
}
The second one is :
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
for (int k = L; k; --k) {
int *dsn = dist[k] + pairOffset;
int *ds = dist[k - 1] + pairOffset;
int *s = colS[k - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
Really another one is :
void enumerateSubStrings(int rowNumber, int remainQTolerance) {
int nItems = rowSize[rowNumber][stackSz];
if (shouldGenerateNeighborhood(rowNumber, nItems)) {
bruteForceIt(rowNumber, nItems);
} else {
indexType *row = rowItem[rowNumber];
for (int j = 0; j < nItems; ++j) {
indexType ind = row[j];
addString(lmers + ind);
preprocessLowerBounds();
uint threshold = maxLB[stackSz] - addMaxFreq();
if (hasSolution(0, threshold)) {
if (getValid<hasPreprocessedPairs, useQ>(rowNumber + 1,
(stackSz <= 2 ? n : smallN), threshold + LminusD,
ind, remainQTolerance)) {
enumerateSubStrings<hasPreprocessedPairs, useQ>(
rowNumber + 1, remainQTolerance);
}
}
removeLastString();
}
}
void addString(const char *t) {
int *mf = colMf[stackSz + 1];
for (int j = 0; j < L; ++j) {
int c = t[j];
colS[j][stackSz] = c;
mf[j] = colMaxFreq[j] + (colMaxFreq[j] == colFreq[j][c]++);
}
colMaxFreq = mf;
++stackSz;
}
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
for (int k = L; k; --k) {
int *dsn = dist[k] + pairOffset;
int *ds = dist[k - 1] + pairOffset;
int *s = colS[k - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
}
void removeLastString() {
--stackSz;
for (int j = 0; j < L; ++j)
--colFreq[j][colS[j][stackSz]];
colMaxFreq = colMf[stackSz];
}

Ok, For OpenMP to parallelize a loop in your basically follow these two rules, the first never write in the same memory location from different threads and second rule never depend on the reading of a memory area that may modified another thread, Now in the first loop you just change the lmer variable and other operations are read-only variables that I assume are not changing at the same time from another part of your code, so the first loop would be as follows:
#pragma omp for private(s,a,pos) //According to my intuition these variables are global or belong to a class, so you must convert private to each thread, on the other hand sumFreqLB and freq not included because only these reading
for (int a = 0; a < sigmaLen; ++a) {
int f = freq[a];
if (f >= sumFreqLB)
if (updateRemainingDistances(s, a, pos))
if (prunePassed(pos + 1)) {
#pragma omp critical //Only one thread at a time can enter otherwise you will fail at runtime
{
lmer[pos] = a;
}
enumerateStrings(pos + 1, sumFreqLB - f);
}
}
In the second loop i could not understand how you're using the for, but you have no problems because you use only reads and only modified the thread local variables.
You must make sure that the functions updateRemainingDistances, prunePassed and enumerateStrings do not use static or global variables within.
In the following function you use most only read operations which can be done from multiple threads (if any thread modifying these variables) and write in local memory positions so just change the shape of the FOR for OpenMP can recognize that FOR.
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
#pragma omp for
for (int var=0; var<=k-L; var++){
int newK=k-var;//This will cover the initial range and in the same order
int *dsn = dist[newK] + pairOffset;
int *ds = dist[newK - 1] + pairOffset;
int *s = colS[newK - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
In the last function you use many functions for which I do not know the source code and thus can not know if they are looking for parallelizable example below the following examples are wrong:
std::vector myVector;
void notParalelizable_1(int i){
miVector.push_back(i);
}
void notParalelizable_2(int i){
static int A=0;
A=A+i;
}
int varGlobal=0;
void notParalelizable_3(int i){
varGlobal=varGlobal+i;
}
void oneFunctionParalelizable(int i)
{
int B=i;
}
int main()
{
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_1(i);//Error because myVector is modified simultaneously from multiple threads, The error here is that myVector not store the values in ascending order as this necessarily being accesing by multiple threads, this more complex functions can generate erroneous results or even errors in run time.
}
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_2(i);//Error because A is modified simultaneously from multiple threads
}
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_3(i);//Error because varGlobal is modified simultaneously from multiple threads
}
#pragma omp for
for(int i=0;i<10;i++)
{
oneFunctionParalelizable(i);//no problem
}
//The following code is correct
int *vector=new int[10];
#pragma omp for
for(int i=0;i<10;i++)
{
vector[i]=i;//No problem because each thread writes to a different memory pocicion
}
//The following code is wrong
int k=2;
#pragma omp for
for(int i=0;i<10;i++)
{
k=k+i; //The result of the k variable at the end will be wrong as it is modified from different threads
}
return 0;
}

c++ define variable inside if block for function scope

Background: I am working on developing several different controllers (over 10 or so) for a hardware which involves running the code in hard real-time under RTAI linux. I have implemented a class for the hardware with each controller as a separate member function of the class. I'm looking to pass the desired trajectory for the respective control variable to each of these control functions based on which controller is chosen. In addition, since there are several parameters for each controller and I am looking to quickly switch controllers without having to navigate through the entire code and changing parameters, I am looking to define all the control variables at one place and define them based on which controller I choose to run. Here is a minimum working example of what I am looking for.
I am looking to define variables based on if a condition is true or not as follows in C++:
int foo()
{
int i=0;
if(i==0)
{
int a=0;
float b=1;
double c=10;
}
elseif(i==1)
{
int e=0;
float f=1;
double g=10;
}
// Memory locked for hard real-time execution
// execute in hard real-time from here
while(some condition)
{
// 100's of lines of code
if(i==0)
{
a=a+1;
b=b*2;
c=c*4;
// 100's of lines of code
}
elseif(i==1)
{
e=e*e*e;
f=f*3;
g=g*10;
// 100's of lines of code
}
// 100's of lines of code
}
// stop execution in hard real-time
}
The above code gives error on execution as the scope of the variables defined in the if blocks is limited to the respective if block. Could anyone suggest a better way of handling this issue? What is the best practice in this context in C++?

In your case, you may simply use:
int foo()
{
int i = 0;
if (i == 0) {
int a = 0;
float b = 1;
double c = 10;
for(int j = 1; j < 10; j++) {
a = a + 1;
b = b * 2;
c = c * 4;
}
} else if (i == 1) {
int e = 0;
float f = 1;
double g = 10;
for(int j = 1; j < 10; j++) {
e = e * e * e;
f = f * 3;
g = g * 10;
}
}
}
or even better, create sub-functions
void foo0()
{
int a = 0;
float b = 1;
double c = 10;
for(int j = 1; j < 10; j++) {
a = a + 1;
b = b * 2;
c = c * 4;
}
}
void foo1()
{
//.. stuff with e, f, g
}
int foo()
{
int i = 0;
if (i == 0) {
foo0();
} else if (i == 1) {
foo1();
}
}

how do I allocate one block of memory with new?

I have a two dimensional array that I've allocated dynamically using new.
The problem is I want to allocate the memory as one connected block instead of in separated pieces to increase processing speed.
Does anyone know if it's possible to do this with new, or do I have to use malloc?
Here's my code:
A = new double*[m];
for (int i=0;i<m;i++)
{
A[i]= new double[n];
}
This code causes a segmentation fault
phi = new double**[xlength];
phi[0] = new double*[xlength*ylength];
phi[0][0] = new double[xlength*ylength*tlength];
for (int i=0;i<xlength;i++)
{
for (int j=0;j<ylength;j++)
{
phi[i][j] = phi[0][0] + (ylength*i+j)*tlength;
}
phi[i] = phi[0] + ylength*i;
}

You can allocate one big block and use it appropriately, something like this:
double* A = new double[m*n];
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
A[i*n+j] = <my_value>;
}
}
Instead of using new, you can use malloc - there is no much difference, except that new must be released with delete, and malloc() released with free().
UPDATE1:
You can create "true" 2d array as follows:
double** A = new double*[m];
double* B = new double[m*n];
for (int i=0; i<m; i++) {
A[i] = B + n*i;
}
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
A[i][j] = <my_value>;
}
}
Just be sure to release both A and B in the end.
UPDATE2:
By popular request, this is how you can create "true" 3-dimensional array (with dimensions m x n x o):
double*** A = new double**[m];
double** B = new double*[m*n];
double* C = new double[m*n*o];
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
B[n*i+j] = C + (n*i+j)*o;
}
A[i] = B + n*i;
}
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
for (int k=0; k<o; k++) {
A[i][j][k] = <my_value>;
}
}
}
This uses 2 relatively small "index" arrays A and B, and data array C. As usual, all three should be released after use.
Extending this for more dimensions is left as an exercise for the reader.

There is nothing you can do with malloc that you can't do with new (though the converse doesn't hold). However if you've already allocated the memory in separate blocks, you will have to allocate new (contiguous) memory in order to get a connected block (with either malloc or new). The code you show allocates m non-contiguous n-sized blocks. To get an array with contiguous memory from this, you would need
int MN = m*n;
B = new double[MN];
for (int i=0; i<MN; ++i)
B[i] = A[ i/N ] [ i%N ];

Ok, if the task is to maintain a single block of memory, but keep [][] way of addressing it, I'd try a few tricks with classes. The first one is an inside proxy:
class CoordProxy
{
private:
int coordX;
int arrayWidth;
int * dataArray;
public:
CoordProxy(int * newArray, int newArrayWidth, int newCoordX)
{
coordX = newCoordX;
arrayWidth = newArrayWidth;
dataArray = newArray;
}
int & operator [](int newCoordY)
{
return (dataArray[newCoordY * arrayWidth + coordX]);
}
};
class CoordsWrapper
{
private:
int * dataArray;
int width;
int height;
public:
CoordsWrapper(int * newArray, int newWidth, int newHeight)
{
dataArray = newArray;
width = newWidth;
height = newHeight;
}
CoordProxy operator[] (int coordX)
{
return CoordProxy(dataArray, width, coordX);
}
};
int main(int argc, char * argv[])
{
int * a = new int[4 * 4];
ZeroMemory(a, 4 * 4 * sizeof(int));
CoordsWrapper w(a, 4, 4);
w[0][0] = 10;
w[0][1] = 20;
w[3][3] = 30;
std::for_each(&a[0], &a[4 * 4], [](int x) { printf("%d ", x); });
delete[] a;
}
Note, that this is not time-efficient, but extremely memory efficient: uses 4 ints and 2 pointers more than original class.
There's even nicer and a lot faster solution, but you would have to resign from [][] notation in favor of (,) notation:
class CoordsWrapper2
{
private:
int * data;
int width;
int height;
public:
CoordsWrapper2(int * newData, int newWidth, int newHeight)
{
data = newData;
width = newWidth;
height = newHeight;
}
inline int & Data(int x, int y)
{
return data[y * width + x];
}
};
int main(int argc, char * argv[])
{
int * a = new int[4 * 4];
ZeroMemory(a, 4 * 4 * sizeof(int));
CoordsWrapper2 w(a, 4, 4);
w.Data(0, 0) = 10;
w.Data(0, 1) = 20;
w.Data(3, 3) = 30;
std::for_each(&a[0], &a[4 * 4], [](int x) { printf("%d ", x); });
delete[] a;
}
Note the inline directive. It suggests the compiler to replace the method call for actual source code, which make it a little faster. This solution is even more memory efficient and a either a tiny bit less or equally time efficient as classic indexing.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to nest for loops in CUDA? - c++

Related

Porting code from C++ to C. How do I deal with this vector and range based loop

C++ 4d array memory deallocation is slow

How can I parallel this loop with open mp?

c++ define variable inside if block for function scope

how do I allocate one block of memory with new?

Categories

Resources