How to allocate a large 3-dimensional array dynamically?

How to allocate a large 3-dimensional array dynamically? - c++

My computer RAM has 32GB of memory available. I want to define a 1500*1500*500 size array. How should I define a dynamic array?
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime_api.h>
void main(void) {
#define NX 1501
#define NY 1501
#define NZ 501
int i, j, k, l, m, dt, nstop;
double comp;
dt = 5;
nstop = 5
static double ex[NX][NY][NZ] = { 0. }, ey[NX][NY][NZ] = { 0. }, ez[NX][NY][NZ] = { 0. };
static double hx[NX][NY][NZ] = { 1. }, hy[NX][NY][NZ] = { 0. }, hz[NX][NY][NZ] = { 1. };
static double t, comp;
FILE *file;
file = fopen("point A hm=0.csv", "w"); /* Output data file name */
t = 0.;
for (l = 0; l < nstop; l++) {
for (i = 0; i < NX - 1; i++) {
for (j = 1; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
ex[i][j][k] = 2 * ey[i][j][k]
+ 3 * (hz[i][j][k] - hx[i][j - 1][k])
- 5 * (hy[i][j][k] - 2 * hz[i][j][k - 1]);
}
}
}
comp = ((double)(l + 1) / nstop) * 100.;
printf("Computation: %4.3f %% completed \r", comp);
fprintf(file, "%e, %e \n", t * 1e6, -ex[1200][950][20] + ex[1170][950][20]) / 2.);
t = t + dt;
}
fclose(file);
}

There must be an error in your problem statement:
the formula to compute ex[i][j][k] only depends on values from the other arrays with same i index for the first dimension. Since you only output the value of -ex[1200][950][20] + ex[1170][950][20]) / 2., you only need to compute the values for i=1200 and i=1170 and there is no need to allocate so much memory.
furthermore, the computed values in ex are the same for all values of l. No need to recompute at each iteration.
finally, given the initialization of the arrays, all values of ex for a first index other than 0 are null, so the output is trivial to compute: 0.0.
More seriously, if the initial values are small integers, the results seem to require only 32-bit integer arithmetics, which would reduce the memory requirements by 50%. Yet this would still exceed the maximum size for statically allocated objects on your system. You should allocate these 3D matrices dynamically this way:
double (*ex)[NY][NZ] = calloc(NX, sizeof(*ex));
Assuming your code is more complex than the sample posted, which incidentally contains a few typos that prevent compilation, here is what the modified code would look like:
#include <stdio.h>
#include <stdlib.h>
int main(void) {
#define NX 1501
#define NY 1501
#define NZ 501
int i, j, k, l, dt, nstop;
double comp;
dt = 5;
nstop = 5;
double (*ex)[NY][NZ] = calloc(NX, sizeof(*ex));
if (ex == NULL) { fprintf(stderr, "allocation failed for ex\n"); exit(1); }
double (*ey)[NY][NZ] = calloc(NX, sizeof(*ey));
if (ey == NULL) { fprintf(stderr, "allocation failed for ey\n"); exit(1); }
double (*ez)[NY][NZ] = calloc(NX, sizeof(*ez));
if (ez == NULL) { fprintf(stderr, "allocation failed for ez\n"); exit(1); }
double (*hx)[NY][NZ] = calloc(NX, sizeof(*hx));
if (hx == NULL) { fprintf(stderr, "allocation failed for hx\n"); exit(1); }
double (*hy)[NY][NZ] = calloc(NX, sizeof(*hy));
if (hy == NULL) { fprintf(stderr, "allocation failed for hy\n"); exit(1); }
double (*hz)[NY][NZ] = calloc(NX, sizeof(*hz));
if (hz == NULL) { fprintf(stderr, "allocation failed for hz\n"); exit(1); }
hx[0][0][0] = 1.;
hz[0][0][0] = 1.;
// probably many more initializations missing
double t;
FILE *file;
file = fopen("point A hm=0.csv", "w"); /* Output data file name */
if (file == NULL) { fprintf(stderr, "cannot create output file\n"); exit(1); }
t = 0.;
for (l = 0; l < nstop; l++) {
for (i = 0; i < NX - 1; i++) {
for (j = 1; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
ex[i][j][k] = 2 * ey[i][j][k]
+ 3 * (hz[i][j][k] - hx[i][j - 1][k])
- 5 * (hy[i][j][k] - 2 * hz[i][j][k - 1]);
}
}
}
comp = ((double)(l + 1) / nstop) * 100.;
printf("Computation: %4.3f %% completed \r", comp);
fprintf(file, "%e, %e \n", t * 1e6, (-ex[1200][950][20] + ex[1170][950][20]) / 2.);
t = t + dt;
}
fclose(file);
free(ex);
free(ey);
free(ez);
free(hx);
free(hy);
free(hz);
return 0;
}

There are several options. If you need to allocate the entire memory structure at once, your probably want to allocate for a pointer-to-pointer-to-array int[500] (int (**)[500]) rather than allocating for a pointer-to-pointer-to-pointer int (int ***)- though both are technically correct.
(note: I used int in the example, so just change the type of a to double to satisfy your needs)
To approach the allocation for a pointer-to-pointer-to-array int[500], start with your pointer and allocate 1500 pointers, e.g.
#define Z 500
#define X 1500
#define Y X
int main (void) {
int (**a)[Z] = NULL; /* pointer to pointer to array of int[500] */
if (!(a = malloc (X * sizeof *a))) { /* allocate X pointers to (*)[Z] */
perror ("malloc-X (**)[Z]");
return 1;
}
At this point you have 1500 pointers-to-array-of int[500]. You can loop of each allocated pointer above, allocating 1500 * sizeof (int[500) and assigning the starting address to each block allocated to one of the pointers, e.g.
for (int i = 0; i < X; i++) /* for each pointer */
if (!(a[i] = malloc (Y * sizeof **a))) { /* alloc Y * sizeof int[Z] */
perror ("malloc-YZ (*)[Z]");
return 1;
}
Now you can address each integer in your allocation as a[x][y][z]. Then to free the allocated memory, you just free() in the reverse order you allocated, e.g.
for (int i = 0; i < X; i++)
free (a[i]); /* free allocated blocks */
free (a); /* free pointers */
A short example that exercises this and writes a value to each index could be:
#include <stdio.h>
#include <stdlib.h>
#define Z 500
#define X 1500
#define Y X
int main (void) {
int (**a)[Z] = NULL; /* pointer to pointer to array of int[500] */
if (!(a = malloc (X * sizeof *a))) { /* allocate X pointers to (*)[Z] */
perror ("malloc-X (**)[Z]");
return 1;
}
puts ("pointers allocated");
for (int i = 0; i < X; i++) /* for each pointer */
if (!(a[i] = malloc (Y * sizeof **a))) { /* alloc Y * sizeof int[Z] */
perror ("malloc-YZ (*)[Z]");
return 1;
}
puts ("all allocated");
for (int i = 0; i < X; i++) /* set mem to prevent optimize out */
for (int j = 0; j < Y; j++)
for (int k = 0; k < Z; k++)
a[i][j][k] = i * j * k;
puts ("freeing memory");
for (int i = 0; i < X; i++)
free (a[i]); /* free allocated blocks */
free (a); /* free pointers */
}
Example Use/Output -- Timed Run
$ time ./bin/malloc_1500x1500x500
pointers allocated
all allocated
freeing memory
real 0m1.481s
user 0m0.649s
sys 0m0.832s
Memory Use/Error Check
That's 4.5G of memory allocated and used (warning: you will swap on 8G or less depending what else you have running if you run valgrind)
$ valgrind ./bin/malloc_1500x1500x500
==7750== Memcheck, a memory error detector
==7750== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==7750== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==7750== Command: ./bin/malloc_1500x1500x500
==7750==
pointers allocated
all allocated
freeing memory
==7750==
==7750== HEAP SUMMARY:
==7750== in use at exit: 0 bytes in 0 blocks
==7750== total heap usage: 1,502 allocs, 1,502 frees, 4,500,013,024 bytes allocated
==7750==
==7750== All heap blocks were freed -- no leaks are possible
==7750==
==7750== For counts of detected and suppressed errors, rerun with: -v
==7750== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
Look things over and let me know if you have questions.

In C (as your code seems to be), you can for example use a triple pointer and malloc():
#define NX 1501
#define NY 1501
#define NZ 501
int*** p_a = malloc(sizeof(double**) * NX);
for (int i = 0; i < NX; i++)
{
p_a[i] = malloc(sizeof(double*) * NY)
for (int j = 0; j < NY; j++)
p_a[i][j] = malloc(sizeof(double) * NZ);
}
A more efficient way would be to use a single pointer and use the size of each dimension in call to malloc() at once:
double* p_a = malloc(sizeof(*p_a) * (NX * NY * NZ));
In C++, the most common and efficient way is to use a std::vector for dynamically allocating an array:
#define NX 1501
#define NY 1501
#define NZ 501
std::vector<std::vector<std::vector<double>>> a(NX, vector<vector<double>>(NY, vector<double>(NZ)));
Note that the size of a double object is on most modern platforms 8 Byte. Means, When you want to achieve what you want, you need at least 8 * 1500 * 1500 * 500 = 9000000000 Byte = about 8,3 Gbyte for each 3D array to allocate it. You define 6 of them, so 49,8 Gbyte are required to only allocate those arrays which is not provided by your systems as you said your system has 32 Gbyte available.

Related

How to copy the subsection of the 3 dimensional array in CUDA C++

I had followed example of Using cudaMemcpy3D to transfer *** pointer
Yet my task is to copy the 3d subsection of the device global memory array to device global memory array for example:
Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))
// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
So in above I am looking for a way to copy from sourceTensor to target tensor all the data where
x indices are in range (1,6)
y indices are in range (2,7)
z indices are in range (3,8)
So only subsection of the source array but I do not know How to define make_cudaPitchedPtr and make_cudaExtent properly, in order to achieve my goal.

The srcPos parameter in your cudaMemcpy3DParams should make this pretty easy. Here is an example:
$ cat t1957.cu
#include <cstdio>
typedef int it; // index type
typedef int dt; // data type
__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {
for (it z = 0; z < Nz; z++)
for (it y = 0; y < Ny; y++)
for (it x = 0; x < Nx; x++) {
char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
((dt *)ptr)[x] = z*100+y*10+x;
}
};
__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {
if (((dt *)targetTensor.ptr)[0] != 321) {
printf("%d\n", ((dt *)targetTensor.ptr)[0]);
}
};
int main(){
it Nx =10;
it Ny=10;
it Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
it NxTarget = 5;
it NyTarget = 5;
it NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
cudaMemcpy3DParms cpy = { 0 };
it NxOffset = 1;
it NyOffset = 2;
it NzOffset = 3;
cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
cpy.srcPtr = sourceTensor;
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that when neither source nor destination are specified as cudaArray types, then the element size is always assumed to be unsigned char (ie. 1 byte).

How to read from another GPU device?

I want to run the following simple code on two GPUs simultaneously. Here I have a variable A[i]=[0 1 2 3 4 5 6 7 8 9] and want to calculate C[i]=A[i+1]+A[i]+A[i-1]. This is the answer: C[i]=[1 3 6 9 7 11 18 21 24 17]. Bold numbers are wrong. For two devices, C[4] from device=1 needs to access to A[5] from device=2. How can I do it in the simplest way?
My expertise is not programming and I suppose to use multiGPU to solve a PDE equation. So, I really appreciate any help to modify this code for my current problem.
Thank you.
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include<time.h>
__global__ void iKernel(float *A, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i-1] + A[i] + A[i+1];
}
int main(int argc, char **argv)
{
int ngpus;
printf("> starting %s", argv[0]);
cudaGetDeviceCount(&ngpus);
printf(" CUDA-capable devices: %i\n", ngpus);
ngpus = 2;
int size = 10;
int iSize = size / ngpus;
size_t iBytes = iSize * sizeof(float);
printf("> total array size %d M, using %d devices with each device "
"handling %d M\n", size / 1024 / 1024, ngpus, iSize / 1024 / 1024);
// allocate device memory
float **d_A = (float **)malloc(sizeof(float *) * ngpus);
float **d_C = (float **)malloc(sizeof(float *) * ngpus);
float **h_A = (float **)malloc(sizeof(float *) * ngpus);
float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
cudaStream_t *stream = (cudaStream_t *)malloc(sizeof(cudaStream_t) * ngpus);
for (int i = 0; i < ngpus; i++){
// set current device
cudaSetDevice(i);
// allocate device memory
cudaMalloc((void **)&d_A[i], iBytes);
cudaMalloc((void **)&d_C[i], iBytes);
// allocate page locked host memory for asynchronous data transfer
cudaMallocHost((void **)&h_A[i], iBytes);
cudaMallocHost((void **)&gpuRef[i], iBytes);
// create streams for timing and synchronizing
cudaStreamCreate(&stream[i]);
}
dim3 block(512);
dim3 grid((iSize + block.x - 1) / block.x);
//h_A[ngpus][index]
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
for (int j = 0; j < iSize; j++){
h_A[i][j] = j + i*iSize;
printf("%d %d %d %0.8f \n", i,j,iSize, h_A[i][j]);
}
}
// record start time
double iStart = clock();
// distributing the workload across multiple devices
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
cudaMemcpyAsync(d_A[i], h_A[i], iBytes, cudaMemcpyHostToDevice, stream[i]);
iKernel << <grid, block, 0, stream[i] >> >(d_A[i], d_C[i], iSize);
cudaMemcpyAsync(gpuRef[i], d_C[i], iBytes, cudaMemcpyDeviceToHost,
stream[i]);
}
// synchronize streams
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
cudaStreamSynchronize(stream[i]);
}
for (int i = 0; i < ngpus; i++){
for (int j = 0; j < iSize; j++){
printf("%d %d %0.8f \n", i,j,gpuRef[i][j]);
}
}
return EXIT_SUCCESS;
}

You have to upload the overlap regions to both devices. You can't (easily) read values from another device, so you have to duplicate and pad at least some of the input values as required. iSize is obviously not enough input size when accessing iSize + 2 different input values.
If this were a multi pass algorithm, you would need to explicitly perform a copy of relevant regions in between passes.
Try modeling data dependencies formally on paper when attempting to target multi GPU systems.
Both GPUs can access memory allocated with cudaMallocHost, but it's usually not advisable to use that memory type as performance over PCIe bus is pretty bad compared to device local memory. There is also driver managed memory, but that isn't suited for two GPUs sharing the same active working set either.

Why is my CUDA implementation equally fast as my CPU implementation

I created some code to do a 2D convlution on a 1300x1300 grayscale image and a 15x15 kernel, in standard C++ and in CUDA. Both versions:
CPU:
#include <iostream>
#include <exception>
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
inline int index(int x, int y)
{
return x*my + y;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
for (int x=0; x<N; ++x)
for (int y=0; y<N; ++y)
{
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
delete[] image;
delete[] kernel;
delete[] result;
}
GPU:
#include <iostream>
#include <exception>
// ignore, just for error handling
struct ErrorHandler {
int d_line;
char const *d_file;
ErrorHandler(int line, char const *file) : d_line(line), d_file(file) {};
};
#define EH ErrorHandler(__LINE__, __FILE__)
ErrorHandler operator<<(ErrorHandler eh, cudaError_t err)
{
if (err != cudaSuccess)
{
std::cerr << cudaGetErrorString( err ) << " in " << eh.d_file << " at line " << eh.d_line << '\n';
throw std::exception();
}
return eh;
}
// end.
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
__device__ inline int index(int x, int y)
{
return x*my + y;
}
__global__ void kernelkernel(double *image, double *kernel, double *result)
{
int x = blockIdx.x;
int y = blockIdx.y; // becomes: int y = threadIdx.x;
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
double *image_cuda;
double *kernel_cuda;
double *result_cuda;
EH << cudaMalloc((void **) &image_cuda, N*N*sizeof(double));
EH << cudaMalloc((void **) &kernel_cuda, K*K*sizeof(double));
EH << cudaMalloc((void **) &result_cuda, N*N*sizeof(double));
EH << cudaMemcpy(image_cuda, image, N*N*sizeof(double), cudaMemcpyHostToDevice);
EH << cudaMemcpy(kernel_cuda, kernel, K*K*sizeof(double), cudaMemcpyHostToDevice);
dim3 grid ( N, N );
kernelkernel<<<grid, 1>>>(image_cuda, kernel_cuda, result_cuda);
// replace previous 2 statements with:
// kernelkernel<<<N, N>>>(image_cuda, kernel_cuda, result_cuda);
EH << cudaMemcpy(result, result_cuda, N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree( image_cuda );
cudaFree( kernel_cuda );
cudaFree( result_cuda );
delete[] image;
delete[] kernel;
delete[] result;
}
I would expect the cuda code to be a lot faster, however:
$ nvprof ./gpuversion
==17806== NVPROF is profiling process 17806, command: ./gpuversion
==17806== Profiling application: ./gpuversion
==17806== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.89% 3.83149s 1 3.83149s 3.83149s 3.83149s kernelkernel(double*, double*, double*)
0.07% 2.6420ms 1 2.6420ms 2.6420ms 2.6420ms [CUDA memcpy DtoH]
0.04% 1.5111ms 2 755.54us 736ns 1.5103ms [CUDA memcpy HtoD]
And:
$ time ./cpuversion
real 0m3.382s
user 0m3.371s
sys 0m0.012s
Their difference is statistically insignificant. The CUDA-kernel takes approximately 3-4 seconds, why isn't it a lot faster? Is my code run in parallel?
PS: I'm new to CUDA, so I could be missing something trivial.
SOLUTION
What I found out, is that CUDA does not let you access memory willy-nilly from blocks. I guess the general strategy of CUDA programming is:
allocate and copy memory from RAM to cuda using cudaMalloc and cudaMemCpy
divide the workload among blocks and threads in such a way that the memory accessed by different blocks doesn't overlap much.
If there is overlap between the memory used by blocks, start each block by copying the memory inside a shared array. Notice that:
the size of this array must be known compile time
it's size is limited
this memory is shared by each thread in ONE block, so __shared double foo[10] allocates 10 doubles for each BLOCK.
copy the memory needed by one block to the shared variables inside the kernel. Of course, you use the different threads to do this 'efficiently'
sync the threads, such that all data is there before it is used.
process the data, and write the result. it to the output array of the kernel
synch again, I'm not sure why, but everyone on the internet is doing it :S
copy the GPU memory back to RAM
clean up the GPU memory.
This gives the following code. It is mex-code, for Matlab for the structural similarity, which also works via a sliding kernel, but over 2 images and with a different aggregate than the dot-product.
// author: Herbert Kruitbosch, CC: be nice, include my name in documentation/papers/publications when used
#include <matrix.h>
#include <mex.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <iostream>
#include <stdio.h>
static void HandleError(
cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TILE_WIDTH 31
__device__ inline double sim(double v0, double v1, double c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
__device__ inline int index(int rows, int cols, int row, int col)
{
return row + col*rows;
}
__global__ void ssimkernel(double *test, double *reference, const double * __restrict__ kernel, double *ssim, int k, int rows, int cols, int tile_batches_needed)
{
int radius = k / 2;
int block_width = TILE_WIDTH - k + 1;
__shared__ double tile_test [TILE_WIDTH][TILE_WIDTH];
__shared__ double tile_reference[TILE_WIDTH][TILE_WIDTH];
for(int offset=0; offset < tile_batches_needed; ++offset)
{
int dest = block_width*block_width*offset + threadIdx.y * block_width + threadIdx.x;
int destRow = dest / TILE_WIDTH;
int destCol = dest % TILE_WIDTH;
int srcRow = blockIdx.y * block_width + destRow - radius;
int srcCol = blockIdx.x * block_width + destCol - radius;
int src = srcCol * rows + srcRow;
if (destRow < TILE_WIDTH)
{
if (srcRow >= 0 and srcRow < rows and
srcCol >= 0 and srcCol < cols)
{
tile_test [destRow][destCol] = test [src];
tile_reference[destRow][destCol] = reference[src];
}
else
{
tile_test [destRow][destCol] = 0;
tile_reference[destRow][destCol] = 0;
}
}
}
__syncthreads();
double mean_test = 0;
double mean_reference = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
mean_test += w * tile_test [threadIdx.y+i][threadIdx.x+j];
mean_reference += w * tile_reference[threadIdx.y+i][threadIdx.x+j];
}
double var_test = 0;
double var_reference = 0;
double correlation = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
double a = (tile_test [threadIdx.y+i][threadIdx.x+j] - mean_test );
double b = (tile_reference[threadIdx.y+i][threadIdx.x+j] - mean_reference);
var_test += w * a * a;
var_reference += w * b * b;
correlation += w * a * b;
}
int destRow = blockIdx.y * block_width + threadIdx.y;
int destCol = blockIdx.x * block_width + threadIdx.x;
if (destRow < rows and destCol < cols)
ssim[destCol * rows + destRow] = sim(mean_test, mean_reference, 0.01) * (0.03 + 2*correlation) / (0.03 + var_test + var_reference);
__syncthreads();
}
template<typename T>
inline T sim(T v0, T v1, T c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
inline int upperdiv(int a, int b) {
return (a + b - 1) / b;
}
void mexFunction(int nargout, mxArray *argout[], int nargin, const mxArray *argin[])
{
mwSize rows = mxGetDimensions(argin[0])[0];
mwSize cols = mxGetDimensions(argin[0])[1];
mwSize k = mxGetDimensions(argin[2])[0];
mwSize channels = mxGetNumberOfDimensions(argin[0]) <= 2 ? 1 : mxGetDimensions(argin[0])[2];
int dims[] = {rows, cols, channels};
argout[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
double *test = (double *)mxGetData(argin[0]);
double *reference = (double *)mxGetData(argin[1]);
double *gaussian = (double *)mxGetData(argin[2]);
double *ssim = (double *)mxGetData(argout[0]);
double *test_cuda;
double *reference_cuda;
double *gaussian_cuda;
double *ssim_cuda;
HANDLE_ERROR( cudaMalloc((void **) &test_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &reference_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &gaussian_cuda, k*k*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &ssim_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMemcpy(gaussian_cuda, gaussian, k*k*sizeof(double), cudaMemcpyHostToDevice) );
int block_width = TILE_WIDTH - k + 1;
int tile_batches_needed = upperdiv(TILE_WIDTH*TILE_WIDTH, block_width*block_width);
for(int c=0; c<channels; ++c)
{
HANDLE_ERROR( cudaMemcpy(test_cuda, test + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(reference_cuda, reference + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
dim3 dimGrid(upperdiv(cols, block_width), upperdiv(rows, block_width), 1);
dim3 dimBlock(block_width, block_width, 1);
ssimkernel<<<dimGrid, dimBlock>>>(test_cuda, reference_cuda, gaussian_cuda, ssim_cuda, k, rows, cols, tile_batches_needed);
HANDLE_ERROR( cudaMemcpy(ssim + rows*cols*c, ssim_cuda, rows*cols*sizeof(double), cudaMemcpyDeviceToHost) );
}
cudaFree( test_cuda );
cudaFree( reference_cuda );
cudaFree( gaussian_cuda );
cudaFree( ssim_cuda );
}

kernelkernel<<<grid, 1>>>
This is a significant issue; threads on nVidia GPUs work in warps of 32 threads. However, you've only assigned a single thread to each block, which means 31 of those threads will sit idle while a single thread does work. And usually, for kernels where you have the flexibility, you'll usually want several warps per block rather than just one.
You could get an immediate speedup by using N blocks and N threads per block, rather than using N^2 blocks.
Actually, N might be too big, since there's an upper limit on the number of threads per block. Although you could choose a suitable M so that that you use N/M threads per block, and N * M blocks.
In fact, you'll probably get the best results in this regard by picking some M (I'm guessing 256 will probably be near optimal) and launching with L=ceiling(N*N/M) blocks and M blocks per thread. Then each thread figures reconstructs an index in [0, M*L) based on its block and thread ID, and then those whose index is in [0,N*N) will proceed to split that index into an x and y coordinate and do work.

Accessing global memory in a kernel is costly, because of its latency. A global memory request (both reading and writing) takes hundreds of clock cycles to complete. You want to minimise the amount of times global memory is accessed, and access it in contiguous blocks.
If each piece of data is accessed exactly once, there's nothing to do about the latency, but that's seldom the case. And definitely not the case in your code, where the kernel array is accessed by all threads in the same pattern, and a lot of image is accessed by multiple threads as well.
The solution for that is to start the kernel by fetching the data from the high-latency global memory into the low-latency shared memory. Shared memory is a block of memory on the multiprocessor, and its latency is comparable to that of registers. So most simple kernels follow a structure like this:
Each thread fetches data from global memory to shared memory. You want to fetch data in contiguous sequences if possible, as global memory is accessed through transactions. If there's not enough data for all threads to fetch, leave some of them idle.
Threads operate on the data in shared memory.
Data is written from shared memory back to global memory in the same pattern as it was fetched in step 1.
Shared memory is shared by all threads within a thread block. Which leads us to the second big issue in your code: you're not using thread blocks at all. Threads in one block run on one multiprocessor, share shared memory, can be synchronised with each other etc. You need to organise threads into blocks well to get the most out of them.
The grid of blocks is just a mechanism to be able to run more blocks at one invocation. All the goodies of parallel instruction execution and shared memory access are within a block. The grid of blocks is just "yeah, sorry, my data's so big a single block won't do, just run many of them."
You're doing the exact opposite: your blocks have one thread each, which means that in each step, only one thread from each warp runs on the multiprocessor (based on your device's compute capability and the number of warp schedulers available, this means something like 2–4 threads on one multiprocessor at most).
You'll have to re-structure your threads to mirror the data access patterns, and prefetch data into shared memory. This will give you the performance boost you expect.
The above is just a short summary. Refer to the CUDA programming guide for details on block organisation, shared memory, and global memory transactions.

If you're using global memory in CUDA, all the data access will be synchronized in something like queue, and you'll receive almost linear solution, not parallel.
Also, transfering a large dataset from your RAM memory to GPU memory also takes a lot of time (the speed of bus is limited).
So, i think you have to somehow parallel your data across computation units in your GPU (part them into shared memory).
Check this to see solution of how to improve your GPU memory usage in the case that similar to yours.

Share memory in CUDA ? How does it CODE work?

I have a program to compute the the value of array :
array A : have 32 elements, value form 0 -> 31.
array B : have 16 elements, value = 0;
**I want to compute the value of B[i] following this rule : B[i]=A[i*2] + A[i*2+1]; i from 0 to 31 **
I use CUDA Programing with my example code :
Main.cu
__global__ void Kernel(int *devB, int *devA)
{
// Use share memory, 16 thread per block, so I use 16element for share memory in block
__shared__ int smA[16];
//copy data from global memory to shared memory
//1 thread copies 1 elementente
smA[threadIdx.x] = devA[threadIdx.x + blockIdx.x * blockDim.x];
__syncthreads();
//8 thread in Block
if (threadIdx.x < 8)
{
devB[threadIdx.x + blockIdx.x * blockDim.x] =
smA[threadIdx.x * 2] + smA[threadIdx.x * 2 + 1];
}
}
Void main
void main()
{
int *A = (int*)malloc(sizeof(int) * 32);
int *B = (int*)malloc(sizeof(int) * 16);
for (int i = 0; i < 32; i++)
A[i] = i;
int *devA = NULL;
cudaMalloc((void**)&devA, sizeof(int) * 32);
cudaMemcpy(devA, A, sizeof(int) * 32), cudaMemcpyHostToDevice);
int * devB = NULL;
cudaMalloc((void**)&devB, sizeof(int) * 16);
dim3 block(16, 1, 1);
dim3 grid(2, 1, 1);
Kernel<<<grid, block>>>(devB, devA);
//copy back data to host
cudaMemcpy(B, devB, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (int i = 0; i < 16; i++) printf("%d/t", b[i]);
if (A != NULL) free(A);
if (B != NULL) free(B);
if (devA != NULL) cudaFree(devA);
if (devB != NULL) cudaFree(devB);
}
So, I want to question : following my code above, I use Share memory int smnA[16] in the Kernel, and with 2 block = 2*16 thread Because Each thread execute a kernel ( from Seland.pdf )
=> I will have 16x16 = 256 element in Share memory ? => it none logic !

No your assumption is wrong. Because shared memory can be used for interaction of threads within the same block, shared memory is allocated also for a whole thread block.
In your example you will use 16 integer elements for every thread block. Total your kernel requires 32 integer elements to run all thread blocks simultaneously.
Even if it's not the same but maybe you can compare it with static variables in c code.
If you write in your kernel something like the following code example, every thread will use it's own array with 16 elements. But this array can't be accessed by other threads (exception will be shuffle instructions).
__globa__ void kernel (...)
{
int array_single_thread[16]; // Every thread instance has it's own array.
...
__shared__ int array_thread_block[16]; // Once allocated for complete thread block.
}

count3's in cuda is very slow

I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.

This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js