Atomic Add on Cuda not working.. - c++

My problem is to find out the number of integer points in n dimensional sphere using CUDA. I dont understand what is wrong with the below code but it is giving 0 output all the time. CUDA compute capability is 2.0 and tool kit version is 3.10.
Thanks for all the help.
__global__ void count_in(int pow_rad, int ndim,int *digit,int w,unsigned int *count,double radius)
{
long int i,j;
int rem,idx,sq,num;
int iy=blockDim.y * blockIdx.y + threadIdx.y;
int ix=blockDim.x * blockIdx.x + threadIdx.x;
int width=gridDim.x*blockDim.x;
int h=2*w+1;
i=iy*width+ix;
if(i>pow_rad) return;
sq=0;
idx=0;
num=i;
for(j=0;j<ndim;j++)
{digit[j]=0;}
while(num!=0)
{
rem=num%w;
num/=w;
digit[idx]=rem;
idx++;
}
for(j=0;j<ndim;j++)
{sq+=(digit[j]-h)*(digit[j]-h);}
if(sq<(radius*radius))
atomicInc(count,(unsigned int)1);
__syncthreads();
}
int main(int argc, char* argv[])
{
const long ntrials = 5;
int i;
for (int n = 0; n < ntrials; ++n) {
int *digit;
unsigned int *count;
std::cout<<n<<std::endl;
int pow_rad;
unsigned int num;
// Select radius and number of dimensions at random
const double r = drand48() * (RMAX - RMIN) + RMIN;
const int nd = lrand48() % (MAXDIM - 1) + 1;
cudaMalloc((void**) &digit,sizeof(int)*nd);
cudaMalloc((void**) &count,sizeof(unsigned int));
cudaMemset(count,0,sizeof(unsigned int));
int h=(int)floor(r);
int w=2*h+1;
std::cout << "###"<< r <<" "<< nd<< std::endl;
for(i=1;i<=nd;i++)
pow_rad*=w;
int width=(int)sqrt(pow_rad);
// Call your function
dim3 dimBlock(32,32);
dim3 dimGrid((width/32)+1,(width/32)+1);
count_in<<<dimGrid,dimBlock>>>(pow_rad, nd,digit,w,count,r);
cudaMemcpy(&num,count,sizeof(unsigned int),cudaMemcpyDeviceToHost);
std::cout << "-->"<<num << std::endl;
}
}

I didn't look at all of your code, but the lines
atomicInc(count,(unsigned int)1);
seems to show a common misunderstanding of the atomicInc function. The second argument is not the amount to increment, but the modulus; when the global variable reaches that amount, it resets to zero. With the value you specified, each time the statement executes the variable count is reset to 0.
If you change atomicInc to atomicAdd, or if you change the modulus to something large enough that it will never be reached, it should work better.

Related

CUDA Speed Slower than expected - Image Processing

I am new to CUDA development and wanted to write a simple benchmark to test some image processing feasibility. I have 32 images that are each 720x540, one byte per pixel greyscale.
I am running benchmarks for 10 seconds, and counting how many times they are able to process. There are three benchmarks I am running:
The first is just transferring the images into the GPU global memory, via cudaMemcpy
The second is transferring and processing the images.
The third is running the equivalent test on a CPU.
For a starting, simple test, the image processing is just counting the number of pixels above a certain greyscale value. I'm finding that accessing global memory on the GPU is very slow. I have my benchmark structured such that it creates one block per image, and one thread per row in each image. Each thread counts its pixels into a shared memory array, after which the first thread sums them up (See below).
The issue I am having is that this all runs very slowly - about 50fps. Much slower than a CPU version - about 230fps. If I comment out the pixel value comparison, resulting in just a count of all pixels, I get 6x the performance. I tried using texture memory but didn't see a performance gain. I am running a Quadro K2000. Also: the image copy only benchmark is able to copy at around 330fps, so that doesn't appear to be the issue.
Any help / pointers would be appreciated. Thank you.
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
extern "C" void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
Two changes to your kernel design can result in a significant speedup:
Perform the operations column-wise instead of row-wise. The general background for why this matters/helps is described here.
Replace your final operation with a canonical parallel reduction.
According to my testing, those 2 changes result in ~22x speedup in kernel performance:
$ cat t49.cu
#include <iostream>
#include <helper_cuda.h>
typedef unsigned char U8;
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
__global__ void ThreadPerColCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns, int rsize)
{
extern __shared__ int col_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
unsigned char *imageStart = AllPixels[myImage];
int myStartCol = (threadIdx.y * blockDim.x + threadIdx.x);
int col_count = 0;
for (int i = 0; i < H; i++) if (imageStart[myStartCol+i*W]> Threshold) col_count++;
col_counts[threadIdx.x] = col_count;
__syncthreads();
for (int i = rsize; i > 0; i>>=1){
if ((threadIdx.x+i < W) && (threadIdx.x < i)) col_counts[threadIdx.x] += col_counts[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) AllReturns[myImage] = col_counts[0];
}
void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
unsigned next_power_of_2(unsigned v){
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;}
void cuda_Benchmark1(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
int rsize = next_power_of_2(W/2);
ThreadPerColCounter<<<nImages, W, sizeof(int)*W>>> (
Threshold,
W, H,
AllPixels,
AllReturns, rsize);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
int main(){
const int my_W = 720;
const int my_H = 540;
const int n_img = 128;
const int my_thresh = 10;
U8 **img_p, **img_ph;
U8 *img, *img_h;
int *res, *res_h, *res_h1;
img_ph = (U8 **)malloc(n_img*sizeof(U8*));
cudaMalloc(&img_p, n_img*sizeof(U8*));
cudaMalloc(&img, n_img*my_W*my_H*sizeof(U8));
img_h = new U8[n_img*my_W*my_H];
for (int i = 0; i < n_img*my_W*my_H; i++) img_h[i] = rand()%20;
cudaMemcpy(img, img_h, n_img*my_W*my_H*sizeof(U8), cudaMemcpyHostToDevice);
for (int i = 0; i < n_img; i++) img_ph[i] = img+my_W*my_H*i;
cudaMemcpy(img_p, img_ph, n_img*sizeof(U8*), cudaMemcpyHostToDevice);
cudaMalloc(&res, n_img*sizeof(int));
cuda_Benchmark(n_img, my_W, my_H, img_p, res, my_thresh);
res_h = new int[n_img];
cudaMemcpy(res_h, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
cuda_Benchmark1(n_img, my_W, my_H, img_p, res, my_thresh);
res_h1 = new int[n_img];
cudaMemcpy(res_h1, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_img; i++) if (res_h[i] != res_h1[i]) {std::cout << "mismatch at: " << i << " was: " << res_h1[i] << " should be: " << res_h[i] << std::endl; return 0;}
}
$ nvcc -o t49 t49.cu -I/usr/local/cuda/samples/common/inc
$ cuda-memcheck ./t49
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t49
==1756== NVPROF is profiling process 1756, command: ./t49
==1756== Profiling application: ./t49
==1756== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 72.02% 54.325ms 1 54.325ms 54.325ms 54.325ms ThreadPerRowCounter(int, int, int, unsigned char**, int*)
24.71% 18.639ms 2 9.3195ms 1.2800us 18.638ms [CUDA memcpy HtoD]
3.26% 2.4586ms 1 2.4586ms 2.4586ms 2.4586ms ThreadPerColCounter(int, int, int, unsigned char**, int*, int)
0.00% 3.1040us 2 1.5520us 1.5360us 1.5680us [CUDA memcpy DtoH]
API calls: 43.63% 59.427ms 3 19.809ms 18.514us 59.159ms cudaMalloc
41.70% 56.789ms 2 28.394ms 2.4619ms 54.327ms cudaDeviceSynchronize
14.02% 19.100ms 4 4.7749ms 17.749us 18.985ms cudaMemcpy
0.52% 705.26us 96 7.3460us 203ns 327.21us cuDeviceGetAttribute
0.05% 69.268us 1 69.268us 69.268us 69.268us cuDeviceTotalMem
0.04% 50.688us 1 50.688us 50.688us 50.688us cuDeviceGetName
0.04% 47.683us 2 23.841us 14.352us 33.331us cudaLaunchKernel
0.00% 3.1770us 1 3.1770us 3.1770us 3.1770us cuDeviceGetPCIBusId
0.00% 1.5610us 3 520ns 249ns 824ns cuDeviceGetCount
0.00% 1.0550us 2 527ns 266ns 789ns cuDeviceGet
$
(Quadro K2000, CUDA 9.2.148, Fedora Core 27)
(The next_power_of_2 code is lifted from this answer)
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

CUDA - Parallel Reduction Sum

I am trying to implement a parallel reduction sum in CUDA 7.5. I have been trying to follow the NVIDIA PDF that walks you through the initial algorithm and then steadily more optimised versions. I am currently making an array that is filled with 1 as the value in every array position so that I can check the output is correct but I am getting a value of -842159451 for an array of size 64. I am expecting that the kernel code is correct as I have followed the exact code from NVIDIA for it but here is my kernel:
__global__ void reduce0(int *input, int *output) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = input[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
if (tid % (2 * s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
Here is my code calling the kernel, which is where I expect my problem to be:
int main()
{
int numThreadsPerBlock = 1024;
int *hostInput;
int *hostOutput;
int *deviceInput;
int *deviceOutput;
int numInputElements = 64;
int numOutputElements; // number of elements in the output list, initialised below
numOutputElements = numInputElements / (numThreadsPerBlock / 2);
if (numInputElements % (numThreadsPerBlock / 2)) {
numOutputElements++;
}
hostInput = (int *)malloc(numInputElements * sizeof(int));
hostOutput = (int *)malloc(numOutputElements * sizeof(int));
for (int i = 0; i < numInputElements; ++i) {
hostInput[i] = 1;
}
const dim3 blockSize(numThreadsPerBlock, 1, 1);
const dim3 gridSize(numOutputElements, 1, 1);
cudaMalloc((void **)&deviceInput, numInputElements * sizeof(int));
cudaMalloc((void **)&deviceOutput, numOutputElements * sizeof(int));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(int), cudaMemcpyHostToDevice);
reduce0 << <gridSize, blockSize >> >(deviceInput, deviceOutput);
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(int), cudaMemcpyDeviceToHost);
for (int ii = 1; ii < numOutputElements; ii++) {
hostOutput[0] += hostOutput[ii]; //accumulates the sum in the first element
}
int sumGPU = hostOutput[0];
printf("GPU Result: %d\n", sumGPU);
std::string wait;
std::cin >> wait;
return 0;
}
I have also tried bigger and smaller array sizes for the input and I get the same result of a very large negative value no matter the size of the array.
Seems you are using a dynamically allocated shared array:
extern __shared__ int sdata[];
but you are not allocating it in the kernel invocation:
reduce0 <<<gridSize, blockSize >>>(deviceInput, deviceOutput);
You have two options:
Option 1
Allocate the shared memory statically in the kernel, e.g.
constexpr int threadsPerBlock = 1024;
__shared__ int sdata[threadsPerBlock];
More often than not I find this the cleanest approach, as it works without a problem when you have multiple arrays in shared memory. The drawback is that while the size usually depends on the number of threads in the block, you need the size to be known at compile-time.
Option 2
Specify the amount of dynamically allocated shared memory in the kernel invocation.
reduce0 <<<gridSize, blockSize, numThreadsPerBlock*sizeof(int) >>>(deviceInput, deviceOutput);
This will work for any value of numThreadsPerBlock (provided it is within the allowed range of course). The drawback is that if you have multiple extern shared arrays, you need to figure out how to put then in the memory yourself, so that one does not overwrite the other.
Note, there may be other problems in your code. I didn't test it. This is something I spotted immediately upon glancing over your code.

CUDA Histogram an illegal memory access was encountered (77) [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
So Here is My almost Complete code:
the first kernel which is normal global histogram works correctly. but I get the error "an illegal memory access was encountered (77)"
at the final memcpy after calculating the shared_histogram. I dont know what is wrong with the code. seems like the shared histogram does change the size of d_hist2. I also checked that bin_count is changed or not. but it didnt. so is my shared_histog kernel wrong or i am doing a mistake on memCpy??
note : w * h * nc is the size of my input image
__global__ void histog( int *img, int *hist, int bin_count, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
if(x>=n) return;
unsigned char value = img[x];
int bin = value % bin_count;
atomicAdd(&hist[bin],1);
}
__global__ void shared_histog( int *img, int *hist, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
int indx = threadIdx.x;
if(x>n) return;
__shared__ int shHist[256];
if (indx < 256)
shHist[indx] =0;
__syncthreads();
unsigned char value = img[x];
__syncthreads();
atomicAdd( (int*)&shHist[value], 1);
__syncthreads();
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize(); CUDA_CHECK;
int *imgval = new int[(size_t)w*h*nc];
for (int i =0; i<w*h*nc; i++)
imgval[i] = (imgIn[i])*256 + 1;
int bin_count = 256;
int *Histogram = new int[bin_count];
int *Histogram2 = new int[bin_count];
for (int i =0; i <bin_count; i++)
Histogram2[i] = 0;
Timer timer; timer.start();
for (int i =0; i <bin_count; i++)
Histogram[i] = 0;
for (int i =0; i<w*h*nc; i++)
Histogram[(imgval[i])]++;
showHistogram256("CPU_Histo", Histogram, 100 + w + 40, 100);
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "CPU time: " << t*1000 << " ms" << endl;
int *d_img = NULL;
int nbytes = w * h * nc * sizeof(int);
cudaMalloc(&d_img, nbytes); CUDA_CHECK;
cudaMemcpy(d_img, imgval, nbytes, cudaMemcpyHostToDevice); CUDA_CHECK;
int *d_hist = NULL;
cudaMalloc(&d_hist, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist, 0, bin_count * sizeof(int)); CUDA_CHECK;
int *d_hist2 = NULL;
cudaMalloc(&d_hist2, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist2, 0, bin_count * sizeof(int)); CUDA_CHECK;
dim3 block = dim3(1024,1,1);
dim3 grid = dim3 ((w*h*nc+block.x-1)/block.x, 1, 1);
Timer timer2; timer2.start();
histog <<<grid, block>>> (d_img, d_hist, bin_count, nbytes); CUDA_CHECK;
timer2.end(); float t2 = timer2.get(); // elapsed time in seconds
cout << "GPU time: " << t2*1000 << " ms" << endl;
cudaMemcpy(Histogram, d_hist,bin_count * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo", Histogram, 100 + w + 40, 100 + h/2 + 10);
Timer timer3; timer3.start();
shared_histog <<<grid, block>>> (d_img, d_hist2, nbytes); CUDA_CHECK;
timer3.end(); float t3 = timer3.get(); // elapsed time in seconds
cout << "Shared time: " << t3*1000 << " ms" << endl;
* here comes the error *
cudaMemcpy(Histogram2, d_hist2, 256 * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo_Shared", Histogram2, 100 + w + 40, 100 + h +10);
return 0;
}
You're using __syncthreads() after a conditional statement:
if(x>n) return;
that may prevent all threads in the block from reaching it. That is not correct usage:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
But it is probably not connected to the illegal memory access.
You are launching this kernel with 1024 threads per block:
dim3 block = dim3(1024,1,1);
which means in the kernel, your indx variable:
int indx = threadIdx.x;
will go from 0..1023 depending on the thread, which means that this line:
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
^^^^ ^^^^
will attempt to index into both hist and shHist out-of bounds for threads whose indx value is greater than 255, since both hist and shHist are only allocated with 256 elements.
You can probably fix this by adding a conditional statement:
if (indx < 256)
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
If you compile with -lineinfo and use cuda-memcheck, you can actually have cuda-memcheck pinpoint the line of source code that is generating the out-of-bounds access.

cudMemcpy invalid argument no clues about what is wrong

I wrote some code and can't get it work for some reason...but it's already copy paste from working test program generated by VisualStudio:
__device__ int pseudoRandomFunction(int seed)
{
unsigned int m_w = 150;
unsigned int m_z = 40;
m_z = 36969 * (m_z & 65535) + (m_z >> 16);
m_w = 18000 * (m_w & 65535) + (m_w >> 16);
return (m_z << 16) + m_w;
}
__global__ void fillArrayWithRandom(int* vector, int seed = 0)
{
int i = threadIdx.x;
vector[i] = pseudoRandomFunction(seed^i);
}
void Lab1(int* array, int size)
{
int* gpuArray = 0;
gpuErrorCheck(cudaSetDevice(0));
gpuErrorCheck(cudaMalloc(&gpuArray, size));
gpuErrorCheck(cudaMemcpy(gpuArray, array, size * sizeof(int), cudaMemcpyHostToDevice)); // <--- here is invalid argument exception!
fillArrayWithRandom<<<1,size>>>(gpuArray,0);
gpuErrorCheck(cudaGetLastError());
gpuErrorCheck(cudaDeviceSynchronize());
gpuErrorCheck(cudaMemcpy(array, gpuArray, size * sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(gpuArray);
gpuErrorCheck(cudaDeviceReset());
}
I even tryed to change argument types but no progress...plese help! My gpu GeForce 9600GT, and cuda 6.5
The only problem I can see is that this line is not correct:
gpuErrorCheck(cudaMalloc(&gpuArray, size));
it should be:
gpuErrorCheck(cudaMalloc(&gpuArray, size*sizeof(int)));
with that change, I can run your code without error, when size = 10.
Note that your pseudoRandomFunction routine does not use seed, so every value it creates will be the same.

Cuda, calculate distance matrix between 3d objects

I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.
struct atom
{
double x,y,z;
}
int main()
{
//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];
for (int i = 0; i < N; i ++)
for (int j = 0; j < N; j++)
DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
(atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;
}
Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.
If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.
I would use a SoA for storing the coordinates.
You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).
threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.
I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.
Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int N(20);
#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int div_up(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z);
int main(int argc, char **argv)
{
double* atoms_x_h;
check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));
double* atoms_y_h;
check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));
double* atoms_z_h;
check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));
for (int i(0); i < N; ++i) {
atoms_x_h[i] = i;
atoms_y_h[i] = i;
atoms_z_h[i] = i;
}
double* atoms_x_d;
check(cudaMalloc(&atoms_x_d, N * sizeof(double)));
double* atoms_y_d;
check(cudaMalloc(&atoms_y_d, N * sizeof(double)));
double* atoms_z_d;
check(cudaMalloc(&atoms_z_d, N * sizeof(double)));
check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));
double* distances_d;
check(cudaMalloc(&distances_d, N * N * sizeof(double)));
const int threads_per_block(256);
dim3 n_blocks(div_up(N, threads_per_block));
calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);
check(cudaPeekAtLastError());
check(cudaDeviceSynchronize());
double* distances_h;
check(cudaMallocHost(&distances_h, N * N * sizeof(double)));
check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int i(0); i < N; ++i) {
for (int j(0); j < N; ++j) {
cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
}
}
check(cudaFree(distances_d));
check(cudaFreeHost(distances_h));
check(cudaFree(atoms_x_d));
check(cudaFreeHost(atoms_x_h));
check(cudaFree(atoms_y_d));
check(cudaFreeHost(atoms_y_h));
check(cudaFree(atoms_z_d));
check(cudaFreeHost(atoms_z_h));
return 0;
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z)
{
int i(threadIdx.x + blockIdx.x * blockDim.x);
int j(threadIdx.y + blockIdx.y * blockDim.y);
if (i >= N || j >= N) {
return;
}
distances[i + N * j] =
(atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
(atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
(atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}