How to fix my block and grid layout to handle large data? - c++

I'm trying to find the closest pair naive algorithm which has 3D coord.
Input is two files which contains 3 floats in one line.
I handled inputs with float3* type variable.
float3* teamA;
float3* teamB;
float3* results;
handleFileInput(argv[1], argv[2], teamA, teamB, numPoints);
results = new float3[numPoints[0]];
after this, I allocated and copied host data to device like this
#define CHECKERROR(val) { if (val != cudaSuccess) {fprintf(stderr, "Error %s at line %d in file %s\n", cudaGetErrorString(val), __LINE__, __FILE__); exit(1);} }
CHECKERROR(cudaMalloc(&d_tA, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_tA, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMalloc(&d_tB, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMemset(d_tB, 0, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMalloc(&d_results, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_results, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemcpy(d_tA, teamA, sizeof(float3) * numPoints[0], cudaMemcpyHostToDevice));
CHECKERROR(cudaMemcpy(d_tB, teamB, sizeof(float3) * numPoints[1], cudaMemcpyHostToDevice));
I set my block, grid like this.
dim3 block(512);
dim3 grid(ceil((float)numPoints[0] / 512);
naive_algorithm <<< block, grid >>> (d_tA, d_tB, d_results, numPoints[0], numPoints[1]);
My kernel code is simple like this
__global__ void naive_algorithm(float3* d_tA, float3* d_tB, float3* d_r, int a_size, int b_size)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < a_size)
{
float min_distance = -1;
for (int y = 0; y < b_size; y++)
{
float i = MUL(SUB(d_tA[idx].x, d_tB[y].x), SUB(d_tA[idx].x, d_tB[y].x));
float j = MUL(SUB(d_tA[idx].y, d_tB[y].y), SUB(d_tA[idx].y, d_tB[y].y));
float k = MUL(SUB(d_tA[idx].z, d_tB[y].z), SUB(d_tA[idx].z, d_tB[y].z));
float distance = SQRT(ADD(ADD(i, j), k));
if (min_distance > distance || min_distance == -1)
{
d_r[idx].x = (float)idx;
d_r[idx].y = (float)y;
d_r[idx].z = distance;
min_distance = distance;
}
}
__syncthreads();
}
}
Environment : RTX 2080Ti
There are five different of data samples :
Team A - 1000000 points / Team B - 500000 points -> Test Failed
Team A - 700000 points / Team B - 500000 points -> Test Failed
Team A - 500000 points / Team B - 300000 points -> Test OK!
Team A - 500000 points / Team B - 100000 points -> Test OK!
Team A - 300000 points / Team B - 100000 points -> Test OK!
In my opinion this caused from thread layout.
Do I have to change the block / grid layout 1D by 1D -> 2D by 2D?
Then how should I set my grid layout?

Like Robert Crovella said this was just typo error my mistake.
Because one block can handle to 1024 and grid's one dimension can handle to 65535,
if the grid's x dimension : numPoints[0] / BLOCK_SIZE is bigger than 1024 it doesn't works.
Thanks a lot to check my code!

Related

2D tiled convolution taking more time than untiled version

Writing a code that perform a 2D convolution on a float matrix, in both tiled and untiled version. I'm assuming the width of the tile as
BLOCK_SIZE - MASK_WIDTH + 1
, using halo cells.
But for a 1024 matrix and masks varing from 3 to 9 I get the untiled version performing better:
untiled version
vs
tiled
Both matrix and mask are defined in a constant manner, equal for tiled and untiled. No random values/sizes used.
I guess I'm doing some wrong assumption about the tile size, but even after doing some research the implementation seems quite legit.
#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32
Here's the kernel code for the tiled version
__global__ void convolution_2D_tiled(float* in, const float* __restrict__ mask, float* out, size_t mask_width, size_t w, size_t h) {
float outputPixel = 0; //minimize write to global memory: stored in register
int tx = threadIdx.x;
int ty = threadIdx.y;
int tile_width = BLOCK_WIDTH - mask_width + 1; //since BLOCK_WIDTH = TILE_WIDTH + MASK_WIDTH - 1
int col = blockIdx.x * tile_width + tx;
int row = blockIdx.y * tile_width + ty;
//picking the starting indexes of input matrix inside the mask
//(TOP-LEFT of the mask)
int inputRow = row - (mask_width / 2);
int inputCol = col - (mask_width / 2);
__shared__ float tile[BLOCK_WIDTH][BLOCK_WIDTH];
// Load tile elements
if (inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
tile[ty][tx] = in[inputRow * w + inputCol];
else
tile[ty][tx] = 0.0;
// Wait until all tile elements are loaded
__syncthreads();
//some thread won't write any outputs, only need to calculate tile_width elements
if (col < w && row < h && ty < tile_width && tx < tile_width) {
//get the neighbour in the mask
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) { //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
outputPixel += tile[i + ty][j + tx] * mask[i * mask_width + j];
}
}
out[(row * w) + col] = (float)(outputPixel);
}
}
The main with the matrix generation and sizes assumptions:
void errorCheck(unsigned int line){
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if (cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}}
int main(int argc, char const* argv[]){
for (size_t mask_width = 3; mask_width <= 9; mask_width += 2) {
printf("Testing with mask size = %d\n\n", mask_width);
float* a;
float* b;
float* c;
cudaMallocManaged((void **) &a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &b, sizeof(int)*mask_width*mask_width);
cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
a[i * MATRIX_SIZE + j] = (float)(1 +(3 * j % 20));
}
}
// initialize matrix B
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) {
b[i * mask_width + j] = (float)(1 + (((2 * i) + j) % mask_width));
}
}
float naive_gpu_elapsed_time_ms;
// some events to count the execution time
//clock_t st, end;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int tile_width = BLOCK_WIDTH - mask_width + 1;
dim3 dimGrid(MATRIX_SIZE / tile_width, MATRIX_SIZE / tile_width);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
errorCheck(__LINE__);
cudaEventRecord(start, 0);
convolution_2D_tiled <<<dimGrid, dimBlock >>> (a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
errorCheck(__LINE__);
cudaThreadSynchronize();
//time counting terminate
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//compute time elapsed on GPU computing
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
//free memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
printf("________________________________________________________________________\n\n");
}
return 0;
}
I'm using google colab with Tesla T4 GPU, and no CUDA error is thrown.
Also tried to use bigger masks (11, 15 ..) but no changes in comparison between tiled and untiled.
You are making inefficient usage of managed memory as discussed here and here.
Nearly all of your ~2ms of execution time is used in inefficient demand-paged copying of data from host to device. As a result, your ability to resolve the difference in performance in the two cases due to the device code changes is almost completely obscured.
If you add these 3 lines of code immediately before float naive_gpu_elapsed_time_ms;, you will observe that your reported execution times decrease dramatically, and you should be able to better judge the performance difference between the shared memory tiled version and the non-tiled version:
cudaMemPrefetchAsync(a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE, 0);
cudaMemPrefetchAsync(b, sizeof(int)*mask_width*mask_width, 0);
cudaMemPrefetchAsync(c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE, 0);
You haven't shown your non-tiled code, so I can't demonstrate that for you. Here's an example profiling output using a non-tiled convolution code that I wrote, comparing to your tiled kernel, and including the cudaMemPrefetchAsync() statements:
$ nvprof ./t2140
Testing with mask size = 3
==13236== NVPROF is profiling process 13236, command: ./t2140
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.032832 ms.
________________________________________________________________________
Testing with mask size = 5
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.061120 ms.
________________________________________________________________________
Testing with mask size = 7
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.086080 ms.
________________________________________________________________________
Testing with mask size = 9
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.118688 ms.
________________________________________________________________________
==13236== Profiling application: ./t2140
==13236== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 52.59% 311.69us 4 77.922us 41.089us 119.08us convolution_2D(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
47.41% 280.97us 4 70.241us 28.449us 114.28us convolution_2D_tiled(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
API calls: 96.10% 365.32ms 12 30.443ms 12.906us 365.10ms cudaMallocManaged
1.32% 5.0301ms 4 1.2575ms 586.91us 3.2433ms cuDeviceTotalMem
0.66% 2.4917ms 404 6.1670us 320ns 268.82us cuDeviceGetAttribute
0.56% 2.1277ms 12 177.31us 8.3020us 578.90us cudaMemPrefetchAsync
0.50% 1.9035ms 4 475.88us 295.08us 549.01us cudaDeviceSynchronize
0.49% 1.8594ms 12 154.95us 75.533us 328.85us cudaFree
0.14% 526.53us 4 131.63us 42.014us 220.14us cudaEventSynchronize
0.11% 399.28us 4 99.820us 61.310us 210.74us cuDeviceGetName
0.09% 351.52us 8 43.940us 11.426us 116.52us cudaLaunchKernel
0.01% 45.911us 8 5.7380us 4.1870us 10.243us cudaEventRecord
0.01% 25.946us 8 3.2430us 935ns 10.182us cudaEventCreate
0.01% 21.643us 4 5.4100us 3.1450us 8.6700us cuDeviceGetPCIBusId
0.00% 10.304us 8 1.2880us 430ns 5.0980us cuDeviceGet
0.00% 9.6790us 4 2.4190us 1.9560us 3.7180us cudaEventElapsedTime
0.00% 3.3390us 3 1.1130us 617ns 1.6520us cuDeviceGetCount
0.00% 3.2480us 4 812ns 700ns 1.0470us cuDeviceGetUuid
0.00% 3.1420us 8 392ns 229ns 1.2110us cudaGetLastError
==13236== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
12 1.3346MB 4.0000KB 2.0000MB 16.01563MB 1.405760ms Host To Device
Total CPU Page faults: 52
$
You can see that in each case, the tiled/shared memory kernel is faster.

Why doesn't my OpenCL 3d image lookup work?

I have been having trouble with an OpenCL kernel which I've written producing incorrect results (compared to a reference brute-force CPU implementation).
I tracked the problem down to a 3D lookup table I'm using which seems to be returning garbage results, rather than the values which I passed in.
I have the following (simplified) OpenCL kernel for reading a precomputed function from a 3D image type:
__constant sampler_t legSampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
inline float normalizedLegendre(int n, int m, float z, image3d_t legendreLUT)
{
float nCoord = (((float) n) / get_image_width(legendreLUT));
float mCoord = (((float) m) / get_image_height(legendreLUT));
float zCoord = ((z + 1.0f) / 2.0f);
float4 coord = (float4)(floor(nCoord) + 0.5f, floor(mCoord) + 0.5f, zCoord, 0.0f);
return read_imagef(legendreLUT, legSampler, coord).x;
}
_kernel void noiseMain(__read_only image3d_t legendreLUT, __global float* outLegDump)
{
//k is the linear index into the array.
int k = get_global_id(0);
if(k < get_image_depth(legendreLUT))
{
float z = ((float) k / (float) get_image_depth(legendreLUT)) * 2.0 - 1.0;
float legLookup = normalizedLegendre(5, 4, z, legendreLUT);
float texCoord = ((float) k / 1024.0) * 2 - 1;
outLegDump = legLookup;
}
}
On the host side, I generate the 3D image, legendreLUT, using the following code:
static const size_t NLEGPOLYBINS = 1024;
static const size_t NLEGPOLYORDERS = 16;
boost::scoped_array<float> legendreHostBuffer(new float[NLEGPOLYORDERS * NLEGPOLYORDERS * NLEGPOLYBINS]);
float stepSize = 1.0 / (((float) NLEGPOLYBINS/2.0) - 0.5);
float z = -1.0;
std::cout << "Generating legendre polynomials..." << std::endl;
for(size_t n = 0; n < NLEGPOLYORDERS; n++)
{
for(size_t m = 0; m < NLEGPOLYORDERS; m++)
{
for(size_t zI = 0; zI < NLEGPOLYBINS; zI++)
{
using namespace boost::math;
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + zI;
//-1..1 in NLEGPOLYBINS steps...
float val;
if(m > n)
{
legendreHostBuffer[index] = 0;
continue;
}
else
{
//boost::math::legendre_p
val = legendre_p<float>(n,m,z);
}
float nPm = n+m;
float nMm = n-m;
float factNum;
float factDen;
factNum = factorial<float>(n-m);
factDen = factorial<float>(n+m);
float nrmTerm;
nrmTerm = pow(-1.0, m) * sqrt((n + 0.5) * (factNum/factDen));
legendreHostBuffer[index] = val;
z += stepSize;
if(z > 1.0) z + 1.0;
}
z = -1.0;
}
}
//DEBUGGING STEP: Dump everything we've just generated for m = 4, n = 5, z=-1..1
std::ofstream legDump("legDump.txt");
for(size_t i = 0; i < NLEGPOLYBINS; i++)
{
int n =5; int m = 4;
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + i;
float texCoord = ((float) i / (float) NLEGPOLYBINS) * 2 - 1;
legDump << i << " " << texCoord << " " << legendreHostBuffer[index] << std::endl;
}
legDump.close();
std::cout << "Creating legendre polynomial look up table image..." << std::endl;
cl::ImageFormat legFormat(CL_R, CL_FLOAT);
//Generate out legendre polynomials image...
m_legendreTable = cl::Image3D(m_clContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
legFormat,
NLEGPOLYORDERS,
NLEGPOLYORDERS,
NLEGPOLYBINS,
0,
0,
legendreHostBuffer.get());
Other than the index, the actual generation of the values is more or less irrelevant, but I've included it here for completeness.
And here is how I execute the kernel and read back the results:
cl::Buffer outLegDump = cl::Buffer(m_clContext, CL_MEM_WRITE_ONLY, NLEGPOLYBINS * sizeof(float));
//Create out kernel...
cl::Kernel kernel(m_program, "noiseMain");
kernel.setArg(0, m_legendreTable);
kernel.setArg(1, outLegDump);
size_t kernelSize = 1024;
cl::NDRange globalRange(kernelSize);
cl::NDRange localRange(1);
m_commandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, globalRange, cl::NullRange);
m_commandQueue.finish();
boost::scoped_array<float> legDumpHost(new float[NLEGPOLYBINS]);
m_commandQueue.enqueueReadBuffer(outLegDump, CL_TRUE, 0, NLEGPOLYBINS * sizeof(float), legDumpHost.get());
std::ofstream legreadback("legreadback.txt");
for(size_t i = 0; i < NLEGPOLYBINS; i++)
{
legreadback << i << " " << legDumpHost[i] << std::endl;
}
legreadback.close();
When I look at the dumped data (i.e. that put out in legdump.txt from the host-side buffer), I get the expected data. However, when I compare it to the data received back from the device side (i.e. that looked up by the kernel and put out in legreadback.txt), I get incorrect values.
Since I'm calculating 1024 values in both cases, I'll spare everyone the whole dump, however, here are the first few/last values of each:
legdump.txt (host side sanity check):
0 -0
1 -0.0143913
2 -0.0573401
3 -0.12851
4 -0.227566
5 -0.354175
..
..
1020 0.12859
1021 0.0144185
1022 0.0144185
1023 1.2905e-8
legreadback.txt (device-side lookup and readback)
0 1
1 1
2 1
3 1
4 0.5
5 0
..
..
1020 7.74249e+11
1021 -1.91171e+15
1022 -3.81029e+15
1023 -1.91173e+15
Note that these values are the same across multiple runs of the code, so I don't think it's an initialization problem.
I can only assume that I'm calculating indices wrong somewhere, but I don't know where. I've checked the calculation of the Z coordinate (which naturally is defined on -1..1), its conversion to texture coordinates (0..1 range), and the conversion of M and N to texture coordinates (which should be done without interpolation), and found nothing to be wrong.
So my question is thus:
What is the proper way to create and index a 3D lookup table in OpenCL?
As expected, the problem turned out to be in the indexing on the host-side used to generate the lookup table.
The previous index calculation:
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + zI;
Was based on C++ 3D array indexing, which is not the way addressing works in OpenCL for a 3D image. A 3D image can be thought of as a "stack" of 2D images on top of each other, where the depth coordinate (Z in this case) selects the image, and the horizontal and vertical coordinates (m and n in this case) select the pixel within the selected image.
The correct indexing calculation is:
size_t index = m * NLEGPOLYORDERS + n + (zI * NLEGPOLYORDERS * NLEGPOLYORDERS);
As one can see, this new approach fits the "stacked image" layout described previously, whereas the previous calculation does not.

Efficiently find minimum of large array using Opencl

I am working on the implementation of a hierarchical clustering algorithm in opencl. For each step, I have find the minimum value in a very large array (approx. 10^8 entries) so that I know which elements have to be combined into a new cluster. The identification of the minimum value must be done 9999 times. With my current kernels, it takes about 200 seconds to find the minimum value (accumulated over all iterations).
How I approached the problem is by dividing the array into 2560 equally sized fragments (there are 2560 stream processors on my Radeon 7970) and to find the minimum of each fragment individually. The I run a second kernel that combines these minima into a global minimum.
It there any more efficient way to approach this problem? The initial idea was to speed up HCA by using OpenCL but with the amount of time that the identification of the minimum takes it is much longer than the matlab HCA on the CPU. What am I doing wrong?
__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * elementsToWorkOn, __global int * arraysize){
int gid = get_global_id(0);
int minloc = 0;
float mymin = INFINITY;
int eltoWorkOn = *elementsToWorkOn;
int offset = gid*eltoWorkOn;
int target = offset + eltoWorkOn;
if (offset<*arraysize){
//make sure the array size is not exceeded
if (target > *arraysize){
target = *arraysize;
}
//find minimum for the kernel
for (int i = offset; i < target; i++){
if (*(myArray + i) < mymin){
mymin = *(myArray + i);
minloc = i;
}
}
}
*(mins + gid * 2) = minloc;
*(mins + gid * 2 + 1) = mymin;
}
__kernel void getGlobalMin(__global double * mins, __global double * gmin, __global int * pixelsInImage){
int nWorkitems = 2560;
float globalMin = INFINITY;
double globalMinLoc;
float tempMin;
for (int i = 0; i < nWorkitems; i++){
tempMin = *(mins + 2 * i + 1);
if (tempMin < globalMin){
globalMin = tempMin;
globalMinLoc = *(mins + 2 * i);
}
}
*(gmin + 0) = globalMinLoc;
*(gmin + 1) = globalMin;
}
UPDATE
I redesigned the findMinValue Kernel based on your suggestions. The memory access is now coalescent and I divided the work into work groups, so that I can reduce the amount of global memory accesses. Before, every kernel wrote its minimum value to the global mins buffer. Now only one kernel per worg group writes one value (i.e. the group minimum). Furthermore, I increased the global work size in order to hide memory latency.
These changes allowed to reduce the time required for identifying the minima from >200s to only 59s! Thank you very much for your help!
Is there anything else I could have missed while optimizing the kernel? Do you have any further suggestions? I could not figure out how to use setArg(). Do I have to pass a pointer to the int value to it (like this: err = clSetKernelArg(kernel[2], 3, sizeof(int), &variable);). How woudl the kernel declaration look in this case?
Here is my new Kernel:
__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * arraysize,__global int * elToWorkOn,__global int * dummy){
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
int lsize = get_local_size(0);
int gsize = get_global_id(0);
int minloc = 0;
int arrSize = *arraysize;
int elPerGroup = *elToWorkOn;
float mymin = INFINITY;
__local float lmins[128];
//initialize local memory
*(lmins + lid) = INFINITY;
__local int lminlocs[128];
//this private value will reduce global memory access in the for loop (temp = *(myArray + i);)
float temp;
//ofset and target of the for loop
int offset = elPerGroup*groupID + lid;
int target = elPerGroup*(groupID + 1);
//prevent that target<arrsize (may happen due to rounding errors or arrSize not a multiple of elPerGroup
target = min(arrSize, target);
//find minimum for the kernel
//offset is different for each lid, leading to sequential memory access
if (offset < arrSize){
for (int i = offset; i < target; i += lsize){
temp = *(myArray + i);
if (temp < mymin){
mymin = temp;
minloc = i;
}
}
//store kernel minimum in local memory
*(lminlocs + lid) = minloc;
*(lmins + lid) = mymin;
//find work group minimum (reduce global memory accesses)
lsize = lsize >> 1;
while (lsize > 0){
if (lid < lsize){
if (*(lmins + lid)> *(lmins + lid + lsize)){
*(lmins + lid) = *(lmins + lid + lsize);
*(lminlocs + lid) = *(lminlocs + lid + lsize);
}
}
lsize = lsize >> 1;
}
}
//write group minimum to global buffer
if (lid == 0){
*(mins + groupID * 2 + 0) = *(lminlocs + 0);
*(mins + groupID * 2 + 1) = *(lmins + 0);
}
}
If each work item iterates through a global array there is ZERO coalescing of reads. If you change it so each work items strides by the warp or wavefront size then you'd get a huge speed gain.
It is much more efficient to access consecutive memory rather than scattered memory by the WI. In addition, you should sum in work groups first, then pass it to global memory. And use single setArg() of ints, and not buffers for that purpose.
At least, you should do it this way:
__kernel void findMinValue(__global float * myArray, __global double * mins, __global int arraysize){
int gid = get_global_id(0);
int minloc = 0;
float mymin = INFINITY;
//find minimum for the kernel
for (int i = gid ; i < arraysize; i+= get_global_size(0)){
if (*(myArray + i) < mymin){
mymin = *(myArray + i);
minloc = i;
}
}
*(mins + gid * 2) = minloc;
*(mins + gid * 2 + 1) = mymin;
}
The coalescent memory access sped up the calculation by roughly factor 4. That was, however, still to slow for our purpose. The brute force method by recalculating the minima of all entries was just not suitable.
I therefore changed the algorithm, so that it would retain only the minimum (+its location) of each row. After changing the 2 rows and columns in each iteration, the row minima are updated if required and then the global minimum is obtained by finding the minimum of the row minima. Therefore, if we had a 22500*22500 matrix, I only needed to get the minimum of 22500 entries as opposed to 506250000. Of course this implementation requires additional calculations but in the end we could reduce the amount of time spent searching for mimima from 200s (non-coalescent) over 59s (coalescent) all the way down do 8s.
I hope this will help someone in the future :-)

Calculating a 2D pixel array using CUDA, declaring proper Grid and Block sizes

I'm using CUDA to calculate the values of an array of 64x64x4 which has been flattened out. The array contains GLubytes and then in the z column are stored the RGBA values for any given pixel. I've created a kernel to use with CUDA but I figure the dimensions of my blocks and grids are off. The end result is instead of drawing circles, I'm only drawing fourths of circles. The kernel and function that calls is follow:
For clarification: DIAMETER = 64, RADIUS = 32.
__global__ void drawKernel(GLubyte *ball)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
ball[4 * (x * DIAMETER + y)+3] = (GLubyte) 0x0;
if ((x * x) + (y * y) <= (RADIUS * RADIUS)){
ball[4 * ((x+32) * DIAMETER + (y+32))+0] = (GLubyte) 0xffffff;
ball[4 * ((x+32) * DIAMETER + (y+32))+1] = (GLubyte) 0x0;
ball[4 * ((x+32) * DIAMETER + (y+32))+2] = (GLubyte) 0x0;
ball[4 * ((x+32) * DIAMETER + (y+32))+3] = (GLubyte) 0xaaaaaa;
}
}
cudaError_t drawWithCuda()
{
size_t memorySize = DIAMETER * DIAMETER * 4 *sizeof(GLubyte);
GLubyte *dev_ball = 0; //device ball
cudaError_t cudaStatus; //CUDA error status
dim3 threadsPerBlock(8, 8);
dim3 numBlocks(DIAMETER/threadsPerBlock.x, DIAMETER/threadsPerBlock.y);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice(0) failed! CUDA-capable GPU not on board.");
goto Error;
}
// Allocate GPU buffers for GLubyte array
cudaStatus = cudaMalloc((void**)&dev_ball, memorySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
//Begin CUDA-kernal call
drawKernel<<<numBlocks, threadsPerBlock>>>(dev_ball);
cudaDeviceSynchronize();
//Copy from Device
cudaStatus = cudaMemcpy(ball, dev_ball, memorySize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Device to Host failed!");
goto Error;
}
Error:
cudaFree(dev_ball);
return cudaStatus;
}
My question is: is my problem found in the dimensions of my Block and Grid? Or is it something else?
The output (once I run the arrray ball through openGL) is the following image:
I should add that when I don't use cuda and just calculate the array values with regular for loops, the logic used inside the kernel works perfectly well and draws circles.
You're allocating this much memory (in ball):
size_t memorySize = DIAMETER * DIAMETER * 4 *sizeof(GLubyte);
i.e. an array that is 64 x 64 x 4bytes deep
Now let's look at your array index calculations in the kernel:
ball[4 * ((x+32) * DIAMETER + (y+32))+0] = (GLubyte) 0xffffff;
Your x and y are computed as follows:
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
Given your kernel launch dimensions, you are launching a DIAMETER x DIAMETER array of threads, i.e. 64x64. So each x varies from 0 to 63 and each y varies from 0 to 63, depending on the thread.
When we plug some of these x and y values into your kernel, the index computations blow up (exceed the memory allocated):
ball[4 * ((63+32) * 64 + (63+32))+0] = (GLubyte) 0xffffff;
This exceeds the 64x64x4 available area in ball. If you ran this code with cuda-memcheck, I'm pretty sure you would see out-of-bounds indexing errors.
It seems like maybe your array indexing should be something like:
ball[4 * ((x) * DIAMETER + (y))+0] = (GLubyte) 0xffffff;

C++ and CUDA: why does the code return different results each time?

Update: I found the bug. Since the code I posted before is very complicated, I simplify them and only keep the part when the problem is.
if (number >= dim * num_points)
return;
But actually, I only have num_points, I want to use num_points thread, so the correct way should be
if (number >= num_points)
return;
Thank you all for the help.
I'm rewriting some C++ code from CPU to GPU. And the code is pasted below. Sorry it's long, since I think the problems are easier to be detected in this way.
In the code, for every thread I need some matrix format intermediate results, so I allocate device memory for these intermediate results, such as d_dir2, d_R, d_Stick, d_PStick. The results turned out to be not what I expected, so to debug, I tried to output some intermediate results R in this way:
if (k == 0)
{
results[tmp_int1 + i * dim + j] = R[tmp_int1 + i * dim + j];
}
and later in C++, I print results.
However, I found that results give different values each time. Sometimes it gives the correct answer R, sometimes, the value of PStick, sometimes a combination of R and PStick, and sometimes a combination of R and 0 (results are initialized to 0 at the beginning).
I'm very confused what caused the problem. Any idea? Thank you very much :)
__global__ void stickvote(const int dim, const int num_points, const int gridx, float Sigma, float* input, float* dir2, float* R, float* Stick, float* PStick, float* results) {
float threshold = 4 * Sigma;
float c = (- 16 * log(0.1f) * (sqrt(Sigma) - 1)) / 3.1415926f / 3.1415926f;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int number = row * BLOCK_SIZE * gridx + col;
if (number >= dim * num_points) //// The bug is here!
return;
}
extern "C" void KernelStickVote(int dim, int num_points, float Sigma, float* input, float* results) {
const int totalpoints = num_points;
const int totalpoints_input = (dim + 1)* (dim + 1) * num_points;
const int totalpoints_output = dim * dim * num_points;
size_t size_input = totalpoints_input * sizeof(float);
size_t size_output = totalpoints_output * sizeof(float);
float* d_input;
cutilSafeCall(cudaMalloc((void**)&d_input, size_input));
float* d_result;
cutilSafeCall(cudaMalloc((void**)&d_result, size_output));
// used to save dir, and calculate dir * dir'
float* d_dir2;
cutilSafeCall(cudaMalloc((void**)&d_dir2, dim * num_points * sizeof(float)));
// used to save R: dim * dim * N
float* d_R;
cutilSafeCall(cudaMalloc((void**)&d_R, size_output));
// used to save Stick: dim * dim * N
float* d_Stick;
cutilSafeCall(cudaMalloc((void**)&d_Stick, size_output));
// used to save Stick: dim * dim * N
float* d_PStick;
cutilSafeCall(cudaMalloc((void**)&d_PStick, size_output));
// Copy input data from host to device
cudaMemcpy(d_input, input, size_input, cudaMemcpyHostToDevice);
int totalblock = (totalpoints % BLOCKPOINTS==0 ? totalpoints/BLOCKPOINTS : (int(totalpoints/BLOCKPOINTS) + 1));
int gridx = (65535 < totalblock ? 65535 : totalblock);
int gridy = (totalblock % gridx == 0 ? totalblock/gridx : (int(totalblock/gridx)+1) );
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(gridx, gridy);
stickvote<<<dimGrid, dimBlock>>>(dim, num_points, gridx, Sigma, d_input, d_dir2, d_R, d_Stick, d_PStick, d_result);
cudaMemcpy(results, d_result, size_output, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_result);
cudaFree(d_dir2);
cudaFree(d_R);
cudaFree(d_Stick);
cudaFree(d_PStick);
}
The original poster of the question performed some further code simplification and debugging his/herself and discover that the guard statement in the kernel:
if (number >= dim * num_points)
return;
was, in fact, incorrect and should have been
if (number >= num_points)
return;
This was the source of the error.
This answer has been added as a community wiki answer with the intention of removing this question from the unanswered queue.