CUDA: please help me to find error in my code - c++

There's code, that uses GPU:
__global__ void gpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
float* in_d, *w_d, *out_d;
cudaMalloc((void**)&in_d, psize * sizeof(float));
cudaMalloc((void**)&w_d, psize * size * sizeof(float));
cudaMalloc((void**)&out_d, size * sizeof(float));
for(size_t i = 0; i < size; i++)
output[i] = 0;
cudaMemcpy(in_d, input, psize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w_d, weights, psize * size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(out_d, output, size * sizeof(float), cudaMemcpyHostToDevice);
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(ceil(float(rx) / block_x), ceil(float(ry) / block_y));
gpu_process<<<dimGrid, dimBlock>>>(in_d, w_d, out_d, psize, size);
cudaThreadSynchronize();
cudaMemcpy(output, out_d, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(in_d);
cudaFree(out_d);
cudaFree(w_d);
}
There's code, that do the same thing, but uses only CPU:
int blockIdxx, blockIdxy, blockDimx, blockDimy, threadIdxx, threadIdxy;
void cpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdxx*blockDimx + threadIdxx;
int j = blockIdxy*blockDimy + threadIdxy;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
for(size_t i = 0; i < size; i++)
output[i] = 0;
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
blockDimx = block_x;
blockDimy = block_y;
int gridDimx = ceil(float(rx) / block_x), gridDimy = ceil(float(ry) / block_y);
for(blockIdxx = 0; blockIdxx < gridDimx; blockIdxx++)
for(blockIdxy = 0; blockIdxy < gridDimy; blockIdxy++)
for(threadIdxx = 0; threadIdxx < blockDimx; threadIdxx++)
for(threadIdxy = 0; threadIdxy < blockDimy; threadIdxy++)
cpu_process(input, weights, output, psize, size);
}
Why CPU variant works correctly but GPU variant returns garbage in output? What differs in
Version of cuda-toolkit: 4.0
OS: Debian GNU/Linux, cuda installed from it's repositories.
GPU: NVIDIA GeForce GT 525M.

cudaThreadSyncronize is deprecated and should not be used, instead use cudaDeviceSyncronize, check the error codes of these, since they will return an error if a thread has failed. These also block all code thereafter until the task is completed, so you could also add some timing code inbetween to find bottlenecks.

Related

CUDA optimization for a vector tensor product using a custom kernel or CUBLAS

I have two vectors a and b. Each vector contains the coordinates of a 3d points x, y, z vector3f.
struct Vector3f
{
float x;
float y;
float z;
}
vector a has a size of n = 5000 points and vector b has a size of m = 4000. I need to do a tensor vector product between them like on the right side of the picture. the resulted vector should have a length size of 5000 * 4000 and contain float point where results are stored at c.
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// int j = blockIdy.y * blockDim.y + threadIdx.y;
//check if the idx is out of range
if (i < n) {
for (int j = 0; j < m; j++) {
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
}
dim3 blockSize(32, 1, 1);
dim3 gridSize((n + blockSize.x - 1) / blockSize.x, 1, 1);
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I get high execution time on Volta arch which is a lot.
My question is how can I optimize the kernel to reduce time which is mainly because of the for loop inside the kernel. I know here that all global reads and writes are not coalesced.
You can make the kernel move through both a and b simultaneously, like this:
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdy.y * blockDim.y + threadIdx.y;
if (i < n && j < m)
{
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
dim3 blockSize(32, 32);
dim3 gridSize((int)ceil(n / 32.0), (int)ceil(m / 32.0));
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
Update
I tried to modify the code to use a single array with and without shared memory, the code without shared memory with always faster 3 or 4 times.
With shared memory:
#define BLOCK_SIZE 32
void tensor3dProdcut(const int n, const int m, const float* a, const float* b, float* c)
{
float* d_a;
size_t size = (uint64_t)n * 3 * sizeof(float);
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
float* d_b;
size = (uint64_t)m * 3 * sizeof(float);
cudaMalloc(&d_b, size);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
float* d_c;
size = (uint64_t)n * m * sizeof(float);
cudaMalloc(&d_c, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((int)ceil((double)n / BLOCK_SIZE), (int)ceil((double)m / BLOCK_SIZE));
tensor3dProdcutKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n, m);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
__shared__ double as[BLOCK_SIZE][3];
__shared__ double bs[BLOCK_SIZE][3];
for (i = 0; i < 3; i++)
{
as[row][i] = a[(BLOCK_SIZE * blockRow + row) * 3 + i];
bs[col][i] = b[(BLOCK_SIZE * blockCol + col) * 3 + i];
}
__syncthreads();
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += as[row][i] * bs[col][i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
Without shared memory:
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += a[(BLOCK_SIZE * blockRow + row) * 3 + i] * b[(BLOCK_SIZE * blockCol + col) * 3 + i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}

CUDA array filtering kernel without a for loop

I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}

Why CUDA shared memory is slower than global memory in tiled matrix multiplication?

I have tiled matrix multiplication code with and without shared memory. Below is matrix multiplication using global memory:
__global__
void MatrixMulKernel(float* M, float* N, float* P, int Width)
{
int Row = blockIdx.y*blockDim.y + threadIdx.y;
int Col = blockIdx.x*blockDim.x + threadIdx.x;
if ((Row < Width) && (Col < Width)) {
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
{
Pvalue += M[Row*Width + k] * N[k*Width + Col];
}
P[Row*Width + Col] = Pvalue;
}
}
Below is matrix multiplication using shared memory:
__global__
void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width)
{
__shared__ float Mds[blockWidth][blockWidth];
__shared__ float Nds[blockWidth][blockWidth];
int tx = threadIdx.x; int ty = threadIdx.y;
int bx = blockIdx.x; int by = blockIdx.y;
int row = by * blockWidth + ty;
int col = bx * blockWidth + tx;
float pvalue = 0;
for (int m = 0; m < Width / blockWidth; ++m)
{
Mds[ty][tx] = d_M[row * Width + m*blockWidth + tx];
Nds[ty][tx] = d_N[(m*blockWidth + ty)*Width + col];
__syncthreads();
for (int k = 0; k < blockWidth; ++k)
{
pvalue += Mds[ty][k]*Nds[k][tx];
}
__syncthreads();
}
d_P[row*Width + col] = pvalue;
}
As much as I know using shared memory should be faster but in comparing this two codes I found code without shared memory runs about 2 seconds faster for 1600*1600 matrixes. Is there any explanation for this speed difference or something goes wrong with my code?
My teacher uses "Programming Massively Parallel Processors" Book as main text resource these two codes comes from that.
EDIT:
Configuration for Kernel:
int NumBlocks =ceil( Width / blockWidth); // blockWidth = 16
dim3 dimGrid(NumBlocks, NumBlocks,1); // Width = 1600
dim3 dimBlock(blockWidth, blockWidth,1);
clock_t startGpuCalculation = clock();
MatrixMulKernel <<<dimGrid, dimBlock >>>(d_M, d_N, d_P, Width);
cudaThreadSynchronize();
clock_t endGpuCalculation = clock();
I was Running Project In Debug Mode (VS 2017 & CUDA 9). I Run Code in Release Mode and Shared Memory Is Much Faster Than Global Memory. My Bad.

row-major or column-major access of thread index in cuda?

I'm confused whether an image is stored in row-major or column-major order in global memory of the device.
I'am getting two different outputs of an image while accessing the image in both the orders.
When accessing in row-major order-
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = numCols * y + x;
if (x >= numCols || y >= numRows)
return;
//marking column boundaries
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
}
else if (x >= numCols-2){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
}
else{
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
}
d_Image[m].w = d_sample[m].w;
output using row-major
when accessing in column-major order-
int m = x * numRows + y;
output using col-major
Dimensions-
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_Image, d_sample, numRows, numCols);
I'm loading and saving the image using opencv.
In the first output red and blue dots are scattered all over the image. And in the second output(col-major) the boundary rows are marked while i'm trying to mark the columns. I'm too much confused.
Edit
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols);
cv::Mat sample;
cv::Mat Image;
size_t numRows() { return sample.rows; }
size_t numCols() { return sample.cols; }
__global__ void blur(const uchar4 *d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = y*numCols + x;
if (x >= numCols || y >= numRows)
return;
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
}
else if (x >= (numCols-2)){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
}
else{
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
}
d_Image[m].w = d_sample[m].w;
}
int main(){
uchar4 *h_sample, *d_sample, *d_Image, *h_Image;
int filter[9];
sample = cv::imread("sample.jpg", CV_LOAD_IMAGE_COLOR);
if (sample.empty()){
std::cout << "error in loading image.";
system("pause");
}
cv::cvtColor(sample,sample,CV_BGR2RGBA);
Image.create(numRows(), numCols(), CV_8UC4);
if (!sample.isContinuous() || !Image.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
system("pause");
exit(1);
}
cv::cvtColor(Image,Image,CV_BGR2RGBA);
h_sample = (uchar4*)sample.data;
h_Image = (uchar4*)Image.data;
size_t numPixels = numRows() * numCols();
//allocate mmeory on device
checkCudaErrors(cudaMalloc((void**)&d_sample, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMalloc((void**)&d_Image, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_sample, 0, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_Image, 0, sizeof(uchar4) * numPixels));
//copy to device
checkCudaErrors(cudaMemcpy(d_sample, h_sample, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
helper(d_sample, d_Image, numCols(), numRows());
//copy back to host
checkCudaErrors(cudaMemcpy(h_Image, d_Image, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
cv::cvtColor(Image,Image,CV_RGBA2BGR);
cv::namedWindow("Image", CV_WINDOW_AUTOSIZE);
cv::imshow("Image", Image);
cv::waitKey(0);
cv::imwrite("sample.jpg", Image);
return 0;
}
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_sample, d_Image, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
and you call
helper(d_sample, d_Image, numCols(), numRows());
I think you may have switched cols and rows when you call helper...

can't enter into __global__ function using cuda

I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.
The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.
Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :
void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
size_t inputBytes = range*128*sizeof(unsigned char);
size_t baseBytes = cardBase*128*sizeof(unsigned char);
size_t outputBytes = range*sizeof(int);
unsigned char * data_d;
unsigned char * descBase_d;
int * cardBase_d;
int * dataReturned_d;
cudaMalloc((void **) &data_d, inputBytes);
cudaMalloc((void **) &descBase_d, baseBytes);
cudaMalloc((void **) &cardBase_d, sizeof(int));
cudaMalloc((void **) &dataReturned_d, outputBytes);
int blockSize = 196;
int nBlocks = range/blockSize + (range%blockSize == 0?0:1);
cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);
FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);
cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);
cudaFree(data_d);
cudaFree(descBase_d);
cudaFree(cardBase_d);
cudaFree(dataReturned_d);
}
And the function entering the GPU (I don't think the error is here) :
__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char descriptor1[128], descriptor2[128];
int part = 0;
int result = 0;
int winner = 0;
int minDistance = 0;
int itelimit = *cardBase;
for (int k = 0; k < 128; k++)
{
descriptor1[k] = data[idx*128+k];
}
// initialize minDistance
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[k];
}
for (int k = 0; k < 128; k++)
{
part = (descriptor1[k]-descriptor2[k]);
part *= part;
minDistance += part;
}
// test all descriptors in the base :
for (int i = 1; i < itelimit; i++)
{
result = 0;
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[i*128+k];
// Calculate squared l2 distance :
part = (descriptor1[k]-descriptor2[k]);
part *= part;
result += part;
}
// Compare to minDistance
if (result < minDistance)
{
minDistance = result;
winner = i;
}
}
// Write the result in dataReturned
dataReturned[idx] = winner;
}
Thank you in advance if you can help me.
EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".
linux has a watchdog mechanism. If your kernel runs for a long time (you say it is slow in debug mode) you can hit the linux watchdog, and receive the "launch timed out and was terminated" error.
In this case you have several things you might try. The options are covered here.