Printing elements of a String vector using cuPrint in Kernel funcion - c++

I am trying to print the elements of a String vector passed as argument of a kernel funcion, using cuPrint function.
The code of the kernel
__global__ void testKernel(string wordList[10000])
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
cuPrintf("wordList[%d]: %s \n", index, wordList[index]);
}
Code from main function to setup execution parameters and launch the kernel
//Allocate device memory for word list
string* d_wordList;
cudaMalloc((void**)&d_wordList, sizeof(string)*number_of_words);
//Copy word list from host to device
cudaMemcpy(d_wordList, wordList, sizeof(string)*number_of_words, cudaMemcpyHostToDevice);
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
cudaPrintfInit();
testKernel<<<grid, threads>>>(d_wordList);
cudaDeviceSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
I am getting the error:
"Error 44 error : calling a host function("std::basic_string, std::allocator >::~basic_string") from a global function("testKernel") is not allowed D:...\kernel.cu 44 1 CUDA_BF_large_word_list
"
What have I missed?
Thanks in advance.

In general, you can't use functions from C++ libraries (including <string>) in CUDA device code.
Use an array of char instead to hold your string(s).
Here is an example of manipulating "strings" as C-style arrays of null-terminated char, and passing them to a kernel.

I modified the code, and used an array of char insted of strings.
The updated version of kernel is:
__global__ void testKernel(char* d_wordList)
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
//cuPrintf("Hello World from kernel! \n");
cuPrintf("!! %c%c%c%c%c%c%c%c%c%c \n" , d_wordList[index * 20 + 0],
d_wordList[index * 20 + 1],
d_wordList[index * 20 + 2],
d_wordList[index * 20 + 3],
d_wordList[index * 20 + 4],
d_wordList[index * 20 + 5],
d_wordList[index * 20 + 6],
d_wordList[index * 20 + 7],
d_wordList[index * 20 + 8],
d_wordList[index * 20 + 9]);
}
I am also wondering if there is an easier way to print the words from the char array. (Bassically I need to print and later work with one word per kernel function).
The code from the main function is:
const int text_length = 20;
char (*wordList)[text_length] = new char[10000][text_length];
char *dev_wordList;
for(int i=0; i<number_of_words; i++)
{
file>>wordList[i];
cout<<wordList[i]<<endl;
}
cudaMalloc((void**)&dev_wordList, 20*number_of_words*sizeof(char));
cudaMemcpy(dev_wordList, &(wordList[0][0]), 20 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
char (*resultWordList)[text_length] = new char[10000][text_length];
cudaMemcpy(resultWordList, dev_wordList, 20 * number_of_words * sizeof(char), cudaMemcpyDeviceToHost);
for(int i=0; i<number_of_words; i++)
cout<<resultWordList[i]<<endl;
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
cudaPrintfInit();
testKernel<<<grid, threads>>>(dev_wordList);
cudaDeviceSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
If I use smaller values for number of blocks/ threads like this:
dim3 grid(20, 1, 1);
dim3 threads(100, 1, 1);
The Kernel launch is correct, it displays one word per thread. But I need this procedure for 10000 words. What have I missed?

Related

2D tiled convolution taking more time than untiled version

Writing a code that perform a 2D convolution on a float matrix, in both tiled and untiled version. I'm assuming the width of the tile as
BLOCK_SIZE - MASK_WIDTH + 1
, using halo cells.
But for a 1024 matrix and masks varing from 3 to 9 I get the untiled version performing better:
untiled version
vs
tiled
Both matrix and mask are defined in a constant manner, equal for tiled and untiled. No random values/sizes used.
I guess I'm doing some wrong assumption about the tile size, but even after doing some research the implementation seems quite legit.
#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32
Here's the kernel code for the tiled version
__global__ void convolution_2D_tiled(float* in, const float* __restrict__ mask, float* out, size_t mask_width, size_t w, size_t h) {
float outputPixel = 0; //minimize write to global memory: stored in register
int tx = threadIdx.x;
int ty = threadIdx.y;
int tile_width = BLOCK_WIDTH - mask_width + 1; //since BLOCK_WIDTH = TILE_WIDTH + MASK_WIDTH - 1
int col = blockIdx.x * tile_width + tx;
int row = blockIdx.y * tile_width + ty;
//picking the starting indexes of input matrix inside the mask
//(TOP-LEFT of the mask)
int inputRow = row - (mask_width / 2);
int inputCol = col - (mask_width / 2);
__shared__ float tile[BLOCK_WIDTH][BLOCK_WIDTH];
// Load tile elements
if (inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
tile[ty][tx] = in[inputRow * w + inputCol];
else
tile[ty][tx] = 0.0;
// Wait until all tile elements are loaded
__syncthreads();
//some thread won't write any outputs, only need to calculate tile_width elements
if (col < w && row < h && ty < tile_width && tx < tile_width) {
//get the neighbour in the mask
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) { //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
outputPixel += tile[i + ty][j + tx] * mask[i * mask_width + j];
}
}
out[(row * w) + col] = (float)(outputPixel);
}
}
The main with the matrix generation and sizes assumptions:
void errorCheck(unsigned int line){
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if (cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}}
int main(int argc, char const* argv[]){
for (size_t mask_width = 3; mask_width <= 9; mask_width += 2) {
printf("Testing with mask size = %d\n\n", mask_width);
float* a;
float* b;
float* c;
cudaMallocManaged((void **) &a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &b, sizeof(int)*mask_width*mask_width);
cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
a[i * MATRIX_SIZE + j] = (float)(1 +(3 * j % 20));
}
}
// initialize matrix B
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) {
b[i * mask_width + j] = (float)(1 + (((2 * i) + j) % mask_width));
}
}
float naive_gpu_elapsed_time_ms;
// some events to count the execution time
//clock_t st, end;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int tile_width = BLOCK_WIDTH - mask_width + 1;
dim3 dimGrid(MATRIX_SIZE / tile_width, MATRIX_SIZE / tile_width);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
errorCheck(__LINE__);
cudaEventRecord(start, 0);
convolution_2D_tiled <<<dimGrid, dimBlock >>> (a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
errorCheck(__LINE__);
cudaThreadSynchronize();
//time counting terminate
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//compute time elapsed on GPU computing
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
//free memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
printf("________________________________________________________________________\n\n");
}
return 0;
}
I'm using google colab with Tesla T4 GPU, and no CUDA error is thrown.
Also tried to use bigger masks (11, 15 ..) but no changes in comparison between tiled and untiled.
You are making inefficient usage of managed memory as discussed here and here.
Nearly all of your ~2ms of execution time is used in inefficient demand-paged copying of data from host to device. As a result, your ability to resolve the difference in performance in the two cases due to the device code changes is almost completely obscured.
If you add these 3 lines of code immediately before float naive_gpu_elapsed_time_ms;, you will observe that your reported execution times decrease dramatically, and you should be able to better judge the performance difference between the shared memory tiled version and the non-tiled version:
cudaMemPrefetchAsync(a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE, 0);
cudaMemPrefetchAsync(b, sizeof(int)*mask_width*mask_width, 0);
cudaMemPrefetchAsync(c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE, 0);
You haven't shown your non-tiled code, so I can't demonstrate that for you. Here's an example profiling output using a non-tiled convolution code that I wrote, comparing to your tiled kernel, and including the cudaMemPrefetchAsync() statements:
$ nvprof ./t2140
Testing with mask size = 3
==13236== NVPROF is profiling process 13236, command: ./t2140
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.032832 ms.
________________________________________________________________________
Testing with mask size = 5
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.061120 ms.
________________________________________________________________________
Testing with mask size = 7
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.086080 ms.
________________________________________________________________________
Testing with mask size = 9
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.118688 ms.
________________________________________________________________________
==13236== Profiling application: ./t2140
==13236== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 52.59% 311.69us 4 77.922us 41.089us 119.08us convolution_2D(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
47.41% 280.97us 4 70.241us 28.449us 114.28us convolution_2D_tiled(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
API calls: 96.10% 365.32ms 12 30.443ms 12.906us 365.10ms cudaMallocManaged
1.32% 5.0301ms 4 1.2575ms 586.91us 3.2433ms cuDeviceTotalMem
0.66% 2.4917ms 404 6.1670us 320ns 268.82us cuDeviceGetAttribute
0.56% 2.1277ms 12 177.31us 8.3020us 578.90us cudaMemPrefetchAsync
0.50% 1.9035ms 4 475.88us 295.08us 549.01us cudaDeviceSynchronize
0.49% 1.8594ms 12 154.95us 75.533us 328.85us cudaFree
0.14% 526.53us 4 131.63us 42.014us 220.14us cudaEventSynchronize
0.11% 399.28us 4 99.820us 61.310us 210.74us cuDeviceGetName
0.09% 351.52us 8 43.940us 11.426us 116.52us cudaLaunchKernel
0.01% 45.911us 8 5.7380us 4.1870us 10.243us cudaEventRecord
0.01% 25.946us 8 3.2430us 935ns 10.182us cudaEventCreate
0.01% 21.643us 4 5.4100us 3.1450us 8.6700us cuDeviceGetPCIBusId
0.00% 10.304us 8 1.2880us 430ns 5.0980us cuDeviceGet
0.00% 9.6790us 4 2.4190us 1.9560us 3.7180us cudaEventElapsedTime
0.00% 3.3390us 3 1.1130us 617ns 1.6520us cuDeviceGetCount
0.00% 3.2480us 4 812ns 700ns 1.0470us cuDeviceGetUuid
0.00% 3.1420us 8 392ns 229ns 1.2110us cudaGetLastError
==13236== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
12 1.3346MB 4.0000KB 2.0000MB 16.01563MB 1.405760ms Host To Device
Total CPU Page faults: 52
$
You can see that in each case, the tiled/shared memory kernel is faster.

NVIDIA Visual profiler does not generate a timeline

My question is almost same as the question [asked here at SO before][1]. But no answer has been provided to it so, I am asking a separate question.
I am using CUDA 7.0 toolkit on a Windows-7 OS. I am using VS-2013.
I tried to generate the timeline of vector addition sample program and it worked. But when I follow exactly same steps to generate a timeline of my own code then, it keep showing a message "Running application to generate timeline". I know that the kernel gets called and everything is working.
cudaDeviceReset() call is also there after finishing everything related to CUDA.
Program: I have changed my original question to provide a minimal working example which can produce the same problem. The following code is not generating a timeline using nvvp irrespective of the place where I put cudaDeviceReset().
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//OpenCV
#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <stdio.h>
using namespace cv;
__global__ void colorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih)
{
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];
}
}
int main()
{
while (1)
{
Mat image(400, 400, CV_8UC3, Scalar(0, 0, 255));
unsigned char *h_src = image.data;
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
unsigned char *dev_src, *dev_dst, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src, numBytes * sizeof(unsigned char));
cudaMalloc((void**)&dev_dst, numBytes * sizeof(unsigned char));
////Copy the source image to the device i.e. GPU
cudaMemcpy(dev_src, h_src, numBytes * sizeof(unsigned char), cudaMemcpyHostToDevice);
////KERNEL
dim3 numOfBlocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied by 3 because we have 3 channel image now
dim3 numOfThreadsPerBlocks(20, 20);
colorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks >> >(numChannels, image.cols, image.rows, dev_src, dev_dst);
cudaDeviceSynchronize();
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3);
h_dst = org_dijSDK_img.data;
cudaMemcpy(h_dst, dev_dst, numBytes * sizeof(unsigned char), cudaMemcpyDeviceToHost);
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}
cudaDeviceReset();
return 0;
}
Very Important Clue: If I comment the line while(1) and hence run the code only once then, the nvvp generates timeline. But in my original project, I cannot get the timeline profile by doing so because, it contain multi-threading and other stuff due to which, there is no image to process during the first run. So, I must need some way to generate the timeline with a code containing infinite while loop.
The problem in my code is the endless while loop due to which the cudaDeviceReset() were never being called. There are two possible solutions to deal with such situations:
If you are interested to have a look at timeline profiling only then, just comment your while loop and the nvvp would be able to reach the cudaDeviceReset() present at the end of main().
There might be a situation where you must keep a loop inside your program. For example, in my original project containing multi-threading, there is no image to process during initial 180 run of while loop. To deal with such situations, replace your while loop with the for loop which can run for limited number of times. For example, the following code has helped me to get a timeline profiling of 4 number of runs. I am posting only the modified main().
int main()
{
cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;
//while (1)
for (int i = 0; i < 4; i++)
{
cudaStreamCreate(&stream_one);
cudaStreamCreate(&stream_two);
cudaStreamCreate(&stream_three);
Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;
unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;
//copy new data of image to the host pointer
h_src = image.data;
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
//KERNEL--stream-1
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
//KERNEL--stream-2
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
//KERNEL--stream-3
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);
//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);
// wait for results
cudaStreamSynchronize(stream_one);
cudaStreamSynchronize(stream_two);
cudaStreamSynchronize(stream_three);
//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}
cudaDeviceReset();
return 0;
}

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

Strange acting of CUDA for large amount of threads

I want to prepare my CUDA kernels for working over large amount of particles (much exceeding 65535 which is max value of gridDim). I tried to create a proper thread index mapping working for any <<<numBlocks, threadsPerBlock>>> values.
I wrote this:
__global__ void step_k(float* position, size_t numElements, unsigned int* blabla)
{
unsigned int i = calculateIndex();
if (i < numElements){
blabla[i] = i;
}
}
__device__ unsigned int calculateIndex(){
unsigned int xIndex = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int zIndex = blockIdx.z*blockDim.z+threadIdx.z;
unsigned int xSize = gridDim.x*blockDim.x;
unsigned int ySize = gridDim.y*blockDim.y;
return xSize*ySize*zIndex+xSize*yIndex+xIndex;
}
and I use it this way:
void CudaSphFluids::step(void)
{
//dim3 threadsPerBlock(1024, 1024, 64);
//dim3 numBlocks(65535, 65535, 65535);
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
unsigned int result[256] = {};
unsigned int* d_results;
cudaMalloc( (void**) &d_results,sizeof(unsigned int)*256);
step_k<<<numBlocks, threadsPerBlock>>>(d_position, 256, d_results);
cudaMemcpy(result,d_results,sizeof(unsigned int)*256,cudaMemcpyDeviceToHost);
CLOG(INFO, "SPH")<<"STEP";
for(unsigned int t=0; t<256;t++) {
cout<<result[t]<<"; ";
}
cout<<endl;
cudaFree(d_results);
Sleep(200);
}
It seems to be ok (incrementing numbers from 0 to 255) for :
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 1, 1);
It works for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 3, 1);
but when I try to run it for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 5, 1);
for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 10, 1);
and for larger values like:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
it's getting crazy:
Then I tried to use another mapping from some smart guy's website:
__device__ int getGlobalIdx_3D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
But unfortunately it doesn't work. (numbers are different, but also wrong).
Any ideas what is the reason of such a strange acting?
I use CUDA 6.0 on GeForce GTX 560Ti (sm_21) and VS2012 with NSight.
This is requesting 65536 threads per block:
dim3 threadsPerBlock(256, 256, 1);
That is not acceptable on any current CUDA GPU, which are limited to either 512 or 1024 threads per block.
These are also launching too many threads per block:
dim3 threadsPerBlock(256, 5, 1);
dim3 threadsPerBlock(256, 10, 1);
Start by adding proper cuda error checking to your program. I would suggest doing this on any CUDA code before posting here. You will be more informed, and others will be able to help you better.
Although you don't show your complete kernel, your kernel indexing seems to be set up correctly for 3D indexing. Therefore, it may just be a matter of also modifying this line:
dim3 numBlocks(1, 1, 1);
Which you will probably want to do to get reasonable performance out of the GPU.

C++ and CUDA: why does the code return different results each time?

Update: I found the bug. Since the code I posted before is very complicated, I simplify them and only keep the part when the problem is.
if (number >= dim * num_points)
return;
But actually, I only have num_points, I want to use num_points thread, so the correct way should be
if (number >= num_points)
return;
Thank you all for the help.
I'm rewriting some C++ code from CPU to GPU. And the code is pasted below. Sorry it's long, since I think the problems are easier to be detected in this way.
In the code, for every thread I need some matrix format intermediate results, so I allocate device memory for these intermediate results, such as d_dir2, d_R, d_Stick, d_PStick. The results turned out to be not what I expected, so to debug, I tried to output some intermediate results R in this way:
if (k == 0)
{
results[tmp_int1 + i * dim + j] = R[tmp_int1 + i * dim + j];
}
and later in C++, I print results.
However, I found that results give different values each time. Sometimes it gives the correct answer R, sometimes, the value of PStick, sometimes a combination of R and PStick, and sometimes a combination of R and 0 (results are initialized to 0 at the beginning).
I'm very confused what caused the problem. Any idea? Thank you very much :)
__global__ void stickvote(const int dim, const int num_points, const int gridx, float Sigma, float* input, float* dir2, float* R, float* Stick, float* PStick, float* results) {
float threshold = 4 * Sigma;
float c = (- 16 * log(0.1f) * (sqrt(Sigma) - 1)) / 3.1415926f / 3.1415926f;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int number = row * BLOCK_SIZE * gridx + col;
if (number >= dim * num_points) //// The bug is here!
return;
}
extern "C" void KernelStickVote(int dim, int num_points, float Sigma, float* input, float* results) {
const int totalpoints = num_points;
const int totalpoints_input = (dim + 1)* (dim + 1) * num_points;
const int totalpoints_output = dim * dim * num_points;
size_t size_input = totalpoints_input * sizeof(float);
size_t size_output = totalpoints_output * sizeof(float);
float* d_input;
cutilSafeCall(cudaMalloc((void**)&d_input, size_input));
float* d_result;
cutilSafeCall(cudaMalloc((void**)&d_result, size_output));
// used to save dir, and calculate dir * dir'
float* d_dir2;
cutilSafeCall(cudaMalloc((void**)&d_dir2, dim * num_points * sizeof(float)));
// used to save R: dim * dim * N
float* d_R;
cutilSafeCall(cudaMalloc((void**)&d_R, size_output));
// used to save Stick: dim * dim * N
float* d_Stick;
cutilSafeCall(cudaMalloc((void**)&d_Stick, size_output));
// used to save Stick: dim * dim * N
float* d_PStick;
cutilSafeCall(cudaMalloc((void**)&d_PStick, size_output));
// Copy input data from host to device
cudaMemcpy(d_input, input, size_input, cudaMemcpyHostToDevice);
int totalblock = (totalpoints % BLOCKPOINTS==0 ? totalpoints/BLOCKPOINTS : (int(totalpoints/BLOCKPOINTS) + 1));
int gridx = (65535 < totalblock ? 65535 : totalblock);
int gridy = (totalblock % gridx == 0 ? totalblock/gridx : (int(totalblock/gridx)+1) );
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(gridx, gridy);
stickvote<<<dimGrid, dimBlock>>>(dim, num_points, gridx, Sigma, d_input, d_dir2, d_R, d_Stick, d_PStick, d_result);
cudaMemcpy(results, d_result, size_output, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_result);
cudaFree(d_dir2);
cudaFree(d_R);
cudaFree(d_Stick);
cudaFree(d_PStick);
}
The original poster of the question performed some further code simplification and debugging his/herself and discover that the guard statement in the kernel:
if (number >= dim * num_points)
return;
was, in fact, incorrect and should have been
if (number >= num_points)
return;
This was the source of the error.
This answer has been added as a community wiki answer with the intention of removing this question from the unanswered queue.