I was wondering what was the best way to generate one pseudo random number between 0 and 49k that would be the same for each thread, by using curand or something else.
I prefer to generate the random numbers inside the kernel because I will have to generate one at the time but about 10k times.
And I could use floats between 0.0 and 1.0, but I've no idea how to make my PRN available for all threads, because most post and example show how to have different PRN for each threads.
Thanks
Probably you just need to study the curand documentation, especially for the device API. The key to getting the same sequence for each thread is to create state for each thread (most examples do this) and then pass the same sequence number to the init function for each thread. In curand_init, the sequence of parameters is as follows:
curand_init(seed, subsequence number, offset, state)
by setting the seed for each init call the same, we generate the same sequence for each thread. by setting the subsequence and offset numbers the same, we select the same starting value within that sequence, for each thread.
Here is code to demonstrate:
// compile with: nvcc -arch=sm_20 -lcurand -o t89 t89.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define SCALE 49000
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ float getnextrand(curandState *state){
return (float)(curand_uniform(state));
}
__device__ int getnextrandscaled(curandState *state, int scale){
return (int) scale * getnextrand(state);
}
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, 0, 0, &state[idx]);
}
__global__ void testrand(curandState *state, int *a1, int *a2){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a1[idx] = getnextrandscaled(&state[idx], SCALE);
a2[idx] = getnextrandscaled(&state[idx], SCALE);
}
int main() {
int *h_a1, *h_a2, *d_a1, *d_a2;
curandState *devState;
h_a1 = (int *)malloc(DSIZE*sizeof(int));
if (h_a1 == 0) {printf("malloc fail\n"); return 1;}
h_a2 = (int *)malloc(DSIZE*sizeof(int));
if (h_a2 == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_a1, DSIZE * sizeof(int));
cudaMalloc((void**)&d_a2, DSIZE * sizeof(int));
cudaMalloc((void**)&devState, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
cudaDeviceSynchronize();
cudaCheckErrors("kernels1");
testrand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a1, d_a2);
cudaDeviceSynchronize();
cudaCheckErrors("kernels2");
cudaMemcpy(h_a1, d_a1, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_a2, d_a2, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
printf("1st returned random value is %d\n", h_a1[0]);
printf("2nd returned random value is %d\n", h_a2[0]);
for (int i=1; i< DSIZE; i++){
if (h_a1[i] != h_a1[0]) {
printf("mismatch on 1st value at %d, val = %d\n", i, h_a1[i]);
return 1;
}
if (h_a2[i] != h_a2[0]) {
printf("mismatch on 2nd value at %d, val = %d\n", i, h_a2[i]);
return 1;
}
}
printf("thread values match!\n");
}
Related
The output of the following CUDA code
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
test<<<512, 256>>>(thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}
is
test.cu:15 test Thread idx(17568) got the mutex. total_count = 56320
test.cu:25 main host_data=(131072,1)
The total_count may differ. My goal is for all 131072=512*256 threads to call
atomicAdd(count ,1);
and for it to subsequently read it back from *count which should then equal 131072 (number of all launched threads.)
Question
What is the best way to read *count after all threads have incremented it?
__threadfence() doesn't seem to do it, since the value is read back before all threads have incremented it.
I know about cooperative_groups but that seems a bit heavy for something I would have expected to be more elementary. Also, even if cooperative_groups were used, would that limit the total number of blocks launched to some hardware-determined limit?
Btw this is a version that works w/ cooperative_groups, with the restricted number of blocks. (Thanks to #paleonix for the bugfix to kernel_args.)
#include <cooperative_groups.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
grid.sync();
//__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
void * args[] { thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1 };
void * kernel_args[] { &args[0], &args[1] };
int dev = 0;
int supportsCoopLaunch = 0;
cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
printf("%s:%u %s supportsCoopLaunch=%d\n", __FILE__, __LINE__, __func__, supportsCoopLaunch);
// This will launch a grid that can maximally fill the GPU, on the default stream with kernel arguments
int numBlocksPerSm = 0;
// Number of threads my_kernel will be launched with
int numThreads = 128;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, test, numThreads, 0);
dim3 dimBlock(numThreads, 1, 1);
dim3 dimGrid(deviceProp.multiProcessorCount*numBlocksPerSm, 1, 1);
printf("%s:%u %s deviceProp.multiProcessorCount=%d numBlocksPerSm=%d\n", __FILE__, __LINE__, __func__, deviceProp.multiProcessorCount, numBlocksPerSm);
cudaLaunchCooperativeKernel((void*)test, dimGrid, dimBlock, kernel_args);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}
I have code :
#define int4 unsigned long long int
int4 mer_thread = tex2D(STexture, col, row);
printf("\nTexture[%d][%d] = %d", row, col, tex2D(STexture, col, row));
Error "error : no instance of overloaded function "tex2D" matches the argument list"
but if define int4 unsigned long int, it work fine.
My code creat texture:
void Creat_TexttureS(int4 _S[nmax][NMAX])
{
cudaArray* carray;
cudaChannelFormatDesc channel;
channel = cudaCreateChannelDesc<int4>();
cudaMallocArray(&carray, &channel, NMAX, nmax);
cudaMemcpyToArray(carray, 0, 0, _S, sizeof(int4)*NMAX*nmax, cudaMemcpyHostToDevice);
STexture.filterMode = cudaFilterModePoint;
STexture.addressMode[0] = cudaAddressModeWrap;
STexture.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(STexture, carray);
}
Thanks for your help !!
Below is a worked example that demonstrates the storing of data of type long long int in a 2D texture of type int2, then how to retrieve it via tex2D() and re-interpret it as long long int.
#include <stdlib.h>
#include <stdio.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__forceinline__ __device__ long long int int2_as_longlong (int2 a)
{
long long int res;
asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y));
return res;
}
texture<int2, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n)
{
int2 data;
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
data = tex2D (tex, col, row);
printf ("% 11lld ", int2_as_longlong (data));
}
printf ("\n");
}
}
int main (void)
{
int m = 4; // height = #rows
int n = 3; // width = #columns
size_t pitch, tex_ofs;
unsigned long long int arr[4][3]=
{{11111111LL, 11112222LL, 11113333LL},
{22221111LL, 22222222LL, 22223333LL},
{33331111LL, 33332222LL, 33333333LL},
{44441111LL, 44442222LL, 44443333LL}};
int2 *arr_d = 0;
CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
n, m, pitch));
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
printf ("printing texture content\n");
kernel<<<1,1>>>(m, n);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (arr_d));
return EXIT_SUCCESS;
}
I'm trying to check how to work with CUFFT and my code is the following
#include <iostream>
//For FFT
#include <cufft.h>
using namespace std;
typedef enum signaltype {REAL, COMPLEX} signal;
//Function to fill the buffer with random real values
void randomFill(cufftComplex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand() / (float) RAND_MAX;
h_signal[i].y = 0;
}
}
}
//Printing the random data in the buffer
void printData(cufftComplex *a, int size, char *msg) {
if (strcmp(msg,"")==0) printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
}
// FFT a signal that's on the _DEVICE_.
// Doing FFT
void signalFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan FFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS)
{
printf ("Failed Executing FFT\n");
exit(0);
}
}
// Doing IFFT
void signalIFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan IFFT\n");
exit(0);
}
// Execute the plan
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS)
{
printf ("Failed Executing IFFT\n");
exit(0);
}
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize();
//Declaring two complex type variables;
cufftComplex *h_signal, *d_signal1;
//Declaring the size variable
int alloc_size;
alloc_size = 16;
//Allocating the memory for CPU version complex variable
h_signal = (cufftComplex *) malloc(sizeof(cufftComplex) * alloc_size);
//Allocating the memory for GPU version complex variable
cudaMalloc(&d_signal1, sizeof(cufftComplex) * alloc_size);
// Add random data to signal.
randomFill(h_signal, alloc_size, REAL);
printData(h_signal, alloc_size, "Random H1");
// Copying the data the data to CUDA
cudaMemcpy(d_signal1, h_signal, sizeof(cufftComplex) * alloc_size, cudaMemcpyHostToDevice);
//Applying FFT
signalFFT(d_signal1, alloc_size);
//Doing IFFT
signalIFFT(d_signal1, alloc_size);
cudaMemcpy(h_signal, d_signal1, sizeof(cufftComplex) * alloc_size, cudaMemcpyDeviceToHost);
printData(h_signal, alloc_size, "IFFT");
return 0;
}
And the MAKEFILE consists of the following:
main: main.cu Makefile nvcc -o main main.cu --ptxas-options=-v --use_fast_math
But I get compilation errors, the errors are as shown in the image:
Apparently the problem is occurring only when I call the functions cufftPlan1d and cufftExecC2C. Do I have to add anything extra in the makefile to make use of these functions? My CUDA version 5.5 and I'm doing it in Ubuntu.
Thanks
There are two problems here
The CUFFT library is not being linked. Change the compilation command to:
nvcc -o main main.cu --ptxas-options=-v --use_fast_math -lcufft
Set LD_LIBRARY_PATH to include the absolute path to the CUFFT library to allow runtime loading of the shared library. The syntax for this can be found here.
[This answer has been assembled from comments and added as a community wiki entry to get this question off the unanswered queue for the CUDA tag]
I am writing a c++ cuda program. I have a very simple struct:
struct A
{
int size;
float* tab;
}
and a kernel:
__global__ void Kernel(A* res, int n,args*) //
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
res[i] = AGenerator::Generate(args[i]);
}
}
Where AGenerator::Generate creates the A object and fills the tab array. What happens here is that when the results are send to the host the tab pointer is invalid. To prevent this I will need to apply the Rule of three to this class. Since there would be many classes like this I would like to avoid writing too many additional code.
I made the research and found that there is a thrust library which has device_vector and host_vector which will probably help with my problem but the thing is that I want the struct A and similar structs to be callable from both host and device so the device and host_vector are not good for this purpose. Is there any struct I can use to approach this?
EDIT
I found that passing the struct by value will help me but since performance is quite important it doesn't seem like a good solution.
Here is a rough outline of what I had in mind for a custom allocator and pool that would hide some of the mechanics of using a class both on the host and the device.
I don't consider it to be a paragon of programming excellence. It is merely intended to be a rough outline of the steps that I think would be involved. I'm sure there are many bugs. I didn't include it, but I think you would want a public method that would get the size as well.
#include <iostream>
#include <assert.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float mytype;
__device__ unsigned int pool_allocated = 0;
__device__ unsigned int pool_size = 0;
__device__ mytype *pool = 0;
__device__ unsigned int pool_reserve(size_t size){
assert((pool_allocated+size) < pool_size);
unsigned int offset = atomicAdd(&pool_allocated, size);
assert (offset < pool_size);
return offset;
}
__host__ void init_pool(size_t psize){
mytype *temp;
unsigned int my_size = psize;
cudaMalloc((void **)&temp, psize*sizeof(mytype));
cudaCheckErrors("init pool cudaMalloc fail");
cudaMemcpyToSymbol(pool, &temp, sizeof(mytype *));
cudaCheckErrors("init pool cudaMemcpyToSymbol 1 fail");
cudaMemcpyToSymbol(pool_size, &my_size, sizeof(unsigned int));
cudaCheckErrors("init pool cudaMemcpyToSymbol 2 fail");
}
class A{
public:
mytype *data;
__host__ __device__ void pool_allocate_and_copy() {
assert(d_data == 0);
assert(size != 0);
#ifdef __CUDA_ARCH__
unsigned int offset = pool_reserve(size);
d_data = pool + offset;
memcpy(d_data, data, size*sizeof(mytype));
#else
cudaMalloc((void **)&d_data, size*sizeof(mytype));
cudaCheckErrors("pool_allocate_and_copy cudaMalloc fail");
cudaMemcpy(d_data, data, size*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("pool_allocate_and_copy cudaMemcpy fail");
#endif /* __CUDA_ARCH__ */
}
__host__ __device__ void update(){
#ifdef __CUDA_ARCH__
assert(data != 0);
data = d_data;
assert(data != 0);
#else
if (h_data == 0) h_data = (mytype *)malloc(size*sizeof(mytype));
data = h_data;
assert(data != 0);
cudaMemcpy(data, d_data, size*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaCheckErrors("update cudaMempcy fail");
#endif
}
__host__ __device__ void allocate(size_t asize) {
assert(data == 0);
data = (mytype *)malloc(asize*sizeof(mytype));
assert(data != 0);
#ifndef __CUDA_ARCH__
h_data = data;
#endif
size = asize;
}
__host__ __device__ void copyobj(A *obj){
assert(obj != 0);
#ifdef __CUDA_ARCH__
memcpy(this, obj, sizeof(A));
#else
cudaMemcpy(this, obj, sizeof(A), cudaMemcpyDefault);
cudaCheckErrors("copy cudaMempcy fail");
#endif
this->update();
}
__host__ __device__ A();
private:
unsigned int size;
mytype *d_data;
mytype *h_data;
};
__host__ __device__ A::A(){
data = 0;
d_data = 0;
h_data = 0;
size = 0;
}
__global__ void mykernel(A obj, A *res){
A mylocal;
mylocal.copyobj(&obj);
A mylocal2;
mylocal2.allocate(24);
mylocal2.data[0]=45;
mylocal2.pool_allocate_and_copy();
res->copyobj(&mylocal2);
printf("kernel data %f\n", mylocal.data[0]);
}
int main(){
A my_obj;
A *d_result, h_result;
my_obj.allocate(32);
my_obj.data[0] = 12;
init_pool(1048576);
my_obj.pool_allocate_and_copy();
cudaMalloc((void **)&d_result, sizeof(A));
cudaCheckErrors("main cudaMalloc fail");
mykernel<<<1,1>>>(my_obj, d_result);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
h_result.copyobj(d_result);
printf("host data %f\n", h_result.data[0]);
return 0;
}
I am pretty sure that the direction of the question and related comments are ill fated. Device memory and host memory are totally different things, both conceptually and physically. Pointers just don't carry over!
Please go back to step 1 and learn about copying values between host and device by reading the reference manual and the progamming guide for more details.
To get a more precise answer to your question please show how those A structs are allocated on the device including the allocation of those tab floats. Also please show how AGenerator::Generate somehow manipulates those tabs in a meaningful way. My best bet is that you are working with unallocated device memory here and that you should probably use a preallocated array of floats and indizes into the array instead of device pointers here. Those indices would then carry over to the host gracefully.
#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.