Segmentation fault on cudaMalloc or cudaMemcpy - c++

New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.

For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.

Related

an illegal memory access was encountered

I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
As mentioned in a comment, your GPU function takes arguments by references.
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
This is bad, passing a reference to a function means more or less that you're passing an address where you can find the value, not the value itself.
In your situation those values are in memory used by Host, NOT Device/GPU memory, when GPU tries to access those values it will most likely crash.
The types you are trying to pass, unsigned char*, int and size_t are very cheap to copy, there's no need to pass them by reference in the 1st place.
__global__ void makeGrey(
unsigned char *pimage,
const int cn,
const size_t total)
There are tools provided by nvidia to debug CUDA applications, but I'm not really familiar with them, you can also use printf inside GPU functions, but you will have to organize output from potentially thousand of threads.
In general, whenever you call GPU functions, be very cautious about what you're passing as parameters, as they need to be passed from Host memory to Device memory. Usually you want to pass everything by value, any pointers need to point to Device memory, and watch out from references.

cuda 9.2 curand_init extremely slow

I have a program where I generate arrays with random elements using cuda. Since I upgraded from cuda 9.1 to cuda 9.2, the time it takes do that has gone up from a fraction of a second (about 0.1s) to almost two minutes (without changing any of the code). The problem seems to be the curand_init() function, as the rest is running at about the same speed. Was there a change I missed in the library, is this a bug or is it a problem with my code?
This is an example
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#define cudaErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::cerr << "cudaAssert: " << cudaGetErrorString(code) << " " << file << ": " << line << std::endl;
if (abort) exit(code);
}
}
__global__
void setup_curand_state (curandState *state, int seed, int dim)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
if (index < dim)
curand_init(seed, index, 0, &state[index]);
}
__global__
void set_random (float* to, curandState* curand_state, int dim)
{
int index = threadIdx.x+ blockIdx.x* blockDim.x;
if (index < dim)
to [index] = curand_normal (&curand_state[index]);
}
int main () {
int dim = 100000;
float *data;
cudaErrchk (cudaMallocManaged ((void**) &data, dim * sizeof (float)));
curandState* curand_state;
cudaErrchk (cudaMalloc (&curand_state, (dim * sizeof (curandState))));
setup_curand_state <<<(dim + 1023) / 1024, 1024>>> (curand_state, time(NULL), dim);
cudaErrchk (cudaDeviceSynchronize());
set_random <<<(dim + 1023) / 1024, 1024>>> (data, curand_state, dim);
cudaFree (data);
return 0;
}
Answered by mrBonobo in the comment above:
Apparently, updating cuda through apt silently broke the install. Code
compiled for 9.1 would still work, but around 100/1000 times slower.
Reinstalling nvidia-cuda-toolkit solved the error

cudaMemcpy doesn't work

In the following code there's the cudaMemcpy not working, it returns an error, and the program exits. What can be the problem? It doesn't seem to me I'm doing something illegal, and the size of the vectors seem fine to me.
It might be possible the algorithm does something wrong at some point but the idea is correct I guess. The code is to sum n numbers by doing some partial sums in parallel, and then re-iterate.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__device__ int aug_vec(int *vec, const int& i, const int& size) {
return (i >= size) ? 0 : vec[i];
}
__global__ void sumVectorElements(int *vec,const int& size) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x);
vec[i] = aug_vec(vec, 2*i, size) + aug_vec(vec, 2 * i + 1, size);
}
__host__ int parallel_sum(int *vec,const int& size) {
cudaError_t err;
int *d_vec, *cp_vec;
int n_threads = (size >> 1) + (size & 1);
cp_vec = new int[size];
err = cudaMalloc((void**)&d_vec, size * sizeof(int));
if (err != cudaSuccess) {
std::cout << "error in cudaMalloc!" << std::endl;
exit(1);
}
err = cudaMemcpy(d_vec, vec, size*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy!" << std::endl;
exit(1);
}
int curr_size = size;
while (curr_size > 1) {
std::cout << "size = " << curr_size << std::endl;
sumVectorElements<<<1,n_threads>>>(d_vec, curr_size);
curr_size = (curr_size >> 1) + (curr_size & 1);
}
err = cudaMemcpy(cp_vec, d_vec, size*sizeof(int), cudaMemcpyDeviceToHost); //THIS LINE IS THE PROBLEM!
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy" << std::endl;
exit(1);
}
err = cudaFree(d_vec);
if (err != cudaSuccess) {
std::cout << "error in cudaFree" << std::endl;
exit(1);
}
int rval = cp_vec[0];
delete[] cp_vec;
return rval;
}
int main(int argc, char **argv) {
const int n_blocks = 1;
const int n_threads_per_block = 12;
int vec[12] = { 0 };
for (auto i = 0; i < n_threads_per_block; ++i) vec[i] = i + 1;
int sum = parallel_sum(vec, n_threads_per_block);
std::cout << "Sum = " << sum << std::endl;
system("pause");
return 0;
}
The cudaMemcpy operation after the kernel is actually asynchronously reporting an error that is due to the kernel execution. Your error reporting is primitive. If you have an error code, you may get more useful information by printing out the result of passing that error code to cudaGetErrorString().
The error is occurring in the kernel due to use of the reference argument:
__global__ void sumVectorElements(int *vec,const int& size) {
^^^^^^^^^^^^^^^
Any argument you pass to a kernel and expect to be usable in kernel code must refer to data that is passed by value, or else data that is accessible/referenceable from device code. For example, passing a host pointer to device code is generally not legal in CUDA, because an attempt to dereference a host pointer in device code will fail.
The exceptions to the above would be data/pointers/references that are accessible in device code. Unified memory and pinned/mapped data are two examples, neither of which are being used here.
As a result, the reference parameter involves a reference (an address, basically) for a an item (size) in host memory. When the kernel code attempts to use this item, it must first de-reference it. The dereferenceing of a host item in device code is illegal in CUDA (unless using UM or pinned memory).
The solution in this case is simple: convert to an ordinary pass-by-value situation:
__global__ void sumVectorElements(int *vec,const int size) ...
^
remove ampersand

Cuda error undefined reference to 'cufftPlan1d'?

I'm trying to check how to work with CUFFT and my code is the following
#include <iostream>
//For FFT
#include <cufft.h>
using namespace std;
typedef enum signaltype {REAL, COMPLEX} signal;
//Function to fill the buffer with random real values
void randomFill(cufftComplex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand() / (float) RAND_MAX;
h_signal[i].y = 0;
}
}
}
//Printing the random data in the buffer
void printData(cufftComplex *a, int size, char *msg) {
if (strcmp(msg,"")==0) printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
}
// FFT a signal that's on the _DEVICE_.
// Doing FFT
void signalFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan FFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS)
{
printf ("Failed Executing FFT\n");
exit(0);
}
}
// Doing IFFT
void signalIFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan IFFT\n");
exit(0);
}
// Execute the plan
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS)
{
printf ("Failed Executing IFFT\n");
exit(0);
}
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize();
//Declaring two complex type variables;
cufftComplex *h_signal, *d_signal1;
//Declaring the size variable
int alloc_size;
alloc_size = 16;
//Allocating the memory for CPU version complex variable
h_signal = (cufftComplex *) malloc(sizeof(cufftComplex) * alloc_size);
//Allocating the memory for GPU version complex variable
cudaMalloc(&d_signal1, sizeof(cufftComplex) * alloc_size);
// Add random data to signal.
randomFill(h_signal, alloc_size, REAL);
printData(h_signal, alloc_size, "Random H1");
// Copying the data the data to CUDA
cudaMemcpy(d_signal1, h_signal, sizeof(cufftComplex) * alloc_size, cudaMemcpyHostToDevice);
//Applying FFT
signalFFT(d_signal1, alloc_size);
//Doing IFFT
signalIFFT(d_signal1, alloc_size);
cudaMemcpy(h_signal, d_signal1, sizeof(cufftComplex) * alloc_size, cudaMemcpyDeviceToHost);
printData(h_signal, alloc_size, "IFFT");
return 0;
}
And the MAKEFILE consists of the following:
main: main.cu Makefile nvcc -o main main.cu --ptxas-options=-v --use_fast_math
But I get compilation errors, the errors are as shown in the image:
Apparently the problem is occurring only when I call the functions cufftPlan1d and cufftExecC2C. Do I have to add anything extra in the makefile to make use of these functions? My CUDA version 5.5 and I'm doing it in Ubuntu.
Thanks
There are two problems here
The CUFFT library is not being linked. Change the compilation command to:
nvcc -o main main.cu --ptxas-options=-v --use_fast_math -lcufft
Set LD_LIBRARY_PATH to include the absolute path to the CUFFT library to allow runtime loading of the shared library. The syntax for this can be found here.
[This answer has been assembled from comments and added as a community wiki entry to get this question off the unanswered queue for the CUDA tag]

Output of cuda program is not what was expected

#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.