The output of the following CUDA code
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
test<<<512, 256>>>(thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}
is
test.cu:15 test Thread idx(17568) got the mutex. total_count = 56320
test.cu:25 main host_data=(131072,1)
The total_count may differ. My goal is for all 131072=512*256 threads to call
atomicAdd(count ,1);
and for it to subsequently read it back from *count which should then equal 131072 (number of all launched threads.)
Question
What is the best way to read *count after all threads have incremented it?
__threadfence() doesn't seem to do it, since the value is read back before all threads have incremented it.
I know about cooperative_groups but that seems a bit heavy for something I would have expected to be more elementary. Also, even if cooperative_groups were used, would that limit the total number of blocks launched to some hardware-determined limit?
Btw this is a version that works w/ cooperative_groups, with the restricted number of blocks. (Thanks to #paleonix for the bugfix to kernel_args.)
#include <cooperative_groups.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
grid.sync();
//__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
void * args[] { thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1 };
void * kernel_args[] { &args[0], &args[1] };
int dev = 0;
int supportsCoopLaunch = 0;
cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
printf("%s:%u %s supportsCoopLaunch=%d\n", __FILE__, __LINE__, __func__, supportsCoopLaunch);
// This will launch a grid that can maximally fill the GPU, on the default stream with kernel arguments
int numBlocksPerSm = 0;
// Number of threads my_kernel will be launched with
int numThreads = 128;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, test, numThreads, 0);
dim3 dimBlock(numThreads, 1, 1);
dim3 dimGrid(deviceProp.multiProcessorCount*numBlocksPerSm, 1, 1);
printf("%s:%u %s deviceProp.multiProcessorCount=%d numBlocksPerSm=%d\n", __FILE__, __LINE__, __func__, deviceProp.multiProcessorCount, numBlocksPerSm);
cudaLaunchCooperativeKernel((void*)test, dimGrid, dimBlock, kernel_args);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}
Related
I have a program where I generate arrays with random elements using cuda. Since I upgraded from cuda 9.1 to cuda 9.2, the time it takes do that has gone up from a fraction of a second (about 0.1s) to almost two minutes (without changing any of the code). The problem seems to be the curand_init() function, as the rest is running at about the same speed. Was there a change I missed in the library, is this a bug or is it a problem with my code?
This is an example
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#define cudaErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::cerr << "cudaAssert: " << cudaGetErrorString(code) << " " << file << ": " << line << std::endl;
if (abort) exit(code);
}
}
__global__
void setup_curand_state (curandState *state, int seed, int dim)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
if (index < dim)
curand_init(seed, index, 0, &state[index]);
}
__global__
void set_random (float* to, curandState* curand_state, int dim)
{
int index = threadIdx.x+ blockIdx.x* blockDim.x;
if (index < dim)
to [index] = curand_normal (&curand_state[index]);
}
int main () {
int dim = 100000;
float *data;
cudaErrchk (cudaMallocManaged ((void**) &data, dim * sizeof (float)));
curandState* curand_state;
cudaErrchk (cudaMalloc (&curand_state, (dim * sizeof (curandState))));
setup_curand_state <<<(dim + 1023) / 1024, 1024>>> (curand_state, time(NULL), dim);
cudaErrchk (cudaDeviceSynchronize());
set_random <<<(dim + 1023) / 1024, 1024>>> (data, curand_state, dim);
cudaFree (data);
return 0;
}
Answered by mrBonobo in the comment above:
Apparently, updating cuda through apt silently broke the install. Code
compiled for 9.1 would still work, but around 100/1000 times slower.
Reinstalling nvidia-cuda-toolkit solved the error
Need some help with PTHREADS. I want to keep over 1000 threads opened at any time, something like a thread pool. Here is the code :
/*
gcc -o test2 test2.cpp -static -lpthread -lstdc++
*/
#include <iostream>
#include <cstdlib>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cstring>
#include <stdexcept>
#include <cstdlib>
int NUM_THREADS = 2000;
int MAX_THREADS = 100;
int THREADSTACK = 65536;
struct thread_struct{
int arg1;
int arg2;
};
pthread_mutex_t mutex_;
static unsigned int thread_count = 0;
string exec(const char* cmd)
{
int DEBUG=0;
char buffer[5000];
string result = "";
FILE* pipe = popen(cmd, "r");
if (!pipe && DEBUG) throw runtime_error("popen() failed!");
try
{
while (!feof(pipe))
{
if (fgets(buffer, 128, pipe) != NULL)
{
result += buffer;
}
}
}
catch(...)
{
pclose(pipe);
throw;
}
pclose(pipe);
return result;
}
void *thread_test(void *arguments)
{
pthread_mutex_lock(&mutex_);
thread_count++;
pthread_mutex_unlock(&mutex_);
// long tid;
// tid = (long)threadid;
struct thread_struct *args = (thread_struct*)arguments;
/*
printf("ARG1=%d\n",args->arg1);
printf("ARG2=%d\n",args->arg2);
*/
int thread_id = (int) args->arg1;
/*
int random_sleep;
random_sleep = rand() % 10 + 1;
printf ("RAND=[%d]\n", random_sleep);
sleep(random_sleep);
*/
int random_sleep;
random_sleep = rand() % 10 + 5;
// printf ("RAND=[%d]\n", random_sleep);
char command[100];
memset(command,0,sizeof(command));
sprintf(command,"sleep %d",random_sleep);
exec(command);
random_sleep = rand() % 100000 + 500000;
usleep(random_sleep);
// simulation of a work between 5 and 10 seconds
// sleep(random_sleep);
// printf("#%d -> sleep=%d total_threads=%u\n",thread_id,random_sleep,thread_count);
pthread_mutex_lock(&mutex_);
thread_count--;
pthread_mutex_unlock(&mutex_);
pthread_exit(NULL);
}
int main()
{
// pthread_t threads[NUM_THREADS];
int rc;
int i;
usleep(10000);
srand ((unsigned)time(NULL));
unsigned int thread_count_now = 0;
pthread_attr_t attrs;
pthread_attr_init(&attrs);
pthread_attr_setstacksize(&attrs, THREADSTACK);
pthread_mutex_init(&mutex_, NULL);
for( i=0; i < NUM_THREADS; i++ )
{
create_thread:
pthread_mutex_lock(&mutex_);
thread_count_now = thread_count;
pthread_mutex_unlock(&mutex_);
// printf("thread_count in for = [%d]\n",thread_count_now);
if(thread_count_now < MAX_THREADS)
{
printf("CREATE thread [%d]\n",i);
struct thread_struct struct1;
struct1.arg1 = i;
struct1.arg2 = 999;
pthread_t temp_thread;
rc = pthread_create(&temp_thread, NULL, &thread_test, (void *)&struct1);
if (rc)
{
printf("Unable to create thread %d\n",rc);
sleep(1);
pthread_detach(temp_thread);
goto create_thread;
}
}
else
{
printf("Thread POOL full %d of %d\n",thread_count_now,MAX_THREADS);
sleep(1);
goto create_thread;
}
}
pthread_attr_destroy(&attrs);
pthread_mutex_destroy(&mutex_);
// pthread_attr_destroy(&attrs);
printf("Proccess completed!\n");
pthread_exit(NULL);
return 1;
}
After spawning 300 threads it begins to give
errors, return code from pthread_create() is 11, and after that keeps executing them one by one.
What im i doing wrong?
According to this website, error code 11 corresponds to EAGAIN which means according to this:
Insufficient resources to create another thread.
A system-imposed limit on the number of threads was encountered.
Hence to solve your problem either create less threads or wait for running ones to finish before creating new ones.
You can also change default thread stack size see pthread_attr_setstacksize
New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.
I was wondering what was the best way to generate one pseudo random number between 0 and 49k that would be the same for each thread, by using curand or something else.
I prefer to generate the random numbers inside the kernel because I will have to generate one at the time but about 10k times.
And I could use floats between 0.0 and 1.0, but I've no idea how to make my PRN available for all threads, because most post and example show how to have different PRN for each threads.
Thanks
Probably you just need to study the curand documentation, especially for the device API. The key to getting the same sequence for each thread is to create state for each thread (most examples do this) and then pass the same sequence number to the init function for each thread. In curand_init, the sequence of parameters is as follows:
curand_init(seed, subsequence number, offset, state)
by setting the seed for each init call the same, we generate the same sequence for each thread. by setting the subsequence and offset numbers the same, we select the same starting value within that sequence, for each thread.
Here is code to demonstrate:
// compile with: nvcc -arch=sm_20 -lcurand -o t89 t89.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define SCALE 49000
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ float getnextrand(curandState *state){
return (float)(curand_uniform(state));
}
__device__ int getnextrandscaled(curandState *state, int scale){
return (int) scale * getnextrand(state);
}
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, 0, 0, &state[idx]);
}
__global__ void testrand(curandState *state, int *a1, int *a2){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a1[idx] = getnextrandscaled(&state[idx], SCALE);
a2[idx] = getnextrandscaled(&state[idx], SCALE);
}
int main() {
int *h_a1, *h_a2, *d_a1, *d_a2;
curandState *devState;
h_a1 = (int *)malloc(DSIZE*sizeof(int));
if (h_a1 == 0) {printf("malloc fail\n"); return 1;}
h_a2 = (int *)malloc(DSIZE*sizeof(int));
if (h_a2 == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_a1, DSIZE * sizeof(int));
cudaMalloc((void**)&d_a2, DSIZE * sizeof(int));
cudaMalloc((void**)&devState, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
cudaDeviceSynchronize();
cudaCheckErrors("kernels1");
testrand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a1, d_a2);
cudaDeviceSynchronize();
cudaCheckErrors("kernels2");
cudaMemcpy(h_a1, d_a1, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_a2, d_a2, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
printf("1st returned random value is %d\n", h_a1[0]);
printf("2nd returned random value is %d\n", h_a2[0]);
for (int i=1; i< DSIZE; i++){
if (h_a1[i] != h_a1[0]) {
printf("mismatch on 1st value at %d, val = %d\n", i, h_a1[i]);
return 1;
}
if (h_a2[i] != h_a2[0]) {
printf("mismatch on 2nd value at %d, val = %d\n", i, h_a2[i]);
return 1;
}
}
printf("thread values match!\n");
}
#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.