CUDA cudaMemcpu Segmentation fault when copying array of object pointers - c++

I am trying to move an array of pointers to the device, where each pointer is pointing to a class object. However, I get a Segmentation fault at the line using cudaMemcpy. I am trying to follow the lines used in this post.
main.cu
#include "testclass.cuh"
#include <iostream>
__global__ void printtest(Test* test){
printf("HELLO FROM CUDA\n");
printf("CUDA1 : %i\n", test->hello);
Test test2(6);
printf("CUDA2 : %i\n", test2.hello);
printf("BYEEE FROM CUDA\n");
}
int main(){
printf("hello\n");
Test* test = new Test(512);
printf("CPU : %i\n", test->hello);
Test* devtest;
cudaMalloc(&devtest, sizeof(Test));
cudaError_t err = cudaMemcpy(devtest, test, sizeof(Test), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Error %s at line %d in file %s\n",
cudaGetErrorString(err), __LINE__-3, __FILE__);
}
printtest<<<1, 1>>>(devtest);
cudaDeviceSynchronize();
printf("hello2\n");
Test** test3 = new Test*[2];
test3[0] = new Test(12299);
test3[1] = new Test(234923);
printf("CPU : %i\n", test3[0]->hello);
Test** devtest3;
cudaMalloc(&devtest3, 2*sizeof(Test*));
printf("CPU2\n");
err = cudaMemcpy(devtest3[0], test3[0], sizeof(Test), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Error %s at line %d in file %s\n",
cudaGetErrorString(err), __LINE__-3, __FILE__);
}
printf("CPU3\n");
printtest<<<1, 1>>>(devtest3[0]);
cudaDeviceSynchronize();
}
testclass.cu
#include "testclass.cuh"
__host__ __device__ Test::Test(int in){
hello = in;
}
testclass.cuh
class Test {
public:
int hello;
__host__ __device__ Test(int);
};

Solved it using #molbdnilo 's comment.
main.cu
...
printf("hello2\n");
Test** test3 = new Test*[2];
test3[0] = new Test(12299);
test3[1] = new Test(234923);
printf("CPU : %i\n", test3[0]->hello);
Test* devtest3[2];
cudaMalloc(&devtest3[0], sizeof(Test));
printf("CPU2\n");
err = cudaMemcpy(devtest3[0], test3[0], sizeof(Test), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Error %s at line %d in file %s\n",
cudaGetErrorString(err), __LINE__-3, __FILE__);
}
printf("CPU3\n");
printtest<<<1, 1>>>(devtest3[0]);
cudaDeviceSynchronize();
...

Related

Same address but different string

Below results show two same string pointer have different value.
Why?
v8::String::Utf8Value just gives its string member, see https://v8docs.nodesource.com/node-0.8/d4/da0/v8_8h_source.html#l01286
#include <nan.h>
#include <string>
NAN_METHOD(Print) {
Nan::MaybeLocal<v8::String> maybeString = Nan::To<v8::String>(info[0]);
if (maybeString.IsEmpty() == false) {
v8::Local<v8::String> str = maybeString.ToLocalChecked();
char *ptr = *v8::String::Utf8Value(info.GetIsolate(), str);
printf("string %s\n", *v8::String::Utf8Value(info.GetIsolate(), str));
printf(" %s\n", ptr);
printf("ptr %p\n", *v8::String::Utf8Value(info.GetIsolate(), str));
printf(" %p\n", ptr);
printf("ptr %p\n", (char *)*v8::String::Utf8Value(info.GetIsolate(), str));
printf(" %p\n", (char *)ptr);
printf("string %s\n", (char *)*v8::String::Utf8Value(info.GetIsolate(), str));
printf(" %s\n", (char *)ptr);
}
}
NAN_MODULE_INIT(Init) {
Nan::Set(target, Nan::New("print").ToLocalChecked(),
Nan::GetFunction(Nan::New<v8::FunctionTemplate>(Print)).ToLocalChecked());
}
NODE_MODULE(myaddon, Init);
Result of info[0]='hello world':
string hello world
�AZ
ptr 0x65a4140
0x65a4140
ptr 0x65a4140
0x65a4140
string hello world
�AZ
printf("string %s\n", *v8::String::Utf8Value(info.GetIsolate(), str));
Here, you create a new object of type v8::String::Utf8Value. It's a temporary object. As soon as this statement completes, it will no longer exist. This is fine, but this is not:
char *ptr = *v8::String::Utf8Value(info.GetIsolate(), str);
printf(" %s\n", ptr);
After the first line of code executes, the v8::String::Utf8Value no longer exists. So the ptr doesn't point to anything. (If you think it does, explain how you would free the memory it points to.) So you can't access it later, but your printf does.

CUDA : error: "transfer of control bypasses initialization of" when creating thrust::device_ptr

I have this code line in my Cuda - C application :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
int main()
{
const int size = 32;
unsigned int * dev_ips_range_end;
unsigned int * ips_range_end = new unsigned int[size];
for (int i = 0; i < size; i++)
ips_range_end[i] = i;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ips_range_end, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_ips_range_end, ips_range_end, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
thrust::device_ptr<unsigned int> dev_ips_range_end_ptr(dev_ips_range_end);
thrust::inclusive_scan(dev_ips_range_end_ptr, dev_ips_range_end_ptr + size, dev_ips_range_end_ptr);
return 0;
Error:
cudaFree(dev_ips_range_end);
}
here is the command I used and the output:
[Test]$ nvcc -I/usr/local/cuda/include -L/usr/local/cuda/lib kernel.cu -o test.run
kernel.cu(27): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
kernel.cu(32): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
kernel.cu(39): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
3 errors detected in the compilation of "/tmp/tmpxft_000022ad_00000000-9_kernel.cpp1.ii".
the same code is working without any problem in visual studio on windows.
how to solve this issue ?
Some people might tell you that the use of goto in C/C++ isn't a great idea. But to avoid arguments, and allow you to keep the same code structure, you can declare your thrust device pointer at the top of your program (before any goto statements) and then set the pointer value when you are ready to use it, like this:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
int main()
{
const int size = 32;
unsigned int * dev_ips_range_end;
unsigned int * ips_range_end = new unsigned int[size];
for (int i = 0; i < size; i++)
ips_range_end[i] = i;
thrust::device_ptr<unsigned int> dev_ips_range_end_ptr;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ips_range_end, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_ips_range_end, ips_range_end, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
dev_ips_range_end_ptr = thrust::device_pointer_cast(dev_ips_range_end);
thrust::inclusive_scan(dev_ips_range_end_ptr, dev_ips_range_end_ptr + size, dev_ips_range_end_ptr);
return 0;
Error:
cudaFree(dev_ips_range_end);
}

Segmentation fault on cudaMalloc or cudaMemcpy

New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.

CUDA texture can't return value type of unsigned long long

I have code :
#define int4 unsigned long long int
int4 mer_thread = tex2D(STexture, col, row);
printf("\nTexture[%d][%d] = %d", row, col, tex2D(STexture, col, row));
Error "error : no instance of overloaded function "tex2D" matches the argument list"
but if define int4 unsigned long int, it work fine.
My code creat texture:
void Creat_TexttureS(int4 _S[nmax][NMAX])
{
cudaArray* carray;
cudaChannelFormatDesc channel;
channel = cudaCreateChannelDesc<int4>();
cudaMallocArray(&carray, &channel, NMAX, nmax);
cudaMemcpyToArray(carray, 0, 0, _S, sizeof(int4)*NMAX*nmax, cudaMemcpyHostToDevice);
STexture.filterMode = cudaFilterModePoint;
STexture.addressMode[0] = cudaAddressModeWrap;
STexture.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(STexture, carray);
}
Thanks for your help !!
Below is a worked example that demonstrates the storing of data of type long long int in a 2D texture of type int2, then how to retrieve it via tex2D() and re-interpret it as long long int.
#include <stdlib.h>
#include <stdio.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__forceinline__ __device__ long long int int2_as_longlong (int2 a)
{
long long int res;
asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y));
return res;
}
texture<int2, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n)
{
int2 data;
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
data = tex2D (tex, col, row);
printf ("% 11lld ", int2_as_longlong (data));
}
printf ("\n");
}
}
int main (void)
{
int m = 4; // height = #rows
int n = 3; // width = #columns
size_t pitch, tex_ofs;
unsigned long long int arr[4][3]=
{{11111111LL, 11112222LL, 11113333LL},
{22221111LL, 22222222LL, 22223333LL},
{33331111LL, 33332222LL, 33333333LL},
{44441111LL, 44442222LL, 44443333LL}};
int2 *arr_d = 0;
CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
n, m, pitch));
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
printf ("printing texture content\n");
kernel<<<1,1>>>(m, n);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (arr_d));
return EXIT_SUCCESS;
}

Output of cuda program is not what was expected

#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.