Initialising nested arrays on the CUDA device - c++

I want to initialise an array of structs on the CUDA device, where each struct contains an array of doubles:
typedef struct TimeSeries
{
double* values;
} TimeSeries;
TimeSeries* allTimeSeries;
I tried using CUDA managed memory to initialise the allTimesSeries array of structs, then initialise each values array of doubles.
Here's my minimal working example:
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <array>
#include <iostream>
#define cudaCheckErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "%s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
typedef struct TimeSeries
{
double* values;
} TimeSeries;
void print_host(TimeSeries* all)
{
printf("CPU %f %f\n", all[0].values[0], all[0].values[1]);
printf("CPU %f\n", all[1].values[0]);
}
__global__ void print_device(TimeSeries* all)
{
printf("GPU %f %f\n", all[0].values[0], all[0].values[1]);
printf("GPU %f\n", all[1].values[0]);
}
int main()
{
TimeSeries* all;
cudaCheckErrors(cudaMallocManaged(reinterpret_cast<void**>(&all),
2*sizeof(TimeSeries)));
cudaCheckErrors(cudaMallocManaged(
reinterpret_cast<void**>(&(all[0].values)), 2*sizeof(double)));
cudaCheckErrors(cudaMallocManaged(
reinterpret_cast<void**>(&(all[1].values)), 1*sizeof(double)));
all[0].values[0] = 4.8;
all[0].values[1] = 3.3;
all[1].values[0] = 0.4;
print_host(all);
print_device<<<1, 1>>>(all);
cudaCheckErrors(cudaFree(all[0].values));
cudaCheckErrors(cudaFree(all[1].values));
cudaCheckErrors(cudaFree(all));
return 0;
}
Compiling with nvcc (CUDA 10) and running on a Tesla K80 gives:
CPU 4.800000 3.300000
CPU 0.400000
Command terminated
How can initialise this data without crashing the GPU? I don't mind if the solution uses unified (managed) memory or not.

Inevitably, I found the problem shortly after posting my question. I needed to call cudaDeviceSynchronize() before calling cudaFree():
print_host(all);
print_device<<<1, 1>>>(all);
cudaCheckErrors(cudaDeviceSynchronize());
cudaCheckErrors(cudaFree(all[0].values));
cudaCheckErrors(cudaFree(all[1].values));
cudaCheckErrors(cudaFree(all));
I found the problem using the cuda-gdb debugger.

Related

LLVM ERROR: MCJIT::runFunction does not support full-featured argument passing

I have got an example from here and I faced with a run error
LLVM ERROR: Target does not support MC emission!
which I fixed it by this.
nevertheless, I still observe runtime problem:
./example 3 5
LLVM ERROR: MCJIT::runFunction does not support full-featured argument passing. Please use ExecutionEngine::getFunctionAddress and cast the result to the desired function pointer type.
main.cpp
/**
* LLVM equivalent of:
*
* int sum(int a, int b) {
* return a + b;
* }
*/
#include <llvm-c/Core.h>
#include <llvm-c/ExecutionEngine.h>
#include <llvm-c/Target.h>
#include <llvm-c/Analysis.h>
#include <llvm-c/BitWriter.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char const *argv[]) {
LLVMModuleRef mod = LLVMModuleCreateWithName("my_module");
LLVMTypeRef param_types[] = { LLVMInt32Type(), LLVMInt32Type() };
LLVMTypeRef ret_type = LLVMFunctionType(LLVMInt32Type(), param_types, 2, 0);
LLVMValueRef sum = LLVMAddFunction(mod, "sum", ret_type);
LLVMBasicBlockRef entry = LLVMAppendBasicBlock(sum, "entry");
LLVMBuilderRef builder = LLVMCreateBuilder();
LLVMPositionBuilderAtEnd(builder, entry);
LLVMValueRef tmp = LLVMBuildAdd(builder, LLVMGetParam(sum, 0), LLVMGetParam(sum, 1), "tmp");
LLVMBuildRet(builder, tmp);
char *error = NULL;
LLVMVerifyModule(mod, LLVMAbortProcessAction, &error);
LLVMDisposeMessage(error);
LLVMExecutionEngineRef engine;
error = NULL;
LLVMLinkInMCJIT();
LLVMInitializeNativeTarget();
LLVMInitializeNativeAsmPrinter(); // added
LLVMInitializeNativeAsmParser(); // added
if (LLVMCreateExecutionEngineForModule(&engine, mod, &error) != 0)
{
fprintf(stderr, "failed to create execution engine\n");
abort();
}
if (error)
{
fprintf(stderr, "error: %s\n", error);
LLVMDisposeMessage(error);
exit(EXIT_FAILURE);
}
if (argc < 3)
{
fprintf(stderr, "usage: %s x y\n", argv[0]);
exit(EXIT_FAILURE);
}
long long x = strtoll(argv[1], NULL, 10);
long long y = strtoll(argv[2], NULL, 10);
LLVMGenericValueRef args[] = {
LLVMCreateGenericValueOfInt(LLVMInt32Type(), x, 0),
LLVMCreateGenericValueOfInt(LLVMInt32Type(), y, 0)
};
LLVMGenericValueRef res = LLVMRunFunction(engine, sum, 2, args);
printf("%d\n", (int)LLVMGenericValueToInt(res, 0));
// Write out bitcode to file
if (LLVMWriteBitcodeToFile(mod, "sum.bc") != 0) {
fprintf(stderr, "error writing bitcode to file, skipping\n");
}
LLVMDisposeBuilder(builder);
LLVMDisposeExecutionEngine(engine);
}
Though the meessage is clear for the view point of the code author, for me as a user it is so cryptic. How can I solve this?
From the documentation of MCJIT::runFunction:
For MCJIT execution engines, clients are encouraged to use the "GetFunctionAddress" method (rather than runFunction) and cast the returned uint64_t to the desired function pointer type. However, for backwards compatibility MCJIT's implementation can execute 'main-like' function (i.e. those returning void or int, and taking either no arguments or (int, char*[])).
So you can't call MCJIT::runFunction (and by extension the C API's LLVMRunFunction when used with an MCJIT engine) unless the arguments array is either empty or consists of only an i32 and an i8* (in that order). Your array contains two i32s, so it does not meet those restrictions.
As stated in the documentation (and the exception message), you should instead use ExecutionEngine::getFunctionAddress (or its C wrapper LLVMGetFunctionAddress), cast the result to int (*)(int, int) and then call it as f(0, 0);.

Linux - segmentation fault only sometimes - how to debug

I have a Linux program, that from time to time ends with a segmentation fault. The program is running periodically every hour, but the segmentation fault occurs only sometimes.
I have a problem to debug this, because if I run the program again with the same input, no error is reported and all is OK.
Is there a way, how to "report" in which part of the code error occured or what caused the problem?
The usual way is to have the crashing program generate a corefile and analyze this after the crash. Make sure, that:
the maximum corefile-size is big enough (i.e. unlimited) by calling ulimit -c unlimited in the shell, which starts the process.
The cwd is writable by the segfaulting process.
Then you can analyze the file with
gdb <exe> <corefile>
Since your code not crashing every time, you can use backtrace as well. Using this you can see the function call stack at the time of crash. There are many examples available. In my projects I normally use the following code for backtracing.
/*
* call reg_usr2 function from main
* gcc -rdynamic myfile.c -o output
*/
#include <stdio.h>
#include <stdarg.h>
#include <signal.h>
#include <unistd.h>
#include <stdlib.h>
#include <execinfo.h>
#define FILE_NAME "/tmp/debug"
#define MODE 0xFFFF
void dbgprint(int flag, char* fmt, ...)
{
if(flag & MODE) {
char buf[100];
va_list vlist;
FILE *fp = fopen(FILE_NAME,"a");
va_start(vlist, fmt);
vsnprintf( buf, sizeof( buf), fmt, vlist);
va_end( vlist);
fprintf(fp,"[%x]->%s\n", flag, buf);
fclose(fp);
}
}
/** Here is the code to print backtrace **/
void print_stack_trace ()
{
void *array[20];
size_t size;
char **strings;
size_t i;
size = backtrace (array, 20);
strings = backtrace_symbols (array, size);
dbgprint(0xFFFF, "Obtained %zd stack frames.", size);
dbgprint(0xFFFF, "-------------------------");
dbgprint(0xFFFF, "---------Backtrace-------");
for (i = 0; i < size; i++)
dbgprint (0xFFFF, "%s", strings[i]);
dbgprint(0xFFFF, "-------------------------");
free (strings);
}
void sig_handler(int signo)
{
FILE *fp = fopen(FILE_NAME,"a");
if (signo == SIGUSR2){
dbgprint(0xFFFF, "received SIGUSR2");
dbgprint(0xFFFF, "----------------");
}
print_stack_trace();
exit(0);
}
void reg_usr2()
{
if (signal(SIGUSR2, sig_handler) == SIG_ERR)
printf("\ncan't catch SIGUSR2\n");
}
int main()
{
reg_usr2(); //should be first line of main after variables
//Code.....
return 0;
}
You can generate backtrace by catching SIGSEGV signal, and see where your application throw an invalid access.
see https://stackoverflow.com/a/77336/4490542
But there is more easier solution, try running your application with catchsegv
catchsegv './program args'
and better alternative, valgrind
valgrind --tool=none ./program args

Undefined Reference in constructors

I have been hammering at this for the past 2 days and i've already tried every single solution on the internet, so here goes.
I have a problem with undefined references. I am doing a project to compare 3 algorithms and i have compartmentalized them into 3 different sets of cpp files. I am using Dev C++ with gcc 4.9.2.6 as my compiler. I know it is a linker error, but all my solutions are not working and i can't seem to identify it.
My main file is
#include <iostream>
#include <stdio.h>
#include <string>
#include <fstream>
#include <time.h>
#include "SDL.h"
#include "bitmap_image.hpp" //ext
#include "Bresenham.hpp"
#include "XiaolinWu.hpp"
#include "GuptaSproull.hpp"
void GenerateBMPBlank(int xsize,int ysize,std::string fileid);
void BresenhamTest(int xsize,int ysize, std::string TestDataFile);
void XiaolinWuTest(int xsize,int ysize, std::string TestDataFile);
void GuptaSproullTest(int xsize, int ysize, std::string TestDataFile);
void executeTest(int xsize,int ysize, std::string TestDataFile); //resulting BMP file generated will have the format "x_y_algorithmName.bmp"
int main()
{
short x,y;
std::string TestFileLocation;
std::cout << "Please indicate file path of Test Data textfile"<< std::endl;
std::cin>>TestFileLocation;
std::cout << "Please indicate file dimensions" << std::endl;
std::cin>> x >> y;
executeTest(x,y, TestFileLocation);
std::cout<< "Procedure executed"<< std::endl;
std::cin.get();
return 0;
}
void GenerateBMPBlank(int xsize,int ysize,std::string fileid) //uses external library http://partow.net/programming/bitmap/ to generate a blank white bmp file
{
bitmap_image blankBMP(xsize,ysize); //creates bitmap image
blankBMP.set_all_channels(255,255,255); //sets entire image to be completely white
blankBMP.save_image(fileid);
} //tested
void executeTest(int xsize,int ysize, std::string TestDataFile)
{
SDL_Init( SDL_INIT_EVERYTHING );
std::cout<<"Beginning test of data set from "+TestDataFile<<std::endl;
std::cout<<"Now executing Bresenham's algorithm"<<std::endl;
BresenhamTest(xsize,ysize, TestDataFile);
std::cout<<"Now executing Xiaolin Wu's algorithm"<<std::endl;
XiaolinWuTest(xsize,ysize,TestDataFile);
std::cout<<"Now executing Gupta Sproull 's algorithm"<<std::endl;
GuptaSproullTest(xsize,ysize,TestDataFile);
SDL_Quit();
}
void BresenhamTest(int xsize,int ysize, std::string TestDataFile)
{
std::string ResultName= std::to_string(xsize) + "_" + std::to_string(ysize) + "_Bresenham.bmp";
GenerateBMPBlank(xsize,ysize,ResultName);
clock_t tStart = clock();
Bresenham b(ResultName,TestDataFile);
printf("Time taken for Bresenham: %.4fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
}
void XiaolinWuTest(int xsize,int ysize, std::string TestDataFile)
{
std::string ResultName= std::to_string(xsize) + "_" + std::to_string(ysize) + "_XiaolinWu.bmp";
GenerateBMPBlank(xsize,ysize,ResultName);
clock_t tStart = clock();
XiaolinWu w(ResultName,TestDataFile);
printf("Time taken for XiaolinWu: %.4fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
}
void GuptaSproullTest(int xsize,int ysize, std::string TestDataFile)
{
std::string ResultName= std::to_string(xsize) + "_" + std::to_string(ysize) + "_GuptaSproull.bmp";
GenerateBMPBlank(xsize,ysize,ResultName);
clock_t tStart = clock();
GuptaSproull g(ResultName,TestDataFile);
printf("Time taken for GuptaSproull: %.4fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
}
However, an error is produced as follows
C++ files/ComparatorMain.o:ComparatorMain.cpp:(.text+0x544): undefined reference to `Bresenham::Bresenham(std::string, std::string)'
C++ files/ComparatorMain.o:ComparatorMain.cpp:(.text+0x76b): undefined reference to `XiaolinWu::XiaolinWu(std::string, std::string)'
C++ files/ComparatorMain.o:ComparatorMain.cpp:(.text+0x992): undefined reference to `GuptaSproull::GuptaSproull(std::string, std::string)'collect2.exe: error: ld returned 1 exit status
As the implementation of the 3 different cpp files are nearly identical (with the main difference just being the algorithm implemented as well as some misc. functions which have complied and so far aren't showing errors), I will just show the main parts of Bresenham.cpp and hpp where the linker errors are occurring (if additional information is needed, just tell me). The definitions for GuptaSproull.cpp as well as XiaolinWu.cpp are pretty much identical for the code shown below. I cut out most of the function implementations for easier reading and i don't think its relavant (unless i got that part wrong).
Bresenham.hpp
#ifndef BRESENHAM_H
#define BRESENHAM_H
#include <iostream>
#include "SDL.h"
#undef main
class Bresenham{
public:
Bresenham(std::string BMPName,std::string TestDataFile);
SDL_Surface* OpenBMP(std::string BMPName);
void CloseBMP(SDL_Surface* surface,std::string Filename);
void putpixel(SDL_Surface *surface, int x, int y, Uint32 pixel);
void bresenhamDrawLine(int x1,int y1,int x2, int y2, SDL_Surface *surface);
};
#endif
Bresenham.cpp
#ifndef BRESENHAM_H
#define BRESENHAM_H
#include <iostream>
#include <cstdio>
#include <cmath>
#include <fstream>
#include "SDL.h"
#include "Bresenham.hpp"
#undef main
#endif
class Bresenham
{
Bresenham(std::string BMPName,std::string TestDataFile)
{
std::ifstream testFile(TestDataFile);
SDL_Surface *image;
image=OpenBMP(BMPName);
if ( SDL_MUSTLOCK(image) ) //surface must be locked before pixels can be drawn
{
if ( SDL_LockSurface(image) < 0 ) {
fprintf(stderr, "Can't lock screen: %s\n", SDL_GetError());
return;
}
}
int x1,y1,x2,y2;
while(testFile>>x1>>y1>>x2>>y2)
{
bresenhamDrawLine(x1,y1,x2,y2,image); //loops through the dataset and calls the bresenham draw line function
}
if ( SDL_MUSTLOCK(image) )
{
SDL_UnlockSurface(image);
}
CloseBMP(image,BMPName);
}
void bresenhamDrawLine(int x1,int y1,int x2, int y2, SDL_Surface *surface)
{
/* implemented */
}
SDL_Surface* OpenBMP(std::string BMPName)
{
/* implemented */
}
void CloseBMP(SDL_Surface *surface,std::string FileName)
{
/* implemented */
}
void putpixel(SDL_Surface *surface, int x, int y, double brightness) //this function allows us to place a pixel at coordinates (x,y) in SDL.
{
/*implemented*/
}
};
Now I have done a few messy attempts to try and fix this problem (such as adding #ifndef BRESENHAM_H #define BRESENHAM_H #include "Bresenham.hpp" into the .cpp file. However, the error above still occurs.
Is this linked to the way I implemented my code to do the testing? I used a constructor to basically run my test on the algorithms (which i suspect you might find a shoddy way of implementing such a test). I have done the following (so yeah those didn't work):
Verified that all the files are in the build path (under the same project)
Tried adding namespaces to see if it fixed the problem (it didn't)
I've searched under pretty much every single link in google in order to find a potential fix (none of them seems to work).
There are no compiler errors so far (in all the files).
I suspect i might need to abandon this style of implementing the test and migrate over to using a static function instead (Could someone comment if this would work?). I'm not really used to C++ (this is my first "big" program in this language so far), so pardon me if I'm missing something glaringly obvious (which i hope i didn't).
What should I do?
You actually have two declarations of Bresenham class, one in Bresenham.hpp and one in Bresenham.cpp. Change your cpp file in following way:
Bresenham::Bresenham(std::string BMPName,std::string TestDataFile)
{
std::ifstream testFile(TestDataFile);
SDL_Surface *image;
image=OpenBMP(BMPName);
if ( SDL_MUSTLOCK(image) ) //surface must be locked before pixels can be drawn
{
if ( SDL_LockSurface(image) < 0 ) {
fprintf(stderr, "Can't lock screen: %s\n", SDL_GetError());
return;
}
}
int x1,y1,x2,y2;
while(testFile>>x1>>y1>>x2>>y2)
{
bresenhamDrawLine(x1,y1,x2,y2,image); //loops through the dataset and calls the bresenham draw line function
}
if ( SDL_MUSTLOCK(image) )
{
SDL_UnlockSurface(image);
}
CloseBMP(image,BMPName);
}
void Bresenham::bresenhamDrawLine(int x1,int y1,int x2, int y2, SDL_Surface *surface)
{
/* implemented */
}
SDL_Surface* Bresenham::OpenBMP(std::string BMPName)
{
/* implemented */
}
void Bresenham::CloseBMP(SDL_Surface *surface,std::string FileName)
{
/* implemented */
}
void Bresenham::putpixel(SDL_Surface *surface, int x, int y, double brightness) //this function allows us to place a pixel at coordinates (x,y) in SDL.
{
/*implemented*/
}
What should I do?
First of all, you need to set up correct code units:
remove #undef main (makes no sense)
remove the include guards from your cpp files, they belong only in header files. With these, the code just doesn't get compiled, hence the linking problem !
As CodeFuller states it clearly in his answer, you must separate the class declaration (in .hpp file) and the implementation of the methods (in the .cpp file)
For more, you need to gives us an MVCE that demonstrates your problem (I agree, that is some bit of work).

What do I have to do to execute code in data areas, ( segment protection )

I work on a linux platform and I use g++ with the above program that copies a function from the code area to the data area. How do I change protection of data segment in order to allow me to execute the copied function ?
The code is bellow:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#define Return asm volatile("pop %rbp; retq; retq; retq; retq; retq;")
int64_t funcEnd=0xc35dc3c3c3c3c35d;
constexpr int maxCode=0x800;
int8_t code[maxCode];
void testCode(void){
int a=8,b=7;
a+=b*a;
Return;
}
typedef void (*action)(void);
int main(int argc, char **argv)
{
action a=&testCode;
testCode();
int8_t *p0=(int8_t*)a,*p=p0,*p1=p0+maxCode;
for(;p!=p1;p++)
if ( (*(int64_t*)p)==funcEnd ) break;
if(p!=p1){
p+=sizeof(int64_t);
printf("found\n");
memcpy(&code,(void*)a,p-(int8_t*)a);
((action)&code)();
}
printf("returning 0\n");
return 0;
}
It depends if you are trying to do this statically (at build-time), or at dynamically (at run-time).
Build-time
You need to tell GCC to put your blob in a section that is executable. We use __attribute__((section)), and this trick to specify the attributes of the section when we create it.
Run-time
TL;DR: Jump to the end of my answer, where I use mmap.
Although others might be questioning why you'd want do allow something like this at run-time, keep in mind that this is exactly what a VM with a JIT compiler (e.g. Java VM, .NET CLR, etc.) do when emitting native code.
You need to change the memory protections of the memory where you're trying to execute. We do that with mprotect(addr, PROT_EXEC). Note that addr must be aligned to the page size of your platform. On x86, the page size is 4K. We use aligned_alloc to guarantee this alignment.
Example (of both):
#define _ISOC11_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h> /* mprotect() */
__attribute__((section(".my_executable_blob,\"awx\",#progbits#")))
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
int (*func)(void);
/* Execute a static blob of data */
func = (void*)code;
printf("(static) code returned %d\n", func());
/* Execute a dynamically-allocated blob of data */
void *p = aligned_alloc(0x1000, sizeof(code));
if (!p) {
fprintf(stderr, "aligned_alloc() failed\n");
return 2;
}
memcpy(p, code, sizeof(code));
if (mprotect(p, sizeof(code), PROT_EXEC) < 0) {
perror("mprotect");
return 2;
}
func = p;
printf("(dynamic) code returned %d\n", func());
return 0;
}
Output:
$ ./a.out
(static) code returned 42
(dynamic) code returned 42
SELinux Impact
Note that this puts your executable code on the heap which might be a bit dangerous. SELinux on my CentOS 7 machine actually denied the mprotect call:
SELinux is preventing /home/jreinhart/so/a.out from using the execheap access on a process.
***** Plugin allow_execheap (53.1 confidence) suggests ********************
If you do not think /home/jreinhart/so/a.out should need to map heap memory that is both writable and executable.
Then you need to report a bug. This is a potentially dangerous access.
So I had to temporarily sudo setenforce 0 to get this to work.
I'm not sure why, however, because looking in /proc/[pid]/maps, the pages are clearly marked only as executable, not as "writable and executable" as SELinux indicated. If I move the memcpy after the mprotect, my process segfaults, because I'm trying to write to non-writable memory. So it seems SELinux is being a bit too over-zealous here.
Use mmap instead
Instead of mprotecting a region of the heap (allocated with aligned_alloc), it is more straightforward to use mmap. This also avoids any issues with SELinux, as we're not trying to execute on the heap.
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h> /* mmap() */
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
void *p = mmap(NULL, sizeof(code), PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p==MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return 2;
}
memcpy(p, code, sizeof(code));
int (*func)(void) = p;
printf("(dynamic) code returned %d\n", func());
pause();
return 0;
}
The final solution
The mmap solution is good, but it doesn't provide us any safety; our mmaped region of code is readable, writable, and executable. It would be better to only allow the memory to be writable while we're putting our code in place, then making it executable only. The following code does just that:
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h> /* mmap(), mprotect() */
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
const size_t len = sizeof(code);
/* mmap a region for our code */
void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, /* No PROT_EXEC */
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p==MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return 2;
}
/* Copy it in (still not executable) */
memcpy(p, code, len);
/* Now make it execute-only */
if (mprotect(p, len, PROT_EXEC) < 0) {
fprintf(stderr, "mprotect failed to mark exec-only\n");
return 2;
}
/* Go! */
int (*func)(void) = p;
printf("(dynamic) code returned %d\n", func());
pause();
return 0;
}

cudaMemcpyToSymbol using or not using string

I was trying to copy a structure to constant memory in this way:
struct Foo {
int a, b, c;
};
__constant__ Foo cData;
int main() {
Foo hData = {1, 2, 3};
cudaMemcpyToSymbol(cData, &hData, sizeof(Foo));
// ...
}
And this worked fine, in my kernel I could access the constant data directly:
__global__ void kernel() {
printf("Data is: %d %d %d\n", cData.a, cData.b, cData.c); // 1 2 3
}
But then I tried to use a const char * as symbol name, and things stopped working:
cudaMemcpyToSymbol("cData", &hData, sizeof(Foo)); // prints 0 0 0
I thought both versions were similar, but it seems I was wrong.
What is happening?
EDIT:
I'd like to report this same behavior with cudaGetSymbolAddress, which works for me if no const char * is used:
__constant__ int someData[10];
__constant__ int *ptrToData;
int *dataPosition;
cudaGetSymbolAddress((void **)&dataPosition, someData); // Works
// cudaGetSymbolAddress((void **)&dataPosition, "someData"); // Do not work
cudaMemcpyToSymbol(ptrToData, &dataPosition, sizeof(int *));
As of CUDA 5, using a string for symbol names is no longer supported. This is covered in the CUDA 5 release notes here
•The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.
One of the reasons for this has to do with enabling of a true device linker, which is new functionality in CUDA 5.
Because of getting the same error again and again, I want to share this sample code that shows nearly all of the example cases for this problem (so I may refer here later when I make same mistakes again).
//file: main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
__constant__ float constData[256];
__device__ float devData;
__device__ float* devPointer;
int main(int argc, char **argv)
{
cudaFree(0);
float data[256];
cudaError_t err = cudaMemcpyToSymbol(constData, data, sizeof(data));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
float value = 3.14f;
err = cudaMemcpyToSymbol(devData, &value, sizeof(float));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
float* ptr;
cudaMalloc(&ptr, 256 * sizeof(float));
err = cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
cudaFree(ptr);
return EXIT_SUCCESS;
}
I was getting "invalid device symbol" and many others which are related to _constant_ _device_ memory usage. This code gives no such errors at runtime.