cudaMemcpyToSymbol using or not using string - c++

I was trying to copy a structure to constant memory in this way:
struct Foo {
int a, b, c;
};
__constant__ Foo cData;
int main() {
Foo hData = {1, 2, 3};
cudaMemcpyToSymbol(cData, &hData, sizeof(Foo));
// ...
}
And this worked fine, in my kernel I could access the constant data directly:
__global__ void kernel() {
printf("Data is: %d %d %d\n", cData.a, cData.b, cData.c); // 1 2 3
}
But then I tried to use a const char * as symbol name, and things stopped working:
cudaMemcpyToSymbol("cData", &hData, sizeof(Foo)); // prints 0 0 0
I thought both versions were similar, but it seems I was wrong.
What is happening?
EDIT:
I'd like to report this same behavior with cudaGetSymbolAddress, which works for me if no const char * is used:
__constant__ int someData[10];
__constant__ int *ptrToData;
int *dataPosition;
cudaGetSymbolAddress((void **)&dataPosition, someData); // Works
// cudaGetSymbolAddress((void **)&dataPosition, "someData"); // Do not work
cudaMemcpyToSymbol(ptrToData, &dataPosition, sizeof(int *));

As of CUDA 5, using a string for symbol names is no longer supported. This is covered in the CUDA 5 release notes here
•The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.
One of the reasons for this has to do with enabling of a true device linker, which is new functionality in CUDA 5.

Because of getting the same error again and again, I want to share this sample code that shows nearly all of the example cases for this problem (so I may refer here later when I make same mistakes again).
//file: main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
__constant__ float constData[256];
__device__ float devData;
__device__ float* devPointer;
int main(int argc, char **argv)
{
cudaFree(0);
float data[256];
cudaError_t err = cudaMemcpyToSymbol(constData, data, sizeof(data));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
float value = 3.14f;
err = cudaMemcpyToSymbol(devData, &value, sizeof(float));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
float* ptr;
cudaMalloc(&ptr, 256 * sizeof(float));
err = cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));
printf("Err id: %d, str: %s\n", err, cudaGetErrorString(err));
cudaFree(ptr);
return EXIT_SUCCESS;
}
I was getting "invalid device symbol" and many others which are related to _constant_ _device_ memory usage. This code gives no such errors at runtime.

Related

LLVM ERROR: MCJIT::runFunction does not support full-featured argument passing

I have got an example from here and I faced with a run error
LLVM ERROR: Target does not support MC emission!
which I fixed it by this.
nevertheless, I still observe runtime problem:
./example 3 5
LLVM ERROR: MCJIT::runFunction does not support full-featured argument passing. Please use ExecutionEngine::getFunctionAddress and cast the result to the desired function pointer type.
main.cpp
/**
* LLVM equivalent of:
*
* int sum(int a, int b) {
* return a + b;
* }
*/
#include <llvm-c/Core.h>
#include <llvm-c/ExecutionEngine.h>
#include <llvm-c/Target.h>
#include <llvm-c/Analysis.h>
#include <llvm-c/BitWriter.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char const *argv[]) {
LLVMModuleRef mod = LLVMModuleCreateWithName("my_module");
LLVMTypeRef param_types[] = { LLVMInt32Type(), LLVMInt32Type() };
LLVMTypeRef ret_type = LLVMFunctionType(LLVMInt32Type(), param_types, 2, 0);
LLVMValueRef sum = LLVMAddFunction(mod, "sum", ret_type);
LLVMBasicBlockRef entry = LLVMAppendBasicBlock(sum, "entry");
LLVMBuilderRef builder = LLVMCreateBuilder();
LLVMPositionBuilderAtEnd(builder, entry);
LLVMValueRef tmp = LLVMBuildAdd(builder, LLVMGetParam(sum, 0), LLVMGetParam(sum, 1), "tmp");
LLVMBuildRet(builder, tmp);
char *error = NULL;
LLVMVerifyModule(mod, LLVMAbortProcessAction, &error);
LLVMDisposeMessage(error);
LLVMExecutionEngineRef engine;
error = NULL;
LLVMLinkInMCJIT();
LLVMInitializeNativeTarget();
LLVMInitializeNativeAsmPrinter(); // added
LLVMInitializeNativeAsmParser(); // added
if (LLVMCreateExecutionEngineForModule(&engine, mod, &error) != 0)
{
fprintf(stderr, "failed to create execution engine\n");
abort();
}
if (error)
{
fprintf(stderr, "error: %s\n", error);
LLVMDisposeMessage(error);
exit(EXIT_FAILURE);
}
if (argc < 3)
{
fprintf(stderr, "usage: %s x y\n", argv[0]);
exit(EXIT_FAILURE);
}
long long x = strtoll(argv[1], NULL, 10);
long long y = strtoll(argv[2], NULL, 10);
LLVMGenericValueRef args[] = {
LLVMCreateGenericValueOfInt(LLVMInt32Type(), x, 0),
LLVMCreateGenericValueOfInt(LLVMInt32Type(), y, 0)
};
LLVMGenericValueRef res = LLVMRunFunction(engine, sum, 2, args);
printf("%d\n", (int)LLVMGenericValueToInt(res, 0));
// Write out bitcode to file
if (LLVMWriteBitcodeToFile(mod, "sum.bc") != 0) {
fprintf(stderr, "error writing bitcode to file, skipping\n");
}
LLVMDisposeBuilder(builder);
LLVMDisposeExecutionEngine(engine);
}
Though the meessage is clear for the view point of the code author, for me as a user it is so cryptic. How can I solve this?
From the documentation of MCJIT::runFunction:
For MCJIT execution engines, clients are encouraged to use the "GetFunctionAddress" method (rather than runFunction) and cast the returned uint64_t to the desired function pointer type. However, for backwards compatibility MCJIT's implementation can execute 'main-like' function (i.e. those returning void or int, and taking either no arguments or (int, char*[])).
So you can't call MCJIT::runFunction (and by extension the C API's LLVMRunFunction when used with an MCJIT engine) unless the arguments array is either empty or consists of only an i32 and an i8* (in that order). Your array contains two i32s, so it does not meet those restrictions.
As stated in the documentation (and the exception message), you should instead use ExecutionEngine::getFunctionAddress (or its C wrapper LLVMGetFunctionAddress), cast the result to int (*)(int, int) and then call it as f(0, 0);.

TensorRT increasing memory usage (leak?)

I'm having a loop where I parse an ONNX model into TensorRT, create an engine and do inference.
I make sure I call x->destroy() on all objects and I use cudaFree for each cudaMalloc.
Yet, I keep getting an increase in memory usage through nvidia-smi over consecutive iterations.
I'm really not sure where the problem comes from. The cuda-memcheck tool reports no leaks either.
Running Ubuntu 18.04, TensorRT 7.0.0, CUDA 10.2 and using a GTX 1070.
The code, the ONNX file along with a CMakeLists.txt are available on this repo
Here's the code
#include <memory>
#include <iostream>
#include <cuda_runtime_api.h>
#include <NvOnnxParser.h>
#include <NvInfer.h>
class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
}
};
int main(int argc, char * argv[])
{
Logger gLogger;
auto builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = builder->createNetworkV2(explicitBatch);
auto config = builder->createBuilderConfig();
auto parser = nvonnxparser::createParser(*network, gLogger);
parser->parseFromFile("../model.onnx", static_cast<int>(0));
builder->setMaxBatchSize(1);
config->setMaxWorkspaceSize(128 * (1 << 20)); // 128 MiB
auto engine = builder->buildEngineWithConfig(*network, *config);
builder->destroy();
network->destroy();
parser->destroy();
config->destroy();
for(int i=0; i< atoi(argv[1]); i++)
{
auto context = engine->createExecutionContext();
void* deviceBuffers[2]{0};
int inputIndex = engine->getBindingIndex("input_rgb:0");
constexpr int inputNumel = 1 * 128 * 64 * 3;
int outputIndex = engine->getBindingIndex("truediv:0");
constexpr int outputNumel = 1 * 128;
//TODO: Remove batch size hardcoding
cudaMalloc(&deviceBuffers[inputIndex], 1 * sizeof(float) * inputNumel);
cudaMalloc(&deviceBuffers[outputIndex], 1 * sizeof(float) * outputNumel);
cudaStream_t stream;
cudaStreamCreate(&stream);
float inBuffer[inputNumel] = {0};
float outBuffer[outputNumel] = {0};
cudaMemcpyAsync(deviceBuffers[inputIndex], inBuffer, 1 * sizeof(float) * inputNumel, cudaMemcpyHostToDevice, stream);
context->enqueueV2(deviceBuffers, stream, nullptr);
cudaMemcpyAsync(outBuffer, deviceBuffers[outputIndex], 1 * sizeof(float) * outputNumel, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaFree(deviceBuffers[inputIndex]);
cudaFree(deviceBuffers[outputIndex]);
cudaStreamDestroy(stream);
context->destroy();
}
engine->destroy();
return 0;
}
Looks like the issue was coming from the repetitive IExecutionContext creation despite destroying it at the end of every iteration. Creating/deleting the context at the same time as the engine fixed the issue for me. Nevertheless, it could still be a bug where context creation leaks a little bit of memory and that leak accumulates over time. Filed a github issue.

Initialising nested arrays on the CUDA device

I want to initialise an array of structs on the CUDA device, where each struct contains an array of doubles:
typedef struct TimeSeries
{
double* values;
} TimeSeries;
TimeSeries* allTimeSeries;
I tried using CUDA managed memory to initialise the allTimesSeries array of structs, then initialise each values array of doubles.
Here's my minimal working example:
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <array>
#include <iostream>
#define cudaCheckErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "%s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
typedef struct TimeSeries
{
double* values;
} TimeSeries;
void print_host(TimeSeries* all)
{
printf("CPU %f %f\n", all[0].values[0], all[0].values[1]);
printf("CPU %f\n", all[1].values[0]);
}
__global__ void print_device(TimeSeries* all)
{
printf("GPU %f %f\n", all[0].values[0], all[0].values[1]);
printf("GPU %f\n", all[1].values[0]);
}
int main()
{
TimeSeries* all;
cudaCheckErrors(cudaMallocManaged(reinterpret_cast<void**>(&all),
2*sizeof(TimeSeries)));
cudaCheckErrors(cudaMallocManaged(
reinterpret_cast<void**>(&(all[0].values)), 2*sizeof(double)));
cudaCheckErrors(cudaMallocManaged(
reinterpret_cast<void**>(&(all[1].values)), 1*sizeof(double)));
all[0].values[0] = 4.8;
all[0].values[1] = 3.3;
all[1].values[0] = 0.4;
print_host(all);
print_device<<<1, 1>>>(all);
cudaCheckErrors(cudaFree(all[0].values));
cudaCheckErrors(cudaFree(all[1].values));
cudaCheckErrors(cudaFree(all));
return 0;
}
Compiling with nvcc (CUDA 10) and running on a Tesla K80 gives:
CPU 4.800000 3.300000
CPU 0.400000
Command terminated
How can initialise this data without crashing the GPU? I don't mind if the solution uses unified (managed) memory or not.
Inevitably, I found the problem shortly after posting my question. I needed to call cudaDeviceSynchronize() before calling cudaFree():
print_host(all);
print_device<<<1, 1>>>(all);
cudaCheckErrors(cudaDeviceSynchronize());
cudaCheckErrors(cudaFree(all[0].values));
cudaCheckErrors(cudaFree(all[1].values));
cudaCheckErrors(cudaFree(all));
I found the problem using the cuda-gdb debugger.

What do I have to do to execute code in data areas, ( segment protection )

I work on a linux platform and I use g++ with the above program that copies a function from the code area to the data area. How do I change protection of data segment in order to allow me to execute the copied function ?
The code is bellow:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#define Return asm volatile("pop %rbp; retq; retq; retq; retq; retq;")
int64_t funcEnd=0xc35dc3c3c3c3c35d;
constexpr int maxCode=0x800;
int8_t code[maxCode];
void testCode(void){
int a=8,b=7;
a+=b*a;
Return;
}
typedef void (*action)(void);
int main(int argc, char **argv)
{
action a=&testCode;
testCode();
int8_t *p0=(int8_t*)a,*p=p0,*p1=p0+maxCode;
for(;p!=p1;p++)
if ( (*(int64_t*)p)==funcEnd ) break;
if(p!=p1){
p+=sizeof(int64_t);
printf("found\n");
memcpy(&code,(void*)a,p-(int8_t*)a);
((action)&code)();
}
printf("returning 0\n");
return 0;
}
It depends if you are trying to do this statically (at build-time), or at dynamically (at run-time).
Build-time
You need to tell GCC to put your blob in a section that is executable. We use __attribute__((section)), and this trick to specify the attributes of the section when we create it.
Run-time
TL;DR: Jump to the end of my answer, where I use mmap.
Although others might be questioning why you'd want do allow something like this at run-time, keep in mind that this is exactly what a VM with a JIT compiler (e.g. Java VM, .NET CLR, etc.) do when emitting native code.
You need to change the memory protections of the memory where you're trying to execute. We do that with mprotect(addr, PROT_EXEC). Note that addr must be aligned to the page size of your platform. On x86, the page size is 4K. We use aligned_alloc to guarantee this alignment.
Example (of both):
#define _ISOC11_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h> /* mprotect() */
__attribute__((section(".my_executable_blob,\"awx\",#progbits#")))
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
int (*func)(void);
/* Execute a static blob of data */
func = (void*)code;
printf("(static) code returned %d\n", func());
/* Execute a dynamically-allocated blob of data */
void *p = aligned_alloc(0x1000, sizeof(code));
if (!p) {
fprintf(stderr, "aligned_alloc() failed\n");
return 2;
}
memcpy(p, code, sizeof(code));
if (mprotect(p, sizeof(code), PROT_EXEC) < 0) {
perror("mprotect");
return 2;
}
func = p;
printf("(dynamic) code returned %d\n", func());
return 0;
}
Output:
$ ./a.out
(static) code returned 42
(dynamic) code returned 42
SELinux Impact
Note that this puts your executable code on the heap which might be a bit dangerous. SELinux on my CentOS 7 machine actually denied the mprotect call:
SELinux is preventing /home/jreinhart/so/a.out from using the execheap access on a process.
***** Plugin allow_execheap (53.1 confidence) suggests ********************
If you do not think /home/jreinhart/so/a.out should need to map heap memory that is both writable and executable.
Then you need to report a bug. This is a potentially dangerous access.
So I had to temporarily sudo setenforce 0 to get this to work.
I'm not sure why, however, because looking in /proc/[pid]/maps, the pages are clearly marked only as executable, not as "writable and executable" as SELinux indicated. If I move the memcpy after the mprotect, my process segfaults, because I'm trying to write to non-writable memory. So it seems SELinux is being a bit too over-zealous here.
Use mmap instead
Instead of mprotecting a region of the heap (allocated with aligned_alloc), it is more straightforward to use mmap. This also avoids any issues with SELinux, as we're not trying to execute on the heap.
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h> /* mmap() */
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
void *p = mmap(NULL, sizeof(code), PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p==MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return 2;
}
memcpy(p, code, sizeof(code));
int (*func)(void) = p;
printf("(dynamic) code returned %d\n", func());
pause();
return 0;
}
The final solution
The mmap solution is good, but it doesn't provide us any safety; our mmaped region of code is readable, writable, and executable. It would be better to only allow the memory to be writable while we're putting our code in place, then making it executable only. The following code does just that:
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h> /* mmap(), mprotect() */
static uint8_t code[] = {
0xB8,0x2A,0x00,0x00,0x00, /* mov eax,0x2a */
0xC3, /* ret */
};
int main(void)
{
const size_t len = sizeof(code);
/* mmap a region for our code */
void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, /* No PROT_EXEC */
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p==MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return 2;
}
/* Copy it in (still not executable) */
memcpy(p, code, len);
/* Now make it execute-only */
if (mprotect(p, len, PROT_EXEC) < 0) {
fprintf(stderr, "mprotect failed to mark exec-only\n");
return 2;
}
/* Go! */
int (*func)(void) = p;
printf("(dynamic) code returned %d\n", func());
pause();
return 0;
}

Problems compiling simple OpenCL example code under Xcode 6.1.1 with C++

I have a simple hello world OpenCL program. For some reason the compiler gives an error when I try to reference the kernel name from within the code.
In kernel.cl I have
kernel void square_kernel(global float *input, global float * output, const unsigned int count)
{
size_t i = get_global_id(0);
if(i<count)
output[i] = input[i] * input[i];
}
In main.cpp I have:
...
#include "kernel.cl.h"
...
int main()
{
...
dispatch_sync(queue, ^{
size_t wgs;
gcl_get_kernel_block_workgroup_info(square_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL);
cl_ndrange range = {
1, // number of dims
{0, 0, 0}, // offset in each dim
{DATA_SIZE, 0, 0}, // global range (total)
{wgs, 0, 0} // local size of each work group #work_groups = DATA_SIZE / wgs
};
// call kernel
square_kernel(&range, (cl_float *)mem_in, (cl_float *)mem_out, DATA_SIZE);
// copy the output
gcl_memcpy(results, mem_out, sizeof(cl_float) * DATA_SIZE);
});
...
}
I get "use of undeclared identifier square_kernel" where it is referenced. The cl file must be building because it generates the byte code files.
Now its just occurred to me that it might be because I am referencing it from a c++ file. However I am not sure how to fix that problem, if that is what causes it. I don't want to use C if I can help it. Any ideas?
It's worth checking the auto-generated kernel.cl.h header to double-check the definition of the function you are trying to call in this situation. I've just created a new Xcode project and built your OpenCL kernel with it, and the resulting function definition looks like this:
extern void (^square_kernel_kernel)(const cl_ndrange *ndrange, cl_float* input, cl_float* output, cl_uint count);
Note the extra _kernel in the function name.