So I wrote a Cuda version of an OpenCL program I wrote. The OpenCL versions works, meanwhile the Cuda version doesn't. Now converting OpenCL code to Cuda code isn't 1-to-1, but I'm confused as to why the cuda version wouldn't work after all I did base my code around an cuda example when translating it over.
I am getting an illegal memory access was encountered (error code # = 77) during a cudaMemcpy(... cudaMemcpyDeviceToHost); (line 227) Although it's during a memcpy the problem appears to be an illegal memory access during the kernel run. Here is an example of what I get with cuda-memcheck checking the program:
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (53,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
=========
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (52,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
I don't understand the differences between Cuda and OpenCL well enough to know what I am doing wrong. I tried mucking around with MoveoutAndStackCuda<<<grid, threads>>> and change it to something like MoveoutAndStackCuda<<<grid, threads, (localGroupSize * sizeof(float))>>> but no luck. I've also tried commenting out parts of my kernel the problem appears to occur even when I have commented out most of my kernel.
Hopefully this is verifiable for you, but there is a chance that it isn't since it could depend on my hardware. I am running a Quadro M5000 on CentOS 6.8 (Final).
I tried to cut out as much stuff that is useless for this problem as possible. I would also provide the working OpenCL version of this MCV example however I am out of text. I recommend debugging using the arguments 100 50 40 for now, because I also have a problem of spawning too many global threads that I will tackle after this one is solved.
Here is the Minimal, Complete, and Verifiable example:
#include <math.h>
#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include <unistd.h>
const bool _VERBOSE = true;
const bool _PRINT_ALLOC_SIZE = true;
const bool _PRINT_RUN_TIME = true;
const int MIN_LOCAL_SIZE = 8;
__global__ void MoveoutAndStackCuda(float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], int startIndices[], int exitIndices[],
int sampleShift[], const unsigned int samplesPerT, const unsigned int readIns,
const unsigned int nOuts) {
unsigned int globalId = (blockIdx.x * blockDim.x) + threadIdx.x;
float stackF = 0.0;
float powerF = 0.0;
unsigned int readIndex = (globalId % samplesPerT);
unsigned int jobNum = (globalId / samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
if ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd))) {
float value = prestackTraces[readIndex + (x * samplesPerT) + sampleShift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[globalId] = stackF;
powerTracesOut[globalId] = powerF;
}
/*
* Single threaded version that somewhat mimics what is executed in the OpenCL code as close as possible.
*/
void MoveoutAndStackSingleThread(const float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], const int startIndices[], const int exitIndices[], const int shift[],
const unsigned int samplesPerT, const unsigned int readIns, const unsigned int nOuts,
const unsigned int jobNum, const unsigned int readIndex) {
float stackF = 0.0f;
float powerF = 0.0f;
int outputIndex = readIndex + (jobNum * samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
bool shouldRead = ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd)));
if (shouldRead) {
float value = prestackTraces[readIndex + (x * samplesPerT) + shift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[outputIndex] = stackF;
powerTracesOut[outputIndex] = powerF;
}
/**
* Used to keep track of how long it takes to execute this.
*/;
double GetTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (1e-6 * tv.tv_usec);
}
/*
* Print message to stderr and exit.
*/
void Fatal(const char* format, ...) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
/*
* We have an error, which one? Also print out where this occured.
*/
void CudaWhichError(cudaError_t errorCode, char* location) {
if (errorCode == cudaSuccess) {
// This shouldn't happen. It should be made sure that errorCode != cudaSuccess before calling this function.
printf("Reported error not actually an error... (cudaSuccess) %s\n", location);
return;
}
Fatal("%s %s (error code # = %d)\n", location, cudaGetErrorString(errorCode), errorCode);
}
/*
* Check for errors.
*/
void CheckForErrors(char* location) {
cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
CudaWhichError(errorCode, location);
}
}
/*
* Finds and initializes the fastest graphics card for CUDA use.
*
* Returns the max number of threads per block for the selected device.
*/
int GetFastestDevice() {
// Get the number of CUDA devices
int num;
if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
if (num<1) Fatal("No CUDA devices found\n");
// Props
cudaDeviceProp currentDevice;
int fastestGflops = -1;
cudaDeviceProp bestDevice;
int fastestDeviceID = -1;
// Get fastest device
for (int dev=0;dev<num;dev++) {
if (cudaGetDeviceProperties(¤tDevice, dev)) {
Fatal("Error getting device %d properties\n", dev);
}
int Gflops = currentDevice.multiProcessorCount * currentDevice.clockRate;
if (_VERBOSE) {
printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",
dev,
currentDevice.name,
(1e-6 * Gflops),
currentDevice.multiProcessorCount,
currentDevice.maxThreadsPerBlock);
}
if (Gflops > fastestGflops) {
fastestGflops = Gflops;
fastestDeviceID = dev;
bestDevice = currentDevice;
}
}
// Check to see if we get a device
if (fastestDeviceID == -1) {
Fatal("bestDevice == NULL");
}
// Print and set device
if (cudaGetDeviceProperties(&bestDevice, fastestDeviceID)) {
Fatal("Error getting device %d properties\n", fastestDeviceID);
}
cudaSetDevice(fastestDeviceID);
if (_VERBOSE) {
printf("Fastest CUDA Device %d: %s\n", fastestDeviceID, bestDevice.name);
printf("bestDevice.maxThreadsPerBlock: %d\n", bestDevice.maxThreadsPerBlock);
}
CheckForErrors((char*)("GetFastestDevice()"));
// Return max thread count
return bestDevice.maxThreadsPerBlock;
}
/*
* Allocate memory on the GPU, also copy the data over.
*
* CudaPtr variables point to the arrays on the GPU side.
* Host variables point to the arrays on the CPU side.
* Sizes variables determine sizes of the arrays.
*/
void AllocateAndCopyCudaDeviceMemory(float** prestackCudaPtr, float** stackOutCudaPtr, float** powerOutCudaPtr,
int** startIndicesCudaPtr, int** endIndicesCudaPtr, int** sampleShiftCudaPtr,
float *prestackHost, int *startIndicesHost, int *endIndicesHost, int *sampleShiftHost,
size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
if (_PRINT_ALLOC_SIZE) {
size_t totalMemoryAllocated = (prestackSizes + (outputSizes * 2) + (inputSizes * 3));
printf(" Total memory allocated for run: %zu\n", totalMemoryAllocated);
printf(" Prestack array size: %zu\n", prestackSizes);
printf(" Output array sizes: %zu\n", outputSizes);
printf(" EtartIndices, EndIndices, & SampleShift array size: %zu\n", inputSizes);
}
cudaError_t cudaCode;
// Allocate memory on the graphics card
cudaCode = cudaMalloc((void**)prestackCudaPtr, prestackSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for prestack array\n")));
cudaCode = cudaMalloc((void**)stackOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for stackOut array\n")));
cudaCode = cudaMalloc((void**)powerOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for powerOut array\n")));
cudaCode = cudaMalloc((void**)startIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for startIndices array\n")));
cudaCode = cudaMalloc((void**)endIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for endIndices array\n")));
cudaCode = cudaMalloc((void**)sampleShiftCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for sampleShift array\n")));
// Copy data over (for the arrays the need it)
cudaCode = cudaMemcpy(*prestackCudaPtr, prestackHost, prestackSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy prestack data over to device.\n")));
cudaCode = cudaMemcpy(*startIndicesCudaPtr, startIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy startIndices data over to device.\n")));
cudaCode = cudaMemcpy(*endIndicesCudaPtr, endIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy endIndices data over to device.\n")));
cudaCode = cudaMemcpy(*sampleShiftCudaPtr, sampleShiftHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy sampleSgift data over to device.\n")));
}
/*
* Enqueue the kernels to be ran on the gpu. Pointers that are passed in are pointing to
* device side memory.
*/
void RunCudaMoveAndStackJobs(float** prestackTracesCudaPtr, float** stackTracesOutCudaPtr,
float** powerTracesOutCudaPtr, int** startIndicesCudaPtr, int** exitIndicesCudaPtr,
int** sampleShiftCudaPtr, unsigned int samplesPerT, unsigned int readIns,
unsigned int nOuts, size_t localGroupSize) {
// Set the size
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
if (*prestackTracesCudaPtr == NULL) printf("*prestackTracesCudaPtr == NULL\n");
// Execute the kernel
MoveoutAndStackCuda<<<grid, threads>>>(*prestackTracesCudaPtr,
*stackTracesOutCudaPtr, *powerTracesOutCudaPtr, *startIndicesCudaPtr, *exitIndicesCudaPtr,
*sampleShiftCudaPtr, samplesPerT, readIns, nOuts);
CheckForErrors((char*)("RunCudaMoveAndStackJobs()"));
}
/*
* Free memory on the GPU device as well as free the remaining OpenCL objects for the host side.
*/
void RetrieveAndCleanupCudaDeviceMemory(float **prestackCudaPtr, float **stackOutCudaPtr,
float **powerOutCudaPtr, int **startIndicesCudaPtr, int **endIndicesCudaPtr, int **sampleShiftCudaPtr,
float *stackOutHost, float *powerOutHost, size_t outputSizes) {
// Copy C from device to host
cudaError_t cudaCode;
cudaCode = cudaMemcpy(stackOutHost, *stackOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy stackOut data back to host.\n"));
cudaCode = cudaMemcpy(powerOutHost, *powerOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy powerOut data back to host.\n"));
// Free device memory (TODO: reverse order)
cudaFree(*prestackCudaPtr);
cudaFree(*stackOutCudaPtr);
cudaFree(*powerOutCudaPtr);
cudaFree(*startIndicesCudaPtr);
cudaFree(*endIndicesCudaPtr);
cudaFree(*sampleShiftCudaPtr);
}
/*
* Runs the program given the arrays passed in the parameters.
*
* Return the time it took to run the program, if desired.
*/
double CommenceCUDAMoveoutAndStack(float* prestackTraces, float* stackOut, float* powerOut,
int* startIndices, int* endIndices, int* sampleShift,
unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
size_t localGroupSize, size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
double returnVal = 0.0;
if (_PRINT_RUN_TIME) {
printf("CommenceCUDAMoveoutAndStack:\n samplesPerTrace=%u nTracesIn=%u nTracesOut=%u\n"
" localGroupSize=%zu\n",
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
}
// Init CUDA
int maxThreadsPerBlock = GetFastestDevice();
// Check the desirec local size
if (((int)localGroupSize) > maxThreadsPerBlock) {
Fatal("Error: local group (%zu) size exceeds the max local group size of the selected graphics card (%d).\n",
localGroupSize, maxThreadsPerBlock);
} else if (((int)localGroupSize) < MIN_LOCAL_SIZE) {
Fatal("Error: local group (%zu) size is less than MIN_LOCAL_SIZE (%d).\n",
localGroupSize, MIN_LOCAL_SIZE);
}
// Allocate memory on the device. These pointers will point to memory on the GPU.
double preInitTime = GetTime();
float* prestackCudaPtr = NULL;
float* stackOutCudaPtr = NULL;
float* powerOutCudaPtr = NULL;
int* startIndicesCudaPtr = NULL;
int* endIndicesCudaPtr = NULL;
int* sampleShiftCudaPtr = NULL;
AllocateAndCopyCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
prestackTraces, startIndices, endIndices, sampleShift,
prestackSizes, outputSizes, inputSizes);
// Run the program
RunCudaMoveAndStackJobs(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
// Retrieve the data and clean up graphics card memory
RetrieveAndCleanupCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
stackOut, powerOut,
(size_t)(nTracesOut * samplesPerTrace * sizeof(float)));
// Print the run time (if requested)
if (_PRINT_RUN_TIME) {
returnVal = (GetTime() - preInitTime);
if (_PRINT_RUN_TIME) {
printf(" Run Time: %f secs\n", returnVal);
}
}
return returnVal;
}
// Returns a float 0.0 - 1.0, inclusive
float RandomFloat() {
return static_cast <float> (rand()) / static_cast <float>(RAND_MAX);
}
// Fill in the prestack traces array
void FillFloatArrayRandomly(float* fillArray, unsigned int length) {
for (unsigned int r = 0; r < length; r++) {
fillArray[r] = RandomFloat() * 1000.0f;
}
}
// Fill the start and end arrays randomly
void FillStartEndShiftArraysRandomly(int* startArray, int* nSampsArray, int* shiftArray,
int arrayLength, int rangeOfStartEndMax, int samplesPerT) {
for (int r = 0; r < arrayLength; r++) {
startArray[r] = (rand() % rangeOfStartEndMax);
int endIndex = samplesPerT - (rand() % rangeOfStartEndMax);
nSampsArray[r] = endIndex - startArray[r];
int range = startArray[r] + (samplesPerT - endIndex);
int ra = rand();
if (range != 0) shiftArray[r] = (ra % range) - startArray[r];
else shiftArray[r] = 0;
// Check to make sure we won't go out of bounds
assert((startArray[r] + nSampsArray[r]) <= samplesPerT);
assert(endIndex > startArray[r]);
assert(startArray[r] >= 0);
assert(nSampsArray[r] >= 0);
assert((startArray[r] + shiftArray[r]) >= 0);
assert((nSampsArray[r] + shiftArray[r]) <= samplesPerT);
}
}
// Create arrays for the OpenCL program to use
double GenerateArraysAndRun(unsigned int samplesPerTrace,
unsigned int nTracesIn, unsigned int nTracesOut, size_t localGroupS) {
srand(time(NULL)); // Set random seed to current time
double returnVal;
// Create the arrays to be used in the program
float* prestackTraces1D;
float* stackOut1D;
float* powerOut1D;
int* startIndices1D;
int* endIndices1D;
int* shift1D;
// Get sizes or arrays
size_t prestackSizes = samplesPerTrace * nTracesIn * sizeof(float);
size_t outputSizes = nTracesOut * samplesPerTrace * sizeof(float);
size_t inputSizes = nTracesOut * nTracesIn * sizeof(int);
// Fill in the arrays
prestackTraces1D = (float*)malloc(prestackSizes);
stackOut1D = (float*)malloc(outputSizes);
powerOut1D = (float*)malloc(outputSizes);
startIndices1D = (int*)malloc(inputSizes);
endIndices1D = (int*)malloc(inputSizes);
shift1D = (int*)malloc(inputSizes);
FillFloatArrayRandomly(prestackTraces1D, samplesPerTrace * nTracesIn);
FillStartEndShiftArraysRandomly(startIndices1D, endIndices1D, shift1D,
(int)(nTracesOut * nTracesIn), (int)(((float)samplesPerTrace) * 0.1), (int)samplesPerTrace);
// Check if arrays were created
if (prestackTraces1D == NULL) Fatal("GenerateArraysAndRun(): prestackTraces1D == NULL\n");
if (stackOut1D == NULL) Fatal("GenerateArraysAndRun(): stackOut1D == NULL\n");
if (powerOut1D == NULL) Fatal("GenerateArraysAndRun(): powerOut1D == NULL\n");
if (startIndices1D == NULL) Fatal("GenerateArraysAndRun(): startIndices1D == NULL\n");
if (endIndices1D == NULL) Fatal("GenerateArraysAndRun(): endIndices1D == NULL\n");
if (shift1D == NULL) Fatal("GenerateArraysAndRun(): shift1D == NULL\n");
// Run the program
returnVal = CommenceCUDAMoveoutAndStack(prestackTraces1D, stackOut1D, powerOut1D, startIndices1D,
endIndices1D, shift1D, samplesPerTrace, nTracesIn, nTracesOut,
localGroupS, prestackSizes, outputSizes, inputSizes);
// Finished: free the memory on CPU side in reverse order
free(shift1D);
free(endIndices1D);
free(startIndices1D);
free(powerOut1D);
free(stackOut1D);
free(prestackTraces1D);
// Return the time that the program gave us
return returnVal;
}
// Main
int main(int argc, char* argv[]) {
// TODO: Errors here
if (argc != 5)
Fatal("Incorrect # of Arguments (5 Needed) <samplesPerTrace> <nTracesIn> <nTracesOut> <LocalGroupSize>\n"
" argc = %d\n", argc);
unsigned int samplesPerTrace = atoi(argv[1]);
unsigned int nTracesIn = atoi(argv[2]);
unsigned int nTracesOut = atoi(argv[3]);
size_t localGroupS = atoi(argv[4]);
GenerateArraysAndRun(samplesPerTrace, nTracesIn, nTracesOut, localGroupS);
return 0;
}
The problem was that I was spawning too many blocks. In OpenCL, you tell the kernel the total number of threads and how many threads are in each block, and the total # of blocks is determined from that. Meanwhile in Cuda, you tell the kernel how many blocks there are and how many threads per block there is, and the total # of threads is determined by those. So:
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
Should be:
dim3 threads(localGroupSize);
dim3 grid((samplesPerT * nOuts) / localGroupSize);
I have this code line in my Cuda - C application :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
int main()
{
const int size = 32;
unsigned int * dev_ips_range_end;
unsigned int * ips_range_end = new unsigned int[size];
for (int i = 0; i < size; i++)
ips_range_end[i] = i;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ips_range_end, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_ips_range_end, ips_range_end, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
thrust::device_ptr<unsigned int> dev_ips_range_end_ptr(dev_ips_range_end);
thrust::inclusive_scan(dev_ips_range_end_ptr, dev_ips_range_end_ptr + size, dev_ips_range_end_ptr);
return 0;
Error:
cudaFree(dev_ips_range_end);
}
here is the command I used and the output:
[Test]$ nvcc -I/usr/local/cuda/include -L/usr/local/cuda/lib kernel.cu -o test.run
kernel.cu(27): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
kernel.cu(32): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
kernel.cu(39): error: transfer of control bypasses initialization of:
variable "dev_ips_range_end_ptr"
(42): here
3 errors detected in the compilation of "/tmp/tmpxft_000022ad_00000000-9_kernel.cpp1.ii".
the same code is working without any problem in visual studio on windows.
how to solve this issue ?
Some people might tell you that the use of goto in C/C++ isn't a great idea. But to avoid arguments, and allow you to keep the same code structure, you can declare your thrust device pointer at the top of your program (before any goto statements) and then set the pointer value when you are ready to use it, like this:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
int main()
{
const int size = 32;
unsigned int * dev_ips_range_end;
unsigned int * ips_range_end = new unsigned int[size];
for (int i = 0; i < size; i++)
ips_range_end[i] = i;
thrust::device_ptr<unsigned int> dev_ips_range_end_ptr;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ips_range_end, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_ips_range_end, ips_range_end, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Problem !");
goto Error;
}
dev_ips_range_end_ptr = thrust::device_pointer_cast(dev_ips_range_end);
thrust::inclusive_scan(dev_ips_range_end_ptr, dev_ips_range_end_ptr + size, dev_ips_range_end_ptr);
return 0;
Error:
cudaFree(dev_ips_range_end);
}
I'm trying to check how to work with CUFFT and my code is the following
#include <iostream>
//For FFT
#include <cufft.h>
using namespace std;
typedef enum signaltype {REAL, COMPLEX} signal;
//Function to fill the buffer with random real values
void randomFill(cufftComplex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand() / (float) RAND_MAX;
h_signal[i].y = 0;
}
}
}
//Printing the random data in the buffer
void printData(cufftComplex *a, int size, char *msg) {
if (strcmp(msg,"")==0) printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
}
// FFT a signal that's on the _DEVICE_.
// Doing FFT
void signalFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan FFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS)
{
printf ("Failed Executing FFT\n");
exit(0);
}
}
// Doing IFFT
void signalIFFT(cufftComplex *d_signal, int signal_size)
{
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS)
{
printf("Failed to plan IFFT\n");
exit(0);
}
// Execute the plan
if (cufftExecC2C(plan, d_signal, d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS)
{
printf ("Failed Executing IFFT\n");
exit(0);
}
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize();
//Declaring two complex type variables;
cufftComplex *h_signal, *d_signal1;
//Declaring the size variable
int alloc_size;
alloc_size = 16;
//Allocating the memory for CPU version complex variable
h_signal = (cufftComplex *) malloc(sizeof(cufftComplex) * alloc_size);
//Allocating the memory for GPU version complex variable
cudaMalloc(&d_signal1, sizeof(cufftComplex) * alloc_size);
// Add random data to signal.
randomFill(h_signal, alloc_size, REAL);
printData(h_signal, alloc_size, "Random H1");
// Copying the data the data to CUDA
cudaMemcpy(d_signal1, h_signal, sizeof(cufftComplex) * alloc_size, cudaMemcpyHostToDevice);
//Applying FFT
signalFFT(d_signal1, alloc_size);
//Doing IFFT
signalIFFT(d_signal1, alloc_size);
cudaMemcpy(h_signal, d_signal1, sizeof(cufftComplex) * alloc_size, cudaMemcpyDeviceToHost);
printData(h_signal, alloc_size, "IFFT");
return 0;
}
And the MAKEFILE consists of the following:
main: main.cu Makefile nvcc -o main main.cu --ptxas-options=-v --use_fast_math
But I get compilation errors, the errors are as shown in the image:
Apparently the problem is occurring only when I call the functions cufftPlan1d and cufftExecC2C. Do I have to add anything extra in the makefile to make use of these functions? My CUDA version 5.5 and I'm doing it in Ubuntu.
Thanks
There are two problems here
The CUFFT library is not being linked. Change the compilation command to:
nvcc -o main main.cu --ptxas-options=-v --use_fast_math -lcufft
Set LD_LIBRARY_PATH to include the absolute path to the CUFFT library to allow runtime loading of the shared library. The syntax for this can be found here.
[This answer has been assembled from comments and added as a community wiki entry to get this question off the unanswered queue for the CUDA tag]
#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.
Certain operations in my app are using more memory than I think they should, and I would like to log the current memory usage to help identify which they are.
Is there a system call that will return the amount of memory currently in use?
The following C function returns the CPU time and resident memory of process pid. To get the resources of other processes, you need root permission. You may also try getrusage(), but I never get it work properly for memory usage. Getting CPU time with getrusage() always works to me.
The function is adapted from the source codes of the ps and top commands. It is part of my program that monitors the memory of other processes.
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <mach/mach_init.h>
#include <mach/mach_host.h>
#include <mach/mach_port.h>
#include <mach/mach_traps.h>
#include <mach/task_info.h>
#include <mach/thread_info.h>
#include <mach/thread_act.h>
#include <mach/vm_region.h>
#include <mach/vm_map.h>
#include <mach/task.h>
#include <mach/shared_memory_server.h>
typedef struct vmtotal vmtotal_t;
typedef struct { /* dynamic process information */
size_t rss, vsize;
double utime, stime;
} RunProcDyn;
/* On Mac OS X, the only way to get enough information is to become root. Pretty frustrating!*/
int run_get_dynamic_proc_info(pid_t pid, RunProcDyn *rpd)
{
task_t task;
kern_return_t error;
mach_msg_type_number_t count;
thread_array_t thread_table;
thread_basic_info_t thi;
thread_basic_info_data_t thi_data;
unsigned table_size;
struct task_basic_info ti;
error = task_for_pid(mach_task_self(), pid, &task);
if (error != KERN_SUCCESS) {
/* fprintf(stderr, "++ Probably you have to set suid or become root.\n"); */
rpd->rss = rpd->vsize = 0;
rpd->utime = rpd->stime = 0;
return 0;
}
count = TASK_BASIC_INFO_COUNT;
error = task_info(task, TASK_BASIC_INFO, (task_info_t)&ti, &count);
assert(error == KERN_SUCCESS);
{ /* adapted from ps/tasks.c */
vm_region_basic_info_data_64_t b_info;
vm_address_t address = GLOBAL_SHARED_TEXT_SEGMENT;
vm_size_t size;
mach_port_t object_name;
count = VM_REGION_BASIC_INFO_COUNT_64;
error = vm_region_64(task, &address, &size, VM_REGION_BASIC_INFO,
(vm_region_info_t)&b_info, &count, &object_name);
if (error == KERN_SUCCESS) {
if (b_info.reserved && size == (SHARED_TEXT_REGION_SIZE) &&
ti.virtual_size > (SHARED_TEXT_REGION_SIZE + SHARED_DATA_REGION_SIZE))
{
ti.virtual_size -= (SHARED_TEXT_REGION_SIZE + SHARED_DATA_REGION_SIZE);
}
}
rpd->rss = ti.resident_size;
rpd->vsize = ti.virtual_size;
}
{ /* calculate CPU times, adapted from top/libtop.c */
unsigned i;
rpd->utime = ti.user_time.seconds + ti.user_time.microseconds * 1e-6;
rpd->stime = ti.system_time.seconds + ti.system_time.microseconds * 1e-6;
error = task_threads(task, &thread_table, &table_size);
assert(error == KERN_SUCCESS);
thi = &thi_data;
for (i = 0; i != table_size; ++i) {
count = THREAD_BASIC_INFO_COUNT;
error = thread_info(thread_table[i], THREAD_BASIC_INFO, (thread_info_t)thi, &count);
assert(error == KERN_SUCCESS);
if ((thi->flags & TH_FLAGS_IDLE) == 0) {
rpd->utime += thi->user_time.seconds + thi->user_time.microseconds * 1e-6;
rpd->stime += thi->system_time.seconds + thi->system_time.microseconds * 1e-6;
}
if (task != mach_task_self()) {
error = mach_port_deallocate(mach_task_self(), thread_table[i]);
assert(error == KERN_SUCCESS);
}
}
error = vm_deallocate(mach_task_self(), (vm_offset_t)thread_table, table_size * sizeof(thread_array_t));
assert(error == KERN_SUCCESS);
}
mach_port_deallocate(mach_task_self(), task);
return 0;
}
#endif /* __APPLE__ */
launch your application with Instruments. put it through the paces, and evaluate the results...
Following #user172818 advice, I tried getrusage and it worked for me:
#include <sys/time.h>
#include <sys/resource.h>
long getMemoryUsage()
{
struct rusage usage;
if(0 == getrusage(RUSAGE_SELF, &usage))
return usage.ru_maxrss; // bytes
else
return 0;
}
I am using Mac OS X 10.9.4, with compiler Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
You can give a try to mallocDebug function : http://developer.apple.com/mac/library/DOCUMENTATION/Performance/Conceptual/ManagingMemory/Articles/FindingPatterns.html.