I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
As mentioned in a comment, your GPU function takes arguments by references.
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
This is bad, passing a reference to a function means more or less that you're passing an address where you can find the value, not the value itself.
In your situation those values are in memory used by Host, NOT Device/GPU memory, when GPU tries to access those values it will most likely crash.
The types you are trying to pass, unsigned char*, int and size_t are very cheap to copy, there's no need to pass them by reference in the 1st place.
__global__ void makeGrey(
unsigned char *pimage,
const int cn,
const size_t total)
There are tools provided by nvidia to debug CUDA applications, but I'm not really familiar with them, you can also use printf inside GPU functions, but you will have to organize output from potentially thousand of threads.
In general, whenever you call GPU functions, be very cautious about what you're passing as parameters, as they need to be passed from Host memory to Device memory. Usually you want to pass everything by value, any pointers need to point to Device memory, and watch out from references.
So I wrote a Cuda version of an OpenCL program I wrote. The OpenCL versions works, meanwhile the Cuda version doesn't. Now converting OpenCL code to Cuda code isn't 1-to-1, but I'm confused as to why the cuda version wouldn't work after all I did base my code around an cuda example when translating it over.
I am getting an illegal memory access was encountered (error code # = 77) during a cudaMemcpy(... cudaMemcpyDeviceToHost); (line 227) Although it's during a memcpy the problem appears to be an illegal memory access during the kernel run. Here is an example of what I get with cuda-memcheck checking the program:
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (53,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
=========
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (52,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
I don't understand the differences between Cuda and OpenCL well enough to know what I am doing wrong. I tried mucking around with MoveoutAndStackCuda<<<grid, threads>>> and change it to something like MoveoutAndStackCuda<<<grid, threads, (localGroupSize * sizeof(float))>>> but no luck. I've also tried commenting out parts of my kernel the problem appears to occur even when I have commented out most of my kernel.
Hopefully this is verifiable for you, but there is a chance that it isn't since it could depend on my hardware. I am running a Quadro M5000 on CentOS 6.8 (Final).
I tried to cut out as much stuff that is useless for this problem as possible. I would also provide the working OpenCL version of this MCV example however I am out of text. I recommend debugging using the arguments 100 50 40 for now, because I also have a problem of spawning too many global threads that I will tackle after this one is solved.
Here is the Minimal, Complete, and Verifiable example:
#include <math.h>
#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include <unistd.h>
const bool _VERBOSE = true;
const bool _PRINT_ALLOC_SIZE = true;
const bool _PRINT_RUN_TIME = true;
const int MIN_LOCAL_SIZE = 8;
__global__ void MoveoutAndStackCuda(float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], int startIndices[], int exitIndices[],
int sampleShift[], const unsigned int samplesPerT, const unsigned int readIns,
const unsigned int nOuts) {
unsigned int globalId = (blockIdx.x * blockDim.x) + threadIdx.x;
float stackF = 0.0;
float powerF = 0.0;
unsigned int readIndex = (globalId % samplesPerT);
unsigned int jobNum = (globalId / samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
if ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd))) {
float value = prestackTraces[readIndex + (x * samplesPerT) + sampleShift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[globalId] = stackF;
powerTracesOut[globalId] = powerF;
}
/*
* Single threaded version that somewhat mimics what is executed in the OpenCL code as close as possible.
*/
void MoveoutAndStackSingleThread(const float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], const int startIndices[], const int exitIndices[], const int shift[],
const unsigned int samplesPerT, const unsigned int readIns, const unsigned int nOuts,
const unsigned int jobNum, const unsigned int readIndex) {
float stackF = 0.0f;
float powerF = 0.0f;
int outputIndex = readIndex + (jobNum * samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
bool shouldRead = ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd)));
if (shouldRead) {
float value = prestackTraces[readIndex + (x * samplesPerT) + shift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[outputIndex] = stackF;
powerTracesOut[outputIndex] = powerF;
}
/**
* Used to keep track of how long it takes to execute this.
*/;
double GetTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (1e-6 * tv.tv_usec);
}
/*
* Print message to stderr and exit.
*/
void Fatal(const char* format, ...) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
/*
* We have an error, which one? Also print out where this occured.
*/
void CudaWhichError(cudaError_t errorCode, char* location) {
if (errorCode == cudaSuccess) {
// This shouldn't happen. It should be made sure that errorCode != cudaSuccess before calling this function.
printf("Reported error not actually an error... (cudaSuccess) %s\n", location);
return;
}
Fatal("%s %s (error code # = %d)\n", location, cudaGetErrorString(errorCode), errorCode);
}
/*
* Check for errors.
*/
void CheckForErrors(char* location) {
cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
CudaWhichError(errorCode, location);
}
}
/*
* Finds and initializes the fastest graphics card for CUDA use.
*
* Returns the max number of threads per block for the selected device.
*/
int GetFastestDevice() {
// Get the number of CUDA devices
int num;
if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
if (num<1) Fatal("No CUDA devices found\n");
// Props
cudaDeviceProp currentDevice;
int fastestGflops = -1;
cudaDeviceProp bestDevice;
int fastestDeviceID = -1;
// Get fastest device
for (int dev=0;dev<num;dev++) {
if (cudaGetDeviceProperties(¤tDevice, dev)) {
Fatal("Error getting device %d properties\n", dev);
}
int Gflops = currentDevice.multiProcessorCount * currentDevice.clockRate;
if (_VERBOSE) {
printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",
dev,
currentDevice.name,
(1e-6 * Gflops),
currentDevice.multiProcessorCount,
currentDevice.maxThreadsPerBlock);
}
if (Gflops > fastestGflops) {
fastestGflops = Gflops;
fastestDeviceID = dev;
bestDevice = currentDevice;
}
}
// Check to see if we get a device
if (fastestDeviceID == -1) {
Fatal("bestDevice == NULL");
}
// Print and set device
if (cudaGetDeviceProperties(&bestDevice, fastestDeviceID)) {
Fatal("Error getting device %d properties\n", fastestDeviceID);
}
cudaSetDevice(fastestDeviceID);
if (_VERBOSE) {
printf("Fastest CUDA Device %d: %s\n", fastestDeviceID, bestDevice.name);
printf("bestDevice.maxThreadsPerBlock: %d\n", bestDevice.maxThreadsPerBlock);
}
CheckForErrors((char*)("GetFastestDevice()"));
// Return max thread count
return bestDevice.maxThreadsPerBlock;
}
/*
* Allocate memory on the GPU, also copy the data over.
*
* CudaPtr variables point to the arrays on the GPU side.
* Host variables point to the arrays on the CPU side.
* Sizes variables determine sizes of the arrays.
*/
void AllocateAndCopyCudaDeviceMemory(float** prestackCudaPtr, float** stackOutCudaPtr, float** powerOutCudaPtr,
int** startIndicesCudaPtr, int** endIndicesCudaPtr, int** sampleShiftCudaPtr,
float *prestackHost, int *startIndicesHost, int *endIndicesHost, int *sampleShiftHost,
size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
if (_PRINT_ALLOC_SIZE) {
size_t totalMemoryAllocated = (prestackSizes + (outputSizes * 2) + (inputSizes * 3));
printf(" Total memory allocated for run: %zu\n", totalMemoryAllocated);
printf(" Prestack array size: %zu\n", prestackSizes);
printf(" Output array sizes: %zu\n", outputSizes);
printf(" EtartIndices, EndIndices, & SampleShift array size: %zu\n", inputSizes);
}
cudaError_t cudaCode;
// Allocate memory on the graphics card
cudaCode = cudaMalloc((void**)prestackCudaPtr, prestackSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for prestack array\n")));
cudaCode = cudaMalloc((void**)stackOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for stackOut array\n")));
cudaCode = cudaMalloc((void**)powerOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for powerOut array\n")));
cudaCode = cudaMalloc((void**)startIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for startIndices array\n")));
cudaCode = cudaMalloc((void**)endIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for endIndices array\n")));
cudaCode = cudaMalloc((void**)sampleShiftCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for sampleShift array\n")));
// Copy data over (for the arrays the need it)
cudaCode = cudaMemcpy(*prestackCudaPtr, prestackHost, prestackSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy prestack data over to device.\n")));
cudaCode = cudaMemcpy(*startIndicesCudaPtr, startIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy startIndices data over to device.\n")));
cudaCode = cudaMemcpy(*endIndicesCudaPtr, endIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy endIndices data over to device.\n")));
cudaCode = cudaMemcpy(*sampleShiftCudaPtr, sampleShiftHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy sampleSgift data over to device.\n")));
}
/*
* Enqueue the kernels to be ran on the gpu. Pointers that are passed in are pointing to
* device side memory.
*/
void RunCudaMoveAndStackJobs(float** prestackTracesCudaPtr, float** stackTracesOutCudaPtr,
float** powerTracesOutCudaPtr, int** startIndicesCudaPtr, int** exitIndicesCudaPtr,
int** sampleShiftCudaPtr, unsigned int samplesPerT, unsigned int readIns,
unsigned int nOuts, size_t localGroupSize) {
// Set the size
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
if (*prestackTracesCudaPtr == NULL) printf("*prestackTracesCudaPtr == NULL\n");
// Execute the kernel
MoveoutAndStackCuda<<<grid, threads>>>(*prestackTracesCudaPtr,
*stackTracesOutCudaPtr, *powerTracesOutCudaPtr, *startIndicesCudaPtr, *exitIndicesCudaPtr,
*sampleShiftCudaPtr, samplesPerT, readIns, nOuts);
CheckForErrors((char*)("RunCudaMoveAndStackJobs()"));
}
/*
* Free memory on the GPU device as well as free the remaining OpenCL objects for the host side.
*/
void RetrieveAndCleanupCudaDeviceMemory(float **prestackCudaPtr, float **stackOutCudaPtr,
float **powerOutCudaPtr, int **startIndicesCudaPtr, int **endIndicesCudaPtr, int **sampleShiftCudaPtr,
float *stackOutHost, float *powerOutHost, size_t outputSizes) {
// Copy C from device to host
cudaError_t cudaCode;
cudaCode = cudaMemcpy(stackOutHost, *stackOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy stackOut data back to host.\n"));
cudaCode = cudaMemcpy(powerOutHost, *powerOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy powerOut data back to host.\n"));
// Free device memory (TODO: reverse order)
cudaFree(*prestackCudaPtr);
cudaFree(*stackOutCudaPtr);
cudaFree(*powerOutCudaPtr);
cudaFree(*startIndicesCudaPtr);
cudaFree(*endIndicesCudaPtr);
cudaFree(*sampleShiftCudaPtr);
}
/*
* Runs the program given the arrays passed in the parameters.
*
* Return the time it took to run the program, if desired.
*/
double CommenceCUDAMoveoutAndStack(float* prestackTraces, float* stackOut, float* powerOut,
int* startIndices, int* endIndices, int* sampleShift,
unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
size_t localGroupSize, size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
double returnVal = 0.0;
if (_PRINT_RUN_TIME) {
printf("CommenceCUDAMoveoutAndStack:\n samplesPerTrace=%u nTracesIn=%u nTracesOut=%u\n"
" localGroupSize=%zu\n",
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
}
// Init CUDA
int maxThreadsPerBlock = GetFastestDevice();
// Check the desirec local size
if (((int)localGroupSize) > maxThreadsPerBlock) {
Fatal("Error: local group (%zu) size exceeds the max local group size of the selected graphics card (%d).\n",
localGroupSize, maxThreadsPerBlock);
} else if (((int)localGroupSize) < MIN_LOCAL_SIZE) {
Fatal("Error: local group (%zu) size is less than MIN_LOCAL_SIZE (%d).\n",
localGroupSize, MIN_LOCAL_SIZE);
}
// Allocate memory on the device. These pointers will point to memory on the GPU.
double preInitTime = GetTime();
float* prestackCudaPtr = NULL;
float* stackOutCudaPtr = NULL;
float* powerOutCudaPtr = NULL;
int* startIndicesCudaPtr = NULL;
int* endIndicesCudaPtr = NULL;
int* sampleShiftCudaPtr = NULL;
AllocateAndCopyCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
prestackTraces, startIndices, endIndices, sampleShift,
prestackSizes, outputSizes, inputSizes);
// Run the program
RunCudaMoveAndStackJobs(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
// Retrieve the data and clean up graphics card memory
RetrieveAndCleanupCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
stackOut, powerOut,
(size_t)(nTracesOut * samplesPerTrace * sizeof(float)));
// Print the run time (if requested)
if (_PRINT_RUN_TIME) {
returnVal = (GetTime() - preInitTime);
if (_PRINT_RUN_TIME) {
printf(" Run Time: %f secs\n", returnVal);
}
}
return returnVal;
}
// Returns a float 0.0 - 1.0, inclusive
float RandomFloat() {
return static_cast <float> (rand()) / static_cast <float>(RAND_MAX);
}
// Fill in the prestack traces array
void FillFloatArrayRandomly(float* fillArray, unsigned int length) {
for (unsigned int r = 0; r < length; r++) {
fillArray[r] = RandomFloat() * 1000.0f;
}
}
// Fill the start and end arrays randomly
void FillStartEndShiftArraysRandomly(int* startArray, int* nSampsArray, int* shiftArray,
int arrayLength, int rangeOfStartEndMax, int samplesPerT) {
for (int r = 0; r < arrayLength; r++) {
startArray[r] = (rand() % rangeOfStartEndMax);
int endIndex = samplesPerT - (rand() % rangeOfStartEndMax);
nSampsArray[r] = endIndex - startArray[r];
int range = startArray[r] + (samplesPerT - endIndex);
int ra = rand();
if (range != 0) shiftArray[r] = (ra % range) - startArray[r];
else shiftArray[r] = 0;
// Check to make sure we won't go out of bounds
assert((startArray[r] + nSampsArray[r]) <= samplesPerT);
assert(endIndex > startArray[r]);
assert(startArray[r] >= 0);
assert(nSampsArray[r] >= 0);
assert((startArray[r] + shiftArray[r]) >= 0);
assert((nSampsArray[r] + shiftArray[r]) <= samplesPerT);
}
}
// Create arrays for the OpenCL program to use
double GenerateArraysAndRun(unsigned int samplesPerTrace,
unsigned int nTracesIn, unsigned int nTracesOut, size_t localGroupS) {
srand(time(NULL)); // Set random seed to current time
double returnVal;
// Create the arrays to be used in the program
float* prestackTraces1D;
float* stackOut1D;
float* powerOut1D;
int* startIndices1D;
int* endIndices1D;
int* shift1D;
// Get sizes or arrays
size_t prestackSizes = samplesPerTrace * nTracesIn * sizeof(float);
size_t outputSizes = nTracesOut * samplesPerTrace * sizeof(float);
size_t inputSizes = nTracesOut * nTracesIn * sizeof(int);
// Fill in the arrays
prestackTraces1D = (float*)malloc(prestackSizes);
stackOut1D = (float*)malloc(outputSizes);
powerOut1D = (float*)malloc(outputSizes);
startIndices1D = (int*)malloc(inputSizes);
endIndices1D = (int*)malloc(inputSizes);
shift1D = (int*)malloc(inputSizes);
FillFloatArrayRandomly(prestackTraces1D, samplesPerTrace * nTracesIn);
FillStartEndShiftArraysRandomly(startIndices1D, endIndices1D, shift1D,
(int)(nTracesOut * nTracesIn), (int)(((float)samplesPerTrace) * 0.1), (int)samplesPerTrace);
// Check if arrays were created
if (prestackTraces1D == NULL) Fatal("GenerateArraysAndRun(): prestackTraces1D == NULL\n");
if (stackOut1D == NULL) Fatal("GenerateArraysAndRun(): stackOut1D == NULL\n");
if (powerOut1D == NULL) Fatal("GenerateArraysAndRun(): powerOut1D == NULL\n");
if (startIndices1D == NULL) Fatal("GenerateArraysAndRun(): startIndices1D == NULL\n");
if (endIndices1D == NULL) Fatal("GenerateArraysAndRun(): endIndices1D == NULL\n");
if (shift1D == NULL) Fatal("GenerateArraysAndRun(): shift1D == NULL\n");
// Run the program
returnVal = CommenceCUDAMoveoutAndStack(prestackTraces1D, stackOut1D, powerOut1D, startIndices1D,
endIndices1D, shift1D, samplesPerTrace, nTracesIn, nTracesOut,
localGroupS, prestackSizes, outputSizes, inputSizes);
// Finished: free the memory on CPU side in reverse order
free(shift1D);
free(endIndices1D);
free(startIndices1D);
free(powerOut1D);
free(stackOut1D);
free(prestackTraces1D);
// Return the time that the program gave us
return returnVal;
}
// Main
int main(int argc, char* argv[]) {
// TODO: Errors here
if (argc != 5)
Fatal("Incorrect # of Arguments (5 Needed) <samplesPerTrace> <nTracesIn> <nTracesOut> <LocalGroupSize>\n"
" argc = %d\n", argc);
unsigned int samplesPerTrace = atoi(argv[1]);
unsigned int nTracesIn = atoi(argv[2]);
unsigned int nTracesOut = atoi(argv[3]);
size_t localGroupS = atoi(argv[4]);
GenerateArraysAndRun(samplesPerTrace, nTracesIn, nTracesOut, localGroupS);
return 0;
}
The problem was that I was spawning too many blocks. In OpenCL, you tell the kernel the total number of threads and how many threads are in each block, and the total # of blocks is determined from that. Meanwhile in Cuda, you tell the kernel how many blocks there are and how many threads per block there is, and the total # of threads is determined by those. So:
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
Should be:
dim3 threads(localGroupSize);
dim3 grid((samplesPerT * nOuts) / localGroupSize);
So I am trying to write a program that turns RGB images to greyscale.
I got the idea from the Udacity problem set. The problem is that when I write out the kernel in the Udacity web environment, it says my code works, however, when I try to do it locally on my computer, I get no errors, but my image instead of coming out greyscale, comes out completely grey. It looks like one grey box the dimensions of the image I loaded. Can you help me find the error in my code, I've compared it with the Udacity version and I can't seem to find it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <cuda.h>
#include <stdio.h>
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <iostream>
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
__global__ void rgb_2_grey(uchar* const greyImage, const uchar4* const rgbImage, int rows, int columns)
{
int rgb_x = blockIdx.x * blockDim.x + threadIdx.x; //x coordinate of pixel
int rgb_y = blockIdx.y * blockDim.y + threadIdx.y; //y coordinate of pixel
if ((rgb_x >= columns) && (rgb_y >= rows)) {
return;
}
int rgb_ab = rgb_y*columns + rgb_x; //absolute pixel position
uchar4 rgb_Img = rgbImage[rgb_ab];
greyImage[rgb_ab] = uchar((float(rgb_Img.x))*0.299f + (float(rgb_Img.y))*0.587f + (float(rgb_Img.z))*0.114f);
}
using namespace cv;
using namespace std;
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage);
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols);
void Save_Img();
Mat img_RGB;
Mat img_Grey;
uchar4 *d_rgbImg;
uchar *d_greyImg;
int main()
{
uchar4* h_rgbImg;
//uchar4* d_rgbImge=0;
uchar* h_greyImg;
//uchar* d_greyImge=0;
Proc_Img(&h_rgbImg, &h_greyImg, &d_rgbImg, &d_greyImg);
RGB_2_Greyscale(d_greyImg, d_rgbImg, img_RGB.rows, img_RGB.cols);
Save_Img();
return 0;
}
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage){
cudaFree(0);
CudaCheckError();
//loads image into a matrix object along with the colors in BGR format (must convert to rgb).
Mat img = imread("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG", CV_LOAD_IMAGE_COLOR);
if (img.empty()){
cerr << "couldnt open file dumbas..." << "C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG" << endl;
exit(1);
}
//converts color type from BGR to RGB
cvtColor(img, img_RGB, CV_BGR2RGBA);
//allocate memory for new greyscale image.
//img.rows returns the range of pixels in y, img.cols returns range of pixels in x
//CV_8UC1 means 8 bit unsigned(non-negative) single channel of color, aka greyscale.
//all three of the parameters allow the create function in the Mat class to determine how much memory to allocate
img_Grey.create(img.rows, img.cols, CV_8UC1);
//creates rgb and greyscale image arrays
*h_RGBImage = (uchar4*)img_RGB.ptr<uchar>(0); //.ptr is a method in the mat class that returns a pointer to the first element of the matrix.
*h_greyImage = (uchar*)img_Grey.ptr<uchar>(0); //this is just like a regular array/pointer mem address to first element of the array. This is templated
//in this case the compiler runs the function for returning pointer of type unsigned char. for rgb image it is
//cast to uchar4 struct to hold r,g, and b values.
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols); //amount of pixels
//allocate memory on gpu
cudaMalloc(d_RGBImage, sizeof(uchar4) * num_pix); //bites of 1 uchar4 times # of pixels gives number of bites necessary for array
CudaCheckError();
cudaMalloc(d_greyImage, sizeof(uchar) * num_pix);//bites of uchar times # pixels gives number of bites necessary for array
CudaCheckError();
cudaMemset(*d_greyImage, 0, sizeof(uchar) * num_pix);
CudaCheckError();
//copy array into allocated space
cudaMemcpy(*d_RGBImage, *h_RGBImage, sizeof(uchar4)*num_pix, cudaMemcpyHostToDevice);
CudaCheckError();
d_rgbImg = *d_RGBImage;
d_greyImg = *d_greyImage;
}
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols){
const int BS = 16;
const dim3 blockSize(BS, BS);
const dim3 gridSize((num_Cols / BS) + 1, (num_Rows / BS) + 1);
rgb_2_grey <<<gridSize, blockSize>>>(d_greyImage, d_RGBImage, num_Rows, num_Cols);
cudaDeviceSynchronize(); CudaCheckError();
}
void Save_Img(){
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols);
cudaMemcpy(img_Grey.ptr<uchar>(0), d_greyImg, sizeof(uchar)*num_pix, cudaMemcpyDeviceToHost);
CudaCheckError();
imwrite("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581GR.JPG", img_Grey);
cudaFree(d_rgbImg);
cudaFree(d_greyImg);
}
EDIT: I realized that the local var in my main is the same name as the global var, I have edited the code here, now I get the error from visual studio that the
variable d_rgbIme is being used without being initialized
when I have already initialized it above. If I set them equal to zero I get a CUDA error saying
an illegal memory access was encountered
I tried running cuda-memcheck, but then I get the error that i could not run the file...
I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {
I was working on the same problem in JCUDA. See if you can use any part of this solution:
//Read Height and Width of image in Height & Width variables
int Width = image.getWidth();
int Height = image.getHeight();
int N = Height * Width;
int[] grayScale = new int[N];
//Allocate separate arrays to store Alpha, Red, Green and
//Blue values for every pixel
int[] redHost = new int[N];
int[] greenHost = new int[N];
int[] blueHost = new int[N];
int[] alphaHost = new int[N];
for(int i=0; i<Height; i++)
{
for(int j=0; j<Width; j++)
{
int pixel = image.getRGB(j, i);
//Read the ARGB data
alphaHost[i*Width+j] = (pixel >> 24) & 0xff;
redHost[i*Width+j] = (pixel >> 16) & 0xff;
greenHost[i*Width+j] = (pixel >> 8) & 0xff;
blueHost[i*Width+j] = (pixel) & 0xff;
}
}
/* Following are the CUDA Kernel parameters*/
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{N}), //Total size of each array W * H
Pointer.to(redDev), // Pointer to redArray on device
Pointer.to(greenDev), // Pointer to greenArray on device
Pointer.to(blueDev), // Pointer to blueArray on device
Pointer.to(Output)); //Pointer to output array
/*Following is my RGBToGrayScale.cu..i.e. CUDA Kernel */
__global__ void RGBtoGrayScale(int N, int *red, int *green, int *blue, int *Output)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
{
Output[id] = (red[id]*0.2989) + (green[id]*0.587) + (blue[id]*0.114);
}
}
/* Get the output data back to Host memory */
cuMemcpyDtoH(Pointer.to(grayScale), Output, N * Sizeof.INT);
/* Write the image with the new RBG values*/
BufferedImage im = new BufferedImage(Width,Height,BufferedImage.TYPE_BYTE_GRAY);
WritableRaster raster = im.getRaster();
for(int i=0;i<Height;i++)
{
for(int j=0;j<Width;j++)
{
raster.setSample(j, i, 0, grayScale[i*Width+j]);
}
}
try
{
ImageIO.write(im,"JPEG",new File("glpattern.jpeg"));
} catch (IOException e)
{
e.printStackTrace();
}
I am writing a c++ cuda program. I have a very simple struct:
struct A
{
int size;
float* tab;
}
and a kernel:
__global__ void Kernel(A* res, int n,args*) //
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
res[i] = AGenerator::Generate(args[i]);
}
}
Where AGenerator::Generate creates the A object and fills the tab array. What happens here is that when the results are send to the host the tab pointer is invalid. To prevent this I will need to apply the Rule of three to this class. Since there would be many classes like this I would like to avoid writing too many additional code.
I made the research and found that there is a thrust library which has device_vector and host_vector which will probably help with my problem but the thing is that I want the struct A and similar structs to be callable from both host and device so the device and host_vector are not good for this purpose. Is there any struct I can use to approach this?
EDIT
I found that passing the struct by value will help me but since performance is quite important it doesn't seem like a good solution.
Here is a rough outline of what I had in mind for a custom allocator and pool that would hide some of the mechanics of using a class both on the host and the device.
I don't consider it to be a paragon of programming excellence. It is merely intended to be a rough outline of the steps that I think would be involved. I'm sure there are many bugs. I didn't include it, but I think you would want a public method that would get the size as well.
#include <iostream>
#include <assert.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float mytype;
__device__ unsigned int pool_allocated = 0;
__device__ unsigned int pool_size = 0;
__device__ mytype *pool = 0;
__device__ unsigned int pool_reserve(size_t size){
assert((pool_allocated+size) < pool_size);
unsigned int offset = atomicAdd(&pool_allocated, size);
assert (offset < pool_size);
return offset;
}
__host__ void init_pool(size_t psize){
mytype *temp;
unsigned int my_size = psize;
cudaMalloc((void **)&temp, psize*sizeof(mytype));
cudaCheckErrors("init pool cudaMalloc fail");
cudaMemcpyToSymbol(pool, &temp, sizeof(mytype *));
cudaCheckErrors("init pool cudaMemcpyToSymbol 1 fail");
cudaMemcpyToSymbol(pool_size, &my_size, sizeof(unsigned int));
cudaCheckErrors("init pool cudaMemcpyToSymbol 2 fail");
}
class A{
public:
mytype *data;
__host__ __device__ void pool_allocate_and_copy() {
assert(d_data == 0);
assert(size != 0);
#ifdef __CUDA_ARCH__
unsigned int offset = pool_reserve(size);
d_data = pool + offset;
memcpy(d_data, data, size*sizeof(mytype));
#else
cudaMalloc((void **)&d_data, size*sizeof(mytype));
cudaCheckErrors("pool_allocate_and_copy cudaMalloc fail");
cudaMemcpy(d_data, data, size*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("pool_allocate_and_copy cudaMemcpy fail");
#endif /* __CUDA_ARCH__ */
}
__host__ __device__ void update(){
#ifdef __CUDA_ARCH__
assert(data != 0);
data = d_data;
assert(data != 0);
#else
if (h_data == 0) h_data = (mytype *)malloc(size*sizeof(mytype));
data = h_data;
assert(data != 0);
cudaMemcpy(data, d_data, size*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaCheckErrors("update cudaMempcy fail");
#endif
}
__host__ __device__ void allocate(size_t asize) {
assert(data == 0);
data = (mytype *)malloc(asize*sizeof(mytype));
assert(data != 0);
#ifndef __CUDA_ARCH__
h_data = data;
#endif
size = asize;
}
__host__ __device__ void copyobj(A *obj){
assert(obj != 0);
#ifdef __CUDA_ARCH__
memcpy(this, obj, sizeof(A));
#else
cudaMemcpy(this, obj, sizeof(A), cudaMemcpyDefault);
cudaCheckErrors("copy cudaMempcy fail");
#endif
this->update();
}
__host__ __device__ A();
private:
unsigned int size;
mytype *d_data;
mytype *h_data;
};
__host__ __device__ A::A(){
data = 0;
d_data = 0;
h_data = 0;
size = 0;
}
__global__ void mykernel(A obj, A *res){
A mylocal;
mylocal.copyobj(&obj);
A mylocal2;
mylocal2.allocate(24);
mylocal2.data[0]=45;
mylocal2.pool_allocate_and_copy();
res->copyobj(&mylocal2);
printf("kernel data %f\n", mylocal.data[0]);
}
int main(){
A my_obj;
A *d_result, h_result;
my_obj.allocate(32);
my_obj.data[0] = 12;
init_pool(1048576);
my_obj.pool_allocate_and_copy();
cudaMalloc((void **)&d_result, sizeof(A));
cudaCheckErrors("main cudaMalloc fail");
mykernel<<<1,1>>>(my_obj, d_result);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
h_result.copyobj(d_result);
printf("host data %f\n", h_result.data[0]);
return 0;
}
I am pretty sure that the direction of the question and related comments are ill fated. Device memory and host memory are totally different things, both conceptually and physically. Pointers just don't carry over!
Please go back to step 1 and learn about copying values between host and device by reading the reference manual and the progamming guide for more details.
To get a more precise answer to your question please show how those A structs are allocated on the device including the allocation of those tab floats. Also please show how AGenerator::Generate somehow manipulates those tabs in a meaningful way. My best bet is that you are working with unallocated device memory here and that you should probably use a preallocated array of floats and indizes into the array instead of device pointers here. Those indices would then carry over to the host gracefully.
To have more comfort handling memory on host and device, I created the following class.
In theory it should manage copying from host to device and vice versa.
struct CudaArray
{
int* memoryHost;
int* memoryDevice;
int size;
CudaArray(int datasize) // creates array on host and allocates memory on device with cudaMalloc
{
size = datasize;
memoryHost = new int[size];
for (int i = 0; i < size; i++)
{
memoryHost[i] = 0;
}
cudaMalloc((void**)&memoryDevice, sizeof(int) * size);
}
~CudaArray() // frees memory on device and host
{
delete[] memoryHost;
cudaFree(memoryDevice);
}
void Upload() // upload data from host to device
{
cudaMemcpy(memoryDevice, memoryHost, sizeof(int) * size, cudaMemcpyHostToDevice);
}
void Download() // download data from device to host
{
cudaMemcpy(memoryHost, memoryDevice, sizeof(int) * size, cudaMemcpyDeviceToHost);
}
void Insert(int* src); // copy from src to memoryHost
void Retrieve(int* dest); // copy from memoryHost to dest
};
Internally, everything is alright. But when I use an object of my CudaArray, there are problems with the pointers:
CudaArray cuda_ar(1000);
kernel <<<blocks, threads_per_block>>> (cuda_ar.memoryDevice, cuda_ar.size);
By using the debugger, I managed to read the pointer memoryDevice. Inside the struct (e.g. when stepping through Upload(), it is 0x01000000. But at the place where the kernel is executed, memoryDevice points to 0x00000400 (numbers are examples).
I know that memoryDevice is a pointer pointing to memory on the device.
Is there a way to explain this behavior and give a solution to my problem?
When I run the following program
#include <cstdio>
struct CudaArray
{
int* memoryHost;
int* memoryDevice;
int size;
CudaArray(int datasize) // creates array on host and allocates memory on device with cudaMalloc
{
size = datasize;
memoryHost = new int[size];
for (int i = 0; i < size; i++)
{
memoryHost[i] = 0;
}
cudaMalloc((void**)&memoryDevice, sizeof(int) * size);
}
~CudaArray() // frees memory on device and host
{
delete[] memoryHost;
cudaFree(memoryDevice);
}
void Upload() // upload data from host to device
{
cudaMemcpy(memoryDevice, memoryHost, sizeof(int) * size, cudaMemcpyHostToDevice);
}
void Download() // download data from device to host
{
cudaMemcpy(memoryHost, memoryDevice, sizeof(int) * size, cudaMemcpyDeviceToHost);
}
};
__global__ void kernel(int *ptr, int n)
{
printf("On Device : %p %d\n", ptr, n);
}
int main(void)
{
CudaArray cuda_ar(1000);
printf("On Host : %p %d\n", cuda_ar.memoryDevice, cuda_ar.size);
kernel<<<1, 1>>>(cuda_ar.memoryDevice, cuda_ar.size);
return 0;
}
I get
On Host : 0x200400000 1000
On Device : 0x200400000 1000
You should make sure your CUDA runtime calls like cudaMalloc, cudaMemcpy and kernel launches are all returning successfully. You can try this code after all your CUDA runtime calls to verify that :
if (cudaSuccess != cudaGetLastError())
printf( "Error!\n" );