i'm fairly new to cuda and i want to use the concept of constant memory, but i'm getting an illegal memory access was encountered when running the code.
My kernel looks like this
__global__ void nonceKernel(int inLen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int nonceLen, int* finishedFlag, int *mutex, int size)
{
if(!*finishedFlag) return;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
bool found = true;
BYTE tempNonce[2];
BYTE tempSha1[20];
tempNonce[1]=((tid+size) >> 8) & 0x000000FF;
tempNonce[0]=(tid+size) & 0x000000FF;
CUDA_SHA1 ctx;
cuda_sha1_init(&ctx); //init context
cuda_sha1_update(&ctx, device_input_data, inLen); // add input buffer
cuda_sha1_update(&ctx, tempNonce, nonceLen); //add nonce
cuda_sha1_final(&ctx, tempSha1); //compute sha1
for(int i=0; i<shaTermLength; i++) {
if(tempSha1[19 - i] != device_sha1_term[shaTermLength - 1 - i])
found=false;
}
if(found == true) {
lock(mutex);
memcpy(outSha1, tempSha1, 20); //20 bytes for sha1
memcpy(outNonce, tempNonce, nonceLen); //2 bytes for nonce
*finishedFlag = 0;
unlock(mutex);
}
}
My intermediary function like this:
cudaError_t nonceWithCuda(int intlen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int *finishFlag, int nonceLen, int size)
{
BYTE *gpuSha1Out;
BYTE *gpuNonceOut;
int *gpuFinishedFlag;
cudaError_t cudaStatus;
int *mutex;
cudaStatus= cudaSetDevice(0);
if(cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a cuda gpu installed?");
goto Error;
}
....
cudaStatus=cudaMalloc((void**)&gpuFinishedFlag, 1*sizeof(int));
if(cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc for gpuFinishedFlag failed");
goto Error;
}
cudaStatus=cudaMemcpy(gpuFinishedFlag, finishFlag, sizeof(int), cudaMemcpyHostToDevice);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudamemcpy 0 to gpuFinishedFlag failed!");
goto Error;
}
....
while(*finishFlag) {
nonceKernel<<<128, 1024>>>(intlen, shaTermLength, gpuSha1Out, gpuNonceOut, nonceLen, gpuFinishedFlag, mutex, size);
size++;
cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
goto Error;
}
}
......
Error:
cudaFree(gpuSha1Out);
cudaFree(gpuNonceOut);
cudaFree(gpuFinishedFlag);
return cudaStatus;
}
Also i'm declaring the constant variables as such:
__constant__ BYTE* device_input_data;
__constant__ BYTE* device_sha1_term;
where BYTE is defined as an unsigned char typedef unsigned char BYTE;.
And finally the main function.
int main(int argc, char** argv) {
size_t input_block_size=5; //bytes
int nonceLen=2;
int finishedFlag=1;
...
BYTE* inputData = (BYTE*) malloc(input_block_size * sizeof(BYTE)); //input byte buffer
inputData[0]=0x23; //#
inputData[1]=0x30; //0
inputData[2]=0x42; //B
inputData[3]=0x69; //i
inputData[4]=0x61; //a
BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE));
shaTerm[0]=0x7E;
shaTerm[1]=0x46;
int shaTermLength = sizeof(shaTerm)/sizeof(shaTerm[0]);//ouput sha buffer
cudaStatus=cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
cudaStatus=cudaMemcpyToSymbol(device_sha1_term, shaTerm, shaTermLength * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
...
nonceWithCuda(input_block_size, shaTermLength, outputSha1Buffer, outputNonceBuffer, &finishedFlag, 2, size);
The error occurs in the while from the nonceWithCuda function, when i'm copying back the value from the gpu to host, i mean this piece of code:
cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
goto Error;
}
The output:
$ ./nonce_v3
MemcpyToSymbol: no error
MemcpyToSymbol: no error
cudaMemcpy from gpuFinishedFlag failed, with code: an illegal memory access was encountered!
Note that the same code works fine when i'm not using constant for those two variables and cannot understand why. Can someone point me in the right direction?
Thank you for your help!!!
I am assuming that you want to store the 5 elements of inputData in constant memory.
The line __constant__ BYTE* device_input_data; will reserve constant memory to store a single pointer. It will not reserve constant memory for 5 BYTE values.
Then, with
cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
the memory adress to which this pointer points to is set to the elements of inputData, i.e. after transfer, the pointer could have the value 0x2330426961000000 .
Most likely, this is not a valid address to device memory. This causes the observed memory error when trying to access this memory location in your kernel.
To fix this, you need to declare the constant memory as a BYTE array of size 5.
__constant__ BYTE device_input_data[5];
Yesterday I asked you guys a question and realize that reading and writing a chunk of blocks is more efficient instead of just one block at once. So now i try to read and write a chunk of blocks but the segmentation error is occurred for me.
#define POSITIONAL_TOKEN_CHUNK_SIZE 1000000
#define POSITIONAL_TOKEN_WORD_LENGTH 15
#define POSITIONAL_TOKEN_ID_LENGTH 4
#define POSITIONAL_TOKEN_LENGTH (POSITIONAL_TOKEN_WORD_LENGTH + POSITIONAL_TOKEN_ID_LENGTH)
struct PTBlk {
char w[POSITIONAL_TOKEN_WORD_LENGTH + 1];
int id;
};
int transform_to_bin_a(FILE * fin, int fd) {
int n = 0;
int m = 0;
char buf[TRANSFORM_TO_BIN_BUF_LENGTH] = {};
struct PTBlk * blks = (struct PTBlk *)malloc(POSITIONAL_TOKEN_CHUNK_SIZE * POSITIONAL_TOKEN_LENGTH);
if (blks == NULL) {
puts("no memory being allocated");
return 0;
}
memset(buf, 0, TRANSFORM_TO_BIN_BUF_LENGTH);
printf("total of size being allocated is %d bytes\n", POSITIONAL_TOKEN_CHUNK_SIZE * POSITIONAL_TOKEN_LENGTH);
while (fgets(buf, TRANSFORM_TO_BIN_BUF_LENGTH, fin) != NULL) {
++n;
sscanf(buf, "%s %d", blks[m].w, &blks[m].id);
++m;
if (m >= POSITIONAL_TOKEN_CHUNK_SIZE) { // error !!
write(fd, (void *)blks, POSITIONAL_TOKEN_CHUNK_SIZE * POSITIONAL_TOKEN_LENGTH);
m = 0;
}
memset(buf, 0, TRANSFORM_TO_BIN_BUF_LENGTH);
}
if (m > 0) {
printf("n:%d m:%d\n", n, m);
printf("m:%d\n", m);
write(fd, (void *)blks, m * POSITIONAL_TOKEN_LENGTH);
}
printf("n:%d\n", n);
lseek(fd, 0, SEEK_SET);
write(fd, (void *)&n, POSITIONAL_TOKEN_SEARCH_BEGIN);
free(blks);
return 1;
}
I guess too large POSITIONAL_TOKEN_CHUNK_SIZE is one of problems. But i don't understand why it is reason for segmentation fault. Because in the code i try to allocate only 20,000,000 bytes on heap. Sometimes i coded a long int array as global variable such as 'int arr[20000000];'. But it doesn't matter.
What do i misunderstand in the code?
So I wrote a Cuda version of an OpenCL program I wrote. The OpenCL versions works, meanwhile the Cuda version doesn't. Now converting OpenCL code to Cuda code isn't 1-to-1, but I'm confused as to why the cuda version wouldn't work after all I did base my code around an cuda example when translating it over.
I am getting an illegal memory access was encountered (error code # = 77) during a cudaMemcpy(... cudaMemcpyDeviceToHost); (line 227) Although it's during a memcpy the problem appears to be an illegal memory access during the kernel run. Here is an example of what I get with cuda-memcheck checking the program:
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (53,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
=========
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (52,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
I don't understand the differences between Cuda and OpenCL well enough to know what I am doing wrong. I tried mucking around with MoveoutAndStackCuda<<<grid, threads>>> and change it to something like MoveoutAndStackCuda<<<grid, threads, (localGroupSize * sizeof(float))>>> but no luck. I've also tried commenting out parts of my kernel the problem appears to occur even when I have commented out most of my kernel.
Hopefully this is verifiable for you, but there is a chance that it isn't since it could depend on my hardware. I am running a Quadro M5000 on CentOS 6.8 (Final).
I tried to cut out as much stuff that is useless for this problem as possible. I would also provide the working OpenCL version of this MCV example however I am out of text. I recommend debugging using the arguments 100 50 40 for now, because I also have a problem of spawning too many global threads that I will tackle after this one is solved.
Here is the Minimal, Complete, and Verifiable example:
#include <math.h>
#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include <unistd.h>
const bool _VERBOSE = true;
const bool _PRINT_ALLOC_SIZE = true;
const bool _PRINT_RUN_TIME = true;
const int MIN_LOCAL_SIZE = 8;
__global__ void MoveoutAndStackCuda(float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], int startIndices[], int exitIndices[],
int sampleShift[], const unsigned int samplesPerT, const unsigned int readIns,
const unsigned int nOuts) {
unsigned int globalId = (blockIdx.x * blockDim.x) + threadIdx.x;
float stackF = 0.0;
float powerF = 0.0;
unsigned int readIndex = (globalId % samplesPerT);
unsigned int jobNum = (globalId / samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
if ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd))) {
float value = prestackTraces[readIndex + (x * samplesPerT) + sampleShift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[globalId] = stackF;
powerTracesOut[globalId] = powerF;
}
/*
* Single threaded version that somewhat mimics what is executed in the OpenCL code as close as possible.
*/
void MoveoutAndStackSingleThread(const float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], const int startIndices[], const int exitIndices[], const int shift[],
const unsigned int samplesPerT, const unsigned int readIns, const unsigned int nOuts,
const unsigned int jobNum, const unsigned int readIndex) {
float stackF = 0.0f;
float powerF = 0.0f;
int outputIndex = readIndex + (jobNum * samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
bool shouldRead = ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd)));
if (shouldRead) {
float value = prestackTraces[readIndex + (x * samplesPerT) + shift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[outputIndex] = stackF;
powerTracesOut[outputIndex] = powerF;
}
/**
* Used to keep track of how long it takes to execute this.
*/;
double GetTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (1e-6 * tv.tv_usec);
}
/*
* Print message to stderr and exit.
*/
void Fatal(const char* format, ...) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
/*
* We have an error, which one? Also print out where this occured.
*/
void CudaWhichError(cudaError_t errorCode, char* location) {
if (errorCode == cudaSuccess) {
// This shouldn't happen. It should be made sure that errorCode != cudaSuccess before calling this function.
printf("Reported error not actually an error... (cudaSuccess) %s\n", location);
return;
}
Fatal("%s %s (error code # = %d)\n", location, cudaGetErrorString(errorCode), errorCode);
}
/*
* Check for errors.
*/
void CheckForErrors(char* location) {
cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
CudaWhichError(errorCode, location);
}
}
/*
* Finds and initializes the fastest graphics card for CUDA use.
*
* Returns the max number of threads per block for the selected device.
*/
int GetFastestDevice() {
// Get the number of CUDA devices
int num;
if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
if (num<1) Fatal("No CUDA devices found\n");
// Props
cudaDeviceProp currentDevice;
int fastestGflops = -1;
cudaDeviceProp bestDevice;
int fastestDeviceID = -1;
// Get fastest device
for (int dev=0;dev<num;dev++) {
if (cudaGetDeviceProperties(¤tDevice, dev)) {
Fatal("Error getting device %d properties\n", dev);
}
int Gflops = currentDevice.multiProcessorCount * currentDevice.clockRate;
if (_VERBOSE) {
printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",
dev,
currentDevice.name,
(1e-6 * Gflops),
currentDevice.multiProcessorCount,
currentDevice.maxThreadsPerBlock);
}
if (Gflops > fastestGflops) {
fastestGflops = Gflops;
fastestDeviceID = dev;
bestDevice = currentDevice;
}
}
// Check to see if we get a device
if (fastestDeviceID == -1) {
Fatal("bestDevice == NULL");
}
// Print and set device
if (cudaGetDeviceProperties(&bestDevice, fastestDeviceID)) {
Fatal("Error getting device %d properties\n", fastestDeviceID);
}
cudaSetDevice(fastestDeviceID);
if (_VERBOSE) {
printf("Fastest CUDA Device %d: %s\n", fastestDeviceID, bestDevice.name);
printf("bestDevice.maxThreadsPerBlock: %d\n", bestDevice.maxThreadsPerBlock);
}
CheckForErrors((char*)("GetFastestDevice()"));
// Return max thread count
return bestDevice.maxThreadsPerBlock;
}
/*
* Allocate memory on the GPU, also copy the data over.
*
* CudaPtr variables point to the arrays on the GPU side.
* Host variables point to the arrays on the CPU side.
* Sizes variables determine sizes of the arrays.
*/
void AllocateAndCopyCudaDeviceMemory(float** prestackCudaPtr, float** stackOutCudaPtr, float** powerOutCudaPtr,
int** startIndicesCudaPtr, int** endIndicesCudaPtr, int** sampleShiftCudaPtr,
float *prestackHost, int *startIndicesHost, int *endIndicesHost, int *sampleShiftHost,
size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
if (_PRINT_ALLOC_SIZE) {
size_t totalMemoryAllocated = (prestackSizes + (outputSizes * 2) + (inputSizes * 3));
printf(" Total memory allocated for run: %zu\n", totalMemoryAllocated);
printf(" Prestack array size: %zu\n", prestackSizes);
printf(" Output array sizes: %zu\n", outputSizes);
printf(" EtartIndices, EndIndices, & SampleShift array size: %zu\n", inputSizes);
}
cudaError_t cudaCode;
// Allocate memory on the graphics card
cudaCode = cudaMalloc((void**)prestackCudaPtr, prestackSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for prestack array\n")));
cudaCode = cudaMalloc((void**)stackOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for stackOut array\n")));
cudaCode = cudaMalloc((void**)powerOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for powerOut array\n")));
cudaCode = cudaMalloc((void**)startIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for startIndices array\n")));
cudaCode = cudaMalloc((void**)endIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for endIndices array\n")));
cudaCode = cudaMalloc((void**)sampleShiftCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for sampleShift array\n")));
// Copy data over (for the arrays the need it)
cudaCode = cudaMemcpy(*prestackCudaPtr, prestackHost, prestackSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy prestack data over to device.\n")));
cudaCode = cudaMemcpy(*startIndicesCudaPtr, startIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy startIndices data over to device.\n")));
cudaCode = cudaMemcpy(*endIndicesCudaPtr, endIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy endIndices data over to device.\n")));
cudaCode = cudaMemcpy(*sampleShiftCudaPtr, sampleShiftHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy sampleSgift data over to device.\n")));
}
/*
* Enqueue the kernels to be ran on the gpu. Pointers that are passed in are pointing to
* device side memory.
*/
void RunCudaMoveAndStackJobs(float** prestackTracesCudaPtr, float** stackTracesOutCudaPtr,
float** powerTracesOutCudaPtr, int** startIndicesCudaPtr, int** exitIndicesCudaPtr,
int** sampleShiftCudaPtr, unsigned int samplesPerT, unsigned int readIns,
unsigned int nOuts, size_t localGroupSize) {
// Set the size
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
if (*prestackTracesCudaPtr == NULL) printf("*prestackTracesCudaPtr == NULL\n");
// Execute the kernel
MoveoutAndStackCuda<<<grid, threads>>>(*prestackTracesCudaPtr,
*stackTracesOutCudaPtr, *powerTracesOutCudaPtr, *startIndicesCudaPtr, *exitIndicesCudaPtr,
*sampleShiftCudaPtr, samplesPerT, readIns, nOuts);
CheckForErrors((char*)("RunCudaMoveAndStackJobs()"));
}
/*
* Free memory on the GPU device as well as free the remaining OpenCL objects for the host side.
*/
void RetrieveAndCleanupCudaDeviceMemory(float **prestackCudaPtr, float **stackOutCudaPtr,
float **powerOutCudaPtr, int **startIndicesCudaPtr, int **endIndicesCudaPtr, int **sampleShiftCudaPtr,
float *stackOutHost, float *powerOutHost, size_t outputSizes) {
// Copy C from device to host
cudaError_t cudaCode;
cudaCode = cudaMemcpy(stackOutHost, *stackOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy stackOut data back to host.\n"));
cudaCode = cudaMemcpy(powerOutHost, *powerOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy powerOut data back to host.\n"));
// Free device memory (TODO: reverse order)
cudaFree(*prestackCudaPtr);
cudaFree(*stackOutCudaPtr);
cudaFree(*powerOutCudaPtr);
cudaFree(*startIndicesCudaPtr);
cudaFree(*endIndicesCudaPtr);
cudaFree(*sampleShiftCudaPtr);
}
/*
* Runs the program given the arrays passed in the parameters.
*
* Return the time it took to run the program, if desired.
*/
double CommenceCUDAMoveoutAndStack(float* prestackTraces, float* stackOut, float* powerOut,
int* startIndices, int* endIndices, int* sampleShift,
unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
size_t localGroupSize, size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
double returnVal = 0.0;
if (_PRINT_RUN_TIME) {
printf("CommenceCUDAMoveoutAndStack:\n samplesPerTrace=%u nTracesIn=%u nTracesOut=%u\n"
" localGroupSize=%zu\n",
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
}
// Init CUDA
int maxThreadsPerBlock = GetFastestDevice();
// Check the desirec local size
if (((int)localGroupSize) > maxThreadsPerBlock) {
Fatal("Error: local group (%zu) size exceeds the max local group size of the selected graphics card (%d).\n",
localGroupSize, maxThreadsPerBlock);
} else if (((int)localGroupSize) < MIN_LOCAL_SIZE) {
Fatal("Error: local group (%zu) size is less than MIN_LOCAL_SIZE (%d).\n",
localGroupSize, MIN_LOCAL_SIZE);
}
// Allocate memory on the device. These pointers will point to memory on the GPU.
double preInitTime = GetTime();
float* prestackCudaPtr = NULL;
float* stackOutCudaPtr = NULL;
float* powerOutCudaPtr = NULL;
int* startIndicesCudaPtr = NULL;
int* endIndicesCudaPtr = NULL;
int* sampleShiftCudaPtr = NULL;
AllocateAndCopyCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
prestackTraces, startIndices, endIndices, sampleShift,
prestackSizes, outputSizes, inputSizes);
// Run the program
RunCudaMoveAndStackJobs(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
// Retrieve the data and clean up graphics card memory
RetrieveAndCleanupCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
stackOut, powerOut,
(size_t)(nTracesOut * samplesPerTrace * sizeof(float)));
// Print the run time (if requested)
if (_PRINT_RUN_TIME) {
returnVal = (GetTime() - preInitTime);
if (_PRINT_RUN_TIME) {
printf(" Run Time: %f secs\n", returnVal);
}
}
return returnVal;
}
// Returns a float 0.0 - 1.0, inclusive
float RandomFloat() {
return static_cast <float> (rand()) / static_cast <float>(RAND_MAX);
}
// Fill in the prestack traces array
void FillFloatArrayRandomly(float* fillArray, unsigned int length) {
for (unsigned int r = 0; r < length; r++) {
fillArray[r] = RandomFloat() * 1000.0f;
}
}
// Fill the start and end arrays randomly
void FillStartEndShiftArraysRandomly(int* startArray, int* nSampsArray, int* shiftArray,
int arrayLength, int rangeOfStartEndMax, int samplesPerT) {
for (int r = 0; r < arrayLength; r++) {
startArray[r] = (rand() % rangeOfStartEndMax);
int endIndex = samplesPerT - (rand() % rangeOfStartEndMax);
nSampsArray[r] = endIndex - startArray[r];
int range = startArray[r] + (samplesPerT - endIndex);
int ra = rand();
if (range != 0) shiftArray[r] = (ra % range) - startArray[r];
else shiftArray[r] = 0;
// Check to make sure we won't go out of bounds
assert((startArray[r] + nSampsArray[r]) <= samplesPerT);
assert(endIndex > startArray[r]);
assert(startArray[r] >= 0);
assert(nSampsArray[r] >= 0);
assert((startArray[r] + shiftArray[r]) >= 0);
assert((nSampsArray[r] + shiftArray[r]) <= samplesPerT);
}
}
// Create arrays for the OpenCL program to use
double GenerateArraysAndRun(unsigned int samplesPerTrace,
unsigned int nTracesIn, unsigned int nTracesOut, size_t localGroupS) {
srand(time(NULL)); // Set random seed to current time
double returnVal;
// Create the arrays to be used in the program
float* prestackTraces1D;
float* stackOut1D;
float* powerOut1D;
int* startIndices1D;
int* endIndices1D;
int* shift1D;
// Get sizes or arrays
size_t prestackSizes = samplesPerTrace * nTracesIn * sizeof(float);
size_t outputSizes = nTracesOut * samplesPerTrace * sizeof(float);
size_t inputSizes = nTracesOut * nTracesIn * sizeof(int);
// Fill in the arrays
prestackTraces1D = (float*)malloc(prestackSizes);
stackOut1D = (float*)malloc(outputSizes);
powerOut1D = (float*)malloc(outputSizes);
startIndices1D = (int*)malloc(inputSizes);
endIndices1D = (int*)malloc(inputSizes);
shift1D = (int*)malloc(inputSizes);
FillFloatArrayRandomly(prestackTraces1D, samplesPerTrace * nTracesIn);
FillStartEndShiftArraysRandomly(startIndices1D, endIndices1D, shift1D,
(int)(nTracesOut * nTracesIn), (int)(((float)samplesPerTrace) * 0.1), (int)samplesPerTrace);
// Check if arrays were created
if (prestackTraces1D == NULL) Fatal("GenerateArraysAndRun(): prestackTraces1D == NULL\n");
if (stackOut1D == NULL) Fatal("GenerateArraysAndRun(): stackOut1D == NULL\n");
if (powerOut1D == NULL) Fatal("GenerateArraysAndRun(): powerOut1D == NULL\n");
if (startIndices1D == NULL) Fatal("GenerateArraysAndRun(): startIndices1D == NULL\n");
if (endIndices1D == NULL) Fatal("GenerateArraysAndRun(): endIndices1D == NULL\n");
if (shift1D == NULL) Fatal("GenerateArraysAndRun(): shift1D == NULL\n");
// Run the program
returnVal = CommenceCUDAMoveoutAndStack(prestackTraces1D, stackOut1D, powerOut1D, startIndices1D,
endIndices1D, shift1D, samplesPerTrace, nTracesIn, nTracesOut,
localGroupS, prestackSizes, outputSizes, inputSizes);
// Finished: free the memory on CPU side in reverse order
free(shift1D);
free(endIndices1D);
free(startIndices1D);
free(powerOut1D);
free(stackOut1D);
free(prestackTraces1D);
// Return the time that the program gave us
return returnVal;
}
// Main
int main(int argc, char* argv[]) {
// TODO: Errors here
if (argc != 5)
Fatal("Incorrect # of Arguments (5 Needed) <samplesPerTrace> <nTracesIn> <nTracesOut> <LocalGroupSize>\n"
" argc = %d\n", argc);
unsigned int samplesPerTrace = atoi(argv[1]);
unsigned int nTracesIn = atoi(argv[2]);
unsigned int nTracesOut = atoi(argv[3]);
size_t localGroupS = atoi(argv[4]);
GenerateArraysAndRun(samplesPerTrace, nTracesIn, nTracesOut, localGroupS);
return 0;
}
The problem was that I was spawning too many blocks. In OpenCL, you tell the kernel the total number of threads and how many threads are in each block, and the total # of blocks is determined from that. Meanwhile in Cuda, you tell the kernel how many blocks there are and how many threads per block there is, and the total # of threads is determined by those. So:
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
Should be:
dim3 threads(localGroupSize);
dim3 grid((samplesPerT * nOuts) / localGroupSize);
New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.
I would just like to start by saying I am new to CUDA and OpenGL. I get the above runtime error when it runs glutMainLoop() and the mainloop runs glDrawPixels(). I have looked everywhere to figure out why this isn't working any help would be appreciated.
#include<stdio.h>
#include<stdlib.h>
#include<gl/glut.h>
#include<stb_image.c>
#include<cuda.h>
#include<cuda_gl_interop.h>
#include<gl/glext.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
system("PAUSE");
//exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define DIM 512
#define GET_PROC_ADDRESS( str ) wglGetProcAddress( str )
PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;
GLuint bufferObj; /* how OpenGL calls the buffer */
cudaGraphicsResource *resource; /* how CUDA calls this same buffer */
uchar4* devPtr; /* friendly pointer to the handles above */
const int size = DIM * DIM;
struct cuComplex
{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a), i(b){ }
__device__ float magnitude2(void)
{
return r * r + i * i;
}
__device__ cuComplex operator *(const cuComplex & a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex & a)
{
return cuComplex(r+a.r, i+a.i);
}
};
__device__ int julia (int x, int y)
{
const float scale = 1.5;
float jx = scale *(float)(DIM/2 - x)/(DIM/2);
float jy = scale *(float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8, 0.156);
cuComplex a(jx,jy);
int i = 0;
for(i=0; i<200; i++)
{
a = a * a + c;
if(a.magnitude2() > 1000)
{
return 0;
}
}
return 1;
}
__global__ void kernel(uchar4 *ptr)
{
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x * blockDim.x;
int juliaValue = julia(x,y);
ptr[offset].x = 255 * juliaValue;
ptr[offset].y = 0;
ptr[offset].z = 0;
ptr[offset].w = 255;
}
static void Draw( void ) {
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE,0); //ERROR HAPPENS HERE
glutSwapBuffers();
}
void display_and_exit() {
cudaDeviceProp prop;
int dev;
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );
/*
* Interoperability with OpenGL requires that the CUDA device
* be specified by cudaGLSetGLDevice() before any other runtime calls.
*/
HANDLE_ERROR( cudaGLSetGLDevice( dev ) );
/*
* a bug in the Windows GLUT implementation prevents us from
* passing zero arguments to glutInit()
*/
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize (DIM, DIM);
glutInitWindowPosition (100, 100);
glutCreateWindow("CUDA and OpenGL example" );
printf("OpenGL version: %s\n", glGetString(GL_VERSION));
dim3 grid(DIM, DIM);
kernel<<<grid,1>>>(devPtr);
glutDisplayFunc(Draw);
/*
* Get pointers to functions (glext.h)
*/
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
glutMainLoop();
}
int main() {
display_and_exit();
return(0);
}
Unless you are using a Pixel Buffer Object, then this is exactly the sort of behavior you should expect from glDrawPixels (..., 0). That last parameter is a pointer to pixel data, it should be non-NULL whenever the source of pixel data is client memory. It can be NULL if you have a Pixel Buffer Object, the same way that the pointer in a call to glVertexPointer (...) can be NULL when using a Vertex Buffer Object.
You have acquired the necessary function pointers to create a Pixel Buffer Object in your code, however have not actually created one. Thus, passing NULL to glDrawPixels (...) will cause the driver to dereference a NULL pointer.