In the following code there's the cudaMemcpy not working, it returns an error, and the program exits. What can be the problem? It doesn't seem to me I'm doing something illegal, and the size of the vectors seem fine to me.
It might be possible the algorithm does something wrong at some point but the idea is correct I guess. The code is to sum n numbers by doing some partial sums in parallel, and then re-iterate.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__device__ int aug_vec(int *vec, const int& i, const int& size) {
return (i >= size) ? 0 : vec[i];
}
__global__ void sumVectorElements(int *vec,const int& size) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x);
vec[i] = aug_vec(vec, 2*i, size) + aug_vec(vec, 2 * i + 1, size);
}
__host__ int parallel_sum(int *vec,const int& size) {
cudaError_t err;
int *d_vec, *cp_vec;
int n_threads = (size >> 1) + (size & 1);
cp_vec = new int[size];
err = cudaMalloc((void**)&d_vec, size * sizeof(int));
if (err != cudaSuccess) {
std::cout << "error in cudaMalloc!" << std::endl;
exit(1);
}
err = cudaMemcpy(d_vec, vec, size*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy!" << std::endl;
exit(1);
}
int curr_size = size;
while (curr_size > 1) {
std::cout << "size = " << curr_size << std::endl;
sumVectorElements<<<1,n_threads>>>(d_vec, curr_size);
curr_size = (curr_size >> 1) + (curr_size & 1);
}
err = cudaMemcpy(cp_vec, d_vec, size*sizeof(int), cudaMemcpyDeviceToHost); //THIS LINE IS THE PROBLEM!
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy" << std::endl;
exit(1);
}
err = cudaFree(d_vec);
if (err != cudaSuccess) {
std::cout << "error in cudaFree" << std::endl;
exit(1);
}
int rval = cp_vec[0];
delete[] cp_vec;
return rval;
}
int main(int argc, char **argv) {
const int n_blocks = 1;
const int n_threads_per_block = 12;
int vec[12] = { 0 };
for (auto i = 0; i < n_threads_per_block; ++i) vec[i] = i + 1;
int sum = parallel_sum(vec, n_threads_per_block);
std::cout << "Sum = " << sum << std::endl;
system("pause");
return 0;
}
The cudaMemcpy operation after the kernel is actually asynchronously reporting an error that is due to the kernel execution. Your error reporting is primitive. If you have an error code, you may get more useful information by printing out the result of passing that error code to cudaGetErrorString().
The error is occurring in the kernel due to use of the reference argument:
__global__ void sumVectorElements(int *vec,const int& size) {
^^^^^^^^^^^^^^^
Any argument you pass to a kernel and expect to be usable in kernel code must refer to data that is passed by value, or else data that is accessible/referenceable from device code. For example, passing a host pointer to device code is generally not legal in CUDA, because an attempt to dereference a host pointer in device code will fail.
The exceptions to the above would be data/pointers/references that are accessible in device code. Unified memory and pinned/mapped data are two examples, neither of which are being used here.
As a result, the reference parameter involves a reference (an address, basically) for a an item (size) in host memory. When the kernel code attempts to use this item, it must first de-reference it. The dereferenceing of a host item in device code is illegal in CUDA (unless using UM or pinned memory).
The solution in this case is simple: convert to an ordinary pass-by-value situation:
__global__ void sumVectorElements(int *vec,const int size) ...
^
remove ampersand
Related
I would like to measure the maximum memory usage of abc.exe on random tests generated by gen.exe. How could I do that?
My code that runs abc.exe on tests from gen.exe looks like this:
#include <bits/stdc++.h>
using namespace std;
int main()
{
int i = 0;
while (true)
{
string si = to_string(i);
cout << i << "\n";
if (system(("echo " + si + "| ./gen.exe > test.in").c_str())) // gen.exe is test generator
{
cout << "gen error\n";
break;
}
if (system(("./abc.exe < test.in > a.out"))) // abc.exe is the program I want to test
{
cout << "abc error\n";
break;
}
i++;
}
}
I know that i can use time -v ./abc.exe but then the used memory is printed in the terminal but I'd like to be able to save it to a variable.
You can use getrusage( RUSAGE_CHILDREN, ... ) to obtain the maximum resident memory. Note that this call will return the maximum memory used by the biggest child at that point in time.
In the example below I used boost::process because it gives better control but it's up to you to use std::system or not, works the same way.
#include <string>
#include <cstdint>
#include <string.h>
#include <iostream>
#include <boost/process/child.hpp>
#include <sys/resource.h>
namespace bp = boost::process;
int parent( const std::string& exename )
{
// Loop from 0 to 10 megabytes
for ( int j=0; j<10; ++j )
{
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
bp::child child( gencmd );
// Wait for it to allocate memory
sleep(1);
// Query the memory usage at this point in time
struct rusage ru;
getrusage( RUSAGE_CHILDREN, &ru );
std::cerr << "Loop:" << j << " mem:"<< ru.ru_maxrss/1024. << " MB" << std::endl;
// Wait for process to quit
child.wait();
if ( child.exit_code()!=0 )
{
std::cerr << "Error executing child:" << child.exit_code() << std::endl;
return 1;
}
}
return 0;
}
int child( int size ) {
// Allocated "size" megabites explicitly
size_t memsize = size*1024*1024;
uint8_t* ptr = (uint8_t*)malloc( memsize );
memset( ptr, size, memsize );
// Wait for the parent to sample our memory usage
sleep( 2 );
// Free memory
free( ptr );
return 0;
}
int main( int argc, char* argv[] )
{
// Without arguments, it is the parent.
// Pass the name of the binary
if ( argc==1 ) return parent( argv[0] );
return child( std::atoi( argv[1] ) );
}
It prints
$ ./env_test
Loop:0 mem:0 MB
Loop:1 mem:3.5625 MB
Loop:2 mem:4.01953 MB
Loop:3 mem:5.05469 MB
Loop:4 mem:6.04688 MB
Loop:5 mem:7.05078 MB
Loop:6 mem:7.78516 MB
Loop:7 mem:8.97266 MB
Loop:8 mem:9.82031 MB
Loop:9 mem:10.8867 MB
If you cannot use boost libraries, you'd got to work a little more but it is still feasible.
If you just want to know the maximum size ever of your children processes then the following works with std::system:
#include <cstdio>
#include <string>
#include <iostream>
#include <sstream>
#include <string.h>
#include <unistd.h>
#include <sys/resource.h>
int main(int argc, char* argv[]) {
if (argc > 1) {
size_t size = ::atol(argv[1]);
size_t memsize = size * 1024 * 1024;
void* ptr = ::malloc(memsize);
memset(ptr, 0, memsize);
::sleep(2);
::free(ptr);
return 0;
}
for (int j = 0; j < 10; ++j) {
std::ostringstream cmd;
cmd << argv[0] << " " << j;
int res = std::system(cmd.str().c_str());
if (res < 0) {
fprintf(stderr, "ERROR system: %s\n", strerror(errno));
break;
}
struct rusage ru;
res = getrusage(RUSAGE_CHILDREN, &ru);
size_t maxmem = ru.ru_maxrss;
fprintf(stderr, "Loop:%d MaxMem:%ld\n", j, maxmem);
}
return 0;
}
It prints
Loop:0 MaxMem:3552
Loop:1 MaxMem:4192
Loop:2 MaxMem:5148
Loop:3 MaxMem:6228
Loop:4 MaxMem:7364
Loop:5 MaxMem:8456
Loop:6 MaxMem:9120
Loop:7 MaxMem:10188
Loop:8 MaxMem:11324
Loop:9 MaxMem:12256
However if you want to keep track of the memory usage during the child process execution you cannot use std::system(). First, you need to call fork() to spawn a new process and then execv() to execute a bash command.
#include <string>
#include <cstdint>
#include <string.h>
#include <unistd.h>
#include <iostream>
#include <sys/resource.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <vector>
int parent(const std::string& exename) {
// Loop from 0 to 10 megabytes
for (int j = 0; j < 10; ++j) {
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
pid_t pid = fork();
if (pid == 0) { // child
const char* args[] = {"/bin/bash", "-c", gencmd.c_str(), (char*)0};
int res = execv("/bin/bash", (char**)args);
// Should never return
std::cerr << "execv error: " << strerror(errno) << std::endl;
return 1;
}
// parent
long maxmem = 0;
while (true) {
int status;
pid_t rid = ::waitpid(pid, &status, WNOHANG);
if (rid < 0) {
if (errno != ECHILD) {
std::cerr << "waitpid:" << strerror(errno) << std::endl;
return 2;
}
break;
}
if (rid == pid) {
if (WIFEXITED(pid)) {
break;
}
}
// Wait for it to allocate memory
usleep(10000);
// Query the memory usage at this point in time
struct rusage ru;
int res = getrusage(RUSAGE_CHILDREN, &ru);
if (res != 0) {
if (errno != ECHILD) {
std::cerr << "getrusage:" << errno << strerror(errno) << std::endl;
}
break;
}
if (maxmem < ru.ru_maxrss) {
maxmem = ru.ru_maxrss;
}
}
std::cerr << "Loop:" << j << " mem:" << maxmem / 1024. << " MB" << std::endl;
}
return 0;
}
int child(int size) {
// Allocated "size" megabites explicitly
size_t memsize = size * 1024 * 1024;
uint8_t* ptr = (uint8_t*)malloc(memsize);
memset(ptr, size, memsize);
// Wait for the parent to sample our memory usage
sleep(2);
// Free memory
free(ptr);
return 0;
}
int main(int argc, char* argv[]) {
// Without arguments, it is the parent.
// Pass the name of the binary
if (argc == 1) return parent(argv[0]);
return child(std::atoi(argv[1]));
}
The result on my machine is:
$ ./fork_test
Loop:0 mem:3.22656 MB
Loop:1 mem:3.69922 MB
Loop:2 mem:4.80859 MB
Loop:3 mem:5.92578 MB
Loop:4 mem:6.87109 MB
Loop:5 mem:8.05469 MB
Loop:6 mem:8.77344 MB
Loop:7 mem:9.71875 MB
Loop:8 mem:10.7422 MB
Loop:9 mem:11.6797 MB
There is a video about this post.
I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
As mentioned in a comment, your GPU function takes arguments by references.
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
This is bad, passing a reference to a function means more or less that you're passing an address where you can find the value, not the value itself.
In your situation those values are in memory used by Host, NOT Device/GPU memory, when GPU tries to access those values it will most likely crash.
The types you are trying to pass, unsigned char*, int and size_t are very cheap to copy, there's no need to pass them by reference in the 1st place.
__global__ void makeGrey(
unsigned char *pimage,
const int cn,
const size_t total)
There are tools provided by nvidia to debug CUDA applications, but I'm not really familiar with them, you can also use printf inside GPU functions, but you will have to organize output from potentially thousand of threads.
In general, whenever you call GPU functions, be very cautious about what you're passing as parameters, as they need to be passed from Host memory to Device memory. Usually you want to pass everything by value, any pointers need to point to Device memory, and watch out from references.
I have a program where multiple host threads try to capture a cuda graph and execute it.
It produces the correct results, but it cannot be run with cuda-memcheck.
When run with cuda-memcheck, the following error appears.
Program hit cudaErrorStreamCaptureInvalidated (error 901) due to "operation failed due to a previous error during capture" on CUDA API call to cudaLaunchKernel.
When only one host thread is used cuda-memcheck shows no error.
Here is example code which can be compiled with nvcc 10.2 : nvcc -arch=sm_61 -O3 main.cu -o main
#include <iostream>
#include <memory>
#include <algorithm>
#include <cassert>
#include <vector>
#include <thread>
#include <iterator>
#ifndef CUERR
#define CUERR { \
cudaError_t err; \
if ((err = cudaGetLastError()) != cudaSuccess) { \
std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \
<< __FILE__ << ", line " << __LINE__ << std::endl; \
exit(1); \
} \
}
#endif
__global__
void kernel(int id, int num){
printf("kernel %d, id %d\n", num, id);
}
struct Data{
bool isValidGraph = false;
int id = 0;
int deviceId = 0;
cudaGraphExec_t execGraph = nullptr;
cudaStream_t stream = nullptr;
};
void buildGraphViaCapture(Data& data){
cudaSetDevice(data.deviceId); CUERR;
if(!data.isValidGraph){
std::cerr << "rebuild graph\n";
if(data.execGraph != nullptr){
cudaGraphExecDestroy(data.execGraph); CUERR;
}
assert(data.stream != cudaStreamLegacy);
cudaStreamCaptureStatus captureStatus;
cudaStreamIsCapturing(data.stream, &captureStatus); CUERR;
assert(captureStatus == cudaStreamCaptureStatusNone);
cudaStreamBeginCapture(data.stream, cudaStreamCaptureModeRelaxed); CUERR;
for(int i = 0; i < 64; i++){
kernel<<<1,1,0,data.stream>>>(data.id, i);
}
cudaGraph_t graph;
cudaStreamEndCapture(data.stream, &graph); CUERR;
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
auto logBuffer = std::make_unique<char[]>(1025);
std::fill_n(logBuffer.get(), 1025, 0);
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, logBuffer.get(), 1025);
if(status != cudaSuccess){
if(logBuffer[1024] != '\0'){
std::cerr << "cudaGraphInstantiate: truncated error message: ";
std::copy_n(logBuffer.get(), 1025, std::ostream_iterator<char>(std::cerr, ""));
std::cerr << "\n";
}else{
std::cerr << "cudaGraphInstantiate: error message: ";
std::cerr << logBuffer.get();
std::cerr << "\n";
}
CUERR;
}
cudaGraphDestroy(graph); CUERR;
data.execGraph = execGraph;
data.isValidGraph = true;
}
}
void execute(Data& data){
buildGraphViaCapture(data);
assert(data.isValidGraph);
cudaGraphLaunch(data.execGraph, data.stream); CUERR;
}
void initData(Data& data, int id, int deviceId){
data.id = id;
data.deviceId = deviceId;
cudaStreamCreate(&data.stream); CUERR;
}
void destroyData(Data& data){
if(data.execGraph != nullptr){
cudaGraphExecDestroy(data.execGraph); CUERR;
}
cudaStreamDestroy(data.stream); CUERR;
}
int main(){
std::vector<int> deviceIds{0};
std::vector<std::thread> threads;
for(int deviceId : deviceIds){
for(int k = 0; k < 4; k++){
threads.emplace_back([&,deviceId](){
std::vector<Data> vec(3);
initData(vec[0], deviceId * 10 + 4*k + 0, deviceId);
initData(vec[1], deviceId * 10 + 4*k + 1, deviceId);
int cur = 0;
for(int iter = 0; iter < 10; iter++){
cudaStreamSynchronize(vec[cur].stream); CUERR;
execute(vec[cur]); CUERR;
cur = 1 - cur;
}
cudaStreamSynchronize(vec[0].stream); CUERR;
cudaStreamSynchronize(vec[1].stream); CUERR;
destroyData(vec[0]);
destroyData(vec[1]);
});
}
}
for(auto& t : threads){
t.join();
}
cudaDeviceReset();
return 0;
}
Why does the error only appear when multiple threads are used, and why exactly is the capture invalidated?
Edit 20th march 2022:
The error still exists with CUDA-MEMCHECK version 11.5.114. However, cuda-memcheck is now deprecated in favor of compute-sanitizer. The latter does no longer report cudaErrorStreamCaptureInvalidated
Cuda graphs are not thread safe. If you read the documentation, it says that:
Graph objects (cudaGraph_t, CUgraph) are not internally synchronized and must not be accessed concurrently from multiple threads. API calls accessing the same graph object must be serialized externally.
You need to access the graph object in a critical section.
We ran into this problem aswell - where even though we are working on different CUDA graph objects, we still get errors. Our (ugly) solution is to wrap the cudaStreamBeginCapture and cudaStreamEndCapture in a RAII struct with a static mutex.
It solves the problem for now, but I'm going to inquire further on the CUDA developer forums.
I have a program where I generate arrays with random elements using cuda. Since I upgraded from cuda 9.1 to cuda 9.2, the time it takes do that has gone up from a fraction of a second (about 0.1s) to almost two minutes (without changing any of the code). The problem seems to be the curand_init() function, as the rest is running at about the same speed. Was there a change I missed in the library, is this a bug or is it a problem with my code?
This is an example
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#define cudaErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::cerr << "cudaAssert: " << cudaGetErrorString(code) << " " << file << ": " << line << std::endl;
if (abort) exit(code);
}
}
__global__
void setup_curand_state (curandState *state, int seed, int dim)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
if (index < dim)
curand_init(seed, index, 0, &state[index]);
}
__global__
void set_random (float* to, curandState* curand_state, int dim)
{
int index = threadIdx.x+ blockIdx.x* blockDim.x;
if (index < dim)
to [index] = curand_normal (&curand_state[index]);
}
int main () {
int dim = 100000;
float *data;
cudaErrchk (cudaMallocManaged ((void**) &data, dim * sizeof (float)));
curandState* curand_state;
cudaErrchk (cudaMalloc (&curand_state, (dim * sizeof (curandState))));
setup_curand_state <<<(dim + 1023) / 1024, 1024>>> (curand_state, time(NULL), dim);
cudaErrchk (cudaDeviceSynchronize());
set_random <<<(dim + 1023) / 1024, 1024>>> (data, curand_state, dim);
cudaFree (data);
return 0;
}
Answered by mrBonobo in the comment above:
Apparently, updating cuda through apt silently broke the install. Code
compiled for 9.1 would still work, but around 100/1000 times slower.
Reinstalling nvidia-cuda-toolkit solved the error
New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.