cuda 9.2 curand_init extremely slow - c++

I have a program where I generate arrays with random elements using cuda. Since I upgraded from cuda 9.1 to cuda 9.2, the time it takes do that has gone up from a fraction of a second (about 0.1s) to almost two minutes (without changing any of the code). The problem seems to be the curand_init() function, as the rest is running at about the same speed. Was there a change I missed in the library, is this a bug or is it a problem with my code?
This is an example
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#define cudaErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::cerr << "cudaAssert: " << cudaGetErrorString(code) << " " << file << ": " << line << std::endl;
if (abort) exit(code);
}
}
__global__
void setup_curand_state (curandState *state, int seed, int dim)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
if (index < dim)
curand_init(seed, index, 0, &state[index]);
}
__global__
void set_random (float* to, curandState* curand_state, int dim)
{
int index = threadIdx.x+ blockIdx.x* blockDim.x;
if (index < dim)
to [index] = curand_normal (&curand_state[index]);
}
int main () {
int dim = 100000;
float *data;
cudaErrchk (cudaMallocManaged ((void**) &data, dim * sizeof (float)));
curandState* curand_state;
cudaErrchk (cudaMalloc (&curand_state, (dim * sizeof (curandState))));
setup_curand_state <<<(dim + 1023) / 1024, 1024>>> (curand_state, time(NULL), dim);
cudaErrchk (cudaDeviceSynchronize());
set_random <<<(dim + 1023) / 1024, 1024>>> (data, curand_state, dim);
cudaFree (data);
return 0;
}

Answered by mrBonobo in the comment above:
Apparently, updating cuda through apt silently broke the install. Code
compiled for 9.1 would still work, but around 100/1000 times slower.
Reinstalling nvidia-cuda-toolkit solved the error

Related

Read value incremented from all launched kernel threads

The output of the following CUDA code
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
test<<<512, 256>>>(thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}
is
test.cu:15 test Thread idx(17568) got the mutex. total_count = 56320
test.cu:25 main host_data=(131072,1)
The total_count may differ. My goal is for all 131072=512*256 threads to call
atomicAdd(count ,1);
and for it to subsequently read it back from *count which should then equal 131072 (number of all launched threads.)
Question
What is the best way to read *count after all threads have incremented it?
__threadfence() doesn't seem to do it, since the value is read back before all threads have incremented it.
I know about cooperative_groups but that seems a bit heavy for something I would have expected to be more elementary. Also, even if cooperative_groups were used, would that limit the total number of blocks launched to some hardware-determined limit?
Btw this is a version that works w/ cooperative_groups, with the restricted number of blocks. (Thanks to #paleonix for the bugfix to kernel_args.)
#include <cooperative_groups.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <vector>
__global__ void test(int *count, int *mutex) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
atomicAdd(count, 1);
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
grid.sync();
//__threadfence();
int const total_count = *count; // How to get all 512*256 here?
if (0 == atomicCAS(mutex, 0, 1)) {
printf("%s:%u %s Thread idx(%d) got the mutex. total_count = %d\n",
__FILE__, __LINE__, __func__, idx, total_count);
}
}
int main() {
thrust::device_vector<int> data(2, 0);
void * args[] { thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(data.data()) + 1 };
void * kernel_args[] { &args[0], &args[1] };
int dev = 0;
int supportsCoopLaunch = 0;
cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
printf("%s:%u %s supportsCoopLaunch=%d\n", __FILE__, __LINE__, __func__, supportsCoopLaunch);
// This will launch a grid that can maximally fill the GPU, on the default stream with kernel arguments
int numBlocksPerSm = 0;
// Number of threads my_kernel will be launched with
int numThreads = 128;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, test, numThreads, 0);
dim3 dimBlock(numThreads, 1, 1);
dim3 dimGrid(deviceProp.multiProcessorCount*numBlocksPerSm, 1, 1);
printf("%s:%u %s deviceProp.multiProcessorCount=%d numBlocksPerSm=%d\n", __FILE__, __LINE__, __func__, deviceProp.multiProcessorCount, numBlocksPerSm);
cudaLaunchCooperativeKernel((void*)test, dimGrid, dimBlock, kernel_args);
std::vector<int> host_data(2);
thrust::copy(data.begin(), data.end(), host_data.begin());
printf("%s:%u %s host_data=(%d,%d)\n", __FILE__, __LINE__, __func__,
host_data[0], host_data[1]);
return 0;
}

Why do libraries prefer to obfuscate the code or provide no debug information instead of creating a special version containing debug information?

In libraries, we often see specific code for debug.
It can sometimes really mess up the code.
I don't see libraries that use 2 versions of the code.
1 version without debug information and 1 other with debug information.
There are advantages and disadvantages:
For a large library, this could considerably increase the size of the code and force to maintain 2 times the same code.
Besides that, it would be possible to have a very complete debug with special structures, many different checks that can be defined via macros
and make the code much easier to debug.
Why do libraries prefer to obfuscate the code or provide no debug information instead of creating a special version containing debug information?
For example:
#ifdef NDEBUG
void send_message(int fd, const char* buffer, int len) {
int total = 0;
while (total < len) {
int n = write(fd, buffer + total, len - total);
total += n;
}
}
#else
void print_and_exit(const char* message) {
std::cerr << message << std::endl;
exit(1);
}
#ifdef ENABLE_WARNING
void warning(const char* message) { std::cerr << message << std::endl; }
#else
void warning(const char* message) {}
#endif
void send_message(int fd, const char* buffer, int len) {
if (fd < 0) {
print_and_exit("fd is invalid");
}
if (!buffer) {
print_and_exit("NULL buffer");
}
if (len <= 0) {
warning("buffer size <= 0");
}
int total = 0;
while (total < len) {
int n = write(fd, buffer + total, len - total);
total += n;
}
}
#endif
The debug code is in the same file but we can imagine separate files that are included according to what is defined by the user

an illegal memory access was encountered

I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
As mentioned in a comment, your GPU function takes arguments by references.
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
This is bad, passing a reference to a function means more or less that you're passing an address where you can find the value, not the value itself.
In your situation those values are in memory used by Host, NOT Device/GPU memory, when GPU tries to access those values it will most likely crash.
The types you are trying to pass, unsigned char*, int and size_t are very cheap to copy, there's no need to pass them by reference in the 1st place.
__global__ void makeGrey(
unsigned char *pimage,
const int cn,
const size_t total)
There are tools provided by nvidia to debug CUDA applications, but I'm not really familiar with them, you can also use printf inside GPU functions, but you will have to organize output from potentially thousand of threads.
In general, whenever you call GPU functions, be very cautious about what you're passing as parameters, as they need to be passed from Host memory to Device memory. Usually you want to pass everything by value, any pointers need to point to Device memory, and watch out from references.

cudaMemcpy doesn't work

In the following code there's the cudaMemcpy not working, it returns an error, and the program exits. What can be the problem? It doesn't seem to me I'm doing something illegal, and the size of the vectors seem fine to me.
It might be possible the algorithm does something wrong at some point but the idea is correct I guess. The code is to sum n numbers by doing some partial sums in parallel, and then re-iterate.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__device__ int aug_vec(int *vec, const int& i, const int& size) {
return (i >= size) ? 0 : vec[i];
}
__global__ void sumVectorElements(int *vec,const int& size) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x);
vec[i] = aug_vec(vec, 2*i, size) + aug_vec(vec, 2 * i + 1, size);
}
__host__ int parallel_sum(int *vec,const int& size) {
cudaError_t err;
int *d_vec, *cp_vec;
int n_threads = (size >> 1) + (size & 1);
cp_vec = new int[size];
err = cudaMalloc((void**)&d_vec, size * sizeof(int));
if (err != cudaSuccess) {
std::cout << "error in cudaMalloc!" << std::endl;
exit(1);
}
err = cudaMemcpy(d_vec, vec, size*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy!" << std::endl;
exit(1);
}
int curr_size = size;
while (curr_size > 1) {
std::cout << "size = " << curr_size << std::endl;
sumVectorElements<<<1,n_threads>>>(d_vec, curr_size);
curr_size = (curr_size >> 1) + (curr_size & 1);
}
err = cudaMemcpy(cp_vec, d_vec, size*sizeof(int), cudaMemcpyDeviceToHost); //THIS LINE IS THE PROBLEM!
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy" << std::endl;
exit(1);
}
err = cudaFree(d_vec);
if (err != cudaSuccess) {
std::cout << "error in cudaFree" << std::endl;
exit(1);
}
int rval = cp_vec[0];
delete[] cp_vec;
return rval;
}
int main(int argc, char **argv) {
const int n_blocks = 1;
const int n_threads_per_block = 12;
int vec[12] = { 0 };
for (auto i = 0; i < n_threads_per_block; ++i) vec[i] = i + 1;
int sum = parallel_sum(vec, n_threads_per_block);
std::cout << "Sum = " << sum << std::endl;
system("pause");
return 0;
}
The cudaMemcpy operation after the kernel is actually asynchronously reporting an error that is due to the kernel execution. Your error reporting is primitive. If you have an error code, you may get more useful information by printing out the result of passing that error code to cudaGetErrorString().
The error is occurring in the kernel due to use of the reference argument:
__global__ void sumVectorElements(int *vec,const int& size) {
^^^^^^^^^^^^^^^
Any argument you pass to a kernel and expect to be usable in kernel code must refer to data that is passed by value, or else data that is accessible/referenceable from device code. For example, passing a host pointer to device code is generally not legal in CUDA, because an attempt to dereference a host pointer in device code will fail.
The exceptions to the above would be data/pointers/references that are accessible in device code. Unified memory and pinned/mapped data are two examples, neither of which are being used here.
As a result, the reference parameter involves a reference (an address, basically) for a an item (size) in host memory. When the kernel code attempts to use this item, it must first de-reference it. The dereferenceing of a host item in device code is illegal in CUDA (unless using UM or pinned memory).
The solution in this case is simple: convert to an ordinary pass-by-value situation:
__global__ void sumVectorElements(int *vec,const int size) ...
^
remove ampersand

Segmentation fault on cudaMalloc or cudaMemcpy

New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.