cudaMemcpy throws InvalidValue error when copying from device to host - c++

I've been trying to implement a one dimensional FFT using cuFFT. An InvalidValue error is thrown and no meaningful results are produced.
I've tried to ensure that each error is caught, and I believe that the cudaMemcpy from DeviceToHost causes the issue, though I am not sure why, nor how to fix it. The data size parameter in cudaMemcpy follows the same relation as supplied by the cuFFT documentation.
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
// cuda macros
#define NX 100 // number of points
#define BATCH 1 // number of ffts to perform
#define RANK 1 //
#define IDIST 1 // distance between 1st elements of batches
#define ISTRIDE 1 // do every ISTRIDEth index
#define ODIST 1 // distance between 1st elements of output
#define OSTRIDE 1 // distance between output elements
void fft1d() {
// create plan for performing fft
cufftHandle plan;
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS) {
printf("Failed to create 1D plan\n");
return;
}
// assemble data
double temp_data[] = {2.598076211353316, 3.2402830701637395, 3.8494572900049224, 4.419388724529261, 4.944267282795252, 5.41874215947433, 5.837976382931011, 6.197696125093141, 6.494234270429254, 6.724567799874842, 6.886348608602047, 6.97792744346504, 6.998370716093996, 6.9474700202387565, 6.8257442563389, 6.6344343416615565, 6.37549055993378, 6.051552679431957, 5.665923042211819, 5.222532898817316, 4.725902331664744, 4.181094175657916, 3.5936624057845576, 2.9695955178498603, 2.315255479544737, 1.6373128742041732, 0.9426788984240022, 0.23843490677753865, -0.46823977812093664, -1.1701410542749289, -1.8601134815746807, -2.531123226988873, -3.176329770049035, -3.7891556376344524, -4.363353457155562, -4.893069644570959, -5.3729040779788875, -5.797965148448726, -6.163919626883915, -6.467036838555256, -6.704226694973039, -6.873071195387157, -6.971849076777267, -6.999553361041935, -6.955901620504255, -6.84133885708361, -6.657032965782207, -6.404862828733319, -6.0873991611848375, -5.707878304681281, -5.270169234606201, -4.778734118422206, -4.23858282669252, -3.6552218606153755, -3.0345982167228436, -2.383038761007964, -1.707185730522749, -1.0139290199674, -0.31033594356630245, 0.39642081173600463, 1.0991363072871054, 1.7906468025248339, 2.463902784786862, 3.1120408346390414, 3.728453594100783, 4.306857124485735, 4.841354967187034, 5.326498254347925, 5.757341256627454, 6.129491801786784, 6.439156050110601, 6.683177170206378, 6.859067520906216, 6.965034011197066, 6.999996379650895, 6.963598207007518, 6.85621054964381, 6.678928156888352, 6.433558310743566, 6.122602401787424, 5.749230429076629, 5.317248684008804, 4.831060947586139, 4.295623596650021, 3.7163950767501706, 3.0992802567403803, 2.4505702323708074, 1.7768781925409076, 1.0850720020162676, 0.3822041878858906, -0.3245599564963766, -1.0280154171511335, -1.7209909100394047, -2.3964219877733033, -3.0474230571943477, -3.667357573646071, -4.249905696354359, -4.78912871521179, -5.279529592175676, -5.716109000098287};
cufftReal *idata;
cudaMalloc((void**) &idata, sizeof(cufftComplex)*NX);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory space for input data.\n");
return;
}
cudaMemcpy(idata, temp_data, sizeof(temp_data)/sizeof(double), cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to load time data to memory.\n");
return;
}
// prepare memory for return data
cufftComplex *odata;
cudaMalloc((void**) &odata, sizeof(cufftComplex)*(NX/2 + 1));
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory for output data.\n");
}
// perform fft
if (cufftExecR2C(plan, idata, odata) != CUFFT_SUCCESS) {
printf("Failed to perform fft.\n");
return;
}
I think the error is thrown here, at the cudaMemcpy.
// grab data from graphics and print (memcpy waits until complete) cuda memcopy doesn't complete
// can return errors from previous cuda calls if they haven't been caught
cufftComplex *out_temp_data;
size_t num_bytes = (NX/2 + 1)*sizeof(cufftComplex);
cudaMemcpy(out_temp_data, odata, num_bytes, cudaMemcpyDeviceToHost);
int error_value = cudaGetLastError();
printf("cudaMemcpy from device state: %i\n", error_value);
if(error_value != cudaSuccess) {
printf("Failed to pull data from device.\n");
return;
}
for (size_t i = 0; i < (NX/2 + 1); i++) {
printf("%lu %f %f\n", i, out_temp_data[i].x, out_temp_data[i].y);
}
// clean up
cufftDestroy(plan);
cudaFree(idata);
}
int main() {
fft1d();
return 0;
}

Memory must be allocated before cudaMemcpy can write the data. Thanks to generic-opto-guy for pointing this out.
In this case:
out_temp_data = new cufftComplex[NX/2 + 1];

Related

Why does my data not fit into a CUDA Texture Object?

I'm trying to fill a CUDA Texture Object with some data but the call to cudaCreateTextureObject fails with the following error (edit: on both a GTX 1080TI and a RTX 2080TI):
GPU ERROR! 'invalid argument' (err code 11)
It works if I put less data into my texture so my guess is that my computation about how much data I can fit into a texture is off.
My thought process is as follows:
(executable code follows below)
My data comes in the form of (76,76) images where each pixel is a float. What I would like to do is to store a column of images in a Texture Object; as I understand it, cudaMallocPitch is the way to do this.
When computing the number of images I can store in one texture I'm using the following formula to determine how much space a single image needs:
GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float)
Where the first argument should be the memory pitch on a GTX 1080TI card (512 bytes). The number of bytes that I can store in a 1D texture is given as 2^27 here. When I divide the latter by the former I get 862.3, assuming this is the number of images I can store in one Texture Object. However, when I try to store more than 855 images in my buffer the program crashes with the error above.
Here's the code:
In the following the main function (a) sets up all the relevant parameters, (b) allocates the memory using cudaMallocPitch, and (c) configures and creates a CUDA Texture Object:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#define GTX_1080TI_MEM_PITCH 512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27
//=====================================================================[ util ]
// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit( EXIT_FAILURE );
}
}
// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
cudaError cu_err = cudaGetLastError();
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit(EXIT_FAILURE);
}
}
//=====================================================================[ main ]
int main(){
// setup
unsigned int img_dim_x = 76;
unsigned int img_dim_y = 76;
unsigned int img_num = 856; // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?
unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
unsigned int img_num_per_tex = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;
fprintf( stderr, "We should be able to stuff %d images into one texture.\n", img_num_per_tex );
fprintf( stderr, "We use %d (more than 855 leads to a crash).\n", img_num );
// allocate pitched memory
size_t img_tex_pitch;
float* d_img_tex_data;
CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );
assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d\n",
img_num*img_dim_x*img_dim_y*sizeof(float),
img_num*img_tex_pitch*img_dim_y*sizeof(float),
img_tex_pitch,
GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
GTX_1080TI_1DTEX_WIDTH );
// generic resource descriptor
cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypePitch2D;
res_desc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
res_desc.res.pitch2D.devPtr = d_img_tex_data;
res_desc.res.pitch2D.width = img_dim_x;
res_desc.res.pitch2D.height = img_dim_y*img_num;
res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;
// texture descriptor
cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModeLinear; // for linear interpolation (NOTE: this breaks normal integer indexing!)
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = false; // we want to index using [0;img_dim] rather than [0;1]
// make sure there are no lingering errors
GPU_ERR_CHK();
fprintf(stderr, "No CUDA error until now..\n");
// create texture object
cudaTextureObject_t img_tex_obj;
CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );
fprintf(stderr, "bluppi\n");
}
This should crash when cudaCreateTextureObject is called. If the img_num parameter (at the start of main) is changed from 856 to 855, however, the code should execute successfully. (edit: The expected behavior would be that the code runs through with a value of 862 but fails with a value of 863 since that actually requires more bytes than the documented buffer size offers.)
Any help would be appreciated!
Since you're working with a 2D texture here, the number of bytes you can store in a 1D texture (the "width") is of no relevance here.
2D textures may have different characteristics depending on the type of memory that provides the backing for the texture. Two examples are linear memory and CUDA Array. You have chosen to use a linear memory backing (that which is provided by cudaMalloc* operations other than cudaMallocArray).
The primary problem you are running into is the maximum texture height. To discover what this is, we could refer to the table 14 in the programming guide, which lists:
Maximum width and height for a 2D texture reference bound to linear memory 65000 x 65000
You are exceeding this 65000 number when going from 855 to 856 images, for an image height of 76 rows. 856*76 = 65056, 855*76 = 64980
"But wait" you say, that table 14 entry says texture reference, and I am using a texture object.
You are correct, and table 14 doesn't explicitly list the corresponding limit for texture objects. In that case, we have to refer to the device properties readable from the device at runtime, using cudaGetDeviceProperties(). If we review the data available there, we see this readable item:
maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
(I suspect the 3 is a typo, but no matter, we only need the first 2 values).
This is the value we want to be sure. If we modify your code to obey that limit, there are no problems:
$ cat t382.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#define GTX_1080TI_MEM_PITCH 512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27
//=====================================================================[ util ]
// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit( EXIT_FAILURE );
}
}
// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
cudaError cu_err = cudaGetLastError();
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit(EXIT_FAILURE);
}
}
//=====================================================================[ main ]
int main(){
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
size_t max2Dtexturelinearwidth = prop.maxTexture2DLinear[0]; // texture x dimension
size_t max2Dtexturelinearheight = prop.maxTexture2DLinear[1]; // texture y dimension
fprintf( stderr, "maximum 2D linear texture dimensions (width,height): %lu,%lu\n", max2Dtexturelinearwidth, max2Dtexturelinearheight);
// setup
unsigned int img_dim_x = 76;
unsigned int img_dim_y = 76;
//unsigned int img_num = 856; // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?
unsigned int img_num = max2Dtexturelinearheight/img_dim_y;
fprintf( stderr, "maximum number of images per texture: %u\n", img_num);
unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
unsigned int img_num_per_tex = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;
fprintf( stderr, "We should be able to stuff %d images into one texture.\n", img_num_per_tex );
fprintf( stderr, "We use %d (more than 855 leads to a crash).\n", img_num );
// allocate pitched memory
size_t img_tex_pitch;
float* d_img_tex_data;
CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );
assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d\n",
img_num*img_dim_x*img_dim_y*sizeof(float),
img_num*img_tex_pitch*img_dim_y*sizeof(float),
img_tex_pitch,
GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
GTX_1080TI_1DTEX_WIDTH );
// generic resource descriptor
cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypePitch2D;
res_desc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
res_desc.res.pitch2D.devPtr = d_img_tex_data;
res_desc.res.pitch2D.width = img_dim_x;
res_desc.res.pitch2D.height = img_dim_y*img_num;
res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;
// texture descriptor
cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModeLinear; // for linear interpolation (NOTE: this breaks normal integer indexing!)
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = false; // we want to index using [0;img_dim] rather than [0;1]
// make sure there are no lingering errors
GPU_ERR_CHK();
fprintf(stderr, "No CUDA error until now..\n");
// create texture object
cudaTextureObject_t img_tex_obj;
CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );
fprintf(stderr, "bluppi\n");
}
$ nvcc -o t382 t382.cu
$ cuda-memcheck ./t382
========= CUDA-MEMCHECK
maximum 2D linear texture dimensions (width,height): 131072,65000
maximum number of images per texture: 855
We should be able to stuff 862 images into one texture.
We use 855 (more than 855 leads to a crash).
Asking for 19753920 bytes allocates 133079040 bytes using pitch 512. Available: 1138688/134217728
No CUDA error until now..
bluppi
========= ERROR SUMMARY: 0 errors
$

OpenCL: CL_OUT_OF_HOST_MEMORY on clCreateCommandQueueWithProperties (with Minimal, Complete, and Verifiable example)

I have a MSI Radeon R9 390X 8GB Video Card (named "Hawaii" as seen below). I have OpenCL installed on my Windows 10 Desktop, I am using Cygwin to compile and run the program.
I am trying to run an example OpenCL program I have kept around from a class from my college days, modified a little.
It won't run on my graphics card. Here is what I get:
$ ./ex26.exe -v 30 40
Bw=30 Bn=40 n=1200
OpenCL Platform 0: AMD Accelerated Parallel Processing
----- OpenCL Device # 0: Hawaii-----
Gflops: 47.520000
Max Compute Units: 44
Max Clock Frequency: 1080
Total Memory of Device (bytes): 8589934592
Max Size of Memory Object Allocation (bytes): 4244635648
Max Work Group Size: 256
Fastest OpenCL Device: Hawaii
Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY
winnerPlatform: 140717488209200
You can see the code below to see where this error statements prints out. For whatever reason clCreateCommandQueueWithProperties is returning CL_OUT_OF_HOST_MEMORY. I don't understand how my CPU side memory could even be close to running out of memory to operate. I really don't know. Especially since all this method is doing is creating the queue.
In fact, if I switch CL_DEVICE_TYPE_GPU to CL_DEVICE_TYPE_CPU then the program executes without problem on the CPU.
It's all just on .cpp file. I couldn't really find anything to cut to make the MCV more minimal since it already an example so you are pretty the code you see if exactly what I have.
Here is all of the code below:
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <CL/opencl.h>
#include <windows.h>
#include <sys/time.h>
/*
* Return elapsed wall time since last call (seconds)
*/
static double t0=0;
float Elapsed(void)
{
#ifdef _WIN32
// Windows version of wall time
LARGE_INTEGER tv,freq;
QueryPerformanceCounter((LARGE_INTEGER*)&tv);
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
double t = tv.QuadPart/(double)freq.QuadPart;
#else
// Unix/Linux/OSX version of wall time
struct timeval tv;
gettimeofday(&tv,NULL);
double t = tv.tv_sec+1e-6*tv.tv_usec;
#endif
float s = t-t0;
t0 = t;
return s;
}
/*
* Print message to stderr and exit
*/
void Fatal(const char* format , ...)
{
va_list args;
va_start(args,format);
vfprintf(stderr,format,args);
va_end(args);
exit(1);
}
/*
* Initialize matrix with random values
*/
void RandomInit(float x[],const unsigned int n)
{
for (unsigned int i=0;i<n*n;i++)
x[i] = rand() / (float)RAND_MAX;
}
/*
* OpenCL notify callback (echo to stderr)
*/
void Notify(const char* errinfo,const void* private_info,size_t cb,void* user_data)
{
fprintf(stderr,"%s\n",errinfo);
}
class ErrorReader {
public:
private:
};
/*
* Initialize fastest OpenCL device
*/
cl_device_id _DEV_ID;
cl_context _CONTEXT;
cl_command_queue _QUEUE;
int InitGPU(int verbose)
{
cl_uint Nplat;
cl_int err;
char name[1024];
int MaxGflops = -1;
cl_platform_id winnerPlatform = 0;
// Get platforms
cl_platform_id platforms[8];
if (clGetPlatformIDs(8, platforms, &Nplat)) Fatal("Cannot get number of OpenCL platforms\n");
else if (Nplat<1) Fatal("No OpenCL platforms found\n");
// Loop over platforms
for (unsigned int platform = 0; platform < Nplat; platform++) {
if (clGetPlatformInfo(platforms[platform], CL_PLATFORM_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL platform name\n");
if (verbose) printf("OpenCL Platform %d: %s\n", platform, name);
// Get GPU device IDs
cl_uint Ndev;
cl_device_id id[1024];
if (clGetDeviceIDs(platforms[platform], CL_DEVICE_TYPE_GPU, 1024, id, &Ndev))
Fatal("Cannot get number of OpenCL devices\n");
else if (Ndev<1)
Fatal("No OpenCL devices found\n");
// Find the fastest device
for (unsigned int devId = 0; devId < Ndev; devId++) {
cl_uint compUnits, freq;
cl_ulong memSize, maxAlloc;
size_t maxWorkGrps;
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compUnits), &compUnits, NULL)) Fatal("Cannot get OpenCL device units\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL)) Fatal("Cannot get OpenCL device frequency\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memSize), &memSize, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(memSize), &maxAlloc, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGrps), &maxWorkGrps, NULL)) Fatal("Cannot get OpenCL max work group size\n");
int Gflops = compUnits * freq;
if (verbose) printf(" ----- OpenCL Device # %d: %s-----\n"
"Gflops: %f\n"
"Max Compute Units: %d\n"
"Max Clock Frequency: %d\n"
"Total Memory of Device (bytes): %lu\n"
"Max Size of Memory Object Allocation (bytes): %lu\n"
"Max Work Group Size: %zu\n\n",
devId,
name,
1e-3*Gflops,
compUnits,
freq,
memSize,
maxAlloc,
maxWorkGrps);
if (Gflops > MaxGflops)
{
_DEV_ID = id[devId];
MaxGflops = Gflops;
winnerPlatform = platforms[platform];
}
}
}
// Print fastest device info
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
printf("winnerPlatform: %zu", winnerPlatform);
// cl_platform_id platform = NULL;
// int retValue = GetPlatform(&platform, winnerPlatform, true);
// Create OpenCL _CONTEXT for fastest device
// _CONTEXT = clCreateContext(0,1,&_DEV_ID,Notify,NULL,&err);
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)winnerPlatform,
(cl_context_properties)0
};
_CONTEXT = clCreateContextFromType(cps,
CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
if (!_CONTEXT || err) Fatal("Cannot create OpenCL Context\n");
cl_command_queue_properties *propers;
cl_command_queue_properties prop = 0;
//prop |= CL_QUEUE_PROFILING_ENABLE;
//prop |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
propers = ∝
_QUEUE = clCreateCommandQueueWithProperties(_CONTEXT, _DEV_ID, propers, &err); // Create OpenCL command _QUEUE for fastest device
if (err) {
if (err == CL_INVALID_CONTEXT) Fatal("Cannot create OpenCL command cue: CL_INVALID_CONTEXT\n");
else if (err == CL_INVALID_DEVICE) Fatal("Cannot create OpenCL command cue: CL_INVALID_DEVICE\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create OpenCL command cue: CL_INVALID_VALUE\n");
else if (err == CL_INVALID_QUEUE_PROPERTIES) Fatal("Cannot create OpenCL command cue: CL_INVALID_QUEUE_PROPERTIES\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY\n");
else Fatal("Cannot create OpenCL command cue: ???????????? Unknown Error\n");
} else if (!_QUEUE) {
Fatal("Cannot create OpenCL command cue: NULL\n");
}
return mwgs;
}
/*
* C = A * B -- host
*/
void AxBh(float C[], const float A[], const float B[], unsigned int n)
{
for (unsigned int i=0;i<n;i++)
for (unsigned int j=0;j<n;j++)
{
double sum=0;
for (unsigned int k=0;k<n;k++)
sum += (double)A[i*n+k] * (double)B[k*n+j];
C[i*n+j] = (float)sum;
}
}
/*
* Compute one element of A * B
*/
const char* source =
"__kernel void AxB(__global float C[],__global const float A[],__global const float B[],const unsigned int n)\n"
"{\n"
" unsigned int j = get_global_id(0);\n"
" unsigned int i = get_global_id(1);\n"
" float sum =0;\n"
" for (int k=0;k<n;k++)\n"
" sum += A[i*n+k] * B[k*n+j];\n"
" C[i*n+j] = sum;\n"
"}\n";
/*
* C = A * B -- device
*/
void AxBd(float Ch[],float Ah[],float Bh[],const unsigned int Bw,const unsigned int Bn)
{
// Calculate matrix dimensions
int n = Bw*Bn;
int N = n*n*sizeof(float);
// Allocate device memory and copy A&B from host to device
cl_int err;
cl_mem Ad = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Ah, &err);
if (err) Fatal("Cannot create and copy A from host to device\n");
cl_mem Bd = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Bh, &err);
if (err) Fatal("Cannot create and copy B from host to device\n");
// Allocate device memory for C on device
cl_mem Cd = clCreateBuffer(_CONTEXT,CL_MEM_WRITE_ONLY,N,NULL,&err);
if (err) Fatal("Cannot create C on device\n");
// Compile kernel
cl_program prog = clCreateProgramWithSource(_CONTEXT,1,&source,0,&err);
if (err) Fatal("Cannot create program\n");
if (clBuildProgram(prog,0,NULL,NULL,NULL,NULL))
{
char log[1048576];
if (clGetProgramBuildInfo(prog,_DEV_ID,CL_PROGRAM_BUILD_LOG,sizeof(log),log,NULL))
Fatal("Cannot get build log\n");
else
Fatal("Cannot build program\n%s\n",log);
}
cl_kernel kernel = clCreateKernel(prog,"AxB",&err);
if (err) Fatal("Cannot create kernel\n");
// Set parameters for kernel
if (clSetKernelArg(kernel,0,sizeof(cl_mem),&Cd)) Fatal("Cannot set kernel parameter Cd\n");
if (clSetKernelArg(kernel,1,sizeof(cl_mem),&Ad)) Fatal("Cannot set kernel parameter Ad\n");
if (clSetKernelArg(kernel,2,sizeof(cl_mem),&Bd)) Fatal("Cannot set kernel parameter Bd\n");
if (clSetKernelArg(kernel,3,sizeof(int),&n)) Fatal("Cannot set kernel parameter n\n");
// Run kernel
size_t Global[2] = {(size_t)n, (size_t)n};
size_t Local[2] = {(size_t)Bw, (size_t)Bw};
if (clEnqueueNDRangeKernel(_QUEUE,kernel,2,NULL,Global,Local,0,NULL,NULL)) Fatal("Cannot run kernel\n");
// Release kernel and program
if (clReleaseKernel(kernel)) Fatal("Cannot release kernel\n");
if (clReleaseProgram(prog)) Fatal("Cannot release program\n");
// Copy C from device to host (block until done)
if (clEnqueueReadBuffer(_QUEUE,Cd,CL_TRUE,0,N,Ch,0,NULL,NULL)) Fatal("Cannot copy C from device to host\n");
// Free device memory
clReleaseMemObject(Ad);
clReleaseMemObject(Bd);
clReleaseMemObject(Cd);
}
/*
* main
*/
int main(int argc, char* argv[])
{
// Process options
int opt;
int verbose=0;
while ((opt=getopt(argc,argv,"v"))!=-1)
{
if (opt=='v')
verbose++;
else
Fatal("Usage: [-v] <block width> <number of blocks>\n");
}
argc -= optind;
argv += optind;
// Get width and number of blocks
if (argc!=2) Fatal("Usage: [-v] <block width> <number of blocks>\n");
int Bw = atoi(argv[0]);
if (Bw<1) Fatal("Block width out of range %d\n",Bw);
int Bn = atoi(argv[1]);
if (Bn<1) Fatal("Number of blocks out of range %d\n",Bn);
// Total width is block times number of blocks
int n = Bw*Bn;
int N = n*n*sizeof(float);
printf("Bw=%d Bn=%d n=%d\n",Bw,Bn,n);
// Initialize GPU
int Mw = InitGPU(verbose);
if (Mw<Bw*Bw) Fatal("Thread count %d exceeds max work group size of %d\n",Bw*Bw,Mw);
// Allocate host matrices A/B/C/R
float* Ah = (float*)malloc(N);
float* Bh = (float*)malloc(N);
float* Ch = (float*)malloc(N);
float* Rh = (float*)malloc(N);
if (!Ah || !Bh || !Ch || !Rh) Fatal("Cannot allocate host memory\n");
// Initialize A & B
srand(9999);
RandomInit(Ah,n);
RandomInit(Bh,n);
// Compute R = AB on host
Elapsed();
AxBh(Rh,Ah,Bh,n);
float Th = Elapsed();
// Compute C = AB on device
Elapsed();
AxBd(Ch,Ah,Bh,Bw,Bn);
float Td = Elapsed();
// Compute difference between R and C
double r2=0;
for (int i=0;i<n*n;i++)
r2 += fabs(Ch[i]-Rh[i]);
r2 /= n*n;
// Free host memory
free(Ah);
free(Bh);
free(Ch);
free(Rh);
// Print results
printf("Host Time = %6.3f s\n",Th);
printf("Device Time = %6.3f s\n",Td);
printf("Speedup = %.1f\n",Th/Td);
printf("Difference = %.2e\n",r2);
// Done
return 0;
}
I compile it using (which you will obviously have to alter a little):
g++ -Wall -o exMatrixMult -I"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/include" -L"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/lib/x86_64" exMatrixMult.cpp -lOpenCL
My "Hawaii" graphics card can, however, run the example programs that came with the OpenCL SDK (in "AMD APP SDK\3.0\samples\opencl\bin\x86_64"). I spent most of the afternoon to see how their source code differs from mine, no success so far.
If at all useful the code where the error occurs use to look like this (same problem still occurred):
// Print fastest device info
if (clGetDeviceInfo(devid,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(devid,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
// Create OpenCL context for fastest device
context = clCreateContext(0,1,&devid,Notify,NULL,&err);
if(!context || err) Fatal("Cannot create OpenCL context\n");
// Create OpenCL command queue for fastest device
queue = clCreateCommandQueueWithProperties(context, devid, 0, &err);
if (err) Fatal("Cannot create OpenCL command cue\n");
Now the problem may not be verifiable if you don't have a similar graphics card. But I don't know.
The problem went away when I updated my graphics card's driver.

cuBLAS matrix inverse much slower than MATLAB

In my current project, I am attempting to calculate the inverse of a large (n > 2000) matrix with cuBLAS. The inverse calculation is performed, but for some reason calculation times are significantly slower than compared to those when done in MATLAB.
I have attached a sample calculation performed on random matrices using my implementation in either language as well as performance results.
Any help or suggestions on what may be causing this slowdown would be greatly appreciated.
Thank you in advance.
Comparison
cuBLAS vs. MATLAB
N = 500 : cuBLAS ~ 0.130 sec, MATLAB ~ 0.066 sec -> ~1.97x slower
N = 1000 : cuBLAS ~ 0.898 sec, MATLAB ~ 0.311 sec -> ~2.89x slower
N = 2000 : cuBLAS ~ 6.667 sec, MATLAB ~ 0.659 sec -> ~10.12x slower
N = 4000 : cuBLAS ~ 51.860 sec, MATLAB ~ 4.296 sec -> ~12.07x slower
C++ Code
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <conio.h>
#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }
static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;
void d_CUDATimerStart(void)
{
CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");
CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}
float d_CUDATimerStop(void)
{
CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");
CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");
CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");
return ms;
}
float* d_GetInv(float* L, int n)
{
cublasHandle_t cu_cublasHandle;
CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");
float** adL;
float** adC;
float* dL;
float* dC;
int* dLUPivots;
int* dLUInfo;
size_t szA = n * n * sizeof(float);
CUDA_CALL(cudaMalloc(&adL, sizeof(float*)), "Failed to allocate adL!");
CUDA_CALL(cudaMalloc(&adC, sizeof(float*)), "Failed to allocate adC!");
CUDA_CALL(cudaMalloc(&dL, szA), "Failed to allocate dL!");
CUDA_CALL(cudaMalloc(&dC, szA), "Failed to allocate dC!");
CUDA_CALL(cudaMalloc(&dLUPivots, n * sizeof(int)), "Failed to allocate dLUPivots!");
CUDA_CALL(cudaMalloc(&dLUInfo, sizeof(int)), "Failed to allocate dLUInfo!");
CUDA_CALL(cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice), "Failed to copy to dL!");
CUDA_CALL(cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adL!");
CUDA_CALL(cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adC!");
d_CUDATimerStart();
CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1), "Failed to perform LU decomp operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1), "Failed to perform Inverse operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
float timed = d_CUDATimerStop();
printf("cublas inverse in: %.5f ms.\n", timed);
float* res = (float*)malloc(szA);
CUDA_CALL(cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");
CUDA_CALL(cudaFree(adL), "Failed to free adL!");
CUDA_CALL(cudaFree(adC), "Failed to free adC!");
CUDA_CALL(cudaFree(dL), "Failed to free dL!");
CUDA_CALL(cudaFree(dC), "Failed to free dC!");
CUDA_CALL(cudaFree(dLUPivots), "Failed to free dLUPivots!");
CUDA_CALL(cudaFree(dLUInfo), "Failed to free dLUInfo!");
CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");
return res;
}
int main()
{
int n = 1000;
float* L = (float*)malloc(n * n * sizeof(float));
for(int i = 0; i < n * n; i++)
L[i] = ((float)rand()/(float)(RAND_MAX));
float* inv = d_GetInv(L, n);
printf("done.");
_getch();
return 0;
}
MATLAB Code
A = rand(1000);
tic
X = inv(A);
toc
System Info:
GPU: GTX 780 3gb
CPU: i7-4790S # 3.20 GHz
As #RobertCrovella said, you should not use batched small matrix APIs for a single large matrix inversion.
Basically you could use the same method as in your code, but with the non-batched version of getrf() and getri() to maximum the performance for large matrix.
For getrf() you could find it here.
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf
For getri(), although CUDA toolkit does not provide a getri() to solve AX=I, where A is LU-facotored by getrf(), it does provide a getrs() to solve AX=B. All you need to do is to set B=I before calling getrs().
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs

tex1Dfetch unexpectedly returning 0

I don't believe this is the same issue as reported here :
Bound CUDA texture reads zero
CUDA 1D texture fetch always return 0
In my CUDA application I noticed that tex1Dfetch is not returning the expected value, past a certain index in the buffer. An initial observation in the application was that a value at index 0 could be read correctly, but at 12705625, the value read was 0. I made a small test program to investigate this, given below. The results are a little bit baffling to me. I'm trying to probe at what index the values no longer are read correctly. But as the value arraySize is changed, so does the "firstBadIndex". Even with arraySize =2, the second value is read incorrectly! As arraySize is made bigger, the firstBadIndex gets bigger. This happens when binding to arrays of float, float2, or float4. If the data are read from the device buffer instead (switch around the commented lines in FetchTextureData), then everything is fine. This is using CUDA 6.5, on a Tesla c2075.
Thanks for any insights or advice you might have.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define FLOATTYPE float4
texture<FLOATTYPE,cudaTextureType1D,cudaReadModeElementType> texture1D;
const unsigned int arraySize = 1000;
FLOATTYPE* host;
FLOATTYPE* device;
FLOATTYPE* dTemp;
FLOATTYPE hTemp[1];
__global__ void FetchTextureData(FLOATTYPE* data,FLOATTYPE* arr,int idx)
{
data[0] = tex1Dfetch(texture1D, idx);
//data[0] = arr[idx];
}
bool GetTextureValues(int idx){
FetchTextureData<<<1,1>>>(dTemp,device,idx);
// copy to the host
cudaError_t err = cudaMemcpy(hTemp,dTemp,sizeof(FLOATTYPE),cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
throw "cudaMemcpy failed!";
}
if (cudaDeviceSynchronize() != cudaSuccess) {
throw "cudaDeviceSynchronize failed!";
}
return hTemp[0].x == 1.0f;
}
int main()
{
try{
host = new FLOATTYPE[arraySize];
cudaError_t err = cudaMalloc((void**)&device,sizeof(FLOATTYPE) * arraySize);
cudaError_t err1 = cudaMalloc((void**)&dTemp,sizeof(FLOATTYPE));
if (err != cudaSuccess || err1 != cudaSuccess) {
throw "cudaMalloc failed!";
}
// make some host data
for(unsigned int i=0; i<arraySize; i++){
FLOATTYPE data = {1.0f, 0.0f, 0.0f, 0.0f};
host[i] = data;
}
// and copy it to the device
err = cudaMemcpy(device,host,sizeof(FLOATTYPE) * arraySize,cudaMemcpyHostToDevice);
if (err != cudaSuccess){
throw "cudaMemcpy failed!";
}
// set up the textures
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<FLOATTYPE>();
texture1D.addressMode[0] = cudaAddressModeClamp;
texture1D.filterMode = cudaFilterModePoint;
texture1D.normalized = false;
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
// do a texture fetch and find where the fetches stop working
int lastGoodValue = -1, firstBadValue = -1;
float4 badValue = {-1.0f,0.0f,0.0f,0.0f};
for(unsigned int i=0; i<arraySize; i++){
if(i % 100000 == 0) printf("%d\n",i);
bool isGood = GetTextureValues(i);
if(firstBadValue == -1 && !isGood)
firstBadValue = i;
if(isGood)
lastGoodValue = i;
else
badValue = hTemp[0];
}
printf("lastGoodValue %d, firstBadValue %d\n",lastGoodValue,firstBadValue);
printf("Bad value is (%.2f)\n",badValue.x);
}catch(const char* err){
printf("\nCaught an error : %s\n",err);
}
return 0;
}
The problem lies in the texture set up. This:
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
should be:
cudaBindTexture(NULL, texture1D, device, channelDesc,
arraySize * sizeof(FLOATTYPE));
As per the documentation, the size argument is the size of the memory area in bytes, not the number of elements. I would have expected that with the clamped addressing mode, the code would still work as expected. With border mode, you should get a zero value which looks like it would trigger your bad value detection. I haven't actually run your code, so perhaps there is a subtley I'm missing somewhere. For such a simple repro case, your code structure is rather convoluted and hard to follow (at least on the mobile phone screen I am reading it on).
EDIT to add that between the time I started writing this and finished, #njuffa pointed out the same mistake in comments

CUBLAS memory allocation error

I tried to allocate 17338896 elements of floating point numbers as follows (which is roughly 70 mb):
state = cublasAlloc(theSim->Ndim*theSim->Ndim,
sizeof(*(theSim->K0)),
(void**)&K0cuda);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
return -1;
}
However, I'm receiving error message of CUBLAS_STATUS_ALLOC_FAILED for the variable state. Would this have anything to do with the amount of video card memory available on the machine (128 mb on mine) or would this be a limit of the amount of memory that I can allocate using cublasAlloc() function (i.e. not relevant to the amount of memory available on the machine)? I tried using cudaMalloc() function and I am running into the same problem. Thanks in advance for looking into this.
--------------Addition of Error Reproduction-------------------------------------
#include <cuda.h>
#include <stdio.h>
int main (int argc, char *argv[]) {
// CUDA setup
cublasStatus state;
if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {
printf("CUBLAS init error.\n");
return -1;
}
// Instantiate video memory pointers
float *K0cuda;
// Allocate video memory needed
state = cublasAlloc(20000000,
sizeof(float),
(void**)&K0cuda);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
return -1;
}
// Copy K0 from CPU memory to GPU memory
// Note: before so, decide whether to integrate as a part of InsertionSim or
// CUDA content as a separate class
//state = cublasSetMatrix(theSim->Ndim, theSim->Ndim, sizeof(*theSim->K0),
// theSim->K0, theSim->Ndim, K0cuda, theSim->Ndim);
//if(state != CUBLAS_STATUS_SUCCESS) {
// printf("Error copy to video memory.\n");
// return -1;
//}
// Free memory
if(cublasFree(K0cuda) != CUBLAS_STATUS_SUCCESS) {
printf("Error freeing video memory.\n");
return -1;
}
// CUDA shutdown
if(cublasShutdown() != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS shutdown error.\n");
return -1;
}
if(theSim != NULL) delete theSim;
return 0;
}
Memory can fragment, which means that you can still allocate multiple smaller blocks but not a single large block. Your videocard will obviously need some memory for its normal 2D task. If that happens to break the 128 MB into 2 blocks of almost 64MB, then you'd see this kind of failure.