cuBLAS matrix inverse much slower than MATLAB

cuBLAS matrix inverse much slower than MATLAB - c++

In my current project, I am attempting to calculate the inverse of a large (n > 2000) matrix with cuBLAS. The inverse calculation is performed, but for some reason calculation times are significantly slower than compared to those when done in MATLAB.
I have attached a sample calculation performed on random matrices using my implementation in either language as well as performance results.
Any help or suggestions on what may be causing this slowdown would be greatly appreciated.
Thank you in advance.
Comparison
cuBLAS vs. MATLAB
N = 500 : cuBLAS ~ 0.130 sec, MATLAB ~ 0.066 sec -> ~1.97x slower
N = 1000 : cuBLAS ~ 0.898 sec, MATLAB ~ 0.311 sec -> ~2.89x slower
N = 2000 : cuBLAS ~ 6.667 sec, MATLAB ~ 0.659 sec -> ~10.12x slower
N = 4000 : cuBLAS ~ 51.860 sec, MATLAB ~ 4.296 sec -> ~12.07x slower
C++ Code
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <conio.h>
#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }
static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;
void d_CUDATimerStart(void)
{
CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");
CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}
float d_CUDATimerStop(void)
{
CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");
CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");
CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");
return ms;
}
float* d_GetInv(float* L, int n)
{
cublasHandle_t cu_cublasHandle;
CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");
float** adL;
float** adC;
float* dL;
float* dC;
int* dLUPivots;
int* dLUInfo;
size_t szA = n * n * sizeof(float);
CUDA_CALL(cudaMalloc(&adL, sizeof(float*)), "Failed to allocate adL!");
CUDA_CALL(cudaMalloc(&adC, sizeof(float*)), "Failed to allocate adC!");
CUDA_CALL(cudaMalloc(&dL, szA), "Failed to allocate dL!");
CUDA_CALL(cudaMalloc(&dC, szA), "Failed to allocate dC!");
CUDA_CALL(cudaMalloc(&dLUPivots, n * sizeof(int)), "Failed to allocate dLUPivots!");
CUDA_CALL(cudaMalloc(&dLUInfo, sizeof(int)), "Failed to allocate dLUInfo!");
CUDA_CALL(cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice), "Failed to copy to dL!");
CUDA_CALL(cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adL!");
CUDA_CALL(cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adC!");
d_CUDATimerStart();
CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1), "Failed to perform LU decomp operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1), "Failed to perform Inverse operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
float timed = d_CUDATimerStop();
printf("cublas inverse in: %.5f ms.\n", timed);
float* res = (float*)malloc(szA);
CUDA_CALL(cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");
CUDA_CALL(cudaFree(adL), "Failed to free adL!");
CUDA_CALL(cudaFree(adC), "Failed to free adC!");
CUDA_CALL(cudaFree(dL), "Failed to free dL!");
CUDA_CALL(cudaFree(dC), "Failed to free dC!");
CUDA_CALL(cudaFree(dLUPivots), "Failed to free dLUPivots!");
CUDA_CALL(cudaFree(dLUInfo), "Failed to free dLUInfo!");
CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");
return res;
}
int main()
{
int n = 1000;
float* L = (float*)malloc(n * n * sizeof(float));
for(int i = 0; i < n * n; i++)
L[i] = ((float)rand()/(float)(RAND_MAX));
float* inv = d_GetInv(L, n);
printf("done.");
_getch();
return 0;
}
MATLAB Code
A = rand(1000);
tic
X = inv(A);
toc
System Info:
GPU: GTX 780 3gb
CPU: i7-4790S # 3.20 GHz

As #RobertCrovella said, you should not use batched small matrix APIs for a single large matrix inversion.
Basically you could use the same method as in your code, but with the non-batched version of getrf() and getri() to maximum the performance for large matrix.
For getrf() you could find it here.
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf
For getri(), although CUDA toolkit does not provide a getri() to solve AX=I, where A is LU-facotored by getrf(), it does provide a getrs() to solve AX=B. All you need to do is to set B=I before calling getrs().
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs

Related

cudaMemcpy throws InvalidValue error when copying from device to host

I've been trying to implement a one dimensional FFT using cuFFT. An InvalidValue error is thrown and no meaningful results are produced.
I've tried to ensure that each error is caught, and I believe that the cudaMemcpy from DeviceToHost causes the issue, though I am not sure why, nor how to fix it. The data size parameter in cudaMemcpy follows the same relation as supplied by the cuFFT documentation.
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
// cuda macros
#define NX 100 // number of points
#define BATCH 1 // number of ffts to perform
#define RANK 1 //
#define IDIST 1 // distance between 1st elements of batches
#define ISTRIDE 1 // do every ISTRIDEth index
#define ODIST 1 // distance between 1st elements of output
#define OSTRIDE 1 // distance between output elements
void fft1d() {
// create plan for performing fft
cufftHandle plan;
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS) {
printf("Failed to create 1D plan\n");
return;
}
// assemble data
double temp_data[] = {2.598076211353316, 3.2402830701637395, 3.8494572900049224, 4.419388724529261, 4.944267282795252, 5.41874215947433, 5.837976382931011, 6.197696125093141, 6.494234270429254, 6.724567799874842, 6.886348608602047, 6.97792744346504, 6.998370716093996, 6.9474700202387565, 6.8257442563389, 6.6344343416615565, 6.37549055993378, 6.051552679431957, 5.665923042211819, 5.222532898817316, 4.725902331664744, 4.181094175657916, 3.5936624057845576, 2.9695955178498603, 2.315255479544737, 1.6373128742041732, 0.9426788984240022, 0.23843490677753865, -0.46823977812093664, -1.1701410542749289, -1.8601134815746807, -2.531123226988873, -3.176329770049035, -3.7891556376344524, -4.363353457155562, -4.893069644570959, -5.3729040779788875, -5.797965148448726, -6.163919626883915, -6.467036838555256, -6.704226694973039, -6.873071195387157, -6.971849076777267, -6.999553361041935, -6.955901620504255, -6.84133885708361, -6.657032965782207, -6.404862828733319, -6.0873991611848375, -5.707878304681281, -5.270169234606201, -4.778734118422206, -4.23858282669252, -3.6552218606153755, -3.0345982167228436, -2.383038761007964, -1.707185730522749, -1.0139290199674, -0.31033594356630245, 0.39642081173600463, 1.0991363072871054, 1.7906468025248339, 2.463902784786862, 3.1120408346390414, 3.728453594100783, 4.306857124485735, 4.841354967187034, 5.326498254347925, 5.757341256627454, 6.129491801786784, 6.439156050110601, 6.683177170206378, 6.859067520906216, 6.965034011197066, 6.999996379650895, 6.963598207007518, 6.85621054964381, 6.678928156888352, 6.433558310743566, 6.122602401787424, 5.749230429076629, 5.317248684008804, 4.831060947586139, 4.295623596650021, 3.7163950767501706, 3.0992802567403803, 2.4505702323708074, 1.7768781925409076, 1.0850720020162676, 0.3822041878858906, -0.3245599564963766, -1.0280154171511335, -1.7209909100394047, -2.3964219877733033, -3.0474230571943477, -3.667357573646071, -4.249905696354359, -4.78912871521179, -5.279529592175676, -5.716109000098287};
cufftReal *idata;
cudaMalloc((void**) &idata, sizeof(cufftComplex)*NX);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory space for input data.\n");
return;
}
cudaMemcpy(idata, temp_data, sizeof(temp_data)/sizeof(double), cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to load time data to memory.\n");
return;
}
// prepare memory for return data
cufftComplex *odata;
cudaMalloc((void**) &odata, sizeof(cufftComplex)*(NX/2 + 1));
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory for output data.\n");
}
// perform fft
if (cufftExecR2C(plan, idata, odata) != CUFFT_SUCCESS) {
printf("Failed to perform fft.\n");
return;
}
I think the error is thrown here, at the cudaMemcpy.
// grab data from graphics and print (memcpy waits until complete) cuda memcopy doesn't complete
// can return errors from previous cuda calls if they haven't been caught
cufftComplex *out_temp_data;
size_t num_bytes = (NX/2 + 1)*sizeof(cufftComplex);
cudaMemcpy(out_temp_data, odata, num_bytes, cudaMemcpyDeviceToHost);
int error_value = cudaGetLastError();
printf("cudaMemcpy from device state: %i\n", error_value);
if(error_value != cudaSuccess) {
printf("Failed to pull data from device.\n");
return;
}
for (size_t i = 0; i < (NX/2 + 1); i++) {
printf("%lu %f %f\n", i, out_temp_data[i].x, out_temp_data[i].y);
}
// clean up
cufftDestroy(plan);
cudaFree(idata);
}
int main() {
fft1d();
return 0;
}

Memory must be allocated before cudaMemcpy can write the data. Thanks to generic-opto-guy for pointing this out.
In this case:
out_temp_data = new cufftComplex[NX/2 + 1];

Why does my data not fit into a CUDA Texture Object?

I'm trying to fill a CUDA Texture Object with some data but the call to cudaCreateTextureObject fails with the following error (edit: on both a GTX 1080TI and a RTX 2080TI):
GPU ERROR! 'invalid argument' (err code 11)
It works if I put less data into my texture so my guess is that my computation about how much data I can fit into a texture is off.
My thought process is as follows:
(executable code follows below)
My data comes in the form of (76,76) images where each pixel is a float. What I would like to do is to store a column of images in a Texture Object; as I understand it, cudaMallocPitch is the way to do this.
When computing the number of images I can store in one texture I'm using the following formula to determine how much space a single image needs:
GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float)
Where the first argument should be the memory pitch on a GTX 1080TI card (512 bytes). The number of bytes that I can store in a 1D texture is given as 2^27 here. When I divide the latter by the former I get 862.3, assuming this is the number of images I can store in one Texture Object. However, when I try to store more than 855 images in my buffer the program crashes with the error above.
Here's the code:
In the following the main function (a) sets up all the relevant parameters, (b) allocates the memory using cudaMallocPitch, and (c) configures and creates a CUDA Texture Object:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#define GTX_1080TI_MEM_PITCH 512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27
//=====================================================================[ util ]
// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit( EXIT_FAILURE );
}
}
// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
cudaError cu_err = cudaGetLastError();
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit(EXIT_FAILURE);
}
}
//=====================================================================[ main ]
int main(){
// setup
unsigned int img_dim_x = 76;
unsigned int img_dim_y = 76;
unsigned int img_num = 856; // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?
unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
unsigned int img_num_per_tex = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;
fprintf( stderr, "We should be able to stuff %d images into one texture.\n", img_num_per_tex );
fprintf( stderr, "We use %d (more than 855 leads to a crash).\n", img_num );
// allocate pitched memory
size_t img_tex_pitch;
float* d_img_tex_data;
CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );
assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d\n",
img_num*img_dim_x*img_dim_y*sizeof(float),
img_num*img_tex_pitch*img_dim_y*sizeof(float),
img_tex_pitch,
GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
GTX_1080TI_1DTEX_WIDTH );
// generic resource descriptor
cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypePitch2D;
res_desc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
res_desc.res.pitch2D.devPtr = d_img_tex_data;
res_desc.res.pitch2D.width = img_dim_x;
res_desc.res.pitch2D.height = img_dim_y*img_num;
res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;
// texture descriptor
cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModeLinear; // for linear interpolation (NOTE: this breaks normal integer indexing!)
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = false; // we want to index using [0;img_dim] rather than [0;1]
// make sure there are no lingering errors
GPU_ERR_CHK();
fprintf(stderr, "No CUDA error until now..\n");
// create texture object
cudaTextureObject_t img_tex_obj;
CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );
fprintf(stderr, "bluppi\n");
}
This should crash when cudaCreateTextureObject is called. If the img_num parameter (at the start of main) is changed from 856 to 855, however, the code should execute successfully. (edit: The expected behavior would be that the code runs through with a value of 862 but fails with a value of 863 since that actually requires more bytes than the documented buffer size offers.)
Any help would be appreciated!

Since you're working with a 2D texture here, the number of bytes you can store in a 1D texture (the "width") is of no relevance here.
2D textures may have different characteristics depending on the type of memory that provides the backing for the texture. Two examples are linear memory and CUDA Array. You have chosen to use a linear memory backing (that which is provided by cudaMalloc* operations other than cudaMallocArray).
The primary problem you are running into is the maximum texture height. To discover what this is, we could refer to the table 14 in the programming guide, which lists:
Maximum width and height for a 2D texture reference bound to linear memory 65000 x 65000
You are exceeding this 65000 number when going from 855 to 856 images, for an image height of 76 rows. 856*76 = 65056, 855*76 = 64980
"But wait" you say, that table 14 entry says texture reference, and I am using a texture object.
You are correct, and table 14 doesn't explicitly list the corresponding limit for texture objects. In that case, we have to refer to the device properties readable from the device at runtime, using cudaGetDeviceProperties(). If we review the data available there, we see this readable item:
maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
(I suspect the 3 is a typo, but no matter, we only need the first 2 values).
This is the value we want to be sure. If we modify your code to obey that limit, there are no problems:
$ cat t382.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#define GTX_1080TI_MEM_PITCH 512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27
//=====================================================================[ util ]
// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit( EXIT_FAILURE );
}
}
// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
cudaError cu_err = cudaGetLastError();
if( cu_err != cudaSuccess ){
fprintf( stderr, "\nGPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.\n\n", cudaGetErrorString(cu_err), cu_err, file, line );
exit(EXIT_FAILURE);
}
}
//=====================================================================[ main ]
int main(){
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
size_t max2Dtexturelinearwidth = prop.maxTexture2DLinear[0]; // texture x dimension
size_t max2Dtexturelinearheight = prop.maxTexture2DLinear[1]; // texture y dimension
fprintf( stderr, "maximum 2D linear texture dimensions (width,height): %lu,%lu\n", max2Dtexturelinearwidth, max2Dtexturelinearheight);
// setup
unsigned int img_dim_x = 76;
unsigned int img_dim_y = 76;
//unsigned int img_num = 856; // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?
unsigned int img_num = max2Dtexturelinearheight/img_dim_y;
fprintf( stderr, "maximum number of images per texture: %u\n", img_num);
unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
unsigned int img_num_per_tex = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;
fprintf( stderr, "We should be able to stuff %d images into one texture.\n", img_num_per_tex );
fprintf( stderr, "We use %d (more than 855 leads to a crash).\n", img_num );
// allocate pitched memory
size_t img_tex_pitch;
float* d_img_tex_data;
CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );
assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d\n",
img_num*img_dim_x*img_dim_y*sizeof(float),
img_num*img_tex_pitch*img_dim_y*sizeof(float),
img_tex_pitch,
GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
GTX_1080TI_1DTEX_WIDTH );
// generic resource descriptor
cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypePitch2D;
res_desc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
res_desc.res.pitch2D.devPtr = d_img_tex_data;
res_desc.res.pitch2D.width = img_dim_x;
res_desc.res.pitch2D.height = img_dim_y*img_num;
res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;
// texture descriptor
cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModeLinear; // for linear interpolation (NOTE: this breaks normal integer indexing!)
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = false; // we want to index using [0;img_dim] rather than [0;1]
// make sure there are no lingering errors
GPU_ERR_CHK();
fprintf(stderr, "No CUDA error until now..\n");
// create texture object
cudaTextureObject_t img_tex_obj;
CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );
fprintf(stderr, "bluppi\n");
}
$ nvcc -o t382 t382.cu
$ cuda-memcheck ./t382
========= CUDA-MEMCHECK
maximum 2D linear texture dimensions (width,height): 131072,65000
maximum number of images per texture: 855
We should be able to stuff 862 images into one texture.
We use 855 (more than 855 leads to a crash).
Asking for 19753920 bytes allocates 133079040 bytes using pitch 512. Available: 1138688/134217728
No CUDA error until now..
bluppi
========= ERROR SUMMARY: 0 errors
$

OpenCL: CL_OUT_OF_HOST_MEMORY on clCreateCommandQueueWithProperties (with Minimal, Complete, and Verifiable example)

I have a MSI Radeon R9 390X 8GB Video Card (named "Hawaii" as seen below). I have OpenCL installed on my Windows 10 Desktop, I am using Cygwin to compile and run the program.
I am trying to run an example OpenCL program I have kept around from a class from my college days, modified a little.
It won't run on my graphics card. Here is what I get:
$ ./ex26.exe -v 30 40
Bw=30 Bn=40 n=1200
OpenCL Platform 0: AMD Accelerated Parallel Processing
----- OpenCL Device # 0: Hawaii-----
Gflops: 47.520000
Max Compute Units: 44
Max Clock Frequency: 1080
Total Memory of Device (bytes): 8589934592
Max Size of Memory Object Allocation (bytes): 4244635648
Max Work Group Size: 256
Fastest OpenCL Device: Hawaii
Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY
winnerPlatform: 140717488209200
You can see the code below to see where this error statements prints out. For whatever reason clCreateCommandQueueWithProperties is returning CL_OUT_OF_HOST_MEMORY. I don't understand how my CPU side memory could even be close to running out of memory to operate. I really don't know. Especially since all this method is doing is creating the queue.
In fact, if I switch CL_DEVICE_TYPE_GPU to CL_DEVICE_TYPE_CPU then the program executes without problem on the CPU.
It's all just on .cpp file. I couldn't really find anything to cut to make the MCV more minimal since it already an example so you are pretty the code you see if exactly what I have.
Here is all of the code below:
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <CL/opencl.h>
#include <windows.h>
#include <sys/time.h>
/*
* Return elapsed wall time since last call (seconds)
*/
static double t0=0;
float Elapsed(void)
{
#ifdef _WIN32
// Windows version of wall time
LARGE_INTEGER tv,freq;
QueryPerformanceCounter((LARGE_INTEGER*)&tv);
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
double t = tv.QuadPart/(double)freq.QuadPart;
#else
// Unix/Linux/OSX version of wall time
struct timeval tv;
gettimeofday(&tv,NULL);
double t = tv.tv_sec+1e-6*tv.tv_usec;
#endif
float s = t-t0;
t0 = t;
return s;
}
/*
* Print message to stderr and exit
*/
void Fatal(const char* format , ...)
{
va_list args;
va_start(args,format);
vfprintf(stderr,format,args);
va_end(args);
exit(1);
}
/*
* Initialize matrix with random values
*/
void RandomInit(float x[],const unsigned int n)
{
for (unsigned int i=0;i<n*n;i++)
x[i] = rand() / (float)RAND_MAX;
}
/*
* OpenCL notify callback (echo to stderr)
*/
void Notify(const char* errinfo,const void* private_info,size_t cb,void* user_data)
{
fprintf(stderr,"%s\n",errinfo);
}
class ErrorReader {
public:
private:
};
/*
* Initialize fastest OpenCL device
*/
cl_device_id _DEV_ID;
cl_context _CONTEXT;
cl_command_queue _QUEUE;
int InitGPU(int verbose)
{
cl_uint Nplat;
cl_int err;
char name[1024];
int MaxGflops = -1;
cl_platform_id winnerPlatform = 0;
// Get platforms
cl_platform_id platforms[8];
if (clGetPlatformIDs(8, platforms, &Nplat)) Fatal("Cannot get number of OpenCL platforms\n");
else if (Nplat<1) Fatal("No OpenCL platforms found\n");
// Loop over platforms
for (unsigned int platform = 0; platform < Nplat; platform++) {
if (clGetPlatformInfo(platforms[platform], CL_PLATFORM_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL platform name\n");
if (verbose) printf("OpenCL Platform %d: %s\n", platform, name);
// Get GPU device IDs
cl_uint Ndev;
cl_device_id id[1024];
if (clGetDeviceIDs(platforms[platform], CL_DEVICE_TYPE_GPU, 1024, id, &Ndev))
Fatal("Cannot get number of OpenCL devices\n");
else if (Ndev<1)
Fatal("No OpenCL devices found\n");
// Find the fastest device
for (unsigned int devId = 0; devId < Ndev; devId++) {
cl_uint compUnits, freq;
cl_ulong memSize, maxAlloc;
size_t maxWorkGrps;
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compUnits), &compUnits, NULL)) Fatal("Cannot get OpenCL device units\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL)) Fatal("Cannot get OpenCL device frequency\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memSize), &memSize, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(memSize), &maxAlloc, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGrps), &maxWorkGrps, NULL)) Fatal("Cannot get OpenCL max work group size\n");
int Gflops = compUnits * freq;
if (verbose) printf(" ----- OpenCL Device # %d: %s-----\n"
"Gflops: %f\n"
"Max Compute Units: %d\n"
"Max Clock Frequency: %d\n"
"Total Memory of Device (bytes): %lu\n"
"Max Size of Memory Object Allocation (bytes): %lu\n"
"Max Work Group Size: %zu\n\n",
devId,
name,
1e-3*Gflops,
compUnits,
freq,
memSize,
maxAlloc,
maxWorkGrps);
if (Gflops > MaxGflops)
{
_DEV_ID = id[devId];
MaxGflops = Gflops;
winnerPlatform = platforms[platform];
}
}
}
// Print fastest device info
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
printf("winnerPlatform: %zu", winnerPlatform);
// cl_platform_id platform = NULL;
// int retValue = GetPlatform(&platform, winnerPlatform, true);
// Create OpenCL _CONTEXT for fastest device
// _CONTEXT = clCreateContext(0,1,&_DEV_ID,Notify,NULL,&err);
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)winnerPlatform,
(cl_context_properties)0
};
_CONTEXT = clCreateContextFromType(cps,
CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
if (!_CONTEXT || err) Fatal("Cannot create OpenCL Context\n");
cl_command_queue_properties *propers;
cl_command_queue_properties prop = 0;
//prop |= CL_QUEUE_PROFILING_ENABLE;
//prop |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
propers = ∝
_QUEUE = clCreateCommandQueueWithProperties(_CONTEXT, _DEV_ID, propers, &err); // Create OpenCL command _QUEUE for fastest device
if (err) {
if (err == CL_INVALID_CONTEXT) Fatal("Cannot create OpenCL command cue: CL_INVALID_CONTEXT\n");
else if (err == CL_INVALID_DEVICE) Fatal("Cannot create OpenCL command cue: CL_INVALID_DEVICE\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create OpenCL command cue: CL_INVALID_VALUE\n");
else if (err == CL_INVALID_QUEUE_PROPERTIES) Fatal("Cannot create OpenCL command cue: CL_INVALID_QUEUE_PROPERTIES\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY\n");
else Fatal("Cannot create OpenCL command cue: ???????????? Unknown Error\n");
} else if (!_QUEUE) {
Fatal("Cannot create OpenCL command cue: NULL\n");
}
return mwgs;
}
/*
* C = A * B -- host
*/
void AxBh(float C[], const float A[], const float B[], unsigned int n)
{
for (unsigned int i=0;i<n;i++)
for (unsigned int j=0;j<n;j++)
{
double sum=0;
for (unsigned int k=0;k<n;k++)
sum += (double)A[i*n+k] * (double)B[k*n+j];
C[i*n+j] = (float)sum;
}
}
/*
* Compute one element of A * B
*/
const char* source =
"__kernel void AxB(__global float C[],__global const float A[],__global const float B[],const unsigned int n)\n"
"{\n"
" unsigned int j = get_global_id(0);\n"
" unsigned int i = get_global_id(1);\n"
" float sum =0;\n"
" for (int k=0;k<n;k++)\n"
" sum += A[i*n+k] * B[k*n+j];\n"
" C[i*n+j] = sum;\n"
"}\n";
/*
* C = A * B -- device
*/
void AxBd(float Ch[],float Ah[],float Bh[],const unsigned int Bw,const unsigned int Bn)
{
// Calculate matrix dimensions
int n = Bw*Bn;
int N = n*n*sizeof(float);
// Allocate device memory and copy A&B from host to device
cl_int err;
cl_mem Ad = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Ah, &err);
if (err) Fatal("Cannot create and copy A from host to device\n");
cl_mem Bd = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Bh, &err);
if (err) Fatal("Cannot create and copy B from host to device\n");
// Allocate device memory for C on device
cl_mem Cd = clCreateBuffer(_CONTEXT,CL_MEM_WRITE_ONLY,N,NULL,&err);
if (err) Fatal("Cannot create C on device\n");
// Compile kernel
cl_program prog = clCreateProgramWithSource(_CONTEXT,1,&source,0,&err);
if (err) Fatal("Cannot create program\n");
if (clBuildProgram(prog,0,NULL,NULL,NULL,NULL))
{
char log[1048576];
if (clGetProgramBuildInfo(prog,_DEV_ID,CL_PROGRAM_BUILD_LOG,sizeof(log),log,NULL))
Fatal("Cannot get build log\n");
else
Fatal("Cannot build program\n%s\n",log);
}
cl_kernel kernel = clCreateKernel(prog,"AxB",&err);
if (err) Fatal("Cannot create kernel\n");
// Set parameters for kernel
if (clSetKernelArg(kernel,0,sizeof(cl_mem),&Cd)) Fatal("Cannot set kernel parameter Cd\n");
if (clSetKernelArg(kernel,1,sizeof(cl_mem),&Ad)) Fatal("Cannot set kernel parameter Ad\n");
if (clSetKernelArg(kernel,2,sizeof(cl_mem),&Bd)) Fatal("Cannot set kernel parameter Bd\n");
if (clSetKernelArg(kernel,3,sizeof(int),&n)) Fatal("Cannot set kernel parameter n\n");
// Run kernel
size_t Global[2] = {(size_t)n, (size_t)n};
size_t Local[2] = {(size_t)Bw, (size_t)Bw};
if (clEnqueueNDRangeKernel(_QUEUE,kernel,2,NULL,Global,Local,0,NULL,NULL)) Fatal("Cannot run kernel\n");
// Release kernel and program
if (clReleaseKernel(kernel)) Fatal("Cannot release kernel\n");
if (clReleaseProgram(prog)) Fatal("Cannot release program\n");
// Copy C from device to host (block until done)
if (clEnqueueReadBuffer(_QUEUE,Cd,CL_TRUE,0,N,Ch,0,NULL,NULL)) Fatal("Cannot copy C from device to host\n");
// Free device memory
clReleaseMemObject(Ad);
clReleaseMemObject(Bd);
clReleaseMemObject(Cd);
}
/*
* main
*/
int main(int argc, char* argv[])
{
// Process options
int opt;
int verbose=0;
while ((opt=getopt(argc,argv,"v"))!=-1)
{
if (opt=='v')
verbose++;
else
Fatal("Usage: [-v] <block width> <number of blocks>\n");
}
argc -= optind;
argv += optind;
// Get width and number of blocks
if (argc!=2) Fatal("Usage: [-v] <block width> <number of blocks>\n");
int Bw = atoi(argv[0]);
if (Bw<1) Fatal("Block width out of range %d\n",Bw);
int Bn = atoi(argv[1]);
if (Bn<1) Fatal("Number of blocks out of range %d\n",Bn);
// Total width is block times number of blocks
int n = Bw*Bn;
int N = n*n*sizeof(float);
printf("Bw=%d Bn=%d n=%d\n",Bw,Bn,n);
// Initialize GPU
int Mw = InitGPU(verbose);
if (Mw<Bw*Bw) Fatal("Thread count %d exceeds max work group size of %d\n",Bw*Bw,Mw);
// Allocate host matrices A/B/C/R
float* Ah = (float*)malloc(N);
float* Bh = (float*)malloc(N);
float* Ch = (float*)malloc(N);
float* Rh = (float*)malloc(N);
if (!Ah || !Bh || !Ch || !Rh) Fatal("Cannot allocate host memory\n");
// Initialize A & B
srand(9999);
RandomInit(Ah,n);
RandomInit(Bh,n);
// Compute R = AB on host
Elapsed();
AxBh(Rh,Ah,Bh,n);
float Th = Elapsed();
// Compute C = AB on device
Elapsed();
AxBd(Ch,Ah,Bh,Bw,Bn);
float Td = Elapsed();
// Compute difference between R and C
double r2=0;
for (int i=0;i<n*n;i++)
r2 += fabs(Ch[i]-Rh[i]);
r2 /= n*n;
// Free host memory
free(Ah);
free(Bh);
free(Ch);
free(Rh);
// Print results
printf("Host Time = %6.3f s\n",Th);
printf("Device Time = %6.3f s\n",Td);
printf("Speedup = %.1f\n",Th/Td);
printf("Difference = %.2e\n",r2);
// Done
return 0;
}
I compile it using (which you will obviously have to alter a little):
g++ -Wall -o exMatrixMult -I"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/include" -L"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/lib/x86_64" exMatrixMult.cpp -lOpenCL
My "Hawaii" graphics card can, however, run the example programs that came with the OpenCL SDK (in "AMD APP SDK\3.0\samples\opencl\bin\x86_64"). I spent most of the afternoon to see how their source code differs from mine, no success so far.
If at all useful the code where the error occurs use to look like this (same problem still occurred):
// Print fastest device info
if (clGetDeviceInfo(devid,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(devid,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
// Create OpenCL context for fastest device
context = clCreateContext(0,1,&devid,Notify,NULL,&err);
if(!context || err) Fatal("Cannot create OpenCL context\n");
// Create OpenCL command queue for fastest device
queue = clCreateCommandQueueWithProperties(context, devid, 0, &err);
if (err) Fatal("Cannot create OpenCL command cue\n");
Now the problem may not be verifiable if you don't have a similar graphics card. But I don't know.

The problem went away when I updated my graphics card's driver.

Why does my program run significantly faster on my CPU device than on my GPU device?

I'm pretty new to OpenCL. I learned a little bit in college, and by a little bit I mean my graphics professor taught us about GPGPU and OpenCL for a single day (whereas the rest of the class was focused on shaders and OpenGL and so on).
I took an example program and changed it to work with the computations that I want it to run. However my program runs significantly faster on the CPU than my GPU, and I am trying to understand why.
My program takes in one input float array and has two output arrays. Under single threaded circumstances it has three arguments . The input array's size is: samplesPerTracetracesInsizeof(float), and the output array's size are: samplesPerTracetracesOutsizeof(float).
My test cases have been using the parameters 25000 2500 250, because that is on average the size of the arrays that I will be using (perhaps a little above average). The values are filled in randomly.
Here is the source code that OpenCL builds and runs on the kernel;
const char* M_AND_S_OPENCL_SOURCE_TEXT =
"__kernel void sumAllCL(__global const float prestackTraces[],\n"
" __global float stackTracesOut[],\n"
" __global float powerTracesOut[], const unsigned int nTracesOut, const unsigned int nTracesIn,\n"
" const unsigned int samplesPerTrace) {\n"
"\n"
" unsigned int k = get_global_id(0);\n" // Thread ID
"\n"
" unsigned int kTimesIn = k * nTracesIn;\n" // Store repeat ints
" unsigned int kTimesSamples = k * samplesPerTrace;\n"
"\n"
" for (int j = 0; j < ? ; j++) {\n" // ? position to be replaced (nTracesOut)"
"\n"
" int jTimesSamplesPT = j * samplesPerTrace;\n"
"\n"
" for (int i = 0; i < # ; i++) {\n" // # position to be replaced ()
"\n"
" int valueIndex = i + jTimesSamplesPT;\n"
" float value = prestackTraces[valueIndex];\n"
"\n"
" stackTracesOut[i + kTimesSamples] += value;\n"
" powerTracesOut[i + kTimesSamples] += (value * value);\n"
"\n"
" }\n"
" }\n"
"}\n";
Note that the ? and the # are replaced at run time with fixed numbers, I do this because I thought it would help the compiler unroll the rl
With the above parameters stated (25000 2500 250 ~10 <1 or 2>) it takes my CPU about 0.6 seconds to complete the program and my GPU about 40 seconds to complete. That's a bigger difference. Fyi, I have been messing around with the 4th parameters to see which value runs faster which is what is meant by the ~10.
My graphics card is a MSI Radeon R9 390X 8GB, given the name Hawaii. When I have OpenCL print out information about both of my devices this is what I get:
OpenCL Platform 0: AMD Accelerated Parallel Processing
----- OpenCL Device # 0: Hawaii-----
Gflops: 47.520000
Max Clock Frequency: 1080
Max Compute Units: 44
Max Work Group Size: 256
MEMORY...
Total Memory of Device: 8.000G (CL_DEVICE_GLOBAL_MEM_SIZE)
Local Memory of Device: 32.000K (CL_DEVICE_LOCAL_MEM_SIZE)
Max Memory Object Allocation: 3.999G (CL_DEVICE_MAX_MEM_ALLOC_SIZE)
Cache Size: 16.000K (CL_DEVICE_GLOBAL_MEM_CACHE_SIZE)
Cacheline Size: 64 bytes (CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE)
VERSIONS...
Device Vendor: Advanced Micro Devices, Inc.
Device Version: OpenCL 2.0 AMD-APP (2117.13)
Driver Version: 2117.13 (VM)
Device OpenCL Version: OpenCL C 2.0
----- OpenCL Device # 1: Intel(R) Core(TM) i7-6700K CPU ? 4.00GHz-----
Gflops: 32.064000
Max Clock Frequency: 4008
Max Compute Units: 8
Max Work Group Size: 1024
MEMORY...
Total Memory of Device: 15.967G (CL_DEVICE_GLOBAL_MEM_SIZE)
Local Memory of Device: 32.000K (CL_DEVICE_LOCAL_MEM_SIZE)
Max Memory Object Allocation: 3.1028G (CL_DEVICE_MAX_MEM_ALLOC_SIZE)
Cache Size: 32.000K (CL_DEVICE_GLOBAL_MEM_CACHE_SIZE)
Cacheline Size: 64 bytes (CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE)
VERSIONS...
Device Vendor: GenuineIntel
Device Version: OpenCL 1.2 AMD-APP (2117.13)
Driver Version: 2117.13 (sse2,avx)
Device OpenCL Version: OpenCL C 1.2
Here is the code relevant to OpenCL. I would have posted a entire minimal verifiable complete example however it puts me over the character limit.
/*
* Prints the given int (numToInsert) at location inside chars.
*/
void PrintIntInStr(char* chars, int location, int numToInsert) {
std::stringstream strs;
strs << numToInsert;
std::string temp_str = strs.str();
char const* numToChars = temp_str.c_str();
int numberLength = strlen(numToChars);
int w;
for (w = 0; w < numberLength; w++) {
chars[location + w] = numToChars[w];
}
}
/*
* Initialize fastest OpenCL device.
*/
int InitOpenCL(int verbose, cl_int deviceType) {
cl_uint Nplat;
cl_int err;
char name[1024];
int MaxGflops = -1;
cl_platform_id winnerPlatform = 0;
// Reset (TODO)
_deviceID = NULL;
_context = NULL;
_queue = NULL;
// Get platforms
cl_platform_id platforms[4];
if (clGetPlatformIDs(4, platforms, &Nplat)) Fatal("Cannot get number of OpenCL platforms\n");
else if (Nplat<1) Fatal("No OpenCL platforms found\n");
// Loop over platforms
for (unsigned int platform = 0; platform < Nplat; platform++) {
if (clGetPlatformInfo(platforms[platform], CL_PLATFORM_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL platform name\n");
if (verbose) printf("OpenCL Platform %d: %s\n", platform, name);
// Get GPU device IDs
cl_uint Ndev;
cl_device_id id[4];
if (clGetDeviceIDs(platforms[platform], deviceType, 4, id, &Ndev))
Fatal("Cannot get number of OpenCL devices: %d\n", platform);
else if (Ndev < 1) Fatal("No OpenCL devices found.\n");
// Find the fastest device
for (unsigned int devId = 0; devId < Ndev; devId++) {
// Print informatio about the device
cl_uint compUnits, freq, cacheLineSize;
cl_ulong memSize, maxAlloc, localMemSize, globalCacheSize;
size_t maxWorkGrps;
char deviceVendor[50];
char deviceVersion[50];
char driverVersion[50];
char deviceOpenCLVersion[50];
// Computing Power...
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compUnits), &compUnits, NULL)) Fatal("Cannot get OpenCL device units\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL)) Fatal("Cannot get OpenCL device frequency\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGrps), &maxWorkGrps, NULL)) Fatal("Cannot get OpenCL max work group size\n");
// Memory...
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memSize), &memSize, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMemSize), &localMemSize, NULL)) localMemSize = 0;
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlloc), &maxAlloc, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(globalCacheSize), &globalCacheSize, NULL)) globalCacheSize = 0;
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheLineSize), &cacheLineSize, NULL)) cacheLineSize = 0;
// Versions...
clGetDeviceInfo(id[devId], CL_DEVICE_VENDOR, sizeof(deviceVendor), deviceVendor, NULL);
clGetDeviceInfo(id[devId], CL_DEVICE_VERSION, sizeof(deviceVersion), deviceVersion, NULL);
clGetDeviceInfo(id[devId], CL_DRIVER_VERSION, sizeof(driverVersion), driverVersion, NULL);
clGetDeviceInfo(id[devId], CL_DEVICE_OPENCL_C_VERSION, sizeof(deviceOpenCLVersion), deviceOpenCLVersion, NULL);
int Gflops = compUnits * freq;
if (verbose) printf(" ----- OpenCL Device # %d: %s-----\n"
"Gflops: %f\n"
"Max Clock Frequency: %d\n"
"Max Compute Units: %d\n"
"Max Work Group Size: %zu\n"
" MEMORY...\n"
"Total Memory of Device: %s (CL_DEVICE_GLOBAL_MEM_SIZE)\n"
"Local Memory of Device: %s (CL_DEVICE_LOCAL_MEM_SIZE)\n"
"Max Memory Object Allocation: %s (CL_DEVICE_MAX_MEM_ALLOC_SIZE)\n"
"Cache Size: %s (CL_DEVICE_GLOBAL_MEM_CACHE_SIZE)\n"
"Cacheline Size: %s (CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE)\n"
" VERSIONS...\n"
"Device Vendor: %s\n"
"Device Version: %s\n"
"Driver Version: %s\n"
"Device OpenCL Version: %s\n",
devId,
name,
(1e-3 * Gflops),
freq,
compUnits,
maxWorkGrps,
byteConverter((unsigned long)memSize),
byteConverter((unsigned long)localMemSize),
byteConverter((unsigned long)maxAlloc),
byteConverter((unsigned long)globalCacheSize),
byteConverter((unsigned long)cacheLineSize),
deviceVendor,
deviceVersion,
driverVersion,
deviceOpenCLVersion);
if(Gflops > MaxGflops)
{
_deviceID = id[devId];
MaxGflops = Gflops;
winnerPlatform = platforms[platform];
}
}
}
// Print fastest device info (TODO: don't get name twice)
if (clGetDeviceInfo(_deviceID, CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");
printf("\n Selected Fastest Open CL Device: %s (#%lu)\n", name, (unsigned long)_deviceID);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(_deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(mwgs), &mwgs, NULL))
Fatal("Cannot get OpenCL max work group size\n");
// Create OpenCL context for fastest device
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)winnerPlatform,
(cl_context_properties)0
};
_context = clCreateContextFromType(cps, deviceType, NULL, NULL, &err);
if (!_context || err) Fatal("Cannot create OpenCL Context\n");
// Properties for create command queue; currently nothing
// cl_command_queue_properties *propers;
cl_command_queue_properties prop = 0;
//prop |= CL_QUEUE_PROFILING_ENABLE;
//prop |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
// propers = ∝
_queue = clCreateCommandQueueWithProperties(_context, _deviceID, &prop, &err); // Create OpenCL command queue for fastest device
// _queue = clCreateCommandQueue(_context, _deviceID, &prop, &err);
if (!_queue || err) {
if (err == CL_INVALID_CONTEXT) Fatal("Cannot create OpenCL command cue: CL_INVALID_CONTEXT\n");
else if (err == CL_INVALID_DEVICE) Fatal("Cannot create OpenCL command cue: CL_INVALID_DEVICE\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create OpenCL command cue: CL_INVALID_VALUE\n");
else if (err == CL_INVALID_QUEUE_PROPERTIES) Fatal("Cannot create OpenCL command cue: CL_INVALID_QUEUE_PROPERTIES\n");
else if (err == CL_OUT_OF_RESOURCES) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_RESOURCES\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY\n");
else if (!_queue) Fatal("Cannot create OpenCL command cue: !queue\n");
else Fatal("Cannot create OpenCL command cue: ?????\n");
}
if (_VERBOSE) printf("Init complete.\n");
return mwgs;
}
/*
* Modify the source text to fit this run.
*/
char* ModifySourceText(unsigned int nTracesIn, unsigned int samplesPerT) {
size_t sourceSize = strlen(M_AND_S_OPENCL_SOURCE_TEXT) + 1;
char* moveStackSourceCode = new char[sourceSize];
strncpy(moveStackSourceCode, M_AND_S_OPENCL_SOURCE_TEXT, sourceSize);
moveStackSourceCode[sourceSize] = '\0';
// Print out the locations of the characters where we should insert other text if asked to do so
if (_FIND_INSERT_LOCATIONS) {
size_t z;
for (z = 0; z < sourceSize; z++) {
if (moveStackSourceCode[z] == '#') {
printf("Found # at position %zu\n", z);
break;
}
}
for (z = 0; z < sourceSize; z++) {
if (moveStackSourceCode[z] == '#') {
printf("Found # at position %zu\n", z);
break;
}
}
}
// Insert the digit that for loops go to inside of the source
PrintIntInStr(moveStackSourceCode, INSERT_LOCATION_1, nTracesIn);
PrintIntInStr(moveStackSourceCode, INSERT_LOCATION_2, samplesPerT);
// Print the modified source code if verbose
if (_FIND_INSERT_LOCATIONS) {
printf("\n GPU Source Code: \n");
printf("%s\n", moveStackSourceCode);
}
return moveStackSourceCode;
}
/*
* Wait for event and then release it.
*/
static void WaitForEventAndRelease(cl_event *event) {
printf("WaitForEventAndRelease()\n");
cl_int status = CL_SUCCESS;
status = clWaitForEvents(1, event);
if (status) Fatal("clWaitForEvents Failed with Error Code");
printf("About to release event...\n");
status = clReleaseEvent(*event);
if (status) Fatal("clReleaseEvent Failed with Error Code");
}
// Runs the program via open CL
static double RunOpenCL(float prestackTracesArray[], float stackTracesOut1DArray[], float powerTracesOut1DArray[],
unsigned int nTracesOut, unsigned int nTracesIn, unsigned int samplesPerT,
size_t inXsamples, size_t outXsamples,
unsigned int localThreadCount)
{
cl_int err;
// Get the source code
char* modifiedGpuSource = ModifySourceText(nTracesIn, samplesPerT);
// Allocate device memory
// CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD (?)
// Input...
cl_mem prestackTracesCL = clCreateBuffer(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
inXsamples * sizeof(cl_float), prestackTracesArray, &err);
if (err) FatalBufferCreation("Prestack traces", err);
// Output... TODO: How do we know that the output is zeroed out?
cl_mem stackTracesOutCL = clCreateBuffer(_context, CL_MEM_WRITE_ONLY,
outXsamples * sizeof(cl_float), NULL, &err);
if (err) FatalBufferCreation("Stack traces", err);
cl_mem powerTracesOutCL = clCreateBuffer(_context, CL_MEM_WRITE_ONLY,
outXsamples * sizeof(cl_float), NULL, &err);
if (err) FatalBufferCreation("Power traces", err);
// Compile the source code
char* gpuSourceText[1];
gpuSourceText[0] = modifiedGpuSource;
size_t sourceLength[1];
sourceLength[0] = strlen(modifiedGpuSource);
cl_program moveoutAndStackCLProgram = clCreateProgramWithSource(_context, 1, (const char**)gpuSourceText,
(const size_t*)sourceLength, &err);
if (err != CL_SUCCESS) {
if (err == CL_INVALID_CONTEXT) Fatal("Cannot create program: CL_INVALID_CONTEXT\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create program: CL_INVALID_VALUE\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create program: CL_OUT_OF_HOST_MEMORY\n");
else Fatal("Cannot create program_S %d\n", err);
}
// Build the program
cl_int buildCode = clBuildProgram(moveoutAndStackCLProgram, 0, NULL, NULL, NULL, NULL);
if (buildCode != CL_SUCCESS) {
// Attempt to get compile errors
char log[1048576];
if (clGetProgramBuildInfo(moveoutAndStackCLProgram, _deviceID, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL)) {
log[0] = '\0'; // Failed to get the log file
}
if (buildCode == CL_INVALID_PROGRAM) Fatal("Cannot build program: CL_INVALID_PROGRAM\n%s", log);
else if (buildCode == CL_INVALID_VALUE) Fatal("Cannot build program: CL_INVALID_VALUE\n%s", log);
else if (buildCode == CL_INVALID_DEVICE) Fatal("Cannot build program: CL_INVALID_DEVICE\n%s", log);
else if (buildCode == CL_INVALID_BINARY) Fatal("Cannot build program: CL_INVALID_BINARY\n%s", log);
else if (buildCode == CL_INVALID_BUILD_OPTIONS) Fatal("Cannot build program: CL_INVALID_BUILD\n_OPTIONS\n%s", log);
else if (buildCode == CL_INVALID_OPERATION) Fatal("Cannot build program: CL_INVALID_OPERATION\n%s", log);
else if (buildCode == CL_COMPILER_NOT_AVAILABLE) Fatal("Cannot build program: CL_COMPILER_NOT_AVAILABLE\n%s", log);
else if (buildCode == CL_BUILD_PROGRAM_FAILURE) Fatal("Cannot build program: CL_BUILD_PROGRAM_FAILURE\n%s", log);
else if (buildCode == CL_INVALID_OPERATION) Fatal("Cannot build program: CL_INVALID_OPERATION\n%s", log);
else if (buildCode == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot build program: CL_OUT_OF_HOST_MEMORY\n%s", log);
else Fatal("Cannot build program: %d\n%s", buildCode, log);
}
// Compile the source code & build the kernel
cl_kernel kernel = clCreateKernel(moveoutAndStackCLProgram, "sumAllCL", &err);
if (err) {
if (err == CL_INVALID_PROGRAM) Fatal("Cannot create kernel: CL_INVALID_PROGRAM\n");
else if (err == CL_INVALID_PROGRAM_EXECUTABLE) Fatal("Cannot create kernel: CL_INVALID_PROGRAM_EXECUTABLE\n");
else if (err == CL_INVALID_KERNEL_NAME) Fatal("Cannot create kernel: CL_INVALID_KERNEL_NAME\n");
else if (err == CL_INVALID_KERNEL_DEFINITION) Fatal("Cannot create kernel: CL_INVALID_KERNEL_DEFINITION\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create kernel: CL_INVALID_VALUE\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create kernel: CL_OUT_OF_HOST_MEMOR\n");
else Fatal("Cannot create kernel: %d\n", err);
}
// Set program parameters
cl_int returnValArgSet;
returnValArgSet = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prestackTracesCL);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("prestackTracesCL", returnValArgSet);
returnValArgSet = clSetKernelArg(kernel, 1, sizeof(cl_mem), &stackTracesOutCL);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("stackTracesOutCL", returnValArgSet);
returnValArgSet = clSetKernelArg(kernel, 2, sizeof(cl_mem), &powerTracesOutCL);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("powerTracesOutCL", returnValArgSet);
returnValArgSet = clSetKernelArg(kernel, 3, sizeof(unsigned int), &nTracesOut);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("nTracesOut", returnValArgSet);
returnValArgSet = clSetKernelArg(kernel, 4, sizeof(unsigned int), &nTracesIn);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("nTracesIn", returnValArgSet);
returnValArgSet = clSetKernelArg(kernel, 5, sizeof(unsigned int), &samplesPerT);
if (returnValArgSet != CL_SUCCESS) FatalSetArgs("samplesPerT", returnValArgSet);
// TODO: verbose
printf("About to run Kernel...\n");
// Start timer TODO: move?
double runTime = GetTime();
// Run the kernel (& also set the number of threads)
cl_event runEvent;
size_t Global[1] = { nTracesOut };
size_t Local[1] = { localThreadCount };
if (localThreadCount > 0) err = clEnqueueNDRangeKernel(_queue, kernel, 1, NULL, Global, Local, 0, NULL, &runEvent);
else err = clEnqueueNDRangeKernel(_queue, kernel, 1, NULL, Global, NULL, 0, NULL, &runEvent);
if (err) {
if (err == CL_INVALID_PROGRAM_EXECUTABLE) {
Fatal("Cannot run Kernel: No successfully built program executable available.\n");
} else if (err == CL_INVALID_COMMAND_QUEUE) {
Fatal("Cannot run Kernel: Command_queue is not a valid command-queue.\n");
} else if (err == CL_INVALID_KERNEL) {
Fatal("Cannot run Kernel: Kernel is not a valid kernel object.\n");
} else if (err == CL_INVALID_CONTEXT) {
Fatal("Cannot run Kernel: Context associated with command_queue and kernel is not the same or if "
"the context associated with command_queue and events in event_wait_list are not the same.\n");
} else if (err == CL_INVALID_KERNEL_ARGS) {
Fatal("Cannot run Kernel: Kernel argument values have not been specified.\n");
} else if (err == CL_INVALID_WORK_DIMENSION) {
Fatal("Cannot run Kernel: work_dim is not a valid value (must be between 1 and 3).\n");
} else if (err == CL_INVALID_WORK_GROUP_SIZE) {
Fatal("Cannot run Kernel: local_work_size is specified and number of work-items specified by global_work_size "
"is not evenly divisable by size of work-group given by local_work_size or does not match the "
"work-group size specified for kernel using the __attribute__((reqd_work_group_size(X, Y, Z))) "
"qualifier in program source.\n");
} else if (err == CL_INVALID_WORK_ITEM_SIZE) {
Fatal("Cannot run Kernel: If the number of work-items specified in any of local_work_size[0], ... "
"local_work_size[work_dim - 1] is greater than the corresponding values specified "
"by CL_DEVICE_MAX_WORK_ITEM_SIZES[0], .... CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1]. .\n");
} else if (err == CL_INVALID_GLOBAL_OFFSET) {
Fatal("Cannot run Kernel: Global_work_offset is not NULL.\n");
} else if (err == CL_OUT_OF_RESOURCES) {
Fatal("Cannot run Kernel: CL_OUT_OF_RESOURCES.\n");
} else if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE) {
Fatal("Cannot run Kernel: Failure to allocate memory for data store associated with image or buffer "
"objects specified as arguments to kernel.\n");
} else if (err == CL_INVALID_EVENT_WAIT_LIST) {
Fatal("Cannot run Kernel: event_wait_list is NULL and num_events_in_wait_list > 0, or event_wait_list "
"is not NULL and num_events_in_wait_list is 0, or if event objects in event_wait_list "
"are not valid events..\n");
} else if (err == CL_OUT_OF_HOST_MEMORY) {
Fatal("Cannot run Kernel: Failure to allocate resources required by the OpenCL implementation on the host.\n");
} else {
Fatal("Cannot run Kernel: Unknown Error. (clEnqueueNDRangeKernel)");
}
}
// Flush the program & wait for the program to finish executing
if (clFlush(_queue)) printf("Flush Fail (Run)");
WaitForEventAndRelease(&runEvent);
// Copy the end result back to CPU memory side
if (clEnqueueReadBuffer(_queue, stackTracesOutCL, CL_TRUE, 0, outXsamples * sizeof(cl_float), stackTracesOut1DArray, 0, NULL, NULL))
Fatal("Cannot copy stackTracesOutCL from device to host\n");
if (clEnqueueReadBuffer(_queue, powerTracesOutCL, CL_TRUE, 0, outXsamples * sizeof(cl_float), powerTracesOut1DArray, 0, NULL, NULL))
Fatal("Cannot copy powerTracesOutCL from device to host\n");
// Release kernel and program
if (clReleaseKernel(kernel)) Fatal("Cannot release kernel\n");
if (clReleaseProgram(moveoutAndStackCLProgram)) Fatal("Cannot release program\n");
// Free device memory
clReleaseMemObject(prestackTracesCL);
clReleaseMemObject(stackTracesOutCL);
clReleaseMemObject(powerTracesOutCL);
// Release the context and queue
clReleaseCommandQueue(_queue);
clReleaseContext(_context);
// Return the time it took to run this program
return runTime;
}
double RunProg(unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
unsigned int localThreadCount, unsigned int deviceType) {
// Stores sizes of the various arrays
size_t tracesInxSample = nTracesIn * samplesPerTrace;
size_t tracesOutxSample = nTracesOut * samplesPerTrace;
// Allocate arrays
float* prestackTraces1D = (float*)malloc(tracesInxSample * sizeof(float));
float* stackTracesOut1Dgpu = (float*)calloc(tracesOutxSample, sizeof(float)); // output; zero-out
float* powerTracesOut1Dgpu = (float*)calloc(tracesOutxSample, sizeof(float)); // output; zero-out
// Count how much memory all of this is
if (_VERBOSE)
{
// Make sure it is consistent with above allocation
unsigned long allocatedMemory = 0;
allocatedMemory += tracesInxSample * sizeof(float);
allocatedMemory += tracesOutxSample * sizeof(float);
allocatedMemory += tracesOutxSample * sizeof(float);
printf("TOTAL MEMORY ALLOCATED: %s\n", byteConverter(allocatedMemory));
printf("Input Array Sizes: %s\n", byteConverter((unsigned int)(tracesInxSample * sizeof(float))));
printf("Output Array Sizes: %s\n", byteConverter((unsigned int)(tracesOutxSample * sizeof(float))));
}
// Fill in array with randoms
RandomFillArray(prestackTraces1D, (unsigned int)tracesInxSample);
// Init OpenCL using the desired device type
double preInitTime = GetTime();
int maxWorkGroupSize;
if (deviceType == 0) maxWorkGroupSize = InitOpenCL(_VERBOSE, CL_DEVICE_TYPE_ALL);
else if (deviceType == 1) maxWorkGroupSize = InitOpenCL(_VERBOSE, CL_DEVICE_TYPE_GPU);
else maxWorkGroupSize = InitOpenCL(_VERBOSE, CL_DEVICE_TYPE_CPU);
printf("Max work size for the device is: %d\n", maxWorkGroupSize);
// --- ACTUAL TEST ---
// Run OpenCL
double startTime = GetTime();
double runTime = RunOpenCL(prestackTraces1D, stackTracesOut1Dgpu, powerTracesOut1Dgpu, // arrays
nTracesOut, nTracesIn, samplesPerTrace, // ints
tracesInxSample, tracesOutxSample,
localThreadCount); // samples
// Display run time
double endTime = GetTime();
printf("Elapsed Time: %fsecs\n", (endTime - runTime));
printf(" %fsecs (Before Function Call)\n", (endTime - startTime));
printf(" %fsecs (Including Init)\n\n", (endTime - preInitTime));
// Free the 1D arrays
free(powerTracesOut1Dgpu);
free(stackTracesOut1Dgpu);
free(prestackTraces1D);
return (endTime - startTime);
}
My first thought as to why it's running so much slower on my GPU than my CPU is that maybe it's because I am busing so much data over the graphics card before anything runs. Perhaps a better implementation would involve splitting the workload in multiple runs, so that code can be executing while more data is being bused over (I presume that's a thing). However now that I think about it this is almost certainly false, because as I said I wrote this program based on an example, and that example did matrix multiplication, and that example runs much much faster on the GPU than my CPU. I don't really know what the difference is.

The problem was caching; I was read and writing from array a lot. So I made a version that writes to local variables as much as possible before writing to an array and now it runs much faster on the GPU.

tex1Dfetch unexpectedly returning 0

I don't believe this is the same issue as reported here :
Bound CUDA texture reads zero
CUDA 1D texture fetch always return 0
In my CUDA application I noticed that tex1Dfetch is not returning the expected value, past a certain index in the buffer. An initial observation in the application was that a value at index 0 could be read correctly, but at 12705625, the value read was 0. I made a small test program to investigate this, given below. The results are a little bit baffling to me. I'm trying to probe at what index the values no longer are read correctly. But as the value arraySize is changed, so does the "firstBadIndex". Even with arraySize =2, the second value is read incorrectly! As arraySize is made bigger, the firstBadIndex gets bigger. This happens when binding to arrays of float, float2, or float4. If the data are read from the device buffer instead (switch around the commented lines in FetchTextureData), then everything is fine. This is using CUDA 6.5, on a Tesla c2075.
Thanks for any insights or advice you might have.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define FLOATTYPE float4
texture<FLOATTYPE,cudaTextureType1D,cudaReadModeElementType> texture1D;
const unsigned int arraySize = 1000;
FLOATTYPE* host;
FLOATTYPE* device;
FLOATTYPE* dTemp;
FLOATTYPE hTemp[1];
__global__ void FetchTextureData(FLOATTYPE* data,FLOATTYPE* arr,int idx)
{
data[0] = tex1Dfetch(texture1D, idx);
//data[0] = arr[idx];
}
bool GetTextureValues(int idx){
FetchTextureData<<<1,1>>>(dTemp,device,idx);
// copy to the host
cudaError_t err = cudaMemcpy(hTemp,dTemp,sizeof(FLOATTYPE),cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
throw "cudaMemcpy failed!";
}
if (cudaDeviceSynchronize() != cudaSuccess) {
throw "cudaDeviceSynchronize failed!";
}
return hTemp[0].x == 1.0f;
}
int main()
{
try{
host = new FLOATTYPE[arraySize];
cudaError_t err = cudaMalloc((void**)&device,sizeof(FLOATTYPE) * arraySize);
cudaError_t err1 = cudaMalloc((void**)&dTemp,sizeof(FLOATTYPE));
if (err != cudaSuccess || err1 != cudaSuccess) {
throw "cudaMalloc failed!";
}
// make some host data
for(unsigned int i=0; i<arraySize; i++){
FLOATTYPE data = {1.0f, 0.0f, 0.0f, 0.0f};
host[i] = data;
}
// and copy it to the device
err = cudaMemcpy(device,host,sizeof(FLOATTYPE) * arraySize,cudaMemcpyHostToDevice);
if (err != cudaSuccess){
throw "cudaMemcpy failed!";
}
// set up the textures
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<FLOATTYPE>();
texture1D.addressMode[0] = cudaAddressModeClamp;
texture1D.filterMode = cudaFilterModePoint;
texture1D.normalized = false;
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
// do a texture fetch and find where the fetches stop working
int lastGoodValue = -1, firstBadValue = -1;
float4 badValue = {-1.0f,0.0f,0.0f,0.0f};
for(unsigned int i=0; i<arraySize; i++){
if(i % 100000 == 0) printf("%d\n",i);
bool isGood = GetTextureValues(i);
if(firstBadValue == -1 && !isGood)
firstBadValue = i;
if(isGood)
lastGoodValue = i;
else
badValue = hTemp[0];
}
printf("lastGoodValue %d, firstBadValue %d\n",lastGoodValue,firstBadValue);
printf("Bad value is (%.2f)\n",badValue.x);
}catch(const char* err){
printf("\nCaught an error : %s\n",err);
}
return 0;
}

The problem lies in the texture set up. This:
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
should be:
cudaBindTexture(NULL, texture1D, device, channelDesc,
arraySize * sizeof(FLOATTYPE));
As per the documentation, the size argument is the size of the memory area in bytes, not the number of elements. I would have expected that with the clamped addressing mode, the code would still work as expected. With border mode, you should get a zero value which looks like it would trigger your bad value detection. I haven't actually run your code, so perhaps there is a subtley I'm missing somewhere. For such a simple repro case, your code structure is rather convoluted and hard to follow (at least on the mobile phone screen I am reading it on).
EDIT to add that between the time I started writing this and finished, #njuffa pointed out the same mistake in comments

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

cuBLAS matrix inverse much slower than MATLAB - c++

Related

cudaMemcpy throws InvalidValue error when copying from device to host

Why does my data not fit into a CUDA Texture Object?

OpenCL: CL_OUT_OF_HOST_MEMORY on clCreateCommandQueueWithProperties (with Minimal, Complete, and Verifiable example)

Why does my program run significantly faster on my CPU device than on my GPU device?

tex1Dfetch unexpectedly returning 0

Categories

Resources