cuda concurrent kernel execution was not successful - concurrency

Following is a code I have editted from the book "cuda by example" for a testing of the CUDA concurrently kernel execution.
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define N (1024*1024*10)
#define FULL_DATA_SIZE (N*2)
__global__ void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c1,
N * sizeof(int) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N*2) {
// enqueue kernels in stream0 and stream1
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
}
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a0 ) );
HANDLE_ERROR( cudaFree( dev_b0 ) );
HANDLE_ERROR( cudaFree( dev_c0 ) );
HANDLE_ERROR( cudaFree( dev_a1 ) );
HANDLE_ERROR( cudaFree( dev_b1 ) );
HANDLE_ERROR( cudaFree( dev_c1 ) );
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
return 0;
}
First I did a CUDA profiling using nvvp, and found the two kernels are not overlapping at all:
Some previous posts on SO stated that the profiler may disable the concurrent kernel execution, so I did a plain running. The total time in the kernel loop was reported as 2.2ms, but he profiler reported the execution time of each kernel as 1.1ms. This still implies no (or very poor) overlap between the two kernels.
I am using CUDA4.0 on Tesla M2090. It seems that the kernel resource requirement (~10s of MB) should be small on this device (6G), and concurrent execution should be practical. Not sure where is the problem. Should I do something special to enable concurrent kernels (some APIs, some environment setup...)?

Do you specify for which compute architecture the code should compile? The default is 1.0, which doesn't support concurrent kernels if I'm not mistaken. Try adding the following to your nvcc invocation:
--generate_code code=sm_21,arch=compute_20
I don't know which compute architecture is supported by your card, but you should be able to find that somewhere on the net. But maybe just try the above first, if it fails, try sm_20 instead of sm_21.

Related

ArrayFire Matrix Multiplication Vectorization

I am using ArrayFire library for signal processing. I am just curious about how to make more efficient my code. I read vectorization guide in docs, but i just ended up using gfor construct. Is it possible to improve efficiency ? Is there a better way for vectorization ? ( I hope there is :)
Note : I am aiming CUDA performance.
Here is the code which i am trying to improve :
#include <arrayfire.h>
#include <stdio.h>
#include <af/util.h>
static int proc_size = 1024;
static int fft_size = proc_size * 4;
static int staves = 288;
static int beams = 256;
static af::array S;
static af::array B;
static af::array R;
void fn()
{
gfor ( af::seq i, fft_size )
R( i , af::span ) = matmul( S( i , af::span ) , B( af::span , af::span , i ) );
}
int main(int, char **)
{
S = af::randn( fft_size , staves , c32 );
gfor ( af::seq i, fft_size )
S( i , af::span ) = af::randn( 1 , staves , c32 );
B = af::randn( staves , beams , fft_size , af::dtype::c32 );
R = af::constant( af::cfloat { 0 , 0 } , fft_size , beams );
try
{
af::setDevice( 0 );
af::info();
double time = af::timeit(fn);
printf( "Took %f secs.\n" , time );
}
catch (const af::exception &ex)
{
fprintf(stderr, "%s\n", ex.what());
throw;
}
return 0;
}

Using OpenGL and Cuda get a run time error for an Access violation reading location

I would just like to start by saying I am new to CUDA and OpenGL. I get the above runtime error when it runs glutMainLoop() and the mainloop runs glDrawPixels(). I have looked everywhere to figure out why this isn't working any help would be appreciated.
#include<stdio.h>
#include<stdlib.h>
#include<gl/glut.h>
#include<stb_image.c>
#include<cuda.h>
#include<cuda_gl_interop.h>
#include<gl/glext.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
system("PAUSE");
//exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define DIM 512
#define GET_PROC_ADDRESS( str ) wglGetProcAddress( str )
PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;
GLuint bufferObj; /* how OpenGL calls the buffer */
cudaGraphicsResource *resource; /* how CUDA calls this same buffer */
uchar4* devPtr; /* friendly pointer to the handles above */
const int size = DIM * DIM;
struct cuComplex
{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a), i(b){ }
__device__ float magnitude2(void)
{
return r * r + i * i;
}
__device__ cuComplex operator *(const cuComplex & a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex & a)
{
return cuComplex(r+a.r, i+a.i);
}
};
__device__ int julia (int x, int y)
{
const float scale = 1.5;
float jx = scale *(float)(DIM/2 - x)/(DIM/2);
float jy = scale *(float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8, 0.156);
cuComplex a(jx,jy);
int i = 0;
for(i=0; i<200; i++)
{
a = a * a + c;
if(a.magnitude2() > 1000)
{
return 0;
}
}
return 1;
}
__global__ void kernel(uchar4 *ptr)
{
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x * blockDim.x;
int juliaValue = julia(x,y);
ptr[offset].x = 255 * juliaValue;
ptr[offset].y = 0;
ptr[offset].z = 0;
ptr[offset].w = 255;
}
static void Draw( void ) {
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE,0); //ERROR HAPPENS HERE
glutSwapBuffers();
}
void display_and_exit() {
cudaDeviceProp prop;
int dev;
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );
/*
* Interoperability with OpenGL requires that the CUDA device
* be specified by cudaGLSetGLDevice() before any other runtime calls.
*/
HANDLE_ERROR( cudaGLSetGLDevice( dev ) );
/*
* a bug in the Windows GLUT implementation prevents us from
* passing zero arguments to glutInit()
*/
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize (DIM, DIM);
glutInitWindowPosition (100, 100);
glutCreateWindow("CUDA and OpenGL example" );
printf("OpenGL version: %s\n", glGetString(GL_VERSION));
dim3 grid(DIM, DIM);
kernel<<<grid,1>>>(devPtr);
glutDisplayFunc(Draw);
/*
* Get pointers to functions (glext.h)
*/
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
glutMainLoop();
}
int main() {
display_and_exit();
return(0);
}
Unless you are using a Pixel Buffer Object, then this is exactly the sort of behavior you should expect from glDrawPixels (..., 0). That last parameter is a pointer to pixel data, it should be non-NULL whenever the source of pixel data is client memory. It can be NULL if you have a Pixel Buffer Object, the same way that the pointer in a call to glVertexPointer (...) can be NULL when using a Vertex Buffer Object.
You have acquired the necessary function pointers to create a Pixel Buffer Object in your code, however have not actually created one. Thus, passing NULL to glDrawPixels (...) will cause the driver to dereference a NULL pointer.

Why is ZLIB uncompress hanging?

I'm compiling ZLIB directly into the executable with is being compiled on VS2010 - 64-bit and this simple program always hangs, any ideas? More info below:
Call Stack:
zlibtest.exe!inflate(z_stream_s * strm, int flush) Line 607
zlibtest.exe!uncompress(unsigned char * dest, unsigned long * destLen, const unsigned char * source, unsigned long sourceLen) Line 44
zlibtest.exe!main(int argc, char * * argv) Line 137
It hangs here:
for (;;)
switch (state->mode) {
case HEAD:
if (state->wrap == 0) {
state->mode = TYPEDO;
break;
}
**NEEDBITS(16);**
Program:
#include "zlib.h"
int main( int argc, char * argv[] )
{
char * buffer = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
ULONG buffer_size = strlen( buffer ) + 1;
ULONG compressed_size = compressBound( buffer_size );
BYTE * compressed_buffer = new BYTE[ compressed_size ];
if ( Z_OK != compress2( compressed_buffer, &compressed_size, ( Bytef * ) buffer, buffer_size, Z_DEFAULT_COMPRESSION ) )
printf( "Failed to compress." );
ULONG uncompressed_size = buffer_size;
BYTE * uncompressed_buffer = new BYTE[ uncompressed_size ];
if ( Z_OK != uncompress( uncompressed_buffer, &uncompressed_size, compressed_buffer, compressed_size ) )
printf( "Failed to uncompress" );
printf( "Press <ENTER> to exit..." );
std::cin.ignore();
}

C-Style unsigned char parsing and manipulation in C/C++ - segmentation fault

Note that I'm using a C++ compiler ( hence, the cast on the calloc function calls) to do this, but the code is essentially C.
Basically, I have a typedef to an unsigned char known as viByte, which I'm using to create a string buffer to parse a file from binary (a TGA file, to be exact - but, that's irrelevant).
I'm writing basic functions for it right now; append, prepend, new, etc.
The problem is that, on the first iteration of the first loop in viByteBuf_Prepend, I get a segmentation fault. I need to know why, exactly, as this is something which could keep me up all night without some pointers (pun intended).
I also would like to know if my algorithms are correct in terms of how the buffer is pre-pending the viByte string. For example, I have a feeling that using memset too much might be a bad idea, and whether or not my printf format for the unsigned char is correct (I have a feeling it isn't, as nothing is getting output to my console).
Compiling on GCC, Linux.
Ze Code
#ifdef VI_BYTEBUF_DEBUG
void viByteBuf_TestPrepend( void )
{
viByteBuf* buf = viByteBuf_New( 4 );
buf->str = ( viByte* ) 0x1;
printf(" Before viByteBuf_Prepend => %uc ", buf->str);
viByteBuf_Prepend( buf, 3, ( viByte* ) 0x2 );
printf(" After viByteBuf_Prepend => %uc ", buf->str);
}
#endif
viByteBuf* viByteBuf_New( unsigned int len )
{
viByteBuf* buf = ( viByteBuf* ) calloc( sizeof( viByteBuf ), 1 );
const int buflen = len + 1;
buf->str = ( viByte* ) calloc( sizeof( viByte ), buflen );
buf->len = buflen;
buf->str[ buflen ] = '\0';
return buf;
}
void viByteBuf_Prepend( viByteBuf* buf, unsigned int len, viByte* str )
{
unsigned int pos, i;
const unsigned int totallen = buf->len + len;
viByteBuf* tmp = viByteBuf_New( totallen );
viByte* strpos = buf->str;
memset( tmp->str, 0, tmp->len );
int index;
for( i = 0; i < buf->len; ++i )
{
index = ( buf->len - i ) - 1;
*strpos = buf->str[ 0 ];
++strpos;
}
memset( buf->str, 0, buf->len );
printf( "%uc\n", buf->str );
i = totallen;
for ( pos = 0; pos < len; ++pos )
{
tmp->str[ pos ] = str[ pos ];
tmp->str[ i ] = buf->str[ i ];
--i;
}
memset( buf->str, 0, buf->len );
buf->len = tmp->len;
memcpy( buf->str, tmp->str, tmp->len );
viByteBuf_Free( tmp );
//memset( )
//realloc( ( viByteBuf* ) buf, sizeof( viByteBuf ) * tmp->len );
}
Many thank yous.
Update
Sorry, I should have explicitly posted the code where the segmentation fault lies. It is right here:
for( i = 0; i < buf->len; ++i )
{
index = ( buf->len - i ) - 1;
*strpos = buf->str[ 0 ]; //<--segmentation fault.
++strpos;
}
On your code you have buf->str[ buflen ] = '\0';, but you only allocate space for buflen. I think you meant buf->str[ len ] = '\0';.

Output of cuda program is not what was expected

#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
__global__ void setVal(char **c){
c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";
}
int main(){
char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/
dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];
cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
//put synchronize here if problem
printf("%s\n",p);
}
getchar();
return 0;
}
Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated
Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.
#include<stdio.h>
#include<string.h>
__global__ void setValues(char** word)
{
volatile char* myWord = word[blockIdx.x];
myWord[0] = 'H';
myWord[1] = 'o';
myWord[2] = 'l';
myWord[3] = 'a';
myWord[4] = '\0';
}
int main()
{
const size_t bufferSize = 32;
const int nObjects = 10;
char* h_x[nObjects];
char** d_x = 0;
cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );
for ( int i=0; i < nObjects; i++ )
{
h_x[i] = NULL;
cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
}
cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
printf("Copied h_x[] to d_x[]\n");
char msg[] = "Hello World!";
cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );
/* Force Thread Synchronization */
cudaError err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
setValues<<<nObjects,1>>>(d_x);
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
printf("Kernel Completed Successfully. Woot.\n\n");
char p[bufferSize];
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);
printf("d_x = %lx\n", (unsigned long)d_x );
printf("h_x = %lx\n", (unsigned long)h_x );
for ( int i=0; i < nObjects; i++ )
{
cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
printf("%d p[] = %s\n",i,p);
}
/* Force Thread Synchronization */
err = cudaThreadSynchronize();
/* Check for and display Error */
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
getchar();
return 0;
}
As #Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!
The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void setValues(char *arr){
arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}
int main() {
const int NCHARS=6;
char *xd;
cudaMalloc(&xd, NCHARS);
dim3 grid(3,2);
setValues<<<grid,1>>>(xd);
char *p;
p = (char*) malloc(20*sizeof(char));
strcpy(p,"");
cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
p[NCHARS]='\0';
printf("<%s>\n", p);
getchar();
cudaFree(xd);
return 0;
}
There are several problems I'm seeing here. Here are some of the most obvious ones:
First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.
Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.