Problem feeding Thrust vector into getrf/getri - c++

Continuing on my CUDA beginner's adventure, I've been introduced to Thrust, which seems a convenient lib that saves me the hassle of explicit memory (de-)allocation.
I've already tried combining it with a few cuBLAS routines, e.g. gemv, by generating a raw pointer to the underlying storage with thrust::raw_pointer_cast(array.data()) and then feeding this to the routines, and it works just fine.
The current task is to get the inverse of a matrix, and for that I'm using getrfBatched and getriBatched. From the documentation:
cublasStatus_t cublasDgetrfBatched(cublasHandle_t handle,
int n,
double *Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize);
where
Aarray - device - array of pointers to <type> array
Naturally I thought I could use another layer of Thrust vector to express this array of pointers and again feed its raw pointer to cuBLAS, so here's what I did:
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
int info = 0;
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, &info, 1);
}
When executed, stat says CUBLAS_STATUS_SUCCESS (0) and info says 0 (execution successful), yet if I try to access the elements of in, pivot or out with standard bracket notation, I hit a thrust::system::system_error. Seems to me that the corresponding memory got corrupted somehow.
Anything obvious that I'm missing here?

The documentation for cublas<t>getrfBatched indicates that the infoArray parameter is expected to be a pointer to device memory.
Instead you have passed a pointer to host memory:
int info = 0;
...
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
^^^^^
If you run your code with cuda-memcheck (always a good practice, in my opinion, any time you are having trouble with a CUDA code, before asking others for help) you will receive an error of "invalid global write of size 4". This is due to the fact that a kernel launched by cublasDgetrfBatched() is attempting to write the info data to device memory using an ordinary host pointer that you provided, which is always illegal in CUDA.
CUBLAS itself does not trap errors like this for performance reasons. However the thrust API uses more rigorous synchronization and error checking, in some cases. Therefore, the use of thrust code after this error reports the error, even though the error had nothing to do with thrust (it was an asynchronously reported error from a previous kernel launch).
The solution is straightforward; provide device storage for info:
$ cat t329.cu
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
thrust::device_vector<int> info(1, 0);
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), thrust::raw_pointer_cast(info.data()), 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, thrust::raw_pointer_cast(info.data()), 1);
for (int i = 0; i < 4; i++) {
double test = in[i];
std::cout << test << std::endl;
}
}
int main(){
test();
}
$ nvcc -o t329 t329.cu -lcublas
t329.cu(12): warning: variable "stat" was set but never used
$ cuda-memcheck ./t329
========= CUDA-MEMCHECK
3
0.333333
4
0.666667
========= ERROR SUMMARY: 0 errors
$
You'll note this change in the above code is applied to usage for both cublas calls, as the infoArray parameter has the same expectations for both.

Related

CUDA deep copy with other data

I'm trying to copy my struct Test to the GPU, change the data, and upload it back to the CPU. This is what I've tried so far, note that my code crashes on the last, commented out, line:
struct Test {
int x, y;
int* data;
};
// Test kernel
static __global__ void TestKernel(Test* d) {
const uint32_t index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
// increment some values
++d->data[0];
++d->data[1];
++d->data[2];
++d->x;
++d->y;
}
// Test snippet:
Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));
TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]); // crashes
Note that I've seen multiple related questions, but none of them also copy some data apart from the deep copied data pointer.
My error message:
Exception thrown at 0x00007FF7A2C5297D in VFD.exe: 0xC0000005: Access
violation reading location 0x0000000B01600208.
Note that I'm probably copying the memory incorrectly, or something along those lines. If I remove the COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost)); line I'm able to access the host->data array, but the x and y values stay unincremented for obvious reasons.
After you cudaMemcpy into the host struct back from the GPU, you override the data pointer in it with an invalid GPU data pointer.
In order to fix it you need to restore the original data pointer (and then copy the actual data).
Working version:
struct Test
{
int x, y;
int* data;
};
static __global__ void TestKernel(Test* d)
{
++(d->data[0]);
++(d->data[1]);
++(d->data[2]);
++(d->x);
++(d->y);
}
int main()
{
int* hostData = new int[3]{ 1, 2, 3 };
Test* host = new Test{ 10, 20, hostData };
int* deviceData = nullptr;
Test* device = nullptr;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(int*), cudaMemcpyHostToDevice));
TestKernel << < 1, 1 >> > (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
host->data = hostData; // Restore host data pointer
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y);
printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
return 0;
}
Output:
host:
11 21
2 3 4
Some notes:
For clarity I added () in the kernel increment statements.
You used sizeof(float*) for the data pointer, although it is an int* (of course the size is the same).

cudaMemcpyAsync from page-locked host memory to device memory returns reading access violation error

I am coding a memory-heavy multi-GPU CUDA program. I found that my cudaMemcpyAsync calls werent actually performed asynchronously. After some research I found out that I would have to copy it from page-locked host memory to the device. So what I now attempt to do is to copy a part of the whole host input data array into a chunk of page-locked host memory, and then copy that to the device. The H2H cudaMemcpyAsync works fine without outputting any errors, the H2D afterwards gives me this error: Access violation reading address 0xWHATEVER. Additionally, in the sample code I am providing there is an identical error with the cudaMallocHost call. (this works fine in the main project)
I have tried to make a much simpler sample project (the one below). This still gives me errors, so I don't really know what to do.
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#ifndef KERNEL_H
#define KERNEL_H
typedef struct
{
int device = 0;
double *d_array, //device array ptr
*h_array_pl; //page locked array ptr
} IOdataPtr;
#endif
void printCudaError(cudaError_t error, char err_src[]) { //error printing function to reduce line count
if (error != cudaSuccess) {
printf("Error: %i while performing %s \n", error, err_src);
}
}
int main() {
const int GPU_N = 2;
const int CALC_N = 1024*1024*1024;
cudaError_t error;
cudaStream_t stream[GPU_N];
double *h_array;
h_array = (double*)malloc(sizeof(double) * CALC_N);
for (int i = 0; i < CALC_N; i++) {
h_array[i] = 2;
}
IOdataPtr ptr[GPU_N];
for (int i = 0; i < GPU_N; i++) {
//normal host alloc
ptr[i].device = i;
error = cudaSetDevice(ptr[i].device); //select device
printCudaError(error, "cudaSetDevice");
cudaStreamCreate(&stream[i]);
printCudaError(error, "cudaStreamCreate");
error = cudaMalloc((void**)&(ptr[i].d_array),
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMalloc");
error = cudaMallocHost((void **)&ptr[i].h_array_pl,
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMallocHost");
//xre
//data -> pl
error = cudaMemcpyAsync(ptr[i].h_array_pl, //dst
&h_array[i * CALC_N / GPU_N], //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToHost, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2H");
//pl -> dev
error = cudaMemcpyAsync(ptr[i].d_array, //dst
ptr[i].h_array_pl, //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToDevice, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2D");
cudaStreamDestroy(stream[i]);
error = cudaFree(ptr[i].d_array);
printCudaError(error, "cudaFree");
}
printf("Well it worked");
free(h_array);
getchar();
}
The output my code gives me:
Error: 2 while performing cudaMallocHost
Error: 2 while performing cudaMemcpyAsync H2H
Error: 2 while performing cudaMemcpyAsync H2D
Error: 2 while performing cudaFree
Well it worked
Error 2 is cudaErrorMemoryAllocation
In the code you currently have posted, this line of code is wrong:
error = cudaMemcpyAsync(ptr[i].d_array, &ptr[i].h_array_pl, CALC_N / GPU_N * sizeof(double), cudaMemcpyHostToDevice, stream[i]);
^
That ampersand doesn't belong there. ptr[i].h_array_pl is already a pointer to the source of the data transfer, you should not be taking the address of that pointer.
Using the address of this pointer as the data source of the copy operation would result in incorrect and illegal host memory accesses, for the size of the transfer indicated in this code. Whether or not this would be detected depends on a number of factors, but it's possibly or probably the reason for the Access violation reading location... report, which is generally referring to an illegal access to host memory.

OpenCL vs CUDA: Pinned memory

I have been porting my RabbitCT CUDA implementation to OpenCL and I'm running into issues with pinned memory.
For CUDA a host buffer is created that buffers the input images to be processed in pinned memory. This allows the host to catch the next batch of input images while the GPU processes the current batch. A simplified mockup of my CUDA implementation is as follows:
// globals
float** hostProjBuffer = new float*[BUFFER_SIZE];
float* devProjection[STREAMS_MAX];
cudaStream_t stream[STREAMS_MAX];
void initialize()
{
// initiate streams
for( uint s = 0; s < STREAMS_MAX; s++ ){
cudaStreamCreateWithFlags (&stream[s], cudaStreamNonBlocking);
cudaMalloc( (void**)&devProjection[s], imgSize);
}
// initiate buffers
for( uint b = 0; b < BUFFER_SIZE; b++ ){
cudaMallocHost((void **)&hostProjBuffer[b], imgSize);
}
}
// main function called for all input images
void backproject(imgdata* r)
{
uint projNr = r->imgnr % BUFFER_SIZE;
uint streamNr = r->imgnr % STREAMS_MAX;
// When buffer is filled, wait until work in current stream has finished
if(projNr == 0) {
cudaStreamSynchronize(stream[streamNr]);
}
// copy received image data to buffer (maps double precision to float)
std::copy(r->I_n, r->I_n+(imgSizeX * imgSizeY), hostProjBuffer[projNr]);
// copy image and matrix to device
cudaMemcpyAsync( devProjection[streamNr], hostProjBuffer[projNr], imgSize, cudaMemcpyHostToDevice, stream[streamNr] );
// call kernel
backproject<<<numBlocks, threadsPerBlock, 0 , stream[streamNr]>>>(devProjection[streamNr]);
}
So, for CUDA, I create a pinned host pointer for each buffer item and copy the data to the device before executing kernel of each stream.
For OpenCL I initially did something similar when following the Nvidia OpenCL Best Practices Guide. Here they recommend creating two buffers, one for copying the kernel data to and one for the pinned memory. However, this leads to the implementation using double the device memory as both the kernel and pinned memory buffers are allocated on the device.
To get around this memory issue, I created an implementation where only a mapping is made to the device as it is needed. This can be seen in the following implementation:
// globals
float** hostProjBuffer = new float* [BUFFER_SIZE];
cl_mem devProjection[STREAMS_MAX], devMatrix[STREAMS_MAX];
cl_command_queue queue[STREAMS_MAX];
// initiate streams
void initialize()
{
for( uint s = 0; s < STREAMS_MAX; s++ ){
queue[s] = clCreateCommandQueueWithProperties(context, device, NULL, &status);
devProjection[s] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, imgSize, NULL, &status);
}
}
// main function called for all input images
void backproject(imgdata* r)
{
const uint projNr = r->imgnr % BUFFER_SIZE;
const uint streamNr = r->imgnr % STREAMS_MAX;
// when buffer is filled, wait until work in current stream has finished
if(projNr == 0) {
status = clFinish(queue[streamNr]);
}
// map host memory region to device buffer
hostProjBuffer[projNr] = (float*) clEnqueueMapBuffer(queue[streamNr], devProjection[streamNr], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0, imgSize, 0, NULL, NULL, &status);
// copy received image data to hostbuffers
std::copy(imgPtr, imgPtr + (imgSizeX * imgSizeY), hostProjBuffer[projNr]);
// unmap the allocated pinned host memory
clEnqueueUnmapMemObject(queue[streamNr], devProjection[streamNr], hostProjBuffer[projNr], 0, NULL, NULL);
// set stream specific arguments
clSetKernelArg(kernel, 0, sizeof(devProjection[streamNr]), (void *) &devProjection[streamNr]);
// launch kernel
clEnqueueNDRangeKernel(queue[streamNr], kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
clFlush(queue[streamNr]);
clFinish(queue[streamNr]); //should be removed!
}
This implementation does use a similar amount of device memory as the CUDA implementation. However, I have been unable to get this last code example working without a clFinish after each loop, which significantly hampers the performance of the application. This indicates data is lost as the host moves ahead of the kernel. I tried increasing my buffer size to the number of input images, but this did not work either. So somehow during execution, the hostBuffer data gets lost.
So, with the goal to write OpenCL code similar to CUDA, I have three questions:
What is the recommended implementation for OpenCL pinned memory?
Is my OpenCL implementation similar to how CUDA handles pinned memory?
What causes the wrong data to be used in the OpenCL example?
Thanks in advance!
Kind regards,
Remy
PS: Question initially asked at the Nvidia developer forums

Segmentation fault when sending struct having std::vector member

Why I get the following error for the following code with mpirun -np 2 ./out command? I called make_layout() after resizing the std::vector so normally I should not get this error. It works if I do not resize. What is the reason?
main.cpp:
#include <iostream>
#include <vector>
#include "mpi.h"
MPI_Datatype MPI_CHILD;
struct Child
{
std::vector<int> age;
void make_layout();
};
void Child::make_layout()
{
int nblock = 1;
int age_size = age.size();
int block_count[nblock] = {age_size};
MPI_Datatype block_type[nblock] = {MPI_INT};
MPI_Aint offset[nblock] = {0};
MPI_Type_struct(nblock, block_count, offset, block_type, &MPI_CHILD);
MPI_Type_commit(&MPI_CHILD);
}
int main()
{
int rank, size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
Child kid;
kid.age.resize(5);
kid.make_layout();
int datasize;
MPI_Type_size(MPI_CHILD, &datasize);
std::cout << datasize << std::endl; // output: 20 (5x4 seems OK).
if (rank == 0)
{
MPI_Send(&kid, 1, MPI_CHILD, 1, 0, MPI_COMM_WORLD);
}
if (rank == 1)
{
MPI_Recv(&kid, 1, MPI_CHILD, 0, 0, MPI_COMM_WORLD, NULL);
}
MPI_Finalize();
return 0;
}
Error message:
*** Process received signal ***
Signal: Segmentation fault (11)
Signal code: Address not mapped (1)
Failing at address: 0x14ae7b8
[ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x113d0)[0x7fe1ad91c3d0]
[ 1] /lib/x86_64-linux-gnu/libc.so.6(cfree+0x22)[0x7fe1ad5c5a92]
[ 2] ./out[0x400de4]
[ 3] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7fe1ad562830]
[ 4] ./out[0x400ec9]
*** End of error message ***
Here is an example with several std::vector members that uses MPI datatypes with absolute addresses:
struct Child
{
int foo;
std::vector<float> bar;
std::vector<int> baz;
Child() : dtype(MPI_DATATYPE_NULL) {}
~Child() { if (dtype != MPI_DATATYPE_NULL) MPI_Type_free(dtype); }
const MPI_Datatype mpi_dtype();
void invalidate_dtype();
private:
MPI_Datatype dtype;
void make_dtype();
};
const MPI_Datatype Child::mpi_dtype()
{
if (dtype == MPI_DATATYPE_NULL)
make_dtype();
return dtype;
}
void Child::invalidate_dtype()
{
if (dtype != MPI_DATATYPE_NULL)
MPI_Datatype_free(&dtype);
}
void Child::make_dtype()
{
const int nblock = 3;
int block_count[nblock] = {1, bar.size(), baz.size()};
MPI_Datatype block_type[nblock] = {MPI_INT, MPI_FLOAT, MPI_INT};
MPI_Aint offset[nblock];
MPI_Get_address(&foo, &offset[0]);
MPI_Get_address(&bar[0], &offset[1]);
MPI_Get_address(&baz[0], &offset[2]);
MPI_Type_struct(nblock, block_count, offset, block_type, &dtype);
MPI_Type_commit(&dtype);
}
Sample use of that class:
Child kid;
kid.foo = 5;
kid.bar.resize(5);
kid.baz.resize(10);
if (rank == 0)
{
MPI_Send(MPI_BOTTOM, 1, kid.mpi_dtype(), 1, 0, MPI_COMM_WORLD);
}
if (rank == 1)
{
MPI_Recv(MPI_BOTTOM, 1, kid.mpi_dtype(), 0, 0, MPI_COMM_WORLD, NULL);
}
Notice the use of MPI_BOTTOM as the buffer address. MPI_BOTTOM specifies the bottom of the address space, which is 0 on architectures with flat address space. Since the offsets passed to MPI_Type_create_struct are the absolute addresses of the structure members, when those are added to 0, the result is again the absolute address of each structure member. Child::mpi_dtype() returns a lazily constructed MPI datatype specific to that instance.
Since resize() reallocates memory, which could result in the data being moved to a different location in memory, the invalidate_dtype() method should be used to force the recreation of the MPI datatype after resize() or any other operation that might trigger memory reallocation:
// ...
kid.bar.resize(100);
kid.invalidate_dtype();
// MPI_Send / MPI_Recv
Please excuse any sloppy C++ code above.
The problem here is that you're telling MPI to send a block of integers from &kid, but that's not where your data is. &kid points to an std::vector object, which has an internal pointer to your block of integers allocated somewhere on the heap.
Replace &kid with kid.age.data() and it should work. The reason it "works" when you don't resize is that the vectors will be of 0 size, so MPI will try to send an empty message and no actual memory access takes place.
Be careful, you faced several problems.
First std::vector stores object in heap, so data is not really stored inside your struct.
Second you are not able to send STL containers even between dynamic libraries, also for app instances this is also true. Because they may be compiled with different versions of STL and work on different architectures differently.
Here is good answer about this part of question: https://stackoverflow.com/a/22797419/440168

Multithreading for image processing at GPU using CUDA

Problem Statement:
I have to continuously process 8 megapixel images captured from a camera . There have to be several image processing algorithms on it like color interpolation, color transformation etc. These operations will take a long time at CPU. So, I decided to do these operations at GPU using CUDA kernel. I have already written a working CUDA kernel for color transformation. But still I need some more boost in the performance.
There are basically two computational times:
Copying the source image from CPU to GPU and vice-versa
Processing of the source image at GPU
when the image is getting copied from CPU to GPU....nothing else happens. And similarly, when the processing of image at GPU working...nothing else happens.
MY IDEA: I want to do multi-threading so that I can save some time. I want to capture the next image while the processing of previous image is going on at GPU. And, when the GPU finishes the processing of previous image then, the next image is already there for it to get transferred from CPU to GPU.
What I need: I am completely new to the world of Multi-threading. I am watching some tutorials and some other stuff to know more about it. So, I am looking up for some suggestions about the proper steps and proper logic.
I'm not sure you really need threads for this. CUDA has the ability to allow for asynchronous concurrent execution between host and device (without the necessity to use multiple CPU threads.) What you're asking for is a pretty standard "pipelined" algorithm. It would look something like this:
$ cat t832.cu
#include <stdio.h>
#define IMGSZ 8000000
// for this example, NUM_FRAMES must be less than 255
#define NUM_FRAMES 128
#define nTPB 256
#define nBLK 64
unsigned char cur_frame = 0;
unsigned char validated_frame = 0;
bool validate_image(unsigned char *img) {
validated_frame++;
for (int i = 0; i < IMGSZ; i++) if (img[i] != validated_frame) {printf("image validation failed at %d, was: %d, should be: %d\n",i, img[i], validated_frame); return false;}
return true;
}
void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data) {
validate_image((unsigned char *)data);
}
bool capture_image(unsigned char *img){
for (int i = 0; i < IMGSZ; i++) img[i] = cur_frame;
if (++cur_frame == NUM_FRAMES) {cur_frame--; return true;}
return false;
}
__global__ void img_proc_kernel(unsigned char *img){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
while(idx < IMGSZ){
img[idx]++;
idx += gridDim.x*blockDim.x;}
}
int main(){
// setup
bool done = false;
unsigned char *h_imgA, *h_imgB, *d_imgA, *d_imgB;
size_t dsize = IMGSZ*sizeof(unsigned char);
cudaHostAlloc(&h_imgA, dsize, cudaHostAllocDefault);
cudaHostAlloc(&h_imgB, dsize, cudaHostAllocDefault);
cudaMalloc(&d_imgA, dsize);
cudaMalloc(&d_imgB, dsize);
cudaStream_t st1, st2;
cudaStreamCreate(&st1); cudaStreamCreate(&st2);
unsigned char *cur = h_imgA;
unsigned char *d_cur = d_imgA;
unsigned char *nxt = h_imgB;
unsigned char *d_nxt = d_imgB;
cudaStream_t *curst = &st1;
cudaStream_t *nxtst = &st2;
done = capture_image(cur); // grabs a frame and puts it in cur
// enter main loop
while (!done){
cudaMemcpyAsync(d_cur, cur, dsize, cudaMemcpyHostToDevice, *curst); // send frame to device
img_proc_kernel<<<nBLK, nTPB, 0, *curst>>>(d_cur); // process frame
cudaMemcpyAsync(cur, d_cur, dsize, cudaMemcpyDeviceToHost, *curst);
// insert a cuda stream callback here to copy the cur frame to output
cudaStreamAddCallback(*curst, &my_callback, (void *)cur, 0);
cudaStreamSynchronize(*nxtst); // prevent overrun
done = capture_image(nxt); // capture nxt image while GPU is processing cur
unsigned char *tmp = cur;
cur = nxt;
nxt = tmp; // ping - pong
tmp = d_cur;
d_cur = d_nxt;
d_nxt = tmp;
cudaStream_t *st_tmp = curst;
curst = nxtst;
nxtst = st_tmp;
}
}
$ nvcc -o t832 t832.cu
$ cuda-memcheck ./t832
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
There are many cuda sample codes which may be helpful also, such as simpleStreams, asyncAPI, and simpleCallbacks
Since your question is very wide, I can only think of the following advice:
1) Use CUDA streams
When using more than one CUDA stream, the memory transfer between CPU->GPU, the GPU processing and the memory transfer between GPU->CPU can overlap. This way the image processing of the next image can already begin while the result is transferred back.
You can also decompose each frame. Use n streams per frame and launch the image processing kernels n times with an offset.
2) Apply the producer-consumer scheme
The producer thread captures the frames from the camera and stores them in a thread-safe container. The consumer thread(s) fetch(es) a frame from this source container, upload(s) it to the GPU using its/their own CUDA stream(s), launches the kernel and copies the result back to the host.
Each consumer thread would synchronize with its stream(s) before trying to get a new image from the source container.
A simple implementation could look like this:
#include <vector>
#include <thread>
#include <memory>
struct ThreadSafeContainer{ /*...*/ };
struct Producer
{
Producer(std::shared_ptr<ThreadSafeContainer> c) : container(c)
{
}
void run()
{
while(true)
{
// grab image from camera
// store image in container
}
}
std::shared_ptr<ThreadSafeContainer> container;
};
struct Consumer
{
Consumer(std::shared_ptr<ThreadSafeContainer> c) : container(c)
{
cudaStreamCreate(&stream);
}
~Consumer()
{
cudaStreamDestroy(stream);
}
void run()
{
while(true)
{
// read next image from container
// upload to GPU
cudaMemcpyAsync(...,...,...,stream);
// run kernel
kernel<<<..., ..., ..., stream>>>(...);
// copy results back
cudaMemcpyAsync(...,...,...,stream);
// wait for results
cudaStreamSynchronize(stream);
// do something with the results
}
}
std::shared_ptr<ThreadSafeContainer> container;
cudaStream_t stream; // or multiple streams per consumer
};
int main()
{
// create an instance of ThreadSafeContainer which whill be shared between Producer and Consumer instances
auto container = std::make_shared<ThreadSafeContainer>();
// create one instance of Producer, pass the shared container as an argument to the constructor
auto p = std::make_shared<Producer>(container);
// create a separate thread which executes Producer::run
std::thread producer_thread(&Producer::run, p);
const int consumer_count = 2;
std::vector<std::thread> consumer_threads;
std::vector<std::shared_ptr<Consumer>> consumers;
// create as many consumers as specified
for (int i=0; i<consumer_count;++i)
{
// create one instance of Consumer, pass the shared container as an argument to the constructor
auto c = std::make_shared<Consumer>(container);
// create a separate thread which executes Consumer::run
consumer_threads.push_back(std::thread(&Consumer::run, c));
}
// wait for the threads to finish, otherwise the program will just exit here and the threads will be killed
// in this example, the program will never exit since the infinite loop in the run() methods never end
producer_thread.join();
for (auto& t : consumer_threads)
{
t.join();
}
return 0;
}