increase performance of opencl

increase performance of opencl - c++

I am trying to implement some image processing algorithm using opencl. But as i see when i use opencl it is taking around 0.5 ms to complet one process i.e one frame. Isn't there any way than i initialize the opencl parameters only once using class object declaration than only call a function run the main kernel? I tried like this by creating class but as i find context, device can't be declared and used seperately and needs to be created each time.
#include <CL/cl.hpp>
#include <chrono>
#include <iostream>
using namespace std::chrono;
using namespace std;
namespace Color {
enum Code {
FG_RED = 31,
FG_GREEN = 32,
FG_BLUE = 34,
FG_DEFAULT = 39,
BG_RED = 41,
BG_GREEN = 42,
BG_BLUE = 44,
BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream& operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
} // namespace Color
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
}
void useOpenCL::backgroundSub() {
int A[size], B[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
auto start1 = high_resolution_clock::now();
cl::Context context({default_device});
cl::Program program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create buffers on the device
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
/*cl::KernelFunctor
simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange);
simple_add(buffer_A,buffer_B,buffer_C);*/
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
queue.finish();
int C[size];
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
/*std::cout<<" result: \n";
for(int i=0;i<size;i++){
std::cout<<C[i]<<"\t";
}*/
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
int main() {
useOpenCL img;
while (true) {
img.backgroundSub();
}
return 0;
}
It is giving me below results:
Segmentation FPS=13.2557 Execution Time(sec)=0.075439
Segmentation FPS=15.7602 Execution Time(sec)=0.063451
Segmentation FPS=14.3872 Execution Time(sec)=0.069506
Segmentation FPS=12.7525 Execution Time(sec)=0.078416
Which is not good since fps is only 12, 13 fps. So how can i make this program faster?

Put the initialization part that you only need to call once in the beginning in the constructor. This initialization should contain ALL memory allocation, OpenCL C code compilation and any initial memory transfers from host to device:
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
context = cl::Context({default_device});
program = cl::Program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create queue to which we will push commands for the device.
queue = cl::CommandQueue(context, default_device);
// create buffers on host
int A[size], B[size];
int C[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
// create buffers on the device
buffer_A = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_B = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_C = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
}
Therefore make context, program, queue, buffer_A, buffer_B, buffer_C member variables of your class useOpenCL. Especially the memory allocation and compilation take a long time, so do them only once and reuse the buffers.
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_C;
cl::Context context;
cl::Program program;
cl::CommandQueue queue;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
Then only the kernel call and eventually memory transfers host<->device remain for every frame calculation:
void useOpenCL::backgroundSub() {
auto start1 = high_resolution_clock::now();
// write arrays A and B to the device (ONLY IF NECESSARY FOR EVERY FRAME)
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
queue.finish();
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
The latter code can be called over and over again and should be much faster than if you re-initialize everything over and over again. Also make sure that size is large enough, otherwise the GPU might not be utilized at its full potential and the latencies for host<->device memory transfers will make every frame disproportunately slower.

Related

Convert NV12 to BGR by NVIDIA Performance Primitives

I'm trying to convert NV12 image to BGR by npp, but in the final array i have zeroes.
int lumaStepBytes, chromaStepBytes;
int rgbStepBytes;
auto dpNV12LumaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetHeight(), &lumaStepBytes);
auto dpNV12ChromaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetChromaHeight(), &chromaStepBytes);
auto dpBGRFrame = nppiMalloc_8u_C3(dec.GetWidth(), dec.GetHeight(), &rgbStepBytes);
cudaMemcpy2D(dpNV12LumaFrame, lumaStepBytes, pFrame, dec.GetWidth(),
dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);
cudaMemcpy2D(dpNV12ChromaFrame, chromaStepBytes, pFrame + dec.GetLumaPlaneSize(), dec.GetWidth(),
dec.GetWidth(), dec.GetChromaHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);
Npp8u *planesAddres[2];
planesAddres[0] = dpNV12LumaFrame;
planesAddres[1] = dpNV12ChromaFrame;
nppiNV12ToBGR_8u_P2C3R(planesAddres, lumaStepBytes,
dpBGRFrame, rgbStepBytes,
{dec.GetWidth(), dec.GetHeight()});
res.m_data.resize(dec.GetWidth() * dec.GetHeight() * 3);
cudaMemcpy2D(res.m_data.data(), dec.GetWidth(), dpBGRFrame, rgbStepBytes,
dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyDeviceToHost);
nppiFree(dpBGRFrame);
nppiFree(dpNV12ChromaFrame);
nppiFree(dpNV12LumaFrame);
dec is a video decoder which gives pFrame in NV12 format and provide additional information about that, like offsets, dimensions, NV12 planes, etc.
The same result I have if I use cu... and cuda... functions for allocating without alignment.
Do anybody have any ideas about the problem?

For questions like this the SO expectation is that you provide a complete example, see item 1 here. So I haven't tried to determine exactly what is wrong with your code.
However I can show you a complete code that converts NV12 to RGB (and other things as well) which is working correctly for me:
// sample compile command line: nvcc -o rs rs.cu -lnppicc -lnppig -DUSE_DEBUG -DUNIT_TEST
#include <nppi.h>
#include <iostream>
template <typename T>
__global__ void pack_uv(T * __restrict__ u, T * __restrict__ v, T * __restrict__ uv, const int w, const int h, const int pitch_uv, const int pitch_u, const int pitch_v){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < w) && (idy < h)){
T *o = (T *)(((char *)uv) + idy*pitch_uv);
T *iu = (T *)(((char *)u) + idy*pitch_u);
T *iv = (T *)(((char *)v) + idy*pitch_v);
int idx2 = idx >> 1;
o[idx] = (idx&1)?iv[idx2]:iu[idx2];}
}
int rs(const int ish, const int isw, const int ipitch, const int osh, const int osw, const int opitch, const unsigned char *iy, const unsigned char *iuv, unsigned char *oy, unsigned char *ouv, unsigned char *tempbuff, int method = 0, int eInterpolation = NPPI_INTER_LANCZOS){
#ifdef USE_DEBUG
if ((iy != NULL) && (tempbuff == NULL)) std::cout << "error: tempbuff is NULL" << std::endl;
if ((iy != NULL) && (iuv == NULL)) std::cout << "error: iuv is NULL" << std::endl;
if ((iy != NULL) && (oy == NULL)) std::cout << "error: oy is NULL" << std::endl;
if ((iy != NULL) && (ouv == NULL)) std::cout << "error: ouv is NULL" << std::endl;
if (isw < 2) std::cout << "error on input width: " << isw << std::endl;
if (ish < 2) std::cout << "error on input height: " << ish << std::endl;
if (ipitch < isw) std::cout << "error on input pitch: " << ipitch << std::endl;
if (osw < 1) std::cout << "error on output width: " << osw << std::endl;
if (osh < 1) std::cout << "error on output height: " << osh << std::endl;
if (opitch < osw) std::cout << "error on output pitch: " << opitch << std::endl;
#endif
cudaError_t err;
NppStatus stat;
// convert NV12 input to RGB
if (iy == NULL){ // temp buffer sizing
// for method 1
NppiSize oSrcROI;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiSize oDstROI;
oDstROI.width = osw;
oDstROI.height = osh;
int bufferSize;
stat = nppiResizeAdvancedGetBufferHostSize_8u_C1R(oSrcROI, oDstROI, &bufferSize, NPPI_INTER_LANCZOS3_ADVANCED);
return ((ish*isw + osh*osw)*3*sizeof(unsigned char))+bufferSize; // temp buffer sizing
}
if (method == 0){
const Npp8u *pSrc[2] = {iy, iuv};
NppiSize oSizeROI;
oSizeROI.width = isw;
oSizeROI.height = ish;
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#endif
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "NV12 to RGB CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "NV12 to RGB NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -1;
// perform resize
NppiSize oSrcSize;
oSrcSize.width = isw;
oSrcSize.height = ish;
NppiRect oSrcROI;
oSrcROI.x = 0;
oSrcROI.y = 0;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiRect oDstROI;
oDstROI.x = 0;
oDstROI.y = 0;
oDstROI.width = osw;
oDstROI.height = osh;
double nXFactor = osw/(double)isw;
double nYFactor = osh/(double)ish;
double nXShift = 0;
double nYShift = 0;
stat = nppiResizeSqrPixel_8u_C3R(tempbuff, oSrcSize, isw*3*sizeof(Npp8u), oSrcROI, tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "RGB LANCZOS RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "RGB LANCZOS RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -2;
// convert resized RGB to YUV420
Npp8u *pDst[3] = { oy, ouv, ouv + osh*opitch/4 };
int rDstStep[3] = { opitch, opitch/2, opitch/2 };
oSizeROI.width = osw;
oSizeROI.height = osh;
stat = nppiRGBToYUV420_8u_C3P3R(tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "RGB TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "RGB TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -3;
// pack uv
dim3 block(32, 8);
dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
pack_uv<<< grid, block >>>(ouv, ouv + osh*opitch/4, tempbuff, osw, osh/2, osw, osw/2, osw/2);
err = cudaGetLastError();
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -4;
// move packed uv to output
err = cudaMemcpy2D(ouv, opitch, tempbuff, osw*sizeof(Npp8u), osw*sizeof(Npp8u), osh/2, cudaMemcpyDeviceToDevice);
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV COPY CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -5;
}
else{ // method 1
// NV12 to YUV420 planar
const Npp8u *const pSrc[2] = {iy, iuv};
Npp8u *pDst[3] = {tempbuff, tempbuff+isw*ish, tempbuff+isw*ish+(isw*ish)/4};
int aDstStep[3] = {isw, isw/2, isw/2};
NppiSize oSizeROI;
oSizeROI.width = isw;
oSizeROI.height = ish;
stat = nppiNV12ToYUV420_8u_P2P3R(pSrc, ipitch, pDst, aDstStep, oSizeROI);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "NV12 TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "NV12 TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -6;
// resize each plane individually
NppiSize oSrcSize = oSizeROI;
NppiRect oSrcROI;
oSrcROI.x = 0;
oSrcROI.y = 0;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiRect oDstROI;
oDstROI.x = 0;
oDstROI.y = 0;
oDstROI.width = osw;
oDstROI.height = osh;
double nXFactor = osw/(double)isw;
double nYFactor = osh/(double)ish;
// resize Y
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff, oSrcSize, isw, oSrcROI, oy, opitch, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "Y RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "Y RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -7;
// resize U
oSrcSize.width /= 2;
oSrcSize.height /= 2;
oSrcROI.width /= 2;
oSrcROI.height /= 2;
oDstROI.width /= 2;
oDstROI.height /= 2;
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw, oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "U RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "U RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -8;
// resize V
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw+(ish*isw/4), oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3)+(osh*osw/4), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "V RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "V RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -9;
// pack_uv
dim3 block(32, 8);
dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
pack_uv<<< grid, block >>>(tempbuff+(ish*isw*3), tempbuff+(ish*isw*3)+(osh*osw/4), ouv, osw, osh/2, opitch, osw/2, osw/2);
err = cudaGetLastError();
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -10;
}
return 0;
}
#ifdef UNIT_TEST
// timing
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
// bitmap file handling
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
#include <fstream>
Info readBMP(const char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
if ((i-offset+1) < (3*height*width)) std::cout << "size: " << i-offset+1 << " expected: " << height*width*3 << std::endl;
std::copy(info,
info + offset,
dat.info);
std::copy(info + offset,
info + i,
dat.data);
delete[] info;
return dat;
}
void writeBMP(const char *filename, Info dat){
std::ofstream fout;
fout.open(filename, std::ios::binary | std::ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(dat.data), dat.size - dat.offset );
fout.close();
}
int main(int argc, char *argv[]){
int eInterpolation = NPPI_INTER_LANCZOS;
if (argc > 1) eInterpolation = atoi(argv[1]);
else{
std::cout << "Must specify a valid interpolation mode:" << std::endl;
std::cout << NPPI_INTER_NN << " :NPPI_INTER_NN" << std::endl;
std::cout << NPPI_INTER_LINEAR << " :NPPI_INTER_LINEAR" << std::endl;
std::cout << NPPI_INTER_CUBIC << " :NPPI_INTER_CUBIC" << std::endl;
std::cout << NPPI_INTER_LANCZOS << " :NPPI_INTER_LANCZOS" << std::endl;
return 0;}
int method = 0;
if (argc > 2) method = atoi(argv[2]);
// input to NV12
Info rfile = readBMP("input.bmp");
const int H = rfile.height;
const int W = rfile.width;
std::cout << "Height = " << rfile.height << std::endl;
std::cout << "Width = " << rfile.width << std::endl;
Npp8u *rgbdata, *ty, *tu, *tv, *tuv;
cudaMalloc(&rgbdata, H*W*3);
cudaMalloc(&ty, H*W);
cudaMalloc(&tu, H*W/4);
cudaMalloc(&tv, H*W/4);
cudaMalloc(&tuv, H*W/2);
cudaMemcpy(rgbdata, rfile.data, H*W*3, cudaMemcpyHostToDevice);
Npp8u *pDst[3] = { ty, tu, tv};
int rDstStep[3] = { W, W/2, W/2 };
NppiSize oSizeROI;
oSizeROI.width = W;
oSizeROI.height = H;
NppStatus stat = nppiRGBToYUV420_8u_C3P3R(rgbdata, W*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
if (stat != NPP_SUCCESS) { std::cout << "Input NPP error" << std::endl; return 0;}
dim3 block(32, 8);
dim3 grid((W+block.x-1)/block.x, (H+block.y-1)/block.y);
pack_uv<<< grid, block >>>(tu, tv, tuv, W, H/2, W, W/2, W/2);
// 1:1 test
int buff_size = rs(H, W, W, H, W, W, NULL, NULL, NULL, NULL, NULL);
unsigned char *tbuff;
cudaError_t err = cudaMalloc(&tbuff, buff_size);
if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
unsigned char *oy, *ouv;
err = cudaMalloc(&oy, H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
err = cudaMalloc(&ouv, H*W*sizeof(unsigned char)/2);
if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
int error = rs(H, W, W, H, W, W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
if (error != 0) std::cout << "Function Failure: " << error << std::endl;
// output to RGB
const Npp8u *pSrc[2] = {ty, tuv};
oSizeROI.width = W;
oSizeROI.height = H;
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#endif
if (stat != NPP_SUCCESS) { std::cout << "Output NPP error" << std::endl; return 0;}
cudaMemcpy(rfile.data, rgbdata, H*W*3, cudaMemcpyDeviceToHost);
writeBMP("output.bmp", rfile);
// 2x upscale test
cudaFree(tbuff);
buff_size = rs(H, W, W, 2*H, 2*W, 2*W, NULL, NULL, NULL, NULL, NULL);
err = cudaMalloc(&tbuff, buff_size);
if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
cudaFree(oy);
cudaFree(ouv);
err = cudaMalloc(&oy, 4*H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
err = cudaMalloc(&ouv, 2*H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
unsigned long long dt = dtime_usec(0);
error = rs(H, W, W, 2*H, 2*W, 2*W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (error != 0) std::cout << "Function Failure: " << error << std::endl;
std::cout << "2x resize time: " << dt/(float)USECPSEC << "s" << std::endl;
// output to RGB
const Npp8u *pSrc2[2] = {oy, ouv};
oSizeROI.width = 2*W;
oSizeROI.height = 2*H;
cudaFree(rgbdata);
cudaMalloc(&rgbdata, H*W*12);
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#endif
if (stat != NPP_SUCCESS) { std::cout << "Output NPP error" << std::endl; return 0;}
delete[] rfile.data;
rfile.data = new unsigned char[H*W*12];
cudaMemcpy(rfile.data, rgbdata, H*W*12, cudaMemcpyDeviceToHost);
int osize = rfile.size - rfile.offset;
int nsizeinc = H*W*12 - osize;
rfile.size += nsizeinc;
*((int*)(rfile.info+18)) = 2*W;
*((int*)(rfile.info+22)) = 2*H;
writeBMP("output2.bmp", rfile);
return 0;
}
#endif
The above code does the following steps:
read in a .bmp file from disk into RGB storage
convert to YUV420
convert to NV12
resize the NV12 image (there are multiple steps here)
convert the resized NV12 image to RGB
write RGB image as a .bmp file

the problem went away when I changed the video card. GeForce 740 to 1080

NppiSize oSizeROI;
oSizeROI.width = frame_dec->width;
oSizeROI.height = frame_dec->height;
DBUG_PRINT("width: %d, height: %d, stepBytes: %d\n", oSizeROI.width, oSizeROI.height, stepBytes);
// NppStatus stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->width, bgrData, frame_dec->width*3*sizeof(Npp8u), oSizeROI);
NppStatus stat;
#ifdef USE_709
stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width*3, oSizeROI);
#else
stat = nppiNV12ToBGR_709HDTV_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width * 3, oSizeROI);
#endif
unsigned char *data = (unsigned char *)malloc(frame_dec->width * frame_dec->height * 3);
cudaMemcpy(data, bgrData, frame_dec->height * frame_dec->width * 3, cudaMemcpyDeviceToHost);
cv::Mat mat_test(frame_dec->height, frame_dec->width, CV_8UC3, data);
imwrite("test.jpg", mat_test);
free(data);
nppiFree(bgrData);
exit(0);
frame_dec decoded by ffmpeg cuda decoder

c++ cuda: cudaMallocManaged access outside of constructor

I have a c++ class that uses cudaMallocManaged like so:
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
float *data;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}
MyMatrix::~MyMatrix(void)
{
cudaFree(data); // delete the data array
}
I have a header .h file too:
class MyMatrix
{
public:
MyMatrix(int new_rows, int new_cols, int padr, int padt);
~MyMatrix(void);
float *data;
int padr;
int padc;
int rows;
int cols;
}
I can access the data array fine within this constructor.
However, as soon as I try to access it (read or write) outside of it, I get terminated by signal SIGSEGV (Address boundary error). E.g.,:
MyMatrix *newmat = new MyMatrix(totalr, totalc, padr, padc);
cout << (*newmat).data[0] << endl;
Or
MyMatrix newmat = new MyMatrix(totalr, totalc, padr, padc);
cout << newmat.data[0] << endl;
How can I "persist" this pointer?

In your constructor you create a local variable called data:
float *data;
After allocating the cuda memory and assigning the value to the local variable data, the memory address does not seem to be stored anywhere permanently. So, your memory becomes unreachable.
You have this line newmat.data, but in the constructro you never assigned any value to a member data. You did use just the local variable with the same name.

Your problem is that in
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
float *data;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}
the data you use in cudaMallocManaged(&data, new_rows*new_cols*sizeof(float)) is the float *data; you declared in the line above, not the data member of your class. You just need to get rid of the local float *data; so you use the data class member like
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}

OpenCL - results are not the same as CPU version

I want to use openCl for a pixel value comparison between two images. One image should be "transformed" by a transformation matrix.
1) I am facing one problem, that the result of the openCL version is not the same as the CPU version.
The pixel value difference is in my example images (imageA: all pixels are 5, imagesB: all pixel are 6) always one, so in total with 1000*1000 pixels it should be 1000000.
The CPU version is always correct, but the openCL version is always a little unprecise and also differs from time to time (e.g. 998895 or 998829).
2) Another issue I have is the runtime, because adding the difference of two compared pixel to a result variable, takes a long time. But my feelings says that it could be solved through another memory layout.
Any ideas for the problems I have?
Maybe also the way of using a two dimensional workingset leads to mistakes?
Thank you and kind regards
Hendrik
Here is the kernel:
Basically it gets two images and 700 transformation matrices (at the moment all are representing the identity).
__kernel void compliance(
__read_only image2d_t imageA,
__read_only image2d_t imageB,
__constant float *matrix,
__global int *result
)
{
for (int i = 0; i < 700; i++)
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
float t1 = matrix[0 + i * 6];
float t2 = matrix[1 + i * 6];
float t3 = matrix[2 + i * 6];
float t4 = matrix[3 + i * 6];
float t5 = matrix[4 + i * 6];
float t6 = matrix[5 + i * 6];
//calculate the other coords of the comparing pixel
int x_new = x * t1 + y * t2 + 1 * t3;
int y_new = x * t4 + y * t5 + 1 * t6;
int a = (read_imagei(imageA, (int2)(x, y)).x);
int b = (read_imagei(imageB, (int2)(x_new, y_new)).x);
int diff = b - a;
//add every different of two compared pixels to the result
result[i] += diff;
}
}
Here is my host code:
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <opencv2\core.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\highgui.hpp>
using namespace std;
int main(int argc, char** argv) {
//700 transformation matrices
int numberMatrices = 700;
bool opencl = true;
//iamge width
int width = 1000;
//image height
int height = 1000;
//total number of pixels of one image
int size = width*height;
// Create two example images
const int LIST_SIZE = size;
int *imageA = new int[LIST_SIZE];
int *imageB = new int[LIST_SIZE];
for (int i = 0; i < LIST_SIZE; i++) {
//every pixel value of imageA is 5
imageA[i] = 5;
//every pixel value of imageA is 6
imageB[i] = 6;
}
//creation of n transformation matrices
const int MATRIX_SIZE = 6* numberMatrices;
float *indi = new float[MATRIX_SIZE];
//all the matrices are the same
for (int i = 0; i < numberMatrices; i++)
{
//identity matrix
indi[0 + i * 6] = 1;
indi[1 + i * 6] = 0;
indi[2 + i * 6] = 0;
indi[3 + i * 6] = 0;
indi[4 + i * 6] = 1;
indi[5 + i * 6] = 0;
}
//array to save the results of the comparison
const int RESULT_SIZE = numberMatrices;
int *result = new int[RESULT_SIZE];
if (opencl)
{
try {
// Get available platforms
vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cerr << "Platform number is: " << platforms.size() << std::endl;
std::string platformVendor;
platforms[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
std::cerr << "Platform is by: " << platformVendor << "\n";
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
cl::Context context(CL_DEVICE_TYPE_CPU, cps);
vector<cl::ImageFormat> format;
context.getSupportedImageFormats(CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format);
/* for (int i = 0; i < format.size(); i++)
{
cout << "Channel Data Type: " << format.at(i).image_channel_data_type
<< " Channel order: " << format.at(i).image_channel_order << endl;
}*/
// Get a list of devices on this platform
vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
for (int i = 0; i < devices.size(); i++)
{
cout << "Device: " << devices.at(i).getInfo<CL_DEVICE_NAME>() << endl;
cout << "DOUBLE FP: " << devices.at(i).getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>() << endl;
cout << "Image Max Height: " << devices.at(i).getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() << endl;
cout << "Image Support: " << devices.at(i).getInfo<CL_DEVICE_IMAGE_SUPPORT>() << endl;
cout << "Local Memory Size: " << devices.at(i).getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << endl;
cout << "Clock Frequency: " << devices.at(i).getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << endl;
cout << "CUs: " << devices.at(i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "Driver: " << devices.at(i).getInfo<CL_DRIVER_VERSION>() << endl;
cout << "Version: " << devices.at(i).getInfo<CL_DEVICE_VERSION>() << endl;
cout << "Work Group: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
cout << "Items: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
cout << endl;
}
//Create opencl image
cl::Image2D clImage_A = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageA);
cl::Image2D clImage_B = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageB);
// Create a command queue and use the first device
cl::CommandQueue queue = cl::CommandQueue(context, devices[0]);
// Read kernel source file
std::ifstream sourceFile("difference.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));
// Make program of the source code in the context
cl::Program program = cl::Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
cl::Kernel kernel(program, "compliance");
// Create memory buffers
cl::Buffer buffer_matrix = cl::Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(float));
cl::Buffer buffer_result = cl::Buffer(context, CL_MEM_READ_WRITE, RESULT_SIZE * sizeof(int));
// Copy list of results to the memory buffers
queue.enqueueWriteBuffer(buffer_matrix, CL_TRUE, 0, MATRIX_SIZE * sizeof(float), indi);
// Set arguments to kernel
kernel.setArg(0, clImage_A);
kernel.setArg(1, clImage_B);
kernel.setArg(2, buffer_matrix);
kernel.setArg(3, buffer_result);
cl::Event event;
std::cout << "Start OpenCL processing.." << endl;
chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();
// Run the kernel n-times on specific ND range
for (int i = 0; i < 1; i++)
{
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange((size_t)width, (size_t)height),
cl::NDRange(1, 1),
NULL,
&event);
cout << i << " ";
event.wait();
}
chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now();
auto duration_opencl = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "OpenCL processing done.." << endl;
std::cout << "Start CPU Processing.." << endl;
// Read buffer_result into result
queue.enqueueReadBuffer(buffer_result, CL_TRUE, 0, RESULT_SIZE * sizeof(int), result);
//cpu version to calculate the difference between the two arryays
t1 = chrono::high_resolution_clock::now();
int different = 0;
int x_new;
int x;
for (int i = 0; i < numberMatrices; i++)
{
different = 0;
for (int n = 0; n < LIST_SIZE; n++)
{
x = imageA[n];
x_new = x;;
int a = imageA[x];
int b = imageB[x_new];
int diff = imageB[x_new] - imageA[x];
different += diff;
}
}
t2 = chrono::high_resolution_clock::now();
auto duration_cpu = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "CPU processing done.." << endl;
//output of the results
std::cout << "opencl: diff " << result[0] << endl;
std::cout << "Runtime opencl: " << duration_opencl << endl;
std::cout << "CPU: diff " << different << endl;
std::cout << "Runtime CPU: " << duration_cpu << endl;
double times = (double)duration_cpu / (double)duration_opencl;
std::cout << "OpenCL is " << times << " times faster!!!" << endl;
char c;
std::cin >> c;
}
catch (cl::Error error) {
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
char c;
std::cin >> c;
}
}
return 0;
}

Don't you think that you have race condition here case in this line result[i] += diff; of your OpenCL code your program do it in each work item simultaneously? So maybe it could be a problem.

The floating point arithmetics differ on the platforms. You are very likely seeing the effect of a hardware specific MAD optimization performed by the OpenCL compiler. As far as I know, disabling optimizations using -cl-opt-disable will not help in this case.

c++ opencl return CL_OUT_OF_RESOURCES

I'm learning OpenCL and trying to apply a black and white on a picture but enqueueNDRangeKernel return CL_OUT_OF_RESOURCES and I don't understand why. OpenCL is running on a GTX 980M, and OpenCL 1.2 .
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <string>
#include <CL/cl.hpp>
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_write.h"
int main() {
unsigned int vectorSize = 1000;
const std::string progCode = "__kernel \n"
"void img_kernel( __read_only image2d_t inputImage, __write_only image2d_t outputImage) \n"
"{ \n"
"const sampler_t sampler=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
" int width = get_image_width(inputImage); \n"
" int height = get_image_height(inputImage); \n"
" int2 pixelcoord = (int2) (get_global_id(0), get_global_id(1)); \n"
" float4 pixel = read_imagef(inputImage, sampler, pixelcoord); \n"
" float color = (pixel.x + pixel.y + pixel.z)/3; \n"
" float4 outColor = (float4)(pixel.x,pixel.y,pixel.z, 1.0); \n"
" write_imagef(outputImage, pixelcoord, outColor); } \n";
int imageX, imageY, imageN;
unsigned char *dataImage = stbi_load("test.jpg", &imageX, &imageY, &imageN, 3);
if (dataImage == nullptr)
{
std::cout << "Unable to load picture" << std::endl;
getchar();
return 1;
}
cl_int error;
std::vector<cl::Platform> platformsList;
error = cl::Platform::get(&platformsList);
if (error != CL_SUCCESS)
{
std::cout << "Unable to find any OpenCL platforms" << std::endl;
getchar();
return 1;
}
std::vector<cl::Device> devicesList;
error = platformsList[0].getDevices(CL_DEVICE_TYPE_DEFAULT, &devicesList);
if (error != CL_SUCCESS)
{
std::cout << "Unable to find any OpenCL device" << std::endl;
getchar();
return 1;
}
cl::Device currentDevice = devicesList[0];
std::string nameDevice, driverDevice;
error = currentDevice.getInfo(CL_DEVICE_NAME, &nameDevice);
error = currentDevice.getInfo(CL_DRIVER_VERSION, &driverDevice);
std::cout << "Device : " << nameDevice << " " << driverDevice << std::endl;;
cl::Context context(currentDevice);
cl::Program::Sources source;
source.push_back({ progCode.c_str(), progCode.size() });
cl::Program program(context, source);
if (program.build({ currentDevice }) != CL_SUCCESS)
{
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(currentDevice) << "\n";
getchar();
exit(1);
}
cl::ImageFormat globalImgFormat;
globalImgFormat.image_channel_data_type = CL_UNSIGNED_INT8;
globalImgFormat.image_channel_order = CL_RGB;
cl::size_t<3> origin;
origin[0] = 0; origin[1] = 0, origin[2] = 0;
cl::size_t<3> region;
region[0] = imageX; region[1] = imageY; region[2] = 1;
cl::Image2D inputImage(context, CL_MEM_READ_ONLY, globalImgFormat, imageX, imageY, 0, dataImage, &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create cl Image for input." << std::endl;
getchar();
return 1;
}
cl::Image2D outputImage(context, CL_MEM_WRITE_ONLY, globalImgFormat, imageX, imageY, 0, nullptr, &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create cl Image for output." << std::endl;
getchar();
return 1;
}
cl::CommandQueue queue(context, currentDevice);
cl::Kernel image_kernel(program, "img_kernel", &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create kernel." << std::endl;
getchar();
return 1;
}
error = image_kernel.setArg(0, inputImage);
if (error != CL_SUCCESS)
{
std::cout << "Unable to set param." << std::endl;
getchar();
return 1;
}
error = image_kernel.setArg(1, outputImage);
if (error != CL_SUCCESS)
{
std::cout << "Unable to set param." << std::endl;
getchar();
return 1;
}
cl::NDRange globalSize(imageX, imageY);
error = queue.enqueueNDRangeKernel(image_kernel, cl::NullRange, globalSize, cl::NullRange);
if (error != CL_SUCCESS)
{
std::cout << "Unable to compute Image data." << std::endl;
getchar();
return 1;
}
queue.finish();
unsigned char *resultPros = new unsigned char[imageX * imageY * imageN];
error = queue.enqueueReadImage(outputImage, CL_TRUE, origin, region, 0, 0, resultPros);
stbi_write_bmp("testresul.jpg", imageX, imageY, imageN, resultPros);
stbi_image_free(dataImage);
stbi_image_free(resultPros);
getchar();
return 0;
}

On NVIDIA hardware if you write outside of a buffer or image (which is an undefined operation) then CL_OUT_OF_RESOURCES is a common error to get. It's better than the early hardware or drivers that simply crashed! Double-check your writes.

It seems that the combination of image_channel_data_type and image_channel_order is not correct. Could be your problem related with this?
Please, take a look here:
https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_image_format.html
Regards

Draw Mandelbrot using OpenCl

I want to write a program to draw a Mandelbrot set to an image.
I'm using OpenCl and cl.hpp - wrapper to c++, but I don't know why this isn't working.
I try draw image with different width and height, but mostly I get white image with random colored pixel (when in kernel I try write to output hardcoded values) or black image.
I also wondering about passing image2d_t to the kernel and "write" pixels directly to the image but when I execute my kernel I get -52 error that I set invalid arguments..
Declaration of my kernel(using image2d_t) looks like this:
__kernel syf(__write_only image2d_t img)
and I set arguments like this:
cl_mem image= clCreateImage2D(context, CL_MEM_WRITE_ONLY, &outputFormat, ImageWidth, ImageHeight, 0, 0, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image)
Could you look at the code and help me?
#define __CL_ENABLE_EXCEPTIONS
//#define __NO_STD_VECTOR
#include <ctime>
#include <fstream>
#include <iostream>
#include <exception>
#include "D:/Users/cuda/cuda/bmp.cpp"
#if defined (__APPLE__) || defined (MACOSX)
#include <OpenCl/cl.cpp>
#else
#include <CL/OpenCL.h>
#include <CL/cl.hpp>
#endif
const int N = 1024 * 1024 * 3;
const char * kernelSource =
"__kernel void prepareVector(__global char *output)"
"{"
" size_t xDimension = get_global_id(0); "
" size_t yDimension = get_global_id(1); "
" "
" size_t currentIndex = 3 * 1024.0f*yDimension + 3 * xDimension; "
" float xOriginal = 3.25f*((float) xDimension / 1024.0f) - 2.0f; "
" float yOriginal = 2.5f*((float) yDimension / 1024.0f) - 1.25f;"
" "
" int iteration = 0; "
" int maxIteration = 256; "
" float temp; "
" float x = 0.0f; "
" float y = 0.0f; "
" while (x*x + y*y <= 4.0f && iteration < maxIteration)"
" { "
" temp = x*x - y*y + xOriginal; "
" y = 2.0*x*y + yOriginal; "
" x = temp; "
" ++iteration; "
" } "
" "
" if (iteration == maxIteration) "
" { "
" iteration = 0; "
" } "
" output[currentIndex] = iteration; "
" output[currentIndex] = iteration;"
" output[currentIndex] = iteration;"
"}";
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::vector<cl::Kernel> allKernels;
cl::Program program;
cl_int cli_err_msg;
char *host_out_c;
int main(int argc, char* argv [])
{
try
{
size_t dimensionSize = 2;
cl::NDRange *globalSize = new cl::NDRange( 6, 8 );
cl::NDRange *localSize = new cl::NDRange( 3, 4 );
cl::NDRange *offsetSize = new cl::NDRange( 0, 0 );
host_out_c = new char[N];
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
cl_context_properties properties [] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) (platforms[0])(), 0
};
//cl_context context = clCreateContext(0, devices.size(), devices, NULL, NULL, NULL);
cl::Context context(CL_DEVICE_TYPE_ALL, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program::Sources source(1, std::make_pair(kernelSource, strlen(kernelSource)));
program = cl::Program(context, source);
program.build(devices);
cl::Kernel kernel(program, "prepareVector", &cli_err_msg);
cl::Buffer device_out_c(context, CL_MEM_WRITE_ONLY, N*sizeof(char));
cl::Event event;
cl::CommandQueue queue(context, devices[0], 0, &cli_err_msg);
kernel.setArg(0, device_out_c);
queue.enqueueNDRangeKernel(kernel, *offsetSize, *globalSize, *localSize, NULL, &event);
queue.enqueueReadBuffer(device_out_c, true, 0, N*sizeof(char), (void*) host_out_c);
//printArray("kernel output:\n", host_out_c);
write_bmp("lel.bmp", 1024, 1024, host_out_c);
}
catch (cl::Error e)
{
std::cout << "Status: "<< program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]) << std::endl;
std::cout << "Options: "<< program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]) << std::endl;
std::cout << "Log: "<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
std::cout << e.what() << ": Error code: " << e.err() << std::endl;
}
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

increase performance of opencl - c++

Related

Convert NV12 to BGR by NVIDIA Performance Primitives

c++ cuda: cudaMallocManaged access outside of constructor

OpenCL - results are not the same as CPU version

c++ opencl return CL_OUT_OF_RESOURCES

Draw Mandelbrot using OpenCl

Categories

Resources