CUDA error with processing the image - c++

I'm trying to get black and white image as the output with color image as the input. I'm using an OpenCV to get the image and write the output, and CUDA to make the image black and white in kernel. I tried the same code, but without OpenCV, and it worked fine. But now the output is slightly different from what I really expect to get.
I think that CUDA code needs some modification to work with OpenCV. I worked a bit with it but failed to find the way to do that. Maybe somebody can give me an advice or modify my code, please? I'm really confused with this problem.
__global__ void addMatrix(uchar4 *DataIn, unsigned char *DataOut)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
DataOut[idx] = (DataIn[idx].x + DataIn[idx].y + DataIn[idx].z)/3;
}
int main()
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
char* c = "";
printf("Input source of image\n Example of right directory file: E:\henrik-evensen-castle-valley-v03.jpg\n Your turn:\n");
char *tbLEN;
tbLEN = new char [1024];
cin.getline(tbLEN,1024);
cout<< endl << "Your image: " << tbLEN << endl;
//Data for input image
IplImage* image;
image = cvLoadImage(tbLEN, 1);
int height = image->height;
int width = image->width;
int step = image->widthStep;
int SizeIn = (step*height);
printf("\nProcessing image\n");
//Data for output image
IplImage *image2 = cvCreateImage(cvSize(width, height), IPL_DEPTH_8U, 1);
int step2 = image2->widthStep;
int SizeOut = step2 * height;
//GPU
uchar4* DatIn = (uchar4*)image->imageData;
unsigned char* DatOut = (unsigned char*)image2->imageData;
uchar4 *datIndev;
unsigned char *datOutdev;
printf("Allocating memory on Device\n");
/* Allocate memory on Device */
cudaMalloc(&datIndev, SizeIn * sizeof(unsigned char));
cudaMalloc(&datOutdev, SizeOut * sizeof(unsigned char));
printf("Copy data on Device\n");
/* Copy data on Device */
cudaMemcpy(datIndev, DatIn, SizeIn * sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(datOutdev, DatOut, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice);
int NumThreadsX = deviceProp.maxThreadsPerBlock;
int NumBlocksX = (width * height)/NumThreadsX;
dim3 blocks(NumBlocksX, 1, 1);
dim3 threads(NumThreadsX, 1, 1);
addMatrix <<< blocks, threads >>> (datIndev, datOutdev);
cudaMemcpy(DatOut, datOutdev, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cvNamedWindow("Imagecolor");
cvShowImage("Imagecolor", image);
cvNamedWindow("Gray");
cvShowImage("Gray", image2);
const char* filename1 = "CcPwSwMW4AELPUc.jpg";
printf("Saving an output image\n");
cvSaveImage( filename1, image2 );
cudaFree(datOutdev);
cudaFree(datIndev);
cvWaitKey(0);
return 0;
}

There are several problems here:
Your assumption about four channel data is incorrect. Your code will load a three channel BGR image into memory from file. So you need to change references from uchar4 to ucharand then have each thread load three bytes from the source image within your kernel
Your kernel itself contains a potential arithmetic error. The sum of three unsigned char pixel values can overflow an unsigned char intermediate result and produce an incorrect average. You should use a larger type for the calculation.
Taken together, your kernel should look something like this:
__global__ void addMatrix(unsigned char *DataIn, unsigned char *DataOut)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int b = DataIn[3*idx];
int g = DataIn[3*idx+1];
int r = DataIn[3*idx+2];
DataOut[idx] = (unsigned char)((b + r + g)/3);
}
Then you might find your image looks correct.

Related

how to use cv::Mat imgbuf created by pointer to video data

I am reading the raw video data from the read buffer using
cv::Mat imgbuf(Size(640, 480), CV_8UC3, &mem[0], (640*3));
This variable imgbuf I am passing to face detection algorithm which detects the face & draws the rectangle around the face. after that I am getting output something like
I tried with below code where I am performing resize operation before pass to face detection algorithm. by using this method it is working fine. but without resizing function I am getting noticeable output with rectangle around the face.
while(1)
{
unsigned char *mem = (unsigned char*)mmap(NULL, page_offset + len,
PROT_READ |PROT_WRITE, MAP_PRIVATE, fd, page_base);
cv::Mat imgbuf(Size(640, 480), CV_8UC3, &mem[0], (640*3));
cv::resize(imgbuf,imgbuf,(640,480)); //Dummy function to get the right output.
auto result = v->facedetection(imgbuf);
for (const auto &r : result.rects) {
cv::rectangle(imgbuf,cv::Rect{ cv::Point(r.x * imgbuf.cols, r.y *
imgbuf.rows),cv::Size{(int)(r.width * imgbuf.cols), (int)(r.height *
imgbuf.rows) } },0xff);
}
imshow("face-detection", imgbuf);
waitKey(1);
can anybody help be to sort out this problem
Test this method:
unsigned char *mem = (unsigned char*)mmap(NULL, page_offset + len,
PROT_READ |PROT_WRITE, MAP_PRIVATE, fd, page_base);
cv::Mat imgbuf(480,640, CV_8UC3, &mem[0]);
cv::Mat img_2, img_3;
cv::resize(imgbuf,img_2,cv::Size(640,480));
img_2.copyTo(img_3);
auto result = v->facedetection(img_2);
for (const auto &r : result.rects)
{
cv::Rect myR = cv::Rect(r.x * img_2.cols, r.y * img_2.rows, (int)(r.width * img_2.cols),
(int)(r.height * img_2.rows));
cv::rectangle(img_3,myR,Scalar(0, 0, 255), 1);
}
imshow("Result", img_3);
waitKey(0);
After getting a valid result you can optimize this and use less of "Mat"s.

uchar* to std::vector<uchar> and back?

I have some image data as a uchar*. I need to run processing on it as a std::vector<uchar>, and then convert it back. I am using this code:
unsigned char* buffer = inputImg.data; //Image data from cv::Mat
std::vector<uchar> vec;
size_t size_of_buffer = sizeof(buffer);
vec.assign(buffer, buffer + size_of_buffer);
uchar* _compressed = reinterpret_cast<uchar*>(vec.data());
When I then view the result with:
cv::Mat mat = cv::Mat(_height, _width, inputImg.type(), _compressed );
this results in a black image. Where am I going wrong?
EDIT:
based on comments below, i have changed the code to:
//from Mat
int COLOR_COMPONENTS = inputImg.channels();
int _width = inputImg.cols;
int _height = inputImg.rows;
//to std::vector and back
std::vector<uchar> vec;
size_t size_of_buffer = _width * _height*COLOR_COMPONENTS;
vec.assign(buffer, buffer + size_of_buffer);
uchar* _compressed = reinterpret_cast<uchar*>(vec.data());
As in the answer below, this works.
This code works for me and displays the image correctly:
int main()
{
cv::Mat input = cv::imread("C:/StackOverflow/Input/Lenna.png");
cv::Mat inputImg = input;
int COLOR_COMPONENTS = inputImg.channels();
int _width = inputImg.cols;
int _height = inputImg.rows;
//to std::vector and back
std::vector<uchar> vec;
size_t size_of_buffer = _width * _height*COLOR_COMPONENTS;
unsigned char* buffer = inputImg.data;
vec.assign(buffer, buffer + size_of_buffer);
uchar* _compressed = reinterpret_cast<uchar*>(vec.data());
cv::Mat mat = cv::Mat(_height, _width, inputImg.type(), _compressed);
cv::imshow("output", mat);
cv::waitKey(0);
return 0;
}
Visual Studio 2013 with OpenCV 3.4
sizeof(buffer) yields the size of the pointer to the buffer, not the amount of data inside the buffer. You must get the buffer size from somewhere else.

How to use cv::mat in opencl

Problem
I have opencl code which must take cv::mat as input and return cv::mat as output.
For now I convert the input to regular array of chars and pass it to opencl and convert the output (which is char array) to cv::mat.
What I have
I try to use cv::mat raw data but there are some gaps in the data. For that reason I copy cv::mat to the contiguous array, but I'm sure that I can force opencl to use data with gaps .
Question
Is it possible for someone to explain how I can avoid copying data to and from the array, and directly use cv::mat as input and output?
Array to Mat: you can use the "constructor for matrix headers pointing to user-allocated data" (see this answer):
Mat(int rows, int cols, int type, void* data, size_t step=AUTO_STEP);
Mat to Ptr: you can use the data attribute (from this answer)
unsigned char *input = (unsigned char*)(img.data);
for(int j = 0;j < img.rows;j++){
for(int i = 0;i < img.cols;i++){
unsigned char b = input[img.step * j + img.channels() * i ] ;
unsigned char g = input[img.step * j + img.channels() * i + 1];
unsigned char r = input[img.step * j + img.channels() * i + 2];
}
}
Of course, you need to adapt this to your data type.
Maybe this answer helps you with your question: How to launch custom OpenCL kernel in OpenCV (3.0.0) OCL?
You could maybe use the UMat class that OpenCV provides.
cv::Mat mat = ...;
// Upload input mat
cv::UMat input_gpu = mat.getUMat(cv::ACCESS_READ, cv::USAGE_ALLOCATE_DEVICE_MEMORY);
// Create output mat on the GPU
cv::UMat output_gpu(mat_src.size(), CV_32F, cv::ACCESS_WRITE, cv::USAGE_ALLOCATE_DEVICE_MEMORY);
// Download output mat
cv::Mat output = output_gpu.getMat(cv::ACCESS_READ);
From what I understand you should be able to pass the UMat directly to your kernel
using cv::ocl::KernelArg::ReadWrite(output_gpu). The kernel argument for that mat would then be __global uchar*. I'm not sure though, I have only used OpenCV in combination with CUDA so far.

Efficient way to generate a three-channel LUT mask

I have some code to create an HSV mask that looks approximately like this (taken from this Japanese-language page):
void colorExtraction(const cv::gpu::GpuMat &src,
cv::gpu::GpuMat *dst)
{
cv::Mat lut(256, 1, CV_8UC3);
for (int i = 0; i < 256; i++)
{
cv::Vec3b thisHSV;
thisHSV[0] = inHRange(i) ? 255 : 0;
thisHSV[1] = inSRange(i) ? 255 : 0;
thisHSV[2] = inVRange(i) ? 255 : 0;
lut.at<cv::Vec3b>(i) = thisHSV;
}
/* apply LUT to input image */
cv::gpu::GpuMat extracted(src.size(), CV_8UC3);
cv::gpu::LUT(src, lut, extracted);
/* divide image into each channel */
std::vector<cv::gpu::GpuMat> channels;
cv::gpu::split(extracted, channels);
/* create mask */
cv::gpu::bitwise_and(channels[0], channels[1], *dst);
cv::gpu::bitwise_and(*dst, channels[2], *dst);
}
This works, but despite the operations being mostly in the GPU, it is slower than I would like, perhaps due to a number of intermediate GpuMats. I suspect there might be a nice way to fold this all up into just one or two calls, but I don't know what it could be. Writing my own CUDA kernel is, of course, an option, but I want to check to see if I don't need to reinvent the wheel.
To self-answer, I ended up coding my own kernel to do the LUT(), split(), and two bitwise_and()s in a single call:
__global__ void colorExtractionKernel(cv::gpu::PtrStepSz<uchar3> const src,
cv::gpu::PtrStepSz<unsigned char> dst,
cv::gpu::PtrStepSz<uchar3> const lut)
{
unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
// Extract post-LUT hsv flags
uchar3 srcHSV = src(row, col);
unsigned char h = lut(srcHSV.x, 0).x;
unsigned char s = lut(srcHSV.y, 0).y;
unsigned char v = lut(srcHSV.z, 0).z;
// Result pixel is the AND of the pixels
dst(row, col) = (h & s & v);
}
void colorExtraction_cuda(const cv::gpu::GpuMat &src, // input HSV image
cv::gpu::GpuMat &dst, // specified color extracted binarized image
cv::Mat const &lut) // Look-up thresholds
{
cudaStream_t thisStream;
gpuErrChk(cudaStreamCreate(&thisStream));
dim3 Threads(32, 16);
dim3 Blocks((src.cols + Threads.x - 1)/Threads.x, (src.rows + Threads.y - 1)/Threads.y);
cv::gpu::GpuMat gpuLut(lut);
colorExtractionKernel<<<Blocks, Threads, 0, thisStream>>>(src, dst, gpuLut);
gpuErrChk(cudaStreamSynchronize(thisStream));
gpuErrChk(cudaStreamDestroy(thisStream));
gpuErrChk(cudaGetLastError());
}

OpenCV CUDA C++ C Image Gray

I am new Here...
I need help in the following code..
I am a beginner in coding.
I am trying to convert a color image in .bmp format to gray scale image using CUDA and openCV
Can anyone find the error or bug or the mistake which i have done in my code and help me.
I am also attaching the input i used and the output(screenshot - Image name output) which i got from the code(image in my code).. In the screenshot the image in the background is the original image. You can use whatever image you want.
#include "cuda_runtime.h"
#include <iostream>
#include <ctime>
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
using namespace std;
using namespace cv;
__global__ void convertImage(int width, int height, int nchannels, int step, uchar *d_data, int nchannels2, int step2, uchar *d_data2)
{
int i, j, r, g, b, byte, z = 0;
for(i=0; i<height; i++)
for(j=0; j<width; j++)
{
r = d_data[i*step + j*nchannels + 0];
g = d_data[i*step + j*nchannels + 1];
b = d_data[i*step + j*nchannels + 2];
byte = (r+g+b)/3;
d_data2[i*step2 + j*nchannels2 + 0] = byte;
d_data2[i*step2 + j*nchannels2 + 1] = byte;
d_data2[i*step2 + j*nchannels2 + 2] = byte;
}
}
int main()
{
IplImage *img = cvLoadImage("D://1.bmp", CV_LOAD_IMAGE_COLOR);
int width = img->width;
int height = img->height;
int nchannels = img->nChannels;
int step = img->widthStep;
cout<<"Image 1 : "<<width<<"\t"<<height<<"\t"<<nchannels<<"\t"<<step<<endl;
uchar *data = (uchar*)img->imageData;
uchar *d_data;
int size = sizeof(data);
cudaMalloc(&d_data, size);
cudaMemcpy(d_data, data, size, cudaMemcpyHostToDevice);
IplImage *img2 = cvCreateImage(cvSize(img->height, img->width), IPL_DEPTH_8U, 1);
int width2 = img2->width;
int height2 = img2->height;
int nchannels2 = img2->nChannels;
int step2 = img2->widthStep;
cout<<"Image 2 : "<<width2<<"\t"<<height2<<"\t"<<nchannels2<<"\t"<<step2<<endl;
uchar *data2 = (uchar*)img2->imageData;
uchar *d_data2;
int size2 = sizeof(data2);
cudaMalloc(&d_data2, size2);
long long i;
uchar *temp = data;
convertImage<<<1,1>>>(width, height, nchannels, step, d_data, nchannels2, step2, d_data2);
cudaMemcpy(data2, d_data2, sizeof(data2), cudaMemcpyHostToDevice);
cvNamedWindow("Imagecolor");
cvShowImage("Imagecolor", img);
cvNamedWindow("Gray");
cvShowImage("Gray", img2);
cvWaitKey();
return 0;
}
There a lot of problems with the code!
1: The size of device memory being allocated:
int size = sizeof(data);
sizeof(data) will return the size of pointer on the current platform. Which is most likely 4 or 8 bytes. So you are allocating a maximum of 8 bytes of device memory and copying whole image into it!!!
The actual number of bytes of the image should be calculated as:
int size = step * height;
and
int size2 = step2 * height2;
2: Direction flag and data size of second cudaMemcpy call:
As pointed out in another answer...
cudaMemcpy(data2, d_data2, sizeof(data2), cudaMemcpyHostToDevice);
should be
cudaMemcpy(data2, d_data2, size2, cudaMemcpyDeviceToHost);
3: Type of output image
In the kernel, 3 values are being written to the output in each iteration, while the output image has a single channel. Either write only one value to the output, or create the output image with 3 channels.
The cvSize function should be called as cvSize(width,height) instead of cvSize(height, width) when creating img2.
Also, the kernel is being launched with 1 thread only, and most likely, it would trigger an execution timeout error if the image size is large.
One thing that I can point out by looking at your code is that in the second call to cudaMemcpy (after convertImage) you should pass in 'cudaMemcpyDeviceToHost' as the flag and NOT 'cudaMemcpyHostToDevice'. You want to get the converted image back from the card.
I am not sure if this will be all that it will take to get your program working.