Covariance matrix with opencl and opencv - c++

I recently start to learn opencl and how to create kernel for OpenCV as well.
I am still working with the basics.
I need to implement an opencl kernel for calculate the covariance matrix.
This function don't have any opencl kernel already implemented by opencv's fundation.
Technically what I want to process is that :
cv::RNG tutu(std::time(nullptr)); // This is only for check the improvement of the method over the executions
int sz = tutu.uniform(1,20);
cv::Mat_<float> a1(1,sz);
cv::Mat_<float> a2(a1.size());
cv::Mat_<float> c2;
for(std::size_t i=0;i<sz;i++)
{
a1(i) = tutu.uniform(0,300);
a2(i) = tutu.uniform(300,600);
}
cv::Mat_<float> f;
cv::vconcat(a1,a2,f);
// process the Covariance Matrix :
cv::gemm(one,f,-0.5f,f,1.f,c2,0);
cv::gemm(c2.t(),c2,1.f,cv::noArray(),0.f,c2);
I found OpenCV's fundation have implemented an OpenCL kernel for both gemm and the function transpose.
So I tried to derivate an implementation from an exemple directly from OpenCV's source.
I wrote this :
ocl.h :
void CovarMatrix( cv::Mat_<float>& src,cv::Mat_<float>& covar);
ocl.cpp :
#include <memory>
#include <fstream>
#include <sstream>
#include <iterator>
#include <opencv2/core.hpp>
#include <opencv2/core/ocl.hpp>
namespace test
{
namespace ocl
{
namespace
{
std::unique_ptr<cv::ocl::ProgramSource> cov_src;
void init_cov()
{
std::ifstream stream("../mahalanobis/covarianceMatrix.cl");
std::ostringstream sstream;
sstream << stream.rdbuf();
cv::String norm_file_content = sstream.str();
stream.close();
cov_src.reset(new cv::ocl::ProgramSource(norm_file_content));
}
}
static bool ocl_gemm( cv::Mat_<float>& matA, cv::Mat_<float>& matB, cv::Mat_<float>& CV_OUT matD)
{
cv::Mat_<float> tmp;
cv::Mat_<float> tmp2;
cv::Size sizeA = matA.size(), sizeB = matB.size();
cv::Size sizeD(sizeB.width, sizeA.height);
const cv::ocl::Device & dev = cv::ocl::Device::getDefault();
int max_wg_size = (int)dev.maxWorkGroupSize();
int block_size = (max_wg_size / 32 < 32) ? (max_wg_size / 16 < 16) ? (max_wg_size / 8 < 8) ? 1 : 8 : 16 : 32;
// matD.create(sizeD);
// tmp2.create(matD.t().size());
tmp.create(sizeD);
tmp2.create(tmp.t().size());
matD.create(sizeD.width,sizeD.width);
cv::UMat A = matA.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
cv::UMat B = matB.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
// cv::UMat D = matD.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
cv::UMat D = tmp.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
// cv::UMat E(sizeD.width,sizeD.height,CV_32FC1,cv::Scalar::all(0.),cv::USAGE_ALLOCATE_DEVICE_MEMORY);
cv::UMat E = tmp2.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
cv::UMat F = matD.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
matB.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, 1, -1 };
int kercn = cv::ocl::checkOptimalVectorWidth(vectorWidths, B, D);
cv::String opts = cv::format(
"-I /home/administrateur/lib_dir/opencv_dir/opencv_304/opencv/modules/core/src/opencl/ -D T=float -D T1=float -D WT=%s -D cn=1 -D kercn=%d -D LOCAL_SIZE=%d %s -D HAVE_C -D TILE_DIM=32 -D BLOCK_ROWS=8 -D rowsPerWI=1 ",
cv::ocl::typeToStr(CV_32FC(kercn)),
kercn, block_size,
(sizeA.width % block_size !=0) ? "-D NO_MULT" : "");
cv::ocl::Kernel k("covarianceMatrix", *cov_src, opts);
k.args(cv::ocl::KernelArg::ReadOnlyNoSize(A),
cv::ocl::KernelArg::ReadOnlyNoSize(B, 1, kercn),
cv::ocl::KernelArg::ReadWrite(D, 1, kercn),
sizeA.width,
cv::ocl::KernelArg::ReadWrite(E,kercn,1),
cv::ocl::KernelArg::ReadWrite(F,kercn,kercn)
);
std::size_t globalsize[2] = { static_cast<std::size_t>(sizeD.width / kercn), static_cast<std::size_t>(sizeD.height)};
std::size_t localsize[2] = { static_cast<std::size_t>(block_size), static_cast<std::size_t>(block_size)};
return k.run(2, globalsize, block_size!=1 ? localsize : nullptr, false);
}
void CovarMatrix( cv::Mat_<float>& src,cv::Mat_<float>& covar)
{
if(!covar.empty())
covar.release();
cv::Mat_<float> o = cv::Mat_<float>::ones(src.rows,src.rows);
if(!cov_src)
init_cov();
ocl_gemm(o,src,covar);
}
covarianceMatrix.cl :
#include "gemm.cl"
#include "transpose.cl"
__kernel void covarianceMatrix
(
__global const uchar * A_ptr, int A_step, int A_offset,
__global const uchar * B_ptr, int B_step, int B_offset,
__global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols,
int n,
__global uchar * E_ptr, int E_step, int E_offset, int E_rows, int E_cols,
__global uchar * F_ptr, int F_step, int F_offset, int F_rows, int F_cols
)
{
// cv::gemm(src2,src1,-0.5,src1,1.f,src2);
// cv::gemm(src2.t(),src2,1.f,cv::noArray(),0.f,dest);
gemm(A_ptr,A_step,A_offset,
B_ptr,B_step,B_offset,
D_ptr,D_step,D_offset,D_rows,D_cols,
n,-0.5f,1.f);
transpose(D_ptr,D_step,D_offset,D_rows,D_cols*sizeof(float),
E_ptr,E_step,E_offset);
gemm(E_ptr,E_step,E_offset,
D_ptr,D_step,D_offset,
F_ptr,F_step,F_offset,F_rows,F_cols,
n,1.f,0.f);
}
If the size of the matrix is fewer than 6 is work perfectly :).
Otherwise ... not really.
It can be check with this code :
cv::RNG tutu(std::time(nullptr));
int sz = tutu.uniform(1,20);
cv::Mat_<float> a1(1,sz);
cv::Mat_<float> a2(a1.size());
for(std::size_t i=0;i<sz;i++)
{
a1(i) = tutu.uniform(0,300);
a2(i) = tutu.uniform(300,600);
}
cv::Mat_<float> f;
cv::vconcat(a1,a2,f);
cv::Mat_<float> c1;
cv::Mat_<float> c2;
cv::Mat_<float> mean;
// reference
cv::calcCovarMatrix(f,c1,mean,cv::COVAR_ROWS | cv::COVAR_NORMAL,CV_32F);
// check
test::ocl::CovarMatrix(f,c2);
std::size_t cnt(0.f);
for(auto it = c1.begin(),it2 = c2.begin();it != c1.end();it++,it2++)
if(*it == *it2)
cnt++;
std::cout<<"check "<<cnt<<" "<<c1.total()<<std::endl;
I am still new in OpenCL and I am interrested to know what I did wrong.
Does someone already implemented an OpenCL kernel for process the covariance matrix with OpenCV ?
Thank in advance for any help.

I resign myself to write this code :
void ocl_CovarMatrix(cv::Mat_<float>& src,cv::Mat_<float>& covar)
{
cv::UMat usrc = src.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY);
cv::UMat ones = cv::UMat::ones(usrc.rows,usrc.rows,usrc.type());
cv::UMat utmp;
double beta = 1.;
double alpha = -1. / static_cast<double>(usrc.rows);
cv::gemm(ones,usrc,alpha,usrc,beta,utmp);
cv::gemm(utmp.t(),utmp,beta,cv::noArray(),0.,utmp);
utmp.copyTo(covar);
ones.release();
utmp.release();
usrc.release();
}
I suspect the GPU memory is updated everytime a function is call, that make that code slower rather than if it has been written in one Kernel.
But it work efficiently.
I am still interested by another solution if maybe someone have an idea.

Related

Assertion failed (trackbar) in getTrackbarPos

How to fix the following error?
OpenCV: terminate handler is called! The last OpenCV error is:
OpenCV(4.5.5) Error: Assertion failed (trackbar) in getTrackbarPos,
file C:\opencv-4.5.5\sources\modules\highgui\src\window.cpp, line 862
#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <vector>
using namespace cv;
using namespace std;
template <typename T>
T PosToActual(T min, T max, int pos, int TICKS)
{
return min + (max - min) * pos / TICKS;
}
void Refresh(int, void *data)
{
Mat *original = (Mat *)data;
Mat modified;
vector<Mat> channels;
split(*original, channels);
int b = cv::getTrackbarPos("Blue", "Modified");
int g = cv::getTrackbarPos("Green", "Modified");
int r = cv::getTrackbarPos("Red", "Modified");
channels[0] *= PosToActual<float>(0, 1, b, 10);
channels[1] *= PosToActual<float>(0, 1, g, 10);
channels[2] *= PosToActual<float>(0, 1, r, 10);
merge(channels, modified);
imshow("Modified", modified);
}
int main()
{
const string filename = "family.jpg";
Mat original = imread(filename);
// Mat modified = Mat::zeros(original.size(), original.type());
cv::namedWindow("Modified", WINDOW_NORMAL);
int bFactor = 5;
int gFactor = 5;
int rFactor = 5;
createTrackbar("Blue", "Modified", &bFactor, 10, Refresh, &original);
createTrackbar("Green", "Modified", &gFactor, 10, Refresh, &original);
createTrackbar("Red", "Modified", &rFactor, 10, Refresh, &original);
Refresh(0, &original);
waitKey();
}
Edit:
This issue only occurs when using OpenCV for Windows (prebuilt) version
4.5.3
4.5.4
4.5.5
It works just fine for version 4.5.2. I have not checked for version lower than 4.5.2 yet because I am not interested in using any lower versions.

Using Tensorflow Lite C, Digit Classifier

Using Tensorflow Lite, on Android,
Image classification was successful.
Example: https://github.com/tensorflow/examples/tree/master/lite/codelabs/digit_classifier
core : https://github.com/tensorflow/examples/blob/master/lite/codelabs/digit_classifier/android/finish/app/src/main/java/org/tensorflow/lite/codelabs/digitclassifier/DigitClassifier.kt
I want to convert this project to C.
The image(.jpg) will be read using opencv.
The model file is (.tflite)
Give me some advice.
This is my program code.
#include <iostream>
#include <iomanip>
#include <fstream>
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/optional_debug_tools.h"
#include "opencv2/opencv.hpp"
using namespace std;
typedef cv::Point3_<float> Pixel;
const uint WIDTH = 28;
const uint HEIGHT = 28;
const uint CHANNEL = 3;
const uint OUTDIM = 10;
void normalize(Pixel &pixel){
pixel.x = (pixel.x / 255.0);
pixel.y = (pixel.y / 255.0);
pixel.z = (pixel.z / 255.0);
}
int main(){
std::vector<std::string> labels;
auto file_name="labels.txt";
std::ifstream input( file_name );
for( std::string line; getline( input, line ); )
{
labels.push_back( line);
}
// read image file
cv::Mat img = cv::imread("sample2.jpg");
cv::Mat inputImg;
img.convertTo(inputImg, CV_32FC3);
cv::cvtColor(inputImg, inputImg, cv::COLOR_BGR2RGB);
// normalize to -1 & 1
Pixel* pixel = inputImg.ptr<Pixel>(0,0);
const Pixel* endPixel = pixel + inputImg.cols * inputImg.rows;
for (; pixel != endPixel; pixel++)
normalize(*pixel);
// resize image as model input
cv::resize(inputImg, inputImg, cv::Size(WIDTH, HEIGHT));
// create model
std::unique_ptr<tflite::FlatBufferModel> model =
tflite::FlatBufferModel::BuildFromFile("mnist.tflite");
tflite::ops::builtin::BuiltinOpResolver resolver;
std::unique_ptr<tflite::Interpreter> interpreter;
tflite::InterpreterBuilder(*model.get(), resolver)(&interpreter);
interpreter->AllocateTensors();
float* inputLayer = interpreter->typed_input_tensor<float>(0);
float* inputImg_ptr = inputImg.ptr<float>(0);
memcpy(inputLayer, inputImg.ptr<float>(0),
WIDTH * HEIGHT * CHANNEL * sizeof(float));
interpreter->Invoke();
float* outputLayer = interpreter->typed_output_tensor<float>(0);
// TODO
return 0;
}
Summary of this question : Android -> C Convert
The result I wantPrediction Result : 6Confidence: 0~1
.tflite model file is here

Using extern on Halide with GPU

I try to use extern function in Halide. In my context, I want to do it on GPU.
I compile in AOT compilation with opencl statement.
Of course, opencl can still use CPU, so I use this:
halide_set_ocl_device_type("gpu");
For now, everything is schedule at compute_root().
First question, if I use compute_root() and OpenCL gpu, did my process will be compute on the device with some CopyHtoD and DtoH? (Or it will be on Host buffer)
Second question, more related to the extern functions. We use some extern call because some of our algorithm is not in Halide.
Extern call:
foo.define_extern("cool_foo", args, Float(32), 4);
Extern retrieve:
extern "C" int cool_foo(buffer_t * in, int w, int h, int z, buffer_t * out){ .. }
But, in the cool_foo function, my buffer_t are load only in host memory. The dev address is 0 (default).
If I try to copy the memory before the algorithm:
halide_copy_to_dev(NULL, &in);
It does nothing.
If I make available only the device memory:
in.host = NULL;
My host pointer are null, but the device address is still 0.
(dev_dirty is true on my case and host_dirty is false)
Any idea?
EDIT (To answer dsharlet)
Here's the structure of my code:
Parse data correctly on CPU. --> Sent the buffer on the GPU (Using halide_copy_to_dev...) --> Enter in Halide structure, read parameter and Add a boundary condition --> Go in my extern function -->...
I don't have a valid buffer_t in my extern function.
I schedule everything in compute_root(), but use HL_TARGET=host-opencl and set ocl to gpu.
Before entering in Halide, I can read my device address and it's ok.
Here's my code:
Before Halide, everything was CPU stuff(The pointer) and we transfert it to GPU
buffer_t k = { 0, (uint8_t *) k_full, {w_k, h_k, num_patch_x * num_patch_y * 3}, {1, w_k, w_k * h_k}, {0}, sizeof(float), };
#if defined( USEGPU )
// Transfer into GPU
halide_copy_to_dev(NULL, &k);
k.host_dirty = false;
k.dev_dirty = true;
//k.host = NULL; // It's k_full
#endif
halide_func(&k)
Inside Halide:
ImageParam ...
Func process;
process = halide_sub_func(k, width, height, k.channels());
process.compute_root();
...
Func halide_sub_func(ImageParam k, Expr width, Expr height, Expr patches)
{
Func kBounded("kBounded"), kShifted("kShifted"), khat("khat"), khat_tuple("khat_tuple");
kBounded = repeat_image(constant_exterior(k, 0.0f), 0, width, 0, height, 0, patches);
kShifted(x, y, pi) = kBounded(x + k.width() / 2, y + k.height() / 2, pi);
khat = extern_func(kShifted, width, height, patches);
khat_tuple(x, y, pi) = Tuple(khat(0, x, y, pi), khat(1, x, y, pi));
kShifted.compute_root();
khat.compute_root();
return khat_tuple;
}
Outside Halide(Extern function):
inline ....
{
//The buffer_t.dev and .host are 0 and null. I expect a null from the host, but the dev..
}
I find the solution for my problem.
I post the answer in code just here. (Since I did a little offline test, the variable name doesn't match)
Inside Halide: (Halide_func.cpp)
#include <Halide.h>
using namespace Halide;
using namespace Halide::BoundaryConditions;
Func thirdPartyFunction(ImageParam f);
Func fourthPartyFunction(ImageParam f);
Var x, y;
int main(int argc, char **argv) {
// Input:
ImageParam f( Float( 32 ), 2, "f" );
printf(" Argument: %d\n",argc);
int test = atoi(argv[1]);
if (test == 1) {
Func f1;
f1(x, y) = f(x, y) + 1.0f;
f1.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
f1.compile_to_file("halide_func", args);
} else if (test == 2) {
Func fOutput("fOutput");
Func fBounded("fBounded");
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
fOutput(x, y) = fBounded(x-1, y) + 1.0f;
fOutput.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
fOutput.compile_to_file("halide_func", args);
} else if (test == 3) {
Func h("hOut");
h = thirdPartyFunction(f);
h.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
h.compile_to_file("halide_func", args);
} else {
Func h("hOut");
h = fourthPartyFunction(f);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
h.compile_to_file("halide_func", args);
}
}
Func thirdPartyFunction(ImageParam f) {
Func g("g");
Func fBounded("fBounded");
Func h("h");
//Boundary
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
g(x, y) = fBounded(x-1, y) + 1.0f;
h(x, y) = g(x, y) - 1.0f;
// Need to be comment out if you want to use GPU schedule.
//g.compute_root(); //At least one stage schedule alone
//h.compute_root();
return h;
}
Func fourthPartyFunction(ImageParam f) {
Func fBounded("fBounded");
Func g("g");
Func h("h");
//Boundary
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
// Preprocess
g(x, y) = fBounded(x-1, y) + 1.0f;
g.compute_root();
g.gpu_tile(x, y, 256, 1);
// Extern
std::vector < ExternFuncArgument > args = { g, f.width(), f.height() };
h.define_extern("extern_func", args, Int(16), 3);
h.compute_root();
return h;
}
The external function: (external_func.h)
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cinttypes>
#include <cstring>
#include <fstream>
#include <map>
#include <vector>
#include <complex>
#include <chrono>
#include <iostream>
#include <clFFT.h> // All OpenCL I need are include.
using namespace std;
// Useful stuff.
void completeDetails2D(buffer_t buffer) {
// Read all elements:
std::cout << "Buffer information:" << std::endl;
std::cout << "Extent: " << buffer.extent[0] << ", " << buffer.extent[1] << std::endl;
std::cout << "Stride: " << buffer.stride[0] << ", " << buffer.stride[1] << std::endl;
std::cout << "Min: " << buffer.min[0] << ", " << buffer.min[1] << std::endl;
std::cout << "Elem size: " << buffer.elem_size << std::endl;
std::cout << "Host dirty: " << buffer.host_dirty << ", Dev dirty: " << buffer.dev_dirty << std::endl;
printf("Host pointer: %p, Dev pointer: %" PRIu64 "\n\n\n", buffer.host, buffer.dev);
}
extern cl_context _ZN6Halide7Runtime8Internal11weak_cl_ctxE;
extern cl_command_queue _ZN6Halide7Runtime8Internal9weak_cl_qE;
extern "C" int extern_func(buffer_t * in, int width, int height, buffer_t * out)
{
printf("In extern\n");
completeDetails2D(*in);
printf("Out extern\n");
completeDetails2D(*out);
if(in->dev == 0) {
// Boundary stuff
in->min[0] = 0;
in->min[1] = 0;
in->extent[0] = width;
in->extent[1] = height;
return 0;
}
// Super awesome stuff on GPU
// ...
cl_context & ctx = _ZN6Halide7Runtime8Internal11weak_cl_ctxE; // Found by zougloub
cl_command_queue & queue = _ZN6Halide7Runtime8Internal9weak_cl_qE; // Same
printf("ctx: %p\n", ctx);
printf("queue: %p\n", queue);
cl_mem buffer_in;
buffer_in = (cl_mem) in->dev;
cl_mem buffer_out;
buffer_out = (cl_mem) out->dev;
// Just copying data from one buffer to another
int err = clEnqueueCopyBuffer(queue, buffer_in, buffer_out, 0, 0, 256*256*4, 0, NULL, NULL);
printf("copy: %d\n", err);
err = clFinish(queue);
printf("finish: %d\n\n", err);
return 0;
}
Finally, the non-Halide stuff: (Halide_test.cpp)
#include <halide_func.h>
#include <iostream>
#include <cinttypes>
#include <external_func.h>
// Extern function available inside the .o generated.
#include "HalideRuntime.h"
int main(int argc, char **argv) {
// Init the kernel in GPU
halide_set_ocl_device_type("gpu");
// Create a buffer
int width = 256;
int height = 256;
float * bufferHostIn = (float*) malloc(sizeof(float) * width * height);
float * bufferHostOut = (float*) malloc(sizeof(float) * width * height);
for( int j = 0; j < height; ++j) {
for( int i = 0; i < width; ++i) {
bufferHostIn[i + j * width] = i+j;
}
}
buffer_t bufferHalideIn = {0, (uint8_t *) bufferHostIn, {width, height}, {1, width, width * height}, {0, 0}, sizeof(float), true, false};
buffer_t bufferHalideOut = {0, (uint8_t *) bufferHostOut, {width, height}, {1, width, width * height}, {0, 0}, sizeof(float), true, false};
printf("IN\n");
completeDetails2D(bufferHalideIn);
printf("Data (host): ");
for(int i = 0; i < 10; ++ i) {
printf(" %f, ", bufferHostIn[i]);
}
printf("\n");
printf("OUT\n");
completeDetails2D(bufferHalideOut);
// Send to GPU
halide_copy_to_dev(NULL, &bufferHalideIn);
halide_copy_to_dev(NULL, &bufferHalideOut);
bufferHalideIn.host_dirty = false;
bufferHalideIn.dev_dirty = true;
bufferHalideOut.host_dirty = false;
bufferHalideOut.dev_dirty = true;
// TRICKS Halide to force the use of device.
bufferHalideIn.host = NULL;
bufferHalideOut.host = NULL;
printf("IN After device\n");
completeDetails2D(bufferHalideIn);
// Halide function
halide_func(&bufferHalideIn, &bufferHalideOut);
// Get back to HOST
bufferHalideIn.host = (uint8_t*)bufferHostIn;
bufferHalideOut.host = (uint8_t*)bufferHostOut;
halide_copy_to_host(NULL, &bufferHalideOut);
halide_copy_to_host(NULL, &bufferHalideIn);
// Validation
printf("\nOUT\n");
completeDetails2D(bufferHalideOut);
printf("Data (host): ");
for(int i = 0; i < 10; ++ i) {
printf(" %f, ", bufferHostOut[i]);
}
printf("\n");
// Free all
free(bufferHostIn);
free(bufferHostOut);
}
You can compile the halide_func with the test 4 to use all the Extern functionnality.
Here's some of the conclusion I have. (Thanks to Zalman and zougloub)
Compute_root don't call the device if you use it alone.
We need gpu() of gpu_tile() in the code to call GPU routine. (BTW, you need to put all your variable inside)
gpu_tile les than your item will crash your stuff.
BoundaryCondition works well in GPU.
Before calling extern function, the Func that goes as a input need to be:
f.compute_root(); f.gpu_tile(x,y,...,...); The compute_root in the middle stage is not implicit.
If the dev address is 0, it's normal, we resend the dimension and the extern will be called again.
Last stage as a compute_root() implicit.
Are you aware of the bounds inference protocol for external array functions? This takes place when the host pointer of any buffer is NULL. (Briefly, in this case, you need to fill in the extent fields of the buffer_t structures that have NULL host pointers and do nothing else.) If you have already taken care of that, then ignore the above.
If you've tested that the host pointers are non-NULL for all buffers, then calling halide_copy_to_dev should work. You may need to explicitly set host_dirty to true beforehand to get the copy part to happen, depending where the buffer came from. (I would hope Halide gets this right and it is already set if the buffer came from a previous pipeline stage on the CPU. But if the buffer came from something outside Halide, the dirty bits are probably false from initialization. It seems halide_dev_malloc should set dev_dirty if it allocates device memory, and currently it does not.)
I would expect the dev field to be populated after a call to halide_copy_to_dev as the first thing it does is call halide_dev_malloc. You can try calling halide_dev_malloc explicitly yourself, setting host_dirty and then calling halide_copy_to_dev.
Is the previous stage on the host or on the GPU? If it is on the GPU, I'd expect the input buffer to be on the GPU as well.
This API needs work. I am in the middle of a first refactoring of somethings that will help, but ultimately it will require changing the buffer_t structure. It is possible to get most things to work, but it requires a modifying the host_dirty and dev_dirty bits as well as calling the halide_dev* APIs in just the right way. Thank you for your patience.

Boost::mpi sending array

Good Morning
I'm implementing a distributed image normalization algorithm an I'm using Boost::mpi with a class Pixel that contain the serialization code,
#ifndef PIXEL_H
#define PIXEL_H
#include <boost/mpi.hpp>
#include <boost/serialization/access.hpp>
class Pixel
{
private:
unsigned char m_red;
unsigned char m_green;
unsigned char m_blue;
friend class boost::serialization::access;
template <class Archive>
void serialize(Archive &ar, const unsigned int version) {
ar & m_red;
ar & m_green;
ar & m_blue;
}
public:
Pixel();
Pixel(unsigned char red,unsigned char green,unsigned char blue) : m_red(red), m_green(green), m_blue(blue) {};
virtual ~Pixel();
unsigned char getRed();
void setRed(unsigned char val);
unsigned char getGreen();
void setGreen(unsigned char val);
unsigned char getBlue();
void setBlue(unsigned char val);
void setColor (unsigned char red,unsigned char green,unsigned char blue);
};
The main.cpp is
#include <iostream>
#include <boost/mpi.hpp>
#include <vector>
#include "include/Pixel.h"
#include <cstdlib>
#include <ctime>
#define ALTEZZA 2
#define LARGHEZZA 2
namespace mpi=boost::mpi;
int main(int argc, char * argv[]) {
std::cout<<"Inizializzazione dell'ambiente MPI"<<std::endl;
mpi::environment env;
mpi::communicator world;
Pixel **vettore;
int i,j;
//Inizializzazione della matrice di test
if(world.rank() == 0){
std::cout<<"Inizializzazione matrice di test..."<<std::endl;
std::srand(std::time(0));
vettore = new Pixel *[ALTEZZA];
for (i = 0; i < ALTEZZA; i++) {
vettore[i] = new Pixel[LARGHEZZA];
}
for (i = 0; i < ALTEZZA; i++) {
for (j = 0; j < LARGHEZZA; j++) {
vettore[i][j].setColor(std::rand() % 256, std::rand() % 256, std::rand() % 256);
std::cout<<"Vettore["<<i<<"]["<<j<<"] = ("<<int(vettore[i][j].getRed())<<","<<int(vettore[i][j].getGreen())<<","<<int(vettore[i][j].getBlue())<<");"<<std::endl;
}
}
}
if (world.rank() == 0) {
std::cout<<"Invio matrice.."<<std::endl;
world.send(1, 0, vettore[0]);
}else {
Pixel *px;
world.recv(0, 0, px);
for (j = 0; j < LARGHEZZA; j++) {
std::cout<<int(px[j].getRed())<<" "<<int(px[j].getGreen())<<" "<<int(px[j].getBlue())<<std::endl;
}
}
return 0;
}
but when i run the program the cout on the receiving process print wrong value like this
Inizializzazione dell'ambiente MPI
Inizializzazione dell'ambiente MPI
Inizializzazione matrice di test...
Vettore[0][0] = (170,103,165);
Vettore[0][1] = (84,0,186);
Vettore[1][0] = (93,228,162);
Vettore[1][1] = (31,100,204);
Invio matrice..
170 103 165
217 1 0
I think that the problem is the 2d array because if I use std::vector i haven't this problem but I don't understand why.
I would imagine you have several problems (I can't test as I don't have a capable MPI installation..)
Firstly, your send() is wrong, currently you are triggering the overload:
template<typename T> void send(int, int, const T &) const;
But you are trying to send a raw array, I imagine the fix here has to be to pass the count, for example:
world.send(1, 0, vettore[0], 2); // 2 Pixels
Secondly, on the receiver side (this I'm not sure about), but I imagine you need to have a suitable array to read the data into.., for example:
Pixel px[LARGHEZZA];
world.recv(0, 0, px, 2);
I think this should fix your problems...

RGB to greyscale conversion using CUDA

So I am trying to write a program that turns RGB images to greyscale.
I got the idea from the Udacity problem set. The problem is that when I write out the kernel in the Udacity web environment, it says my code works, however, when I try to do it locally on my computer, I get no errors, but my image instead of coming out greyscale, comes out completely grey. It looks like one grey box the dimensions of the image I loaded. Can you help me find the error in my code, I've compared it with the Udacity version and I can't seem to find it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <cuda.h>
#include <stdio.h>
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <iostream>
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
__global__ void rgb_2_grey(uchar* const greyImage, const uchar4* const rgbImage, int rows, int columns)
{
int rgb_x = blockIdx.x * blockDim.x + threadIdx.x; //x coordinate of pixel
int rgb_y = blockIdx.y * blockDim.y + threadIdx.y; //y coordinate of pixel
if ((rgb_x >= columns) && (rgb_y >= rows)) {
return;
}
int rgb_ab = rgb_y*columns + rgb_x; //absolute pixel position
uchar4 rgb_Img = rgbImage[rgb_ab];
greyImage[rgb_ab] = uchar((float(rgb_Img.x))*0.299f + (float(rgb_Img.y))*0.587f + (float(rgb_Img.z))*0.114f);
}
using namespace cv;
using namespace std;
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage);
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols);
void Save_Img();
Mat img_RGB;
Mat img_Grey;
uchar4 *d_rgbImg;
uchar *d_greyImg;
int main()
{
uchar4* h_rgbImg;
//uchar4* d_rgbImge=0;
uchar* h_greyImg;
//uchar* d_greyImge=0;
Proc_Img(&h_rgbImg, &h_greyImg, &d_rgbImg, &d_greyImg);
RGB_2_Greyscale(d_greyImg, d_rgbImg, img_RGB.rows, img_RGB.cols);
Save_Img();
return 0;
}
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage){
cudaFree(0);
CudaCheckError();
//loads image into a matrix object along with the colors in BGR format (must convert to rgb).
Mat img = imread("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG", CV_LOAD_IMAGE_COLOR);
if (img.empty()){
cerr << "couldnt open file dumbas..." << "C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG" << endl;
exit(1);
}
//converts color type from BGR to RGB
cvtColor(img, img_RGB, CV_BGR2RGBA);
//allocate memory for new greyscale image.
//img.rows returns the range of pixels in y, img.cols returns range of pixels in x
//CV_8UC1 means 8 bit unsigned(non-negative) single channel of color, aka greyscale.
//all three of the parameters allow the create function in the Mat class to determine how much memory to allocate
img_Grey.create(img.rows, img.cols, CV_8UC1);
//creates rgb and greyscale image arrays
*h_RGBImage = (uchar4*)img_RGB.ptr<uchar>(0); //.ptr is a method in the mat class that returns a pointer to the first element of the matrix.
*h_greyImage = (uchar*)img_Grey.ptr<uchar>(0); //this is just like a regular array/pointer mem address to first element of the array. This is templated
//in this case the compiler runs the function for returning pointer of type unsigned char. for rgb image it is
//cast to uchar4 struct to hold r,g, and b values.
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols); //amount of pixels
//allocate memory on gpu
cudaMalloc(d_RGBImage, sizeof(uchar4) * num_pix); //bites of 1 uchar4 times # of pixels gives number of bites necessary for array
CudaCheckError();
cudaMalloc(d_greyImage, sizeof(uchar) * num_pix);//bites of uchar times # pixels gives number of bites necessary for array
CudaCheckError();
cudaMemset(*d_greyImage, 0, sizeof(uchar) * num_pix);
CudaCheckError();
//copy array into allocated space
cudaMemcpy(*d_RGBImage, *h_RGBImage, sizeof(uchar4)*num_pix, cudaMemcpyHostToDevice);
CudaCheckError();
d_rgbImg = *d_RGBImage;
d_greyImg = *d_greyImage;
}
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols){
const int BS = 16;
const dim3 blockSize(BS, BS);
const dim3 gridSize((num_Cols / BS) + 1, (num_Rows / BS) + 1);
rgb_2_grey <<<gridSize, blockSize>>>(d_greyImage, d_RGBImage, num_Rows, num_Cols);
cudaDeviceSynchronize(); CudaCheckError();
}
void Save_Img(){
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols);
cudaMemcpy(img_Grey.ptr<uchar>(0), d_greyImg, sizeof(uchar)*num_pix, cudaMemcpyDeviceToHost);
CudaCheckError();
imwrite("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581GR.JPG", img_Grey);
cudaFree(d_rgbImg);
cudaFree(d_greyImg);
}
EDIT: I realized that the local var in my main is the same name as the global var, I have edited the code here, now I get the error from visual studio that the
variable d_rgbIme is being used without being initialized
when I have already initialized it above. If I set them equal to zero I get a CUDA error saying
an illegal memory access was encountered
I tried running cuda-memcheck, but then I get the error that i could not run the file...
I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {
I was working on the same problem in JCUDA. See if you can use any part of this solution:
//Read Height and Width of image in Height & Width variables
int Width = image.getWidth();
int Height = image.getHeight();
int N = Height * Width;
int[] grayScale = new int[N];
//Allocate separate arrays to store Alpha, Red, Green and
//Blue values for every pixel
int[] redHost = new int[N];
int[] greenHost = new int[N];
int[] blueHost = new int[N];
int[] alphaHost = new int[N];
for(int i=0; i<Height; i++)
{
for(int j=0; j<Width; j++)
{
int pixel = image.getRGB(j, i);
//Read the ARGB data
alphaHost[i*Width+j] = (pixel >> 24) & 0xff;
redHost[i*Width+j] = (pixel >> 16) & 0xff;
greenHost[i*Width+j] = (pixel >> 8) & 0xff;
blueHost[i*Width+j] = (pixel) & 0xff;
}
}
/* Following are the CUDA Kernel parameters*/
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{N}), //Total size of each array W * H
Pointer.to(redDev), // Pointer to redArray on device
Pointer.to(greenDev), // Pointer to greenArray on device
Pointer.to(blueDev), // Pointer to blueArray on device
Pointer.to(Output)); //Pointer to output array
/*Following is my RGBToGrayScale.cu..i.e. CUDA Kernel */
__global__ void RGBtoGrayScale(int N, int *red, int *green, int *blue, int *Output)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
{
Output[id] = (red[id]*0.2989) + (green[id]*0.587) + (blue[id]*0.114);
}
}
/* Get the output data back to Host memory */
cuMemcpyDtoH(Pointer.to(grayScale), Output, N * Sizeof.INT);
/* Write the image with the new RBG values*/
BufferedImage im = new BufferedImage(Width,Height,BufferedImage.TYPE_BYTE_GRAY);
WritableRaster raster = im.getRaster();
for(int i=0;i<Height;i++)
{
for(int j=0;j<Width;j++)
{
raster.setSample(j, i, 0, grayScale[i*Width+j]);
}
}
try
{
ImageIO.write(im,"JPEG",new File("glpattern.jpeg"));
} catch (IOException e)
{
e.printStackTrace();
}