Get access to OpenCV GpuMat channels - c++

I am working on a raytracer. I wanted to optimize my code by saving data of each pixel in an OpenCV Mat, using GPU.
For now, I save pixel values in buffer fb which is a vector of three values (RGB):
__global__ void render(vec3 *fb, int max_x, int max_y, Camera **cam, Triangle *data, size_t n, )
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i >= max_x) || (j >= max_y)) return;
int pixel_index = j * max_x + i;
float u = float(i) / float(max_x);
float v = float(j) / float(max_y);
Ray r = (*cam)->get_ray(u,v);
fb[pixel_index] = color(r, data,n);
}
and then I save data in Mat on CPU:
for (int j = ny - 1; j >= 0; j--)
{
for (int i = 0; i < nx; i++)
{
size_t pixel_index = j * nx + i;
int ir = int(255.99*fb[pixel_index].r());
int ig = int(255.99*fb[pixel_index].g());
int ib = int(255.99*fb[pixel_index].b());
output.at<Vec3b>(j, i)[0] = (uchar)ib;
output.at<Vec3b>(j, i)[1] = (uchar)ig;
output.at<Vec3b>(j, i)[2] = (uchar)ir;
//std::cout << ir << " " << ig << " " << ib << "\n";
}
}
but it is a very slow process when I have a large pixel array. That why I want to use an OpenCV GpuMat and save the data directly on the GPU.
The problem is that I can't really find an example of how I can save data in each channel of GPU Mat. Is it an easy way to do it, similar to saving data on the CPU?

See the documentation. There it says
no functions that return references to their data (because references on GPU are not valid for CPU)
The only way to access the data is through the datafunction. But the pointer can only be dereferenced in (cuda)kernel code. And there is no
at function as far as I see. So you will have to calculate the offset from data.

Thank you for your answers. They make me think how to do it in another way. I am not sure if it is the best solution to deal with it but it works and in my opinion is a quiet easy way to fill matrix on GPU.
Reserve memory on GPU for matrix
Mat output(ny, nx, CV_8UC3);
const size_t numBytes = output.step * output.rows;
unsigned char *d_output;
cudaMalloc<unsigned char>(&d_output, numBytes);
Fill matrix on GPU
_global__ void render(vec3 *fb, int max_x, int max_y, Camera **cam, Triangle *data, size_t n, unsigned char* input, int step)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i >= max_x) || (j >= max_y)) return;
int pixel_index = j * max_x + i;
int index = j * step + 3 * i;
float u = float(i) / float(max_x);
float v = float(j) / float(max_y);
Ray r = (*cam)->get_ray(u,v);
fb[pixel_index] = color(r, data,n);
int ir = int(255.99*fb[pixel_index].r());
int ig = int(255.99*fb[pixel_index].g());
int ib = int(255.99*fb[pixel_index].b());
input[index] = ib;
input[index+1] = ig;
input[index+2] = ir;
}
I will be grateful for any advice and comments to this code.

Related

How to transpose a huge arbitrary matrix in cuda using shared memory?

I have a task to transpose a matrix in CUDA using shared memory with no bank conflicts. The limits are: with*height <= 10^8. The key test sizes are: 1x10^8, 10^4x10^4, 10^8*1.
I have tried a solution provided here Matrix Transpose (with shared Memory) with arbitary size on Cuda C but it did not help me, because my matrix size is too large and outside of CUDA dimension limits (65536 blocks and 32 threads per block).
I tried to create a loop, that helps to work with huge matrix:
const int BLOCK_DIM = 32;
__global__ void transposeMatrixFast(double* inputMatrix, double* outputMatrix, int width, int height)
{
__shared__ double temp[BLOCK_DIM][BLOCK_DIM+1];
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
for (int y = yIndex; y < height; y += offsety)
{
for (int x = xIndex; x < width; x += offsetx)
{
if ((xIndex < width) && (yIndex < height))
{
int idx = y * width + x;
temp[threadIdx.y][threadIdx.x] = inputMatrix[idx];
}
__syncthreads();
if ((x < width) && (y < height))
{
int idx = x * height + y;
outputMatrix[idx] = temp[threadIdx.y][threadIdx.x];
}
}
}
}
Now I am getting a "time limit exceeded" error on a testing server. The reason is that I can't use a benefit of a shared memory in this line:
outputMatrix[idx] = temp[threadIdx.x][threadIdx.y]; and my kerner slows down. I think there is another way to organise my loop, but I don't know how.
I found another way to organise my loop, and now I can transpose matrices of ANY size:
const int BLOCK_SIZE = 32;
__global__ void matrixTransposeSolveBankConflicts(const double *d_a, double *d_b, const unsigned long rows, const unsigned long cols) {
__shared__ double mat[BLOCK_SIZE][BLOCK_SIZE + 1];
unsigned long bh = ceil((double)rows / BLOCK_SIZE);
unsigned long bw = ceil((double)cols / BLOCK_SIZE);
for (unsigned long blocky = blockIdx.y; blocky < bh; blocky += gridDim.y) {
for (unsigned long blockx = blockIdx.x; blockx < bw; blockx += gridDim.x) {
unsigned long bx = blockx * BLOCK_SIZE;
unsigned long by = blocky * BLOCK_SIZE;
unsigned long i = by + threadIdx.y;
unsigned long j = bx + threadIdx.x;
if (i < rows && j < cols)
{
mat[threadIdx.x][threadIdx.y] = d_a[i*cols + j];
}
__syncthreads();
unsigned long ti = bx + threadIdx.y;
unsigned long tj = by + threadIdx.x;
if (tj < rows && ti < cols)
{
d_b[ti*rows + tj] = mat[threadIdx.y][threadIdx.x];
}
__syncthreads();
}
}
}

Why CUDA shared memory is slower than global memory in tiled matrix multiplication?

I have tiled matrix multiplication code with and without shared memory. Below is matrix multiplication using global memory:
__global__
void MatrixMulKernel(float* M, float* N, float* P, int Width)
{
int Row = blockIdx.y*blockDim.y + threadIdx.y;
int Col = blockIdx.x*blockDim.x + threadIdx.x;
if ((Row < Width) && (Col < Width)) {
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
{
Pvalue += M[Row*Width + k] * N[k*Width + Col];
}
P[Row*Width + Col] = Pvalue;
}
}
Below is matrix multiplication using shared memory:
__global__
void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width)
{
__shared__ float Mds[blockWidth][blockWidth];
__shared__ float Nds[blockWidth][blockWidth];
int tx = threadIdx.x; int ty = threadIdx.y;
int bx = blockIdx.x; int by = blockIdx.y;
int row = by * blockWidth + ty;
int col = bx * blockWidth + tx;
float pvalue = 0;
for (int m = 0; m < Width / blockWidth; ++m)
{
Mds[ty][tx] = d_M[row * Width + m*blockWidth + tx];
Nds[ty][tx] = d_N[(m*blockWidth + ty)*Width + col];
__syncthreads();
for (int k = 0; k < blockWidth; ++k)
{
pvalue += Mds[ty][k]*Nds[k][tx];
}
__syncthreads();
}
d_P[row*Width + col] = pvalue;
}
As much as I know using shared memory should be faster but in comparing this two codes I found code without shared memory runs about 2 seconds faster for 1600*1600 matrixes. Is there any explanation for this speed difference or something goes wrong with my code?
My teacher uses "Programming Massively Parallel Processors" Book as main text resource these two codes comes from that.
EDIT:
Configuration for Kernel:
int NumBlocks =ceil( Width / blockWidth); // blockWidth = 16
dim3 dimGrid(NumBlocks, NumBlocks,1); // Width = 1600
dim3 dimBlock(blockWidth, blockWidth,1);
clock_t startGpuCalculation = clock();
MatrixMulKernel <<<dimGrid, dimBlock >>>(d_M, d_N, d_P, Width);
cudaThreadSynchronize();
clock_t endGpuCalculation = clock();
I was Running Project In Debug Mode (VS 2017 & CUDA 9). I Run Code in Release Mode and Shared Memory Is Much Faster Than Global Memory. My Bad.

Equivalent of curand for OpenCL

I am looking at switching from nvidia to amd for my compute card because I want double precision support. Before doing this I decided to learn opencl on my nvidia card to see if I like it. I want to convert the following code from CUDA to OpenCL. I am using the curand library to generate uniformly and normally distributed random numbers. Each thread needs to be able to create a different sequence of random numbers and generate a few million per thread. Here is the code. How would I go about this in OpenCL. Everything I have read online seems to imply that I should generate a buffer of random numbers and then use that on the gpu but this is not practical for me.
template<int NArgs, typename OptimizationFunctor>
__global__
void statistical_solver_kernel(float* args_lbounds,
float* args_ubounds,
int trials,
int initial_temp,
unsigned long long seed,
float* results,
OptimizationFunctor f)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= trials)
return;
curandState rand;
curand_init(seed, idx, 0, &rand);
float x[NArgs];
for(int i = 0; i < NArgs; i++)
{
x[i] = curand_uniform(&rand) * (args_ubounds[i]- args_lbounds[i]) + args_lbounds[i];
}
float y = f(x);
for(int t = initial_temp - 1; t > 0; t--)
{
float t_percent = (float)t / initial_temp;
float x_prime[NArgs];
for(int i = 0; i < NArgs; i++)
{
x_prime[i] = curand_normal(&rand) * (args_ubounds[i] - args_lbounds[i]) * t_percent + x[i];
x_prime[i] = fmaxf(args_lbounds[i], x_prime[i]);
x_prime[i] = fminf(args_ubounds[i], x_prime[i]);
}
float y_prime = f(x_prime);
if(y_prime < y || (y_prime - y) / y_prime < t_percent)
{
y = y_prime;
for(int i = 0; i < NArgs; i++)
{
x[i] = x_prime[i];
}
}
}
float* rptr = results + idx * (NArgs + 1);
rptr[0] = y;
for(int i = 1; i <= NArgs; i++)
rptr[i] = x[i - 1];
}
The VexCL library provides an implementation of counter-based generators. You can use those inside larger expressions, see this slide for an example.
EDIT: Take this with a grain of sault, as I am the author of VexCL :).

CUDA: Is there a way to force each line to complete before moving on?

I am new to parallel programming and I appreciate your help in understand how it works. This is a contrived example, where I want the result of an operation to be 50 in every cell of the matrix.
The result depends on a value in an array at [index+1]. This doesn't work so well in parallel programming, as values aren't computed in order, and I get incorrect results every few cells. The band aid I have is splitting the function into multiple ones, but I think there should be a better solution, though I'm not exactly sure what to search for. Thank you.
CUDA code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <iostream>
#define TILE_WIDTH 16
using namespace std;
__global__ void cuda_arithmetic(int height, int width, float *B, float *C, float *initial_array, float *result_array){
int w = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int h = blockIdx.y * blockDim.y + threadIdx.y; // Row // height
int index = h * width + w;
if ((w < width) && h < (height)) //initial=20, B=2, C=10, result = 17;
initial_array[index] = powf(C[index],2);
if ((w < (width-1)) && h < (height))
result_array[index] = initial_array[index+1] / B[index];
}
__global__ void cuda_arithmetic_step_1(int height, int width, float *B, float *C, float *initial_array, float *result_array){
int w = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int h = blockIdx.y * blockDim.y + threadIdx.y; // Row // height
int index = h * width + w;
if ((w < width) && h < (height))
initial_array[index] = powf(C[index],2);
}
__global__ void cuda_arithmetic_step_2(int height, int width, float *B, float *C, float *initial_array, float *result_array){
int w = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int h = blockIdx.y * blockDim.y + threadIdx.y; // Row // height
int index = h * width + w;
if ((w < (width-1)) && h < (height))
result_array[index] = initial_array[index+1] / B[index];
}
int main(){
int height = 800;
int width = 8192;
float *A = new float[height * width];
float *B = new float[height * width];
float *C = new float[height * width];
float *result = new float[height * width];
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++){
A[i*width+j] = 20;
B[i*width+j] = 2;
C[i*width+j] = 10;
result[i*width+j] = 17;
}
}
float *gpu_A;
float *gpu_B;
float *gpu_C;
float *gpu_result;
cudaMalloc((void **)&gpu_A, (height * width * sizeof(float)));
cudaMalloc((void **)&gpu_B, (height * width * sizeof(float)));
cudaMalloc((void **)&gpu_C, (height * width * sizeof(float)));
cudaMalloc((void **)&gpu_result, (height * width * sizeof(float)));
cudaMemcpy(gpu_A, A, (height * width * sizeof(float)), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_B, B, (height * width * sizeof(float)), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_C, C, (height * width * sizeof(float)), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_result, result, (height * width * sizeof(float)), cudaMemcpyHostToDevice);
dim3 dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
// CODE OPTION
// incorrect result
cuda_arithmetic<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
// correct result
//cuda_arithmetic_step_1<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
//cuda_arithmetic_step_2<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
cudaMemcpy(result, gpu_result, (height * width * sizeof(float)), cudaMemcpyDeviceToHost);
for (int i = 0; i < height; i++){
for (int j = 0; j < (width-1); j++){
if (abs((result[i*(width-1)+j] - 50)) > 0.001){
cout << "error: ";
cout << i << " * " << width-1 << " + " << j << ": " << result[i*(width-1)+j] << endl;
system("pause");
}
}
cout << endl;
}
cout << endl;
cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);
cudaFree(gpu_result);
delete[] A;
delete[] B;
delete[] C;
delete[] result;
system("pause");
}
Since your example is contrived, my answer will be somewhat general.
In general, you're dealing with the problem of global synchronization.
As you've discovered, the only clean global synchronization point is the kernel launch, so breaking your code into pieces before and after the necessary synchronization point will insert a global synchronization, due to the kernel launch(es).
Another approach is to consider whether the necessary synchronization can be localized. If so, you can look at arranging your algorithm/data so that the necessary synchronization can be handled within a threadblock (where shared memory and __syncthreads() give us built-in coordination/synchronization capability.) This may have some challenges around data boundaries (e.g. inter-threadblock boundaries). One approach to handling the border data is to have adjacent threadblocks perform redundant calculations in the border region, so that each threadblock is guaranteed to produce all necessary intermediate results before any final results are computed. In this case, you can safely separate the computation of intermediate results with final results using __syncthreads(), which is an intra-threadblock barrier.
In some cases, you can reduce the dependency to a single thread. For example, in your code, you can make a single thread perform the necessary computation:
initial_array[index+1] = powf(C[index+1],2);
and the dependent result computation:
result_array[index] = initial_array[index+1] / B[index];
Since the dependent calculation is guaranteed to be performed after the necessary intermediate results are computed, there is no other synchronization needed. Probably your actual code may not lend itself to such a trivial rewrite.
As an aside, note that your use of index+1 will go out of range for the last threadblock in the kernel (w = width -1, h = height-1). Also, I don't think this indexing is what you intended:
if (abs((result[i*(width-1)+j] - 50)) > 0.001){
I think you probably meant this:
if (abs((result[i*(width)+j] - 50)) > 0.001){
With those changes, your cuda_arithmetic kernel runs correctly for me (even though it has a slight out-of-bounds issue.)
Use __syncthreads();. All the code before __syncthreads will be executed in all threads launched by this kernel before any code after it is executed.
Also keep in mind that it's a good idea to separate read and write operations to avoid conflicts (when multiple threads read and write from the same addresses). For example
array[i] = array2[i]
should be reworked as
temp = array2[i];
__syncthreads();
array[i] = temp;
__syncthreads();

3D Elementwise Matrix Multiplication in CUDA?

I have a 2D Matrix Multiplication program using the following kernel:
__global__ void multKernel(int *a, int *b, int *c, int N)
{
int column = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
int index = row * N + column;
if(column < N && row < N)
{
c[index] = a[index] * b[index];
}
}
Now, I'd like to create a 3D matrix multiplication kernel, but I'm having trouble finding examples of how do create one (also, I'm terrible at reading mathematical formulae, it's something I need to improve on).
I know the GPU example will involve using
threadIdx.z
and so on, but I'm a bit lost with how to do it.
Could anyone point me in the right direction to either some formulae or sample code? Or even provide a basic example? I have a CPU example which should work, I think.
void matrixMult3D(int *a, int *b, int *c, int *z, int N)
{
int index;
for(int column = 0; column < N; column++)
{
for(int row = 0; row < N; row++)
{
for (int z = 0; z < N; z++)
{
index = row * N + column + z;
c[index] = a[index] * b[index] * z[index];
}
}
}
}
Am I at least on the right track?
Because what you are actually doing is just an element-wise product (I hesitate to call it a Hadamard Product because that isn't defined for hyper matrices AFAIK), you don't need to do anything differently from the simplest 1D version of your kernel code. Something like this:
template<int ndim>
__global__ void multKernel(int *a, int *b, int *c, int *z, int N)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
int idxmax = 1;
#pragma unroll
for(int i=0; i < ndim; i++) {
idxmax *= N;
}
for(; idx < idxmax; idx+=stride) {
c[index] = a[index] * b[index] * z[index];
}
}
[disclaimer: code written in browser, never compiled or run. use at own risk]
would work for any dimension of array with dimensions N (ndim=1), N*N (ndim=2), N*N*N (ndim=3), etc.