cuda multiple image erosion not work - c++

i'm trying to implement multiple black(0) and white(255) image erosion with cuda,i use a square (5x5)structure element.The kernel that i had implemented take an unsigned char array buffer in which are stored nImg images 200X200 px . To allow erosion of multiple image simultaneosly i make a grid with 3D structure:
each block has the dimension of the strel (5x5)
the grid has height = image_height/blockDim.y , width = image_width/blockDim.x , z = nImg
i've try to implement it extending that sample.
the problem is that if i store the pixels that a block of threads consider into a shared buffer shared between the threads of the block;
to allow fast memory access, the algorithm doesn't work properly.I try to change the bindex that for me make mistake,but i cannot found a solution.
any suggestion?
here's my code:
//strel size
#define STREL_W 5
#define STREL_H 5
// distance from the cente of strel to strel width or height
#define R (STREL_H/2)
//size of the 2D region that each block consider i.e all the neighborns that each thread in a block consider
#define BLOCK_W (STREL_W+(2*R))
#define BLOCK_H (STREL_H+(2*R))
__global__ void erode_multiple_img_SM(unsigned char * buffer_in,
unsigned char * buffer_out,
int w,
int h ){
//array stored in shared memory,that contain all pixel neighborns that each thread in a block consider
__shared__ unsigned char fast_acc_arr[BLOCK_W*BLOCK_H];
// map thread in a 3D structure
int col = blockIdx.x * STREL_W + threadIdx.x -R ;
int row = blockIdx.y * STREL_H + threadIdx.y -R ;
int plane = blockIdx.z * blockDim.z + threadIdx.z;
// check if a foreground px of strel is not contain in a region of the image with size of strel (if only one px is not contain the image is eroded)
bool is_contain = true;
// clamp to edge of image
col = max(0,col);
col = min(col,w-1);
row = max(0,row);
row = min(row,h-1);
//map each thread in one dim coord to map 3D structure(grid) with image buffer(1D)
unsigned int index = (plane * h * w) + (row * w) + col;
unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
//each thread copy its pixel of the block to shared memory (shared with thread of a block)
fast_acc_arr[bindex] = buffer_in[index];
__syncthreads();
//the strel must be contain in image, thread.x and thread.y are the coords of the center of the mask that correspond to strel in image, and it must be contain in image
if((threadIdx.x >= R) && (threadIdx.x < BLOCK_W-R) && (threadIdx.y >= R) && (threadIdx.y <BLOCK_H-R)){
for(int dy=-R; dy<=R; dy++){
if(is_contain == false)
break;
for (int dx = -R ; dx <= R; dx++) {
//if only one element in mask is different from the value of strel el --> the strel is not contain in the mask --> the center of the mask is eroded (and it's no necessary to consider the other el of the mask this is the motivation of the break)
if (fast_acc_arr[bindex + (dy * blockDim.x) + dx ] != 255 ){
buffer_out[index ] = 0;
is_contain = false;
break;
}
}
}
// if the strel is contain into the image the the center is not eroded
if(is_contain == true)
buffer_out[index] = 255;
}
}
that are my kernel settings:
dim3 block(5,5,1);
dim3 grid(200/(block.x),200/(block.y),nImg);
my kernel call:
erode_multiple_img_SM<<<grid,block>>>(dimage_src,dimage_dst,200,200);
my image input and output:
input: output(150 buff element):
code without shared memory(low speed):
__global__ void erode_multiple_img(unsigned char * buffer_in,
unsigned char * buffer_out,
int w,int h ){
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int plane = blockIdx.z * blockDim.z +threadIdx.z;
bool is_contain = true;
col = max(0,col);
col = min(col,w-1);
row = max(0,row);
row = min(row,h-1);
for(int dy=-STREL_H/2; dy<=STREL_H/2; dy++){
if(is_contain == false)
break;
for (int dx = -STREL_W/2 ; dx <= STREL_W/2; dx++) {
if (buffer_in[(plane * h * w) +( row + dy) * w + (col + dx) ] !=255 ){
buffer_out[(plane * h * w) + row * w + col ] = 0;
is_contain = false;
break;
}
}
}
if(is_contain == true)
buffer_out[(plane * h * w) + row * w +col ] = 255;
}
UPDATED ALGORITHM
i try to follow that samples to do convolution.I change the input image, now has 512x512 size and i wrote that algorithm:
#define STREL_SIZE 5
#define TILE_W 16
#define TILE_H 16
#define R (STREL_H/2)
#define BLOCK_W (TILE_W+(2*R))
#define BLOCK_H (TILE_H+(2*R))
__global__ void erode_multiple_img_SM_v2(unsigned char * buffer_in,
unsigned char * buffer_out,
int w,int h ){
// Data cache: threadIdx.x , threadIdx.y
__shared__ unsigned char data[TILE_W +STREL_SIZE ][TILE_W +STREL_SIZE ];
// global mem address of this thread
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int plane = blockIdx.z * blockDim.z +threadIdx.z;
int gLoc = (plane*h/w)+ row*w +col;
bool is_contain = true;
// load cache (32x32 shared memory, 16x16 threads blocks)
// each threads loads four values from global memory into shared mem
int x, y; // image based coordinate
if((col<w)&&(row<h)) {
data[threadIdx.x][threadIdx.y]=buffer_in[gLoc];
if (threadIdx.y > (h-STREL_SIZE))
data[threadIdx.x][threadIdx.y + STREL_SIZE]=buffer_in[gLoc + STREL_SIZE];
if (threadIdx.x >(w-STREL_SIZE))
data[threadIdx.x + STREL_SIZE][threadIdx.y]=buffer_in[gLoc+STREL_SIZE];
if ((threadIdx.x >(w-STREL_SIZE)) && (threadIdx.y > (h-STREL_SIZE)))
data[threadIdx.x+STREL_SIZE][threadIdx.y+STREL_SIZE] = buffer_in[gLoc+2*STREL_SIZE];
//wait for all threads to finish read
__syncthreads();
//buffer_out[gLoc] = data[threadIdx.x][threadIdx.y];
unsigned char min_value = 255;
for(x=0;x<STREL_SIZE;x++){
for(y=0;y<STREL_SIZE;y++){
min_value = min( (data[threadIdx.x+x][threadIdx.y+y]) , min_value);
}
}
buffer_out[gLoc]= min_value;
}
}
my kernel settings now are:
dim3 block(16,16);
dim3 grid(512/(block.x),512/(block.y),nImg);
input:
output:
seems that the pixels of the apron are not copyied in the ouput buffer

You may want to read the following links for more detailed description and better example code on how to implement an image convolution CUDA kernel function.
http://igm.univ-mlv.fr/~biri/Enseignement/MII2/Donnees/convolutionSeparable.pdf
https://www.evl.uic.edu/sjames/cs525/final.html
Basically using a convolution filter of the size (5 x 5) does not mean setting the size of the thread block to be (5 x 5).
Typically, for a non-separable convolution, you could use a thread block of the size (16 x 16), to calculate a block of (16 x 16) pixels on the output image. To achieve this you need to read a block of ((2+16+2) x (2+16+2)) pixels from the input image to the shared memory, using the (16 x 16) threads collaboratively.

Related

Image subtraction with CUDA and textures

My goal is to use C++ with CUDA to subtract a dark frame from a raw image. I want to use textures for acceleration. The input of the images is cv::Mat with the type CV_8UC4 (I use the pointer to the data of the cv::Mat). This is the kernel I came up with, but I have no idea how to eventually subtract the textures from each other:
__global__ void DarkFrameSubtractionKernel(unsigned char* outputImage, size_t pitchOutputImage,
cudaTextureObject_t inputImage, cudaTextureObject_t darkImage, int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const float tx = (x + 0.5f);
const float ty = (y + 0.5f);
if (x >= width || y >= height) return;
uchar4 inputImageTemp = tex2D<uchar4>(inputImage, tx, ty);
uchar4 darkImageTemp = tex2D<uchar4>(darkImage, tx, ty);
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp; // this line will throw an error
}
This is the function that calls the kernel (you can see that I create the textures from unsigned char):
void subtractDarkImage(unsigned char* inputImage, size_t pitchInputImage, unsigned char* outputImage,
size_t pitchOutputImage, unsigned char* darkImage, size_t pitchDarkImage, int width, int height,
cudaStream_t stream)
{
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.devPtr = inputImage;
resDesc.res.pitch2D.pitchInBytes = pitchInputImage;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
cudaTextureObject_t imageInputTex, imageDarkTex;
CUDA_CHECK(cudaCreateTextureObject(&imageInputTex, &resDesc, &texDesc, 0));
resDesc.res.pitch2D.devPtr = darkImage;
resDesc.res.pitch2D.pitchInBytes = pitchDarkImage;
CUDA_CHECK(cudaCreateTextureObject(&imageDarkTex, &resDesc, &texDesc, 0));
dim3 block(32, 8);
dim3 grid = paddedGrid(block.x, block.y, width, height);
DarkImageSubtractionKernel << <grid, block, 0, stream >> > (reinterpret_cast<uchar4*>(outputImage), pitchOutputImage / sizeof(uchar4),
imageInputTex, imageDarkTex, width, height);
CUDA_CHECK(cudaDestroyTextureObject(imageInputTex));
CUDA_CHECK(cudaDestroyTextureObject(imageDarkTex));
}
The code does not compile as I can not subtract a uchar4 from another one (in the kernel). Is there an easy way of subtraction here?
Help is very much appreciated.
Is there an easy way of subtraction here?
There are no arithmetic operators defined for CUDA built-in vector types. If you replace
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp;
with
uchar4 val;
val.x = inputImageTemp.x - darkImageTemp.x;
val.y = inputImageTemp.y - darkImageTemp.y;
val.z = inputImageTemp.z - darkImageTemp.z;
val.w = inputImageTemp.w - darkImageTemp.w;
outputImage[y * pitchOutputImage + x] = val;
things will work. If this offends you, I suggest writing a small library of helper functions to hide the mess.

Pixel data unpacking to smaller sections

I'm trying to write a function that unpacks an image into separate quads. But for some reason the results are distorted (kinda stretched 45 degrees), so I must be reading the pixel array incorrectly, though I can't see the problem with my function...
The function takes 2 unsigned char arrays, "source" and "target" and two unsigned int values, the "width" and "height" of the source image. Width is dividable by 4, and height is dividable by 3 (both return the same value, because the texture is 600 * 450) so each face is 150*150 px. So the w/h values are correct. Then it also takes in 2 ints, "xIt" and "yIt" which determine the offset - which 150*150 block should be read.
Here's the function:
const unsigned int trgImgWidth = width / 4;
const unsigned int trgImgHeight = height / 3;
unsigned int trgBufferOffset = 0;
// Compute pixel offset to start reading from
unsigned int Yoffset = yIt * trgImgHeight * width * 3;
unsigned int Xoffset = xIt * trgImgWidth * 3;
for (unsigned int y = 0; y < trgImgHeight; y++)
{
unsigned int o = Yoffset + Xoffset; // Offset of current line of pixels
for (unsigned int x = 0; x < trgImgWidth * 3; x++) // for each pixel component (rgb) in the line
{
target[trgBufferOffset] = source[o + x];
trgBufferOffset++;
}
Yoffset += width * 3;
}
Anyone see where I might be going wrong here?

Why doesn't my OpenCL 3d image lookup work?

I have been having trouble with an OpenCL kernel which I've written producing incorrect results (compared to a reference brute-force CPU implementation).
I tracked the problem down to a 3D lookup table I'm using which seems to be returning garbage results, rather than the values which I passed in.
I have the following (simplified) OpenCL kernel for reading a precomputed function from a 3D image type:
__constant sampler_t legSampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
inline float normalizedLegendre(int n, int m, float z, image3d_t legendreLUT)
{
float nCoord = (((float) n) / get_image_width(legendreLUT));
float mCoord = (((float) m) / get_image_height(legendreLUT));
float zCoord = ((z + 1.0f) / 2.0f);
float4 coord = (float4)(floor(nCoord) + 0.5f, floor(mCoord) + 0.5f, zCoord, 0.0f);
return read_imagef(legendreLUT, legSampler, coord).x;
}
_kernel void noiseMain(__read_only image3d_t legendreLUT, __global float* outLegDump)
{
//k is the linear index into the array.
int k = get_global_id(0);
if(k < get_image_depth(legendreLUT))
{
float z = ((float) k / (float) get_image_depth(legendreLUT)) * 2.0 - 1.0;
float legLookup = normalizedLegendre(5, 4, z, legendreLUT);
float texCoord = ((float) k / 1024.0) * 2 - 1;
outLegDump = legLookup;
}
}
On the host side, I generate the 3D image, legendreLUT, using the following code:
static const size_t NLEGPOLYBINS = 1024;
static const size_t NLEGPOLYORDERS = 16;
boost::scoped_array<float> legendreHostBuffer(new float[NLEGPOLYORDERS * NLEGPOLYORDERS * NLEGPOLYBINS]);
float stepSize = 1.0 / (((float) NLEGPOLYBINS/2.0) - 0.5);
float z = -1.0;
std::cout << "Generating legendre polynomials..." << std::endl;
for(size_t n = 0; n < NLEGPOLYORDERS; n++)
{
for(size_t m = 0; m < NLEGPOLYORDERS; m++)
{
for(size_t zI = 0; zI < NLEGPOLYBINS; zI++)
{
using namespace boost::math;
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + zI;
//-1..1 in NLEGPOLYBINS steps...
float val;
if(m > n)
{
legendreHostBuffer[index] = 0;
continue;
}
else
{
//boost::math::legendre_p
val = legendre_p<float>(n,m,z);
}
float nPm = n+m;
float nMm = n-m;
float factNum;
float factDen;
factNum = factorial<float>(n-m);
factDen = factorial<float>(n+m);
float nrmTerm;
nrmTerm = pow(-1.0, m) * sqrt((n + 0.5) * (factNum/factDen));
legendreHostBuffer[index] = val;
z += stepSize;
if(z > 1.0) z + 1.0;
}
z = -1.0;
}
}
//DEBUGGING STEP: Dump everything we've just generated for m = 4, n = 5, z=-1..1
std::ofstream legDump("legDump.txt");
for(size_t i = 0; i < NLEGPOLYBINS; i++)
{
int n =5; int m = 4;
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + i;
float texCoord = ((float) i / (float) NLEGPOLYBINS) * 2 - 1;
legDump << i << " " << texCoord << " " << legendreHostBuffer[index] << std::endl;
}
legDump.close();
std::cout << "Creating legendre polynomial look up table image..." << std::endl;
cl::ImageFormat legFormat(CL_R, CL_FLOAT);
//Generate out legendre polynomials image...
m_legendreTable = cl::Image3D(m_clContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
legFormat,
NLEGPOLYORDERS,
NLEGPOLYORDERS,
NLEGPOLYBINS,
0,
0,
legendreHostBuffer.get());
Other than the index, the actual generation of the values is more or less irrelevant, but I've included it here for completeness.
And here is how I execute the kernel and read back the results:
cl::Buffer outLegDump = cl::Buffer(m_clContext, CL_MEM_WRITE_ONLY, NLEGPOLYBINS * sizeof(float));
//Create out kernel...
cl::Kernel kernel(m_program, "noiseMain");
kernel.setArg(0, m_legendreTable);
kernel.setArg(1, outLegDump);
size_t kernelSize = 1024;
cl::NDRange globalRange(kernelSize);
cl::NDRange localRange(1);
m_commandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, globalRange, cl::NullRange);
m_commandQueue.finish();
boost::scoped_array<float> legDumpHost(new float[NLEGPOLYBINS]);
m_commandQueue.enqueueReadBuffer(outLegDump, CL_TRUE, 0, NLEGPOLYBINS * sizeof(float), legDumpHost.get());
std::ofstream legreadback("legreadback.txt");
for(size_t i = 0; i < NLEGPOLYBINS; i++)
{
legreadback << i << " " << legDumpHost[i] << std::endl;
}
legreadback.close();
When I look at the dumped data (i.e. that put out in legdump.txt from the host-side buffer), I get the expected data. However, when I compare it to the data received back from the device side (i.e. that looked up by the kernel and put out in legreadback.txt), I get incorrect values.
Since I'm calculating 1024 values in both cases, I'll spare everyone the whole dump, however, here are the first few/last values of each:
legdump.txt (host side sanity check):
0 -0
1 -0.0143913
2 -0.0573401
3 -0.12851
4 -0.227566
5 -0.354175
..
..
1020 0.12859
1021 0.0144185
1022 0.0144185
1023 1.2905e-8
legreadback.txt (device-side lookup and readback)
0 1
1 1
2 1
3 1
4 0.5
5 0
..
..
1020 7.74249e+11
1021 -1.91171e+15
1022 -3.81029e+15
1023 -1.91173e+15
Note that these values are the same across multiple runs of the code, so I don't think it's an initialization problem.
I can only assume that I'm calculating indices wrong somewhere, but I don't know where. I've checked the calculation of the Z coordinate (which naturally is defined on -1..1), its conversion to texture coordinates (0..1 range), and the conversion of M and N to texture coordinates (which should be done without interpolation), and found nothing to be wrong.
So my question is thus:
What is the proper way to create and index a 3D lookup table in OpenCL?
As expected, the problem turned out to be in the indexing on the host-side used to generate the lookup table.
The previous index calculation:
size_t index = (n * NLEGPOLYORDERS * NLEGPOLYBINS) + (m * NLEGPOLYBINS) + zI;
Was based on C++ 3D array indexing, which is not the way addressing works in OpenCL for a 3D image. A 3D image can be thought of as a "stack" of 2D images on top of each other, where the depth coordinate (Z in this case) selects the image, and the horizontal and vertical coordinates (m and n in this case) select the pixel within the selected image.
The correct indexing calculation is:
size_t index = m * NLEGPOLYORDERS + n + (zI * NLEGPOLYORDERS * NLEGPOLYORDERS);
As one can see, this new approach fits the "stacked image" layout described previously, whereas the previous calculation does not.

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

CUDA - Optimize mean of matrix rows calculation using shared memory

I am trying to optimize the computation of the mean of each row in my 512w x 1024h image, and then subtract the mean from the row from which it was computed. I wrote a piece of code which does it in 1.86 ms, but I want to reduce the speed. This piece of code works fine, but does not use shared memory, and it utilizes for loops. I want to do away with them.
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
// height = 1024, width = 512
int tidy = threadIdx.x + blockDim.x * blockIdx.x;
float sum = 0.0f;
float sumDiv = 0.0f;
if(tidy < height) {
for(int c = 0; c < width; c++) {
sum += img[tidy*width + c];
}
sumDiv = (sum/width)/2;
//__syncthreads();
for(int cc = 0; cc < width; cc++) {
lineImg[tidy*width + cc] = img[tidy*width + cc] - sumDiv;
}
}
__syncthreads();
I called the above kernel using:
subtractMean <<< 2, 512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);
However, the following code I wrote uses shared memory to optimize. But, it does not work as expected. Any thoughts on what the problem might be?
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
extern __shared__ float perRow[];
int idx = threadIdx.x; // set idx along x
int stride = width/2;
while(idx < width) {
perRow[idx] = 0;
idx += stride;
}
__syncthreads();
int tidx = threadIdx.x; // set idx along x
int tidy = blockIdx.x; // set idx along y
if(tidy < height) {
while(tidx < width) {
perRow[tidx] = img[tidy*width + tidx];
tidx += stride;
}
}
__syncthreads();
tidx = threadIdx.x; // reset idx along x
tidy = blockIdx.x; // reset idx along y
if(tidy < height) {
float sumAllPixelsInRow = 0.0f;
float sumDiv = 0.0f;
while(tidx < width) {
sumAllPixelsInRow += perRow[tidx];
tidx += stride;
}
sumDiv = (sumAllPixelsInRow/width)/2;
tidx = threadIdx.x; // reset idx along x
while(tidx < width) {
lineImg[tidy*width + tidx] = img[tidy*width + tidx] - sumDiv;
tidx += stride;
}
}
__syncthreads();
}
The shared memory function was called using:
subtractMean <<< 1024, 256, sizeof(float)*512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);
2 blocks is hardly enough to saturate GPU use. You are going towards the right approach with utilizing more blocks, however, you are using Kepler and I would like to present an option that does not use shared memory at all.
Start with 32 threads in a block (this can be changed later using 2D blocks)
With those 32 threads you should do something along the lines of this:
int rowID = blockIdx.x;
int tid = threadIdx.x;
int stride= blockDim.x;
int index = threadIdx.x;
float sum=0.0;
while(index<width){
sum+=img[width*rowID+index];
index+=blockDim.x;
}
at this point you will have 32 threads that have a partial sum in each of them. You next need to add them all together. You can do this without the use of shared memory (since we are within a warp) by utilizing a shuffle reduction. For details on that look here: http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ what you want is the shuffle warp reduce, but you need to change it to use the full 32 threads.
Now that thread 0 in each warp has the sum of every row, you can divide that by the width cast to a float, and broadcast it to the rest of the warp using shfl using shfl(average, 0);. http://docs.nvidia.com/cuda/cuda-c-programming-guide/#warp-description
With the average found and the warps synchronized implicitly and explicitly (with shfl), you can continue on in a similar method with the subtract.
Possible further optimizations would be to include more than one warp in a block to improve occupancy, and to manually unroll the loops over the width to improve instruction level parallelism.
Good Luck.