RGB to greyscale conversion using CUDA

RGB to greyscale conversion using CUDA - c++

So I am trying to write a program that turns RGB images to greyscale.
I got the idea from the Udacity problem set. The problem is that when I write out the kernel in the Udacity web environment, it says my code works, however, when I try to do it locally on my computer, I get no errors, but my image instead of coming out greyscale, comes out completely grey. It looks like one grey box the dimensions of the image I loaded. Can you help me find the error in my code, I've compared it with the Udacity version and I can't seem to find it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <cuda.h>
#include <stdio.h>
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <iostream>
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
__global__ void rgb_2_grey(uchar* const greyImage, const uchar4* const rgbImage, int rows, int columns)
{
int rgb_x = blockIdx.x * blockDim.x + threadIdx.x; //x coordinate of pixel
int rgb_y = blockIdx.y * blockDim.y + threadIdx.y; //y coordinate of pixel
if ((rgb_x >= columns) && (rgb_y >= rows)) {
return;
}
int rgb_ab = rgb_y*columns + rgb_x; //absolute pixel position
uchar4 rgb_Img = rgbImage[rgb_ab];
greyImage[rgb_ab] = uchar((float(rgb_Img.x))*0.299f + (float(rgb_Img.y))*0.587f + (float(rgb_Img.z))*0.114f);
}
using namespace cv;
using namespace std;
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage);
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols);
void Save_Img();
Mat img_RGB;
Mat img_Grey;
uchar4 *d_rgbImg;
uchar *d_greyImg;
int main()
{
uchar4* h_rgbImg;
//uchar4* d_rgbImge=0;
uchar* h_greyImg;
//uchar* d_greyImge=0;
Proc_Img(&h_rgbImg, &h_greyImg, &d_rgbImg, &d_greyImg);
RGB_2_Greyscale(d_greyImg, d_rgbImg, img_RGB.rows, img_RGB.cols);
Save_Img();
return 0;
}
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage){
cudaFree(0);
CudaCheckError();
//loads image into a matrix object along with the colors in BGR format (must convert to rgb).
Mat img = imread("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG", CV_LOAD_IMAGE_COLOR);
if (img.empty()){
cerr << "couldnt open file dumbas..." << "C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG" << endl;
exit(1);
}
//converts color type from BGR to RGB
cvtColor(img, img_RGB, CV_BGR2RGBA);
//allocate memory for new greyscale image.
//img.rows returns the range of pixels in y, img.cols returns range of pixels in x
//CV_8UC1 means 8 bit unsigned(non-negative) single channel of color, aka greyscale.
//all three of the parameters allow the create function in the Mat class to determine how much memory to allocate
img_Grey.create(img.rows, img.cols, CV_8UC1);
//creates rgb and greyscale image arrays
*h_RGBImage = (uchar4*)img_RGB.ptr<uchar>(0); //.ptr is a method in the mat class that returns a pointer to the first element of the matrix.
*h_greyImage = (uchar*)img_Grey.ptr<uchar>(0); //this is just like a regular array/pointer mem address to first element of the array. This is templated
//in this case the compiler runs the function for returning pointer of type unsigned char. for rgb image it is
//cast to uchar4 struct to hold r,g, and b values.
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols); //amount of pixels
//allocate memory on gpu
cudaMalloc(d_RGBImage, sizeof(uchar4) * num_pix); //bites of 1 uchar4 times # of pixels gives number of bites necessary for array
CudaCheckError();
cudaMalloc(d_greyImage, sizeof(uchar) * num_pix);//bites of uchar times # pixels gives number of bites necessary for array
CudaCheckError();
cudaMemset(*d_greyImage, 0, sizeof(uchar) * num_pix);
CudaCheckError();
//copy array into allocated space
cudaMemcpy(*d_RGBImage, *h_RGBImage, sizeof(uchar4)*num_pix, cudaMemcpyHostToDevice);
CudaCheckError();
d_rgbImg = *d_RGBImage;
d_greyImg = *d_greyImage;
}
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols){
const int BS = 16;
const dim3 blockSize(BS, BS);
const dim3 gridSize((num_Cols / BS) + 1, (num_Rows / BS) + 1);
rgb_2_grey <<<gridSize, blockSize>>>(d_greyImage, d_RGBImage, num_Rows, num_Cols);
cudaDeviceSynchronize(); CudaCheckError();
}
void Save_Img(){
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols);
cudaMemcpy(img_Grey.ptr<uchar>(0), d_greyImg, sizeof(uchar)*num_pix, cudaMemcpyDeviceToHost);
CudaCheckError();
imwrite("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581GR.JPG", img_Grey);
cudaFree(d_rgbImg);
cudaFree(d_greyImg);
}
EDIT: I realized that the local var in my main is the same name as the global var, I have edited the code here, now I get the error from visual studio that the
variable d_rgbIme is being used without being initialized
when I have already initialized it above. If I set them equal to zero I get a CUDA error saying
an illegal memory access was encountered
I tried running cuda-memcheck, but then I get the error that i could not run the file...

I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {

I was working on the same problem in JCUDA. See if you can use any part of this solution:
//Read Height and Width of image in Height & Width variables
int Width = image.getWidth();
int Height = image.getHeight();
int N = Height * Width;
int[] grayScale = new int[N];
//Allocate separate arrays to store Alpha, Red, Green and
//Blue values for every pixel
int[] redHost = new int[N];
int[] greenHost = new int[N];
int[] blueHost = new int[N];
int[] alphaHost = new int[N];
for(int i=0; i<Height; i++)
{
for(int j=0; j<Width; j++)
{
int pixel = image.getRGB(j, i);
//Read the ARGB data
alphaHost[i*Width+j] = (pixel >> 24) & 0xff;
redHost[i*Width+j] = (pixel >> 16) & 0xff;
greenHost[i*Width+j] = (pixel >> 8) & 0xff;
blueHost[i*Width+j] = (pixel) & 0xff;
}
}
/* Following are the CUDA Kernel parameters*/
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{N}), //Total size of each array W * H
Pointer.to(redDev), // Pointer to redArray on device
Pointer.to(greenDev), // Pointer to greenArray on device
Pointer.to(blueDev), // Pointer to blueArray on device
Pointer.to(Output)); //Pointer to output array
/*Following is my RGBToGrayScale.cu..i.e. CUDA Kernel */
__global__ void RGBtoGrayScale(int N, int *red, int *green, int *blue, int *Output)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
{
Output[id] = (red[id]*0.2989) + (green[id]*0.587) + (blue[id]*0.114);
}
}
/* Get the output data back to Host memory */
cuMemcpyDtoH(Pointer.to(grayScale), Output, N * Sizeof.INT);
/* Write the image with the new RBG values*/
BufferedImage im = new BufferedImage(Width,Height,BufferedImage.TYPE_BYTE_GRAY);
WritableRaster raster = im.getRaster();
for(int i=0;i<Height;i++)
{
for(int j=0;j<Width;j++)
{
raster.setSample(j, i, 0, grayScale[i*Width+j]);
}
}
try
{
ImageIO.write(im,"JPEG",new File("glpattern.jpeg"));
} catch (IOException e)
{
e.printStackTrace();
}

Related

libpng output is not the expected one

I'm trying to get a grasp of basic features in libpng. To do this, I've used this snippet and adapted to my own example.
int x, y;
png_byte color_type = PNG_COLOR_TYPE_RGBA;
png_byte bit_depth = 16;
png_structp png_ptr;
png_infop info_ptr;
auto create_image(const int height, const int width) {
png_byte **rows = new png_byte *[height];
for (auto i = 0; i < height; i++)
rows[i] = new png_byte[width * 4];
return rows;
}
auto modify_image(const int height, const int width, png_byte **rows) {
for (auto i = 0; i < height; i++) {
for (auto j = 0; j < width; j++) {
// Red channel
rows[i][j * 4 + 0] = (j * 127.) / width;
// Blue channel
rows[i][j * 4 + 2] = (i * 127.) / height;
// Alpha channel
rows[i][j * 4 + 3] = 127;
}
}
}
void write(const std::string& filename, const int height, const int width, png_byte** rows)
{
/* create file */
FILE *fp = fopen(filename.c_str(), "wb");
if (!fp)
abort_("[write_png_file] File %s could not be opened for writing", filename);
/* initialize stuff */
png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr)
abort_("[write_png_file] png_create_write_struct failed");
info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr)
abort_("[write_png_file] png_create_info_struct failed");
if (setjmp(png_jmpbuf(png_ptr)))
abort_("[write_png_file] Error during init_io");
png_init_io(png_ptr, fp);
/* write header */
if (setjmp(png_jmpbuf(png_ptr)))
abort_("[write_png_file] Error during writing header");
png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
png_write_info(png_ptr, info_ptr);
/* write bytes */
if (setjmp(png_jmpbuf(png_ptr)))
abort_("[write_png_file] Error during writing bytes");
png_write_image(png_ptr, rows);
/* end write */
if (setjmp(png_jmpbuf(png_ptr)))
abort_("[write_png_file] Error during end of write");
png_write_end(png_ptr, NULL);
fclose(fp);
}
Though, when I run my code:
void render(const int height, const int width, const std::string &filename) {
png_byte **rows = create_image(height, width);
modify_image(height, width, rows);
write(filename, height, width, rows);
}
The problem is... I don't get the expected result at all. While I was expecting a square image with some kind of gradient I get... two rectangular rectangles.
Also, I've noticed that these two rectangles are stretched: while trying to render a circle, I've found that the circle was distorted and doubling its width makes it an actual circle...
Finally, I've seen that on the second rectangle, the last row seems to be random data (which is not the case on the first rectangle).
Please suggest if you have any ideas.

You create image with 16 bit depth yet use 1 byte per channel. Output image consists of odd / even rows of your original image being put on the same row. Basically each line of the right rectangle is caused by a buffer overrun. You need to allocate buffers that are twice as big, that is width * 4 * 2 and fill higher and lower bytes of each channel separately.

Corrupted heap while display a BMP image on console

I have a exercise. It says, that the C program should be able to read the information of a bitmap file and after that it should display the picture on console.
I have already written a code but when it does not work correctly.
When I debugged the code it looks like the heap is corrupted. I thinks I have a known glitch/mistake in ScanPixelline function.
I don't know how to fix it. Can someone help me to check it?
I am relatively new to C programming.
#include "stdafx.h"
#include <conio.h>
#include <stdio.h>
#include <stdlib.h>
#include "stdint.h"
#include "windows.h"
#pragma pack(1)
struct BMP
{
char Type[2]; //File type. Set to "BM".
int32_t Size; //Size in BYTES of the file.
int16_t Reserved1; //Reserved. Set to zero.
int16_t Reserved2; //Reserved. Set to zero.
int32_t OffSet; //Offset to the data.
int32_t headsize; //Size of rest of header. Set to 40.
int32_t Width; //Width of bitmap in pixels.
int32_t Height; // Height of bitmap in pixels.
int16_t Planes; //Number of Planes. Set to 1.
int16_t BitsPerPixel; //Number of Bits per pixels.
int32_t Compression; //Compression. Usually set to 0.
int32_t SizeImage; //Size in bytes of the bitmap.
int32_t XPixelsPreMeter; //Horizontal pixels per meter.
int32_t YPixelsPreMeter; //Vertical pixels per meter.
int32_t ColorsUsed; //Number of colors used.
int32_t ColorsImportant; //Number of "important" colors.
};
struct Color
{
unsigned char B;
unsigned char G;
unsigned char R;
};
struct ColorTable
{
Color *colors;
unsigned long length;
};
struct PixelArray
{
Color **pixels;
unsigned long rowCount;
unsigned long columnCount;
};
void readBMP(char *File_Name, BMP &a)
{
FILE *p = fopen(File_Name, "rb");
if (p == NULL)
{
printf("Can't open file!");
fclose(p);
return;
}
else
{
fread(&a, sizeof(BMP), 1, p);
}
fclose(p);
}
void Get_Inf(BMP a)
{
if (a.Type[0] != 'B' || a.Type[1] != 'M')
{
printf("This is not a BMP file");
}
else
{
printf("This is a BMP file\n");
printf("The size of this file is %lu bytes\n", a.Size);
printf("The witdth of this image is %lu pixels\n", a.Width);
printf("The height of this image is %lu pixels\n", a.Height);
printf("The number of bits per pixels in this image is %u\n", a.BitsPerPixel);
}
}
void scanBmpPixelLine(Color *&line, unsigned long length)
{
FILE *pointer_ = fopen("test.bmp", "rb");
line = new Color[length];
fread(line, sizeof(Color), sizeof(Color)*length, pointer_);
fclose(pointer_);
//file.read((char *)line, length * sizeof(Color));
}
void skipBmpPadding(char count)
{
FILE *pointer__ = fopen("test.bmp", "rb");
if (count == 0)
{
fclose(pointer__);
return;
}
char padding[3];
fread(&padding, sizeof(char), count, pointer__);
fclose(pointer__);
//file.read((char *)&padding, count);
}
void ReadPixelArray(BMP a, PixelArray &data)
{
FILE *pointer = fopen("test.bmp", "rb");
data.rowCount = a.Height;
data.columnCount = a.Width;
data.pixels = new Color*[data.rowCount];
char paddingCount = (4 - (a.Width * (a.BitsPerPixel / 8) % 4)) % 4;
fseek(pointer, 54, SEEK_SET);
for (int i = 0; i < data.rowCount; i++)
{
scanBmpPixelLine(data.pixels[data.rowCount - i - 1], a.Width);
skipBmpPadding(paddingCount);
}
}
void drawBmp(BMP a, PixelArray data)
{
HWND console = GetConsoleWindow();
HDC hdc = GetDC(console);
for (int i = 0; i < a.Height; i++)
for (int j = 0; j < a.Width; j++)
{
Color pixel = data.pixels[i][j];
SetPixel(hdc, j, i, RGB(pixel.R, pixel.G, pixel.B));
}
ReleaseDC(console, hdc);
}
void releaseBmpPixelArray(PixelArray data)
{
for (int i = 0; i < data.rowCount; i++)
delete[]data.pixels[i];
delete[]data.pixels;
}
int main()
{
char file_name[] = "test.bmp";
BMP a;
PixelArray data;
readBMP(file_name, a);
Get_Inf(a);
ReadPixelArray(a, data);
drawBmp(a, data);
releaseBmpPixelArray(data);
}

This function:
void scanBmpPixelLine(Color *&line, unsigned long length)
{
FILE *pointer_ = fopen("test.bmp", "rb");
line = new Color[length];
fread(line, sizeof(Color), sizeof(Color)*length, pointer_);
fclose(pointer_);
//file.read((char *)line, length * sizeof(Color));
}
For starters, the intent of the function appears to be to read one line of pixel data from the file. But instead, it's re-opening the file and reading from the beginning (where the header bytes are). I'm not sure if you are aware of that...
But the crash is a result of this line:
fread(line, sizeof(Color), sizeof(Color)*length, pointer_);
The second parameter, sizeof(Color), is the size of each element. The third parameter is the number of elements to read. The total bytes read from the file will be the multiplication of the second parameter by the third parameter. So you've redundantly multiplied by sizeof(Color) one too many times. The result is that it will overwrite the line buffer.
To fix, it should be:
fread(line, sizeof(Color), length, pointer_);
You probably want to pass the FILE* pointer obtained from your ReadPixelArray function into this function instead of re-opening the file for every line.
Another code review comment. You should just read the entire file into memory instead of redundantly opening and closing the file for each operation. Then parse the header and set a pointer to the first "line" after the header.

C/CUDA: Only every fourth element in CudaArray can be indexed

This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.
This is based on the simpleTexture3D project from CUDA Samples:
This is my cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>
typedef unsigned int uint;
typedef unsigned char uchar;
const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 64, height = 64, depth=64;
//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);
uint *d_output = NULL;
int *pArgc = NULL;
char **pArgv = NULL;
extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);
void loadVolumeData(char *exec_path);
// render image using CUDA
void render()
{
// call CUDA kernel
render_kernel(gridSize, blockSize, d_output, width, height, depth);
getLastCudaError("render_kernel failed");
}
void cleanup()
{
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());
}
// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
{
FILE *fp = fopen(filename, "rb");
if (!fp)
{
fprintf(stderr, "Error opening file '%s'\n", filename);
return 0;
}
uchar *data = (uchar *) malloc(size);
size_t read = fread(data, 1, size, fp);
fclose(fp);
printf("Read '%s', %lu bytes\n", filename, read);
return data;
}
// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
{
int returnState=0;
// cut file extension from filename
char *a=strdup(filename); //via strdup you dumb a const char to char, you must free it yourself
int len = strlen(a);
a[len-4] = '\0'; //deletes '.raw'
//printf("%s\n",a);
char b[50];
sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);
//char b[]="_out.raw"; //Add suffix out to filename
char buffer[256]; // <- danger, only storage for 256 characters.
strncpy(buffer, a, sizeof(buffer));
strncat(buffer, b, sizeof(buffer));
free(a);
FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared
if (!fp)
{
fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
return 0;
}
size_t write = fwrite(data, 1, size, fp);
fclose(fp);
if (write==size)
{
printf("Wrote %lu bytes to '%s'\n", write, buffer);
return 0;
}
else
{
printf("Error writing data to file '%s'\n", buffer);
return 1;
}
}
// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
{
int result = 0;
result = findCudaDevice(argc, (const char **)argv);
return result;
}
void runAutoTest(char *exec_path, char *PathToFile)
{
// set path
char *path;
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
// Allocate output memory
checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));
// zero out the output array with cudaMemset
cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));
// render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, depth);
checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("render_kernel failed");
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
int wState=writeRawFile(path,h_output,width*height*depth);
checkCudaErrors(cudaFree(d_output));
free(h_output);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
//exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void loadVolumeData(char *exec_path, char *PathToFile)
{
char *path;
// load volume data
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
uchar *h_volume = loadRawFile(path, size);
//int wState=writeRawFile(path,h_volume,size);
initCuda(h_volume, volumeSize);
free(h_volume);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
pArgc = &argc;
pArgv = argv;
char *image_file = NULL;
printf("%s Starting...\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "file")) //Note cmd line argument is -file "PathToFile/File.raw"
{ // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"
getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
}
if (image_file)
{
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],image_file);
runAutoTest(argv[0],image_file);
}
else
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],NULL);
runAutoTest(argv[0],NULL);
}
printf("I am finished...\n"
"Can I get some ice cream please\n");
exit(EXIT_SUCCESS);
}
And this is my .cu
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
#include <helper_math.h>
typedef unsigned int uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex; // 3D texture
cudaArray *d_volumeArray = 0;
__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
// float u = x / (float) imageW;
// float v = y / (float) imageH;
//float w = z / (float) imageD;
// // read from 3D texture
// float voxel = tex3D(tex, u, v, w);
uint ps=__umul24(imageW,imageH);
if ((x < imageW) && (y < imageH) && (z < imageD))
{
// write output color
uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
d_output[1] = (uchar) 255;//+0*voxel*255;
}
}
extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
// create 3D array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
// set texture parameters
tex.normalized = true; // access with normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeBorder; // wrap texture coordinates
tex.addressMode[1] = cudaAddressModeBorder;
tex.addressMode[2] = cudaAddressModeBorder;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}
extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
{
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
}
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)
I am eager to here from you experts
Cheers Mika

I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
This led to this strange behavior

Flip and crop a bitmap

I'm getting a bytearray (32 bit or 16 bit) from a source.
If the size width is odd, the last pixel in each row needs to be dropped.
If the height is odd, the last row needs to be dropped.
If the height is negative the bitmap needs to be flipped vertically.
Here is my code so far:
m_pbmiLast = new BITMAPINFO(*m_pbmi);
m_pbmiLast->bmiHeader.biWidth = abs(m_pbmiLast->bmiHeader.biWidth) - (abs(m_pbmiLast->bmiHeader.biWidth) % 2);
m_pbmiLast->bmiHeader.biHeight = abs(m_pbmiLast->bmiHeader.biHeight) - (abs(m_pbmiLast->bmiHeader.biHeight) % 2);
int biWidth = m_pbmiLast->bmiHeader.biWidth;
int biHeight = m_pbmiLast->bmiHeader.biHeight;
int iAdjustedStride = ((((biWidth * m_pbmiLast->bmiHeader.biBitCount) + 31) & ~31) >> 3);
int iRealStride = ((((m_pbmi->bmiHeader.biWidth * m_pbmi->bmiHeader.biBitCount) + 31) & ~31) >> 3);
if (m_pbmi->bmiHeader.biHeight < 0) {
/* Copy the actual data */
int iLineOffsetSource = 0;
int iLineOffsetDest = (biHeight - 1) * iRealStride;
for (int i = 0; i < biHeight; ++i) {
memcpy(&pData[iLineOffsetDest], &m_inputBuffer[iLineOffsetSource], iAdjustedStride);
iLineOffsetSource += iRealStride;
iLineOffsetDest -= iRealStride;
}
} else {
int iLineOffset = 0;
for (int i = 0; i < biHeight; ++i) {
memcpy(&pData[iLineOffset], &m_inputBuffer[iLineOffset], iAdjustedStride);
iLineOffset += iRealStride;
}
}
It doesn't flip the bitmap, and when the bitmap is an odd width, it slants the bitmap.

Can be done like so.. I include the reading and writing just to make it an SSCCE. It has little to no error.
As for my comment about new BITMAPINFO. I was saying that you don't have to allocate such a small structure on the HEAP. Ditch the new part. The only allocation you need for a bitmap is the pixels. The header and other info does not need an allocation at all.
See the Flip function below.
#include <iostream>
#include <fstream>
#include <cstring>
#include <windows.h>
typedef struct
{
BITMAPFILEHEADER Header;
BITMAPINFO Info;
unsigned char* Pixels;
} BITMAPDATA;
void LoadBmp(const char* path, BITMAPDATA* Data)
{
std::ifstream hFile(path, std::ios::in | std::ios::binary);
if(hFile.is_open())
{
hFile.read((char*)&Data->Header, sizeof(Data->Header));
hFile.read((char*)&Data->Info, sizeof(Data->Info));
hFile.seekg(Data->Header.bfOffBits, std::ios::beg);
Data->Pixels = new unsigned char[Data->Info.bmiHeader.biSizeImage];
hFile.read((char*)Data->Pixels, Data->Info.bmiHeader.biSizeImage);
hFile.close();
}
}
void SaveBmp(const char* path, BITMAPDATA* Data)
{
std::ofstream hFile(path, std::ios::out | std::ios::binary);
if (hFile.is_open())
{
hFile.write((char*)&Data->Header, sizeof(Data->Header));
hFile.write((char*)&Data->Info, sizeof(Data->Info));
hFile.seekp(Data->Header.bfOffBits, std::ios::beg);
hFile.write((char*)Data->Pixels, Data->Info.bmiHeader.biSizeImage);
hFile.close();
}
}
void Flip(BITMAPDATA* Data)
{
unsigned short bpp = Data->Info.bmiHeader.biBitCount;
unsigned int width = std::abs(Data->Info.bmiHeader.biWidth);
unsigned int height = std::abs(Data->Info.bmiHeader.biHeight);
unsigned char* out = new unsigned char[Data->Info.bmiHeader.biSizeImage];
unsigned long chunk = (bpp > 24 ? width * 4 : width * 3 + width % 4);
unsigned char* dst = out;
unsigned char* src = Data->Pixels + chunk * (height - 1);
while(src != Data->Pixels)
{
std::memcpy(dst, src, chunk);
dst += chunk;
src -= chunk;
}
std::memcpy(dst, src, chunk); //for 24-bit.
std::swap(Data->Pixels, out);
delete[] out;
}
int main()
{
BITMAPDATA Data;
LoadBmp("C:/Users/Brandon/Desktop/Bar.bmp", &Data);
Flip(&Data);
SaveBmp("C:/Users/Brandon/Desktop/Foo.bmp", &Data);
delete[] Data.Pixels;
return 0;
}

Examples or tutorials of using libjpeg-turbo's TurboJPEG

The instructions for libjpeg-turbo here describes the TurboJPEG API thus: "This API wraps libjpeg-turbo and provides an easy-to-use interface for compressing and decompressing JPEG images in memory". Great, but are there some solid examples of using this API available? Just looking to decompress a fairly vanilla jpeg in memory.
I've found a few bits such as https://github.com/erlyvideo/jpeg/blob/master/c_src/jpeg.c, which appears to be using the TurboJPEG API, but are there any more solid/varied examples?
The source for libjpeg-turbo is well documented, so that does help.

Ok, I know that you did already solve your problem, but as some people, just like me, could be searching some simple example I will share what I created.
It is an example, compressing and decompressing an RGB image. Otherwise I think that the API documentation of TurboJPEG is quite easy to understand!
Compression:
#include <turbojpeg.h>
const int JPEG_QUALITY = 75;
const int COLOR_COMPONENTS = 3;
int _width = 1920;
int _height = 1080;
long unsigned int _jpegSize = 0;
unsigned char* _compressedImage = NULL; //!< Memory is allocated by tjCompress2 if _jpegSize == 0
unsigned char buffer[_width*_height*COLOR_COMPONENTS]; //!< Contains the uncompressed image
tjhandle _jpegCompressor = tjInitCompress();
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &_jpegSize, TJSAMP_444, JPEG_QUALITY,
TJFLAG_FASTDCT);
tjDestroy(_jpegCompressor);
//to free the memory allocated by TurboJPEG (either by tjAlloc(),
//or by the Compress/Decompress) after you are done working on it:
tjFree(&_compressedImage);
After that you have the compressed image in _compressedImage.
To decompress you have to do the following:
Decompression:
#include <turbojpeg.h>
long unsigned int _jpegSize; //!< _jpegSize from above
unsigned char* _compressedImage; //!< _compressedImage from above
int jpegSubsamp, width, height;
unsigned char buffer[width*height*COLOR_COMPONENTS]; //!< will contain the decompressed image
tjhandle _jpegDecompressor = tjInitDecompress();
tjDecompressHeader2(_jpegDecompressor, _compressedImage, _jpegSize, &width, &height, &jpegSubsamp);
tjDecompress2(_jpegDecompressor, _compressedImage, _jpegSize, buffer, width, 0/*pitch*/, height, TJPF_RGB, TJFLAG_FASTDCT);
tjDestroy(_jpegDecompressor);
Some random thoughts:
I just came back over this as I am writing my bachelor thesis, and I noticed that if you run the compression in a loop it is preferable to store the biggest size of the JPEG buffer to not have to allocate a new one every turn. Basically, instead of doing:
long unsigned int _jpegSize = 0;
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &_jpegSize, TJSAMP_444, JPEG_QUALITY,
TJFLAG_FASTDCT);
we would add an object variable, holding the size of the allocated memory long unsigned int _jpegBufferSize = 0; and before every compression round we would set the jpegSize back to that value:
long unsigned int jpegSize = _jpegBufferSize;
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &jpegSize, TJSAMP_444, JPEG_QUALITY,
TJFLAG_FASTDCT);
_jpegBufferSize = _jpegBufferSize >= jpegSize? _jpegBufferSize : jpegSize;
after the compression one would compare the memory size with the actual jpegSize and set it to the jpegSize if it is higher than the previous memory size.

I ended up using below code as a working example for both JPEG encoding and decoding. Best example that I can find, it's self-contained that initializes a dummy image and output the encoded image to a local file.
Below code is NOT my own, credit goes to https://sourceforge.net/p/libjpeg-turbo/discussion/1086868/thread/e402d36f/#8722 . Posting it here again to help anyone finds it's difficult to get libjpeg turbo working.
#include "turbojpeg.h"
#include <iostream>
#include <string.h>
#include <errno.h>
using namespace std;
int main(void)
{
unsigned char *srcBuf; //passed in as a param containing pixel data in RGB pixel interleaved format
tjhandle handle = tjInitCompress();
if(handle == NULL)
{
const char *err = (const char *) tjGetErrorStr();
cerr << "TJ Error: " << err << " UNABLE TO INIT TJ Compressor Object\n";
return -1;
}
int jpegQual =92;
int width = 128;
int height = 128;
int nbands = 3;
int flags = 0;
unsigned char* jpegBuf = NULL;
int pitch = width * nbands;
int pixelFormat = TJPF_GRAY;
int jpegSubsamp = TJSAMP_GRAY;
if(nbands == 3)
{
pixelFormat = TJPF_RGB;
jpegSubsamp = TJSAMP_411;
}
unsigned long jpegSize = 0;
srcBuf = new unsigned char[width * height * nbands];
for(int j = 0; j < height; j++)
{
for(int i = 0; i < width; i++)
{
srcBuf[(j * width + i) * nbands + 0] = (i) % 256;
srcBuf[(j * width + i) * nbands + 1] = (j) % 256;
srcBuf[(j * width + i) * nbands + 2] = (j + i) % 256;
}
}
int tj_stat = tjCompress2( handle, srcBuf, width, pitch, height,
pixelFormat, &(jpegBuf), &jpegSize, jpegSubsamp, jpegQual, flags);
if(tj_stat != 0)
{
const char *err = (const char *) tjGetErrorStr();
cerr << "TurboJPEG Error: " << err << " UNABLE TO COMPRESS JPEG IMAGE\n";
tjDestroy(handle);
handle = NULL;
return -1;
}
FILE *file = fopen("out.jpg", "wb");
if (!file) {
cerr << "Could not open JPEG file: " << strerror(errno);
return -1;
}
if (fwrite(jpegBuf, jpegSize, 1, file) < 1) {
cerr << "Could not write JPEG file: " << strerror(errno);
return -1;
}
fclose(file);
//write out the compress date to the image file
//cleanup
int tjstat = tjDestroy(handle); //should deallocate data buffer
handle = 0;
}

In the end I used a combination of random code found on the internet (e.g. https://github.com/erlyvideo/jpeg/blob/master/c_src/jpeg.c) and the .c and header files for libjeg-turbo, which are well documented.
This official API is a good information source aswell.

Here's a fragment of code what I use to load jpeg's from memory. Maybe it will require a bit of fixing, because I extracted it from different files in my project. It will load both - grayscale and rgb images (bpp will be set either to 1 or to 3).
struct Image
{
int bpp;
int width;
int height;
unsigned char* data;
};
struct jerror_mgr
{
jpeg_error_mgr base;
jmp_buf jmp;
};
METHODDEF(void) jerror_exit(j_common_ptr jinfo)
{
jerror_mgr* err = (jerror_mgr*)jinfo->err;
longjmp(err->jmp, 1);
}
METHODDEF(void) joutput_message(j_common_ptr)
{
}
bool Image_LoadJpeg(Image* image, unsigned char* img_data, unsigned int img_size)
{
jpeg_decompress_struct jinfo;
jerror_mgr jerr;
jinfo.err = jpeg_std_error(&jerr.base);
jerr.base.error_exit = jerror_exit;
jerr.base.output_message = joutput_message;
jpeg_create_decompress(&jinfo);
image->data = NULL;
if (setjmp(jerr.jmp)) goto bail;
jpeg_mem_src(&jinfo, img_data, img_size);
if (jpeg_read_header(&jinfo, TRUE) != JPEG_HEADER_OK) goto bail;
jinfo.dct_method = JDCT_FLOAT; // change this to JDCT_ISLOW on Android/iOS
if (!jpeg_start_decompress(&jinfo)) goto bail;
if (jinfo.num_components != 1 && jinfo.num_components != 3) goto bail;
image->data = new (std::nothrow) unsigned char [jinfo.output_width * jinfo.output_height * jinfo.output_components];
if (!image->data) goto bail;
{
JSAMPROW ptr = image->data;
while (jinfo.output_scanline < jinfo.output_height)
{
if (jpeg_read_scanlines(&jinfo, &ptr, 1) != 1) goto bail;
ptr += jinfo.output_width * jinfo.output_components;
}
}
if (!jpeg_finish_decompress(&jinfo)) goto bail;
image->bpp = jinfo.output_components;
image->width = jinfo.output_width;
image->height = jinfo.output_height;
jpeg_destroy_decompress(&jinfo);
return true;
bail:
jpeg_destroy_decompress(&jinfo);
if (image->data) delete [] data;
return false;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

RGB to greyscale conversion using CUDA - c++

I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {

Related

libpng output is not the expected one

Corrupted heap while display a BMP image on console

C/CUDA: Only every fourth element in CudaArray can be indexed

Flip and crop a bitmap

Examples or tutorials of using libjpeg-turbo's TurboJPEG

Categories

Resources