Please help me with cleaning up my heap after loading bitmap with FreeImage.
delete[] data;
causes _ASSERTE(_CrtIsValidHeapPointer(pUserData)) assertion, and I cannot found how to fix it other than commenting this line. Will there memory leak?
Any help and explanation will be appreciated!
Full code at pastebin:
Visual Studio 2012 solution (with huge FreeImage static lib): (15.7 Mbytes!)
Full code here:
#include <iostream>
// FreeImage static linkage
#include "FreeImage/FreeImage.h"
#include "FreeImage/Utilities.h"
#pragma comment(lib, "FreeImage/FreeImaged.lib")
using namespace std;
static const wchar_t* sk_Filename = L"Test.tga";
// Error handler to use in callback
void FreeImageErrorHandler(FREE_IMAGE_FORMAT fif, const char *msg)
char buf[1024];
sprintf_s(buf, 1024, "Error: %s", FreeImage_GetFormatFromFIF(fif));
cout << buf;
// Bitmap loader from FreeImage samples
FIBITMAP* GenericLoaderU(const wchar_t* lpszPathName, int flag)
fif = FreeImage_GetFileTypeU(lpszPathName, 0);
if(fif == FIF_UNKNOWN)
fif = FreeImage_GetFIFFromFilenameU(lpszPathName);
if((fif != FIF_UNKNOWN) && FreeImage_FIFSupportsReading(fif))
FIBITMAP *dib = FreeImage_LoadU(fif, lpszPathName, flag);
return dib;
return NULL;
// Function gets filename and returns bitmap data array, its size and bits per pixel
void GetData(const wchar_t* szFilename, unsigned char* data, unsigned int& width, unsigned int& height, unsigned int& bpp)
FIBITMAP* src = GenericLoaderU(szFilename, 0);
if(src == 0)
FIBITMAP* src32 = FreeImage_ConvertTo32Bits(src);
// Get picture info
width = FreeImage_GetWidth(src32);
height = FreeImage_GetHeight(src32);
bpp = FreeImage_GetBPP(src32);
unsigned int scan_width = width * bpp/8;
if((width == 0) || (height == 0) || (bpp == 0))
memset(data, 0, height * scan_width);
SwapRedBlue32(src32); // Convert BGR to RGB
// Get bitmap data
FreeImage_ConvertToRawBits(data, src32, scan_width, bpp, FI_RGBA_RED_MASK, FI_RGBA_GREEN_MASK, FI_RGBA_BLUE_MASK, TRUE);
int main()
//Creating bitmap data array (size is unknown here)
unsigned char* data = new unsigned char[];
unsigned int width(0), height(0), bpp(0);
// Loading data here
GetData(sk_Filename, data, width, height, bpp);
//Using data here
cout << width << "x" << height << "x" << bpp << endl;
for (unsigned int i = 0; i < width * height * bpp/8; )
cout << "("
<< (unsigned int)data[i] << ", "
<< (unsigned int)data[i+1] << ", "
<< (unsigned int)data[i+2] << ", "
<< (unsigned int)data[i+3] << ")"
<< endl;
i += 4;
cout << endl;
delete[] data; // <-- Breaks with _ASSERTE(_CrtIsValidHeapPointer(pUserData));
// What's wrong here?
return 0;
Okay, first possible solution is to use std::vector.
It has nothing to do with the delete.
The thing is that Debug Crt Runtime can check the memory integrity only during calls to memory API like: malloc, free, realloc, new, delete.
You have a memory overrun that is detected by the Crt.
Obviously, new unsigned char[] does not allocate enough bytes for you.
Move the allocation into the GetData() proc and call it like:
unsigned char* data = GetData(sk_Filename, width, height, bpp);
Write a function that calculates the size of data based on the image.
Then allocate data with that size
size_z GetDataSize(const wchar_t* szFilename)
It's easy to calculate the required size inside your GetData function, so allocate the array there and return it instead.
You would have
unsigned char* GetData(const wchar_t* szFilename,
unsigned int& width,
unsigned int& height,
unsigned int& bpp);
which contains
unsigned char* data = new unsigned char[height * scan_width];
// Do the conversion...
return data;
and main would say
unsigned char* data = GetData(sk_Filename, width, height, bpp);
I am trying to to encode a bitmap image block and save it to memory (inside of a vector). It all works fine, until I try to do this in a multi-threaded fashion. I keep getting the following errors:
Error C2672 'std::invoke': no matching overloaded function found
Error C2893 Failed to specialize function template 'unknown-type std::invoke(_Callable &&,_Types &&...) noexcept()'
My code is a simple screen-shooting class called inside main(), this is where I try to do multi-threading:
bool Screenshot::threadfunc(Gdiplus::Bitmap* bmp, int i, int j, int x, int y, int bw, int bh, std::vector<std::vector< std::vector<BYTE> >> blocksBmpBytesMatrix, std::string dataFormat)
Gdiplus::Bitmap* tile = bmp->Clone(x, y, bw, bh, PixelFormat24bppRGB);
// write to IStream
IStream* istream = nullptr;
CreateStreamOnHGlobal(NULL, TRUE, &istream);
// define encoding
CLSID clsid;
CLSIDFromString(L"{557cf400-1a04-11d3-9a73-0000f81ef32e}", &clsid);
Gdiplus::Status status = tile->Save(istream, &clsid, NULL);
if (status != Gdiplus::Status::Ok)
std::wcout << "ERROR" << std::endl;
return false;
// get memory handle associated with istream
GetHGlobalFromStream(istream, &hg);
// copy IStream to buffer
int bufsize = GlobalSize(hg);
// lock & unlock memory
LPVOID pimage = GlobalLock(hg);
memcpy(&blocksBmpBytesMatrix[i][j][0], pimage, bufsize);
return true;
bool Screenshot::divideIntoBlocks(HWND chwnd, int screenshotId, RECT rcMonitors, int blockHeight, int blockWidth)
Gdiplus::Bitmap bmp(hbwindow, nullptr);
int nrows = height / blockHeight + 1 * int((height % blockHeight) != 0);
int ncols = width / blockWidth + 1 * int((width % blockWidth) != 0);
for (int i = 0; i < nrows; i++)
for (int j = 0; j < ncols; j++)
// compute block coordinates and dimensions
int x = j * blockWidth;
int y = i * blockHeight;
int bw = ((x + blockWidth) > width) * (width % blockWidth) + ((x + blockWidth) <= width) * blockWidth;
int bh = ((y + blockHeight) > height) * (height % blockHeight) + ((y + blockHeight) <= height) * blockHeight;
// append to vecs
blocksInfo.push_back({ i, j, x, y, bw, bh });
std::vector<std::thread*> pool(nrows * ncols);
for (auto& ij : blocksInfo)
std::cout << ij.size() << " : " << ij[0] << "," << ij[1] << "," << ij[2] << "," << ij[3] << "," << ij[4] << "," << ij[5] << std::endl;
std::thread t(&Screenshot::threadfunc, &bmp, ij[0], ij[1], ij[2], ij[3], ij[4], ij[5], this->blockPngBytesMatrix, "png");
for (auto& t : pool) { t->join(); }
return true;
any idea what am I doing wrong here? or if I can (how can I) do this multi-threading encoding?
Edit: This is Screenshot.h
class Screenshot
// screenshot dimensions and coordinates
int width, height;
int screenx, screeny;
// inti coordinates and dimensions vectors
std::vector< std::vector<int>> blocksInfo;
// init data matrices
std::vector<std::vector< std::vector<BYTE> >> blocksPngBytesMatrix;
// init screenshot bmp and hbmp
HBITMAP hbwindow;
// init handle for display contexts
HDC hwindowDC;
HDC hwindowCompatibleDC;
// constructor
Screenshot(RECT, int, int);
// funcs
BITMAPINFOHEADER createBitmapHeader(int, int);
bool capture();
bool divideIntoBlocks(HWND, int, RECT, int, int);
bool saveToMemory(Gdiplus::Bitmap*, std::vector<BYTE>&, std::string);
bool threadfunc(Gdiplus::Bitmap*, int, int, int, int, int, int, std::vector<std::vector< std::vector<BYTE> >>, std::string);
// deconstructor
There's a lock inside GDI+ that prevents two threads from using the same Graphics object or the same Bitmap. Whatever thread gets to it first grabs the lock, the other one will die with an exception.
Refer: GDI+ objects and multithreading
And Thread Synchronization also pointed out:
Some GDI+ methods return ObjectBusy if a thread attempts to call a
method while another thread is executing a method on the same object.
Do not try to synchronize access to an object based on the ObjectBusy
return value.
Instead, each time you access a member or call a method of the object, place the call inside a critical section, or use some other standard synchronization technique.
My project need to receive a PNG file content over HTTP request, do something to the image and sending back the generated PNG back in the HTTP response. All code need to be done in C/C++.
I'm new to libpng. So I try to write a prototype, reading a PNG file into unsigned char buffer, get the RGB values out (ignore alpha), do no-op, create a new unsigned char buffer with PNG file content, write the new file to the disk and validate I "generate" the same image. I have referenced this question in StackOverflow
My code (with some unrelated function removed, full runnable code is here):
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <png.h>
using namespace std;
typedef struct
png_bytep data;
int size;
} ImageTarget;
int read_png(string file_path, unsigned char** buffer)
// ... ...
void write_png(string file_path, unsigned char* buffer, int length)
// ... ...
static void pngReadCallback(png_structp png_ptr, png_bytep data, png_size_t length)
// ... ...
void pngWriteCallback(png_structp png_ptr, png_bytep data, png_size_t length)
cout << "- length ----------- " << length << endl;
ImageTarget * itarget = (ImageTarget*)png_get_io_ptr(png_ptr);
size_t nsize = itarget->size + length;
cout << "- nsize ----------- " << nsize << endl;
cout << "- data ------ " << (size_t) itarget->data << endl;
if(itarget->data != nullptr)
itarget->data = (unsigned char*)realloc(itarget->data, nsize);
itarget->data = (unsigned char*)malloc(nsize);
memcpy(itarget->data + itarget->size, data, length);
itarget->size += length;
int main()
const string Input_PNG = "pic/c.png";
const string Output_PNG = "output/output.png";
unsigned char* buffer = nullptr;
int length = read_png(Input_PNG, &buffer);
png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
png_infop info_ptr = png_create_info_struct(png_ptr);
ImageSource imgsource; = buffer;
imgsource.size = length;
imgsource.offset = 0;
png_set_read_fn(png_ptr, &imgsource, pngReadCallback);
png_read_png(png_ptr, info_ptr, PNG_TRANSFORM_STRIP_ALPHA, 0);
int w = png_get_image_width( png_ptr, info_ptr );
int h = png_get_image_height( png_ptr, info_ptr );
cout << "Image width (from PNG file): " << w << endl;
cout << "Image height (from PNG file): " << h << endl;
png_bytep* row_pointers = png_get_rows( png_ptr, info_ptr );
png_bytep raw_rgb = (png_bytep)malloc(w * h * 3);
int i = 0;
for(int y=0; y<h; ++y ) {
for(int x=0; x<w*3; ) {
raw_rgb[i++] = row_pointers[y][x++]; // red
raw_rgb[i++] = row_pointers[y][x++]; // green
raw_rgb[i++] = row_pointers[y][x++]; // blue
// Do Something
png_destroy_read_struct( &png_ptr, &info_ptr, 0);
// ---------------------------------
png_structp wpng_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
png_infop winfo_ptr = png_create_info_struct(wpng_ptr);
png_set_IHDR(wpng_ptr, winfo_ptr, 720, 720, 8,
png_set_rows(wpng_ptr, winfo_ptr, &raw_rgb);
ImageTarget itarget; = nullptr;
itarget.size = 0;
png_set_write_fn(wpng_ptr, &itarget, pngWriteCallback, NULL);
png_write_png(wpng_ptr, winfo_ptr, PNG_TRANSFORM_IDENTITY, NULL);
cout << "Output file name: " << Output_PNG << endl;
write_png(Output_PNG,, length);
return 0;
The makefile I use: here.
After compile and run my code, I see the output (I have verified the c.png I used is a PNG file by Irfanview):
Image width (from PNG file): 720
Image height (from PNG file): 720
- length ----------- 8
- nsize ----------- 8
- data ------ 0
- length ----------- 8
- nsize ----------- 16
- data ------ 22161152
- length ----------- 13
- nsize ----------- 29
- data ------ 22161152
- length ----------- 4
- nsize ----------- 33
- data ------ 22161152
[1] 6675 segmentation fault (core dumped) ./png_from_buffer
Check the core file with gdb, here is the output:
#0 __memcpy_sse2_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S:35
#1 0x00007f3c41b3caf0 in png_write_row () from /lib/x86_64-linux-gnu/
#2 0x00007f3c41b3cd78 in png_write_image () from /lib/x86_64-linux-gnu/
#3 0x00007f3c41b3d61b in png_write_png () from /lib/x86_64-linux-gnu/
#4 0x000000000040269c in main () at main.cpp:181
I tried using different buffer, convert to vector<unsigned char*> as the rows. No luck so far. Any idea will be appreciated.
My environment, if matters:
Ubuntu 16.04
libpng 1.2
libpng12-0/xenial-updates,xenial-security,now 1.2.54-1ubuntu1.1 amd64 [installed]
libpng12-dev/xenial-updates,xenial-security,now 1.2.54-1ubuntu1.1 amd64 [installed,automatic]
g++ (Ubuntu 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609
I figured out a solution: using libpng v1.6 and the problem could be fixed easily as below:
// READING....
png_image image;
memset(&image, 0, (sizeof image));
image.version = PNG_IMAGE_VERSION;
if (png_image_begin_read_from_memory(&image, file_buffer, length) == 0)
return -1;
png_bytep buffer;
image.format = PNG_FORMAT_BGR;
size_t input_data_length = PNG_IMAGE_SIZE(image);
buffer = (png_bytep)malloc(input_data_length);
memset(buffer, 0, input_data_length);
if (png_image_finish_read(&image, NULL, buffer, 0, NULL) == 0)
return -1;
Writing to a memory buffer is also quite easy:
// WRITING......
png_image wimage;
memset(&wimage, 0, (sizeof wimage));
wimage.version = PNG_IMAGE_VERSION;
wimage.format = PNG_FORMAT_BGR;
wimage.height = 720;
wimage.width = 720;
// Get memory size
bool wresult = png_image_write_to_memory(&wimage, nullptr, &wlength, 0, buffer, 0, nullptr);
if (!wresult)
cout << "Error: " << image.message << endl;
// Real write to memory
unsigned char* wbuffer = (unsigned char*)malloc(wlength);
wresult = png_image_write_to_memory(&wimage, wbuffer, &wlength, 0, buffer, 0, nullptr);
write_png(Output_PNG, wbuffer, wlength);
libpng v1.6 exist simple api.Both png_image_write_to_memory you use will consume double time.
I found the better sample in github:
I hope that it helps.
I am using the libjpeg library to read and copy a jpeg into an editing program that I am writing in C++
I have a display buffer which is a vector of a datatype called ColorData
All ColorData consists of is 3 floats (RGB)
Here is my code that opens the jpeg files
PixelBuffer * IOManager::load_jpg_to_pixel_buffer(const char *file_name){
struct jpeg_decompress_struct cinfo;
FILE * infile;
if ((infile = fopen(file_name, "rb")) == NULL) {
std::cout << "Could not open the jpg file: " << file_name << std::endl;
return nullptr;
struct jpeg_error_mgr jerr;
cinfo.err = jpeg_std_error(&jerr);
jpeg_stdio_src(&cinfo, infile);
jpeg_read_header(&cinfo, TRUE);
int width = static_cast<int>(cinfo.output_width);
int height = static_cast<int>(cinfo.output_height);
std::cout << typeid(cinfo.colormap).name() << std::endl;
std::cout << "Width: " << width << "Height: " << height << std::endl;
PixelBuffer * image_buffer = new PixelBuffer(width, height, ColorData());
std::cout << cinfo.output_components << std::endl;
buffer = (*cinfo.mem->alloc_sarray)
((j_common_ptr) &cinfo, JPOOL_IMAGE, cinfo.output_width * cinfo.output_components, 1);
/* Step 6: while (scan lines remain to be read) */
/* jpeg_read_scanlines(...); */
/* Here we use the library's state variable cinfo.output_scanline as the
* loop counter, so that we don't have to keep track ourselves.
while (cinfo.output_scanline < cinfo.output_height) {
/* jpeg_read_scanlines expects an array of pointers to scanlines.
* Here the array is only one element long, but you could ask for
* more than one scanline at a time if that's more convenient.
(void) jpeg_read_scanlines(&cinfo, buffer, 1);
/* Assume put_scanline_someplace wants a pointer and sample count. */
return nullptr;
How can I get the RGB value from the jpeg using the libjpeg?
The RGB values are in buffer. It's actually an array of arrays, so you have to index buffer[0].
Something like this:
while (cinfo.output_scanline < cinfo.output_height)
(void) jpeg_read_scanlines(&cinfo, buffer, 1);
// get the pointer to the row:
unsigned char* pixel_row = (unsigned char*)(buffer[0]);
// iterate over the pixels:
for(int i = 0; i < cinfo.output_width; i++)
// convert the RGB values to a float in the range 0 - 1
float red = (float)(*pixel_row++) / 255.0f;
float green = (float)(*pixel_row++) / 255.0f;
float blue = (float)(*pixel_row++) / 255.0f;
This is assuming cinfo.output_components is 3.
This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.
This is based on the simpleTexture3D project from CUDA Samples:
This is my cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>
typedef unsigned int uint;
typedef unsigned char uchar;
const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 64, height = 64, depth=64;
//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);
uint *d_output = NULL;
int *pArgc = NULL;
char **pArgv = NULL;
extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);
void loadVolumeData(char *exec_path);
// render image using CUDA
void render()
// call CUDA kernel
render_kernel(gridSize, blockSize, d_output, width, height, depth);
getLastCudaError("render_kernel failed");
void cleanup()
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
FILE *fp = fopen(filename, "rb");
if (!fp)
fprintf(stderr, "Error opening file '%s'\n", filename);
return 0;
uchar *data = (uchar *) malloc(size);
size_t read = fread(data, 1, size, fp);
printf("Read '%s', %lu bytes\n", filename, read);
return data;
// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
int returnState=0;
// cut file extension from filename
char *a=strdup(filename); //via strdup you dumb a const char to char, you must free it yourself
int len = strlen(a);
a[len-4] = '\0'; //deletes '.raw'
char b[50];
sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);
//char b[]="_out.raw"; //Add suffix out to filename
char buffer[256]; // <- danger, only storage for 256 characters.
strncpy(buffer, a, sizeof(buffer));
strncat(buffer, b, sizeof(buffer));
FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared
if (!fp)
fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
return 0;
size_t write = fwrite(data, 1, size, fp);
if (write==size)
printf("Wrote %lu bytes to '%s'\n", write, buffer);
return 0;
printf("Error writing data to file '%s'\n", buffer);
return 1;
// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
int result = 0;
result = findCudaDevice(argc, (const char **)argv);
return result;
void runAutoTest(char *exec_path, char *PathToFile)
// set path
char *path;
if (PathToFile == NULL)
path = sdkFindFilePath(volumeFilename, exec_path);
path = PathToFile;
if (path == NULL)
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
// Allocate output memory
checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));
// zero out the output array with cudaMemset
cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));
// render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, depth);
getLastCudaError("render_kernel failed");
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
int wState=writeRawFile(path,h_output,width*height*depth);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
//exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
void loadVolumeData(char *exec_path, char *PathToFile)
char *path;
// load volume data
if (PathToFile == NULL)
path = sdkFindFilePath(volumeFilename, exec_path);
path = PathToFile;
if (path == NULL)
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
uchar *h_volume = loadRawFile(path, size);
//int wState=writeRawFile(path,h_volume,size);
initCuda(h_volume, volumeSize);
// Program main
main(int argc, char **argv)
pArgc = &argc;
pArgv = argv;
char *image_file = NULL;
printf("%s Starting...\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "file")) //Note cmd line argument is -file "PathToFile/File.raw"
{ // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"
getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
if (image_file)
chooseCudaDevice(argc, argv);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
chooseCudaDevice(argc, argv);
printf("I am finished...\n"
"Can I get some ice cream please\n");
And this is my .cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
#include <helper_math.h>
typedef unsigned int uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex; // 3D texture
cudaArray *d_volumeArray = 0;
__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
// float u = x / (float) imageW;
// float v = y / (float) imageH;
//float w = z / (float) imageD;
// // read from 3D texture
// float voxel = tex3D(tex, u, v, w);
uint ps=__umul24(imageW,imageH);
if ((x < imageW) && (y < imageH) && (z < imageD))
// write output color
uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
d_output[1] = (uchar) 255;//+0*voxel*255;
extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
// create 3D array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
// set texture parameters
tex.normalized = true; // access with normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeBorder; // wrap texture coordinates
tex.addressMode[1] = cudaAddressModeBorder;
tex.addressMode[2] = cudaAddressModeBorder;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)
I am eager to here from you experts
Cheers Mika
I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
This led to this strange behavior
The instructions for libjpeg-turbo here describes the TurboJPEG API thus: "This API wraps libjpeg-turbo and provides an easy-to-use interface for compressing and decompressing JPEG images in memory". Great, but are there some solid examples of using this API available? Just looking to decompress a fairly vanilla jpeg in memory.
I've found a few bits such as, which appears to be using the TurboJPEG API, but are there any more solid/varied examples?
The source for libjpeg-turbo is well documented, so that does help.
Ok, I know that you did already solve your problem, but as some people, just like me, could be searching some simple example I will share what I created.
It is an example, compressing and decompressing an RGB image. Otherwise I think that the API documentation of TurboJPEG is quite easy to understand!
#include <turbojpeg.h>
const int JPEG_QUALITY = 75;
const int COLOR_COMPONENTS = 3;
int _width = 1920;
int _height = 1080;
long unsigned int _jpegSize = 0;
unsigned char* _compressedImage = NULL; //!< Memory is allocated by tjCompress2 if _jpegSize == 0
unsigned char buffer[_width*_height*COLOR_COMPONENTS]; //!< Contains the uncompressed image
tjhandle _jpegCompressor = tjInitCompress();
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &_jpegSize, TJSAMP_444, JPEG_QUALITY,
//to free the memory allocated by TurboJPEG (either by tjAlloc(),
//or by the Compress/Decompress) after you are done working on it:
After that you have the compressed image in _compressedImage.
To decompress you have to do the following:
#include <turbojpeg.h>
long unsigned int _jpegSize; //!< _jpegSize from above
unsigned char* _compressedImage; //!< _compressedImage from above
int jpegSubsamp, width, height;
unsigned char buffer[width*height*COLOR_COMPONENTS]; //!< will contain the decompressed image
tjhandle _jpegDecompressor = tjInitDecompress();
tjDecompressHeader2(_jpegDecompressor, _compressedImage, _jpegSize, &width, &height, &jpegSubsamp);
tjDecompress2(_jpegDecompressor, _compressedImage, _jpegSize, buffer, width, 0/*pitch*/, height, TJPF_RGB, TJFLAG_FASTDCT);
Some random thoughts:
I just came back over this as I am writing my bachelor thesis, and I noticed that if you run the compression in a loop it is preferable to store the biggest size of the JPEG buffer to not have to allocate a new one every turn. Basically, instead of doing:
long unsigned int _jpegSize = 0;
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &_jpegSize, TJSAMP_444, JPEG_QUALITY,
we would add an object variable, holding the size of the allocated memory long unsigned int _jpegBufferSize = 0; and before every compression round we would set the jpegSize back to that value:
long unsigned int jpegSize = _jpegBufferSize;
tjCompress2(_jpegCompressor, buffer, _width, 0, _height, TJPF_RGB,
&_compressedImage, &jpegSize, TJSAMP_444, JPEG_QUALITY,
_jpegBufferSize = _jpegBufferSize >= jpegSize? _jpegBufferSize : jpegSize;
after the compression one would compare the memory size with the actual jpegSize and set it to the jpegSize if it is higher than the previous memory size.
I ended up using below code as a working example for both JPEG encoding and decoding. Best example that I can find, it's self-contained that initializes a dummy image and output the encoded image to a local file.
Below code is NOT my own, credit goes to . Posting it here again to help anyone finds it's difficult to get libjpeg turbo working.
#include "turbojpeg.h"
#include <iostream>
#include <string.h>
#include <errno.h>
using namespace std;
int main(void)
unsigned char *srcBuf; //passed in as a param containing pixel data in RGB pixel interleaved format
tjhandle handle = tjInitCompress();
if(handle == NULL)
const char *err = (const char *) tjGetErrorStr();
cerr << "TJ Error: " << err << " UNABLE TO INIT TJ Compressor Object\n";
return -1;
int jpegQual =92;
int width = 128;
int height = 128;
int nbands = 3;
int flags = 0;
unsigned char* jpegBuf = NULL;
int pitch = width * nbands;
int pixelFormat = TJPF_GRAY;
int jpegSubsamp = TJSAMP_GRAY;
if(nbands == 3)
pixelFormat = TJPF_RGB;
jpegSubsamp = TJSAMP_411;
unsigned long jpegSize = 0;
srcBuf = new unsigned char[width * height * nbands];
for(int j = 0; j < height; j++)
for(int i = 0; i < width; i++)
srcBuf[(j * width + i) * nbands + 0] = (i) % 256;
srcBuf[(j * width + i) * nbands + 1] = (j) % 256;
srcBuf[(j * width + i) * nbands + 2] = (j + i) % 256;
int tj_stat = tjCompress2( handle, srcBuf, width, pitch, height,
pixelFormat, &(jpegBuf), &jpegSize, jpegSubsamp, jpegQual, flags);
if(tj_stat != 0)
const char *err = (const char *) tjGetErrorStr();
cerr << "TurboJPEG Error: " << err << " UNABLE TO COMPRESS JPEG IMAGE\n";
handle = NULL;
return -1;
FILE *file = fopen("out.jpg", "wb");
if (!file) {
cerr << "Could not open JPEG file: " << strerror(errno);
return -1;
if (fwrite(jpegBuf, jpegSize, 1, file) < 1) {
cerr << "Could not write JPEG file: " << strerror(errno);
return -1;
//write out the compress date to the image file
int tjstat = tjDestroy(handle); //should deallocate data buffer
handle = 0;
In the end I used a combination of random code found on the internet (e.g. and the .c and header files for libjeg-turbo, which are well documented.
This official API is a good information source aswell.
Here's a fragment of code what I use to load jpeg's from memory. Maybe it will require a bit of fixing, because I extracted it from different files in my project. It will load both - grayscale and rgb images (bpp will be set either to 1 or to 3).
struct Image
int bpp;
int width;
int height;
unsigned char* data;
struct jerror_mgr
jpeg_error_mgr base;
jmp_buf jmp;
METHODDEF(void) jerror_exit(j_common_ptr jinfo)
jerror_mgr* err = (jerror_mgr*)jinfo->err;
longjmp(err->jmp, 1);
METHODDEF(void) joutput_message(j_common_ptr)
bool Image_LoadJpeg(Image* image, unsigned char* img_data, unsigned int img_size)
jpeg_decompress_struct jinfo;
jerror_mgr jerr;
jinfo.err = jpeg_std_error(&jerr.base);
jerr.base.error_exit = jerror_exit;
jerr.base.output_message = joutput_message;
image->data = NULL;
if (setjmp( goto bail;
jpeg_mem_src(&jinfo, img_data, img_size);
if (jpeg_read_header(&jinfo, TRUE) != JPEG_HEADER_OK) goto bail;
jinfo.dct_method = JDCT_FLOAT; // change this to JDCT_ISLOW on Android/iOS
if (!jpeg_start_decompress(&jinfo)) goto bail;
if (jinfo.num_components != 1 && jinfo.num_components != 3) goto bail;
image->data = new (std::nothrow) unsigned char [jinfo.output_width * jinfo.output_height * jinfo.output_components];
if (!image->data) goto bail;
JSAMPROW ptr = image->data;
while (jinfo.output_scanline < jinfo.output_height)
if (jpeg_read_scanlines(&jinfo, &ptr, 1) != 1) goto bail;
ptr += jinfo.output_width * jinfo.output_components;
if (!jpeg_finish_decompress(&jinfo)) goto bail;
image->bpp = jinfo.output_components;
image->width = jinfo.output_width;
image->height = jinfo.output_height;
return true;
if (image->data) delete [] data;
return false;