Bad data coming from cudaMemcpy2D - c++

If this sort of question has been asked I apologize, link me to the thread please!
Anyhow I am new to CUDA (I'm coming from OpenCL) and wanted to try generating an image with it. The relevant CUDA code is:
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
save_png("out.png", h_pixels, dims.x, dims.y);
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
The save_png function is a usual utility function I created for taking a block of data and saving it to a png:
void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr) {
std::cerr << "Failed to create png write struct" << std::endl;
return;
}
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) {
std::cerr << "Failed to create info_ptr" << std::endl;
png_destroy_write_struct(&png_ptr, NULL);
return;
}
FILE *fp = fopen(filename, "wb");
if (!fp) {
std::cerr << "Failed to open " << filename << " for writing" << std::endl;
png_destroy_write_struct(&png_ptr, &info_ptr);
return;
}
if (setjmp(png_jmpbuf(png_ptr))) {
png_destroy_write_struct(&png_ptr, &info_ptr);
std::cerr << "Error from libpng!" << std::endl;
return;
}
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
png_byte *row_pnts[height];
size_t i;
for (i = 0; i < height; i++) {
row_pnts[i] = buffer + width * 4 * i;
}
png_write_image(png_ptr, row_pnts);
png_write_end(png_ptr, info_ptr);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
Anyways the image that's generated is a weird whiteish strip that's speckled with random colored pixels which can be seen here.
Is there something glaring I did wrong? I tried to follow the introduction documentation on the CUDA site. Otherwise can anyone help me out to fix this? Here I'm simply trying to fill the pixels buffer with green pixels.
I am using a MBP retina with an NVIDIA GeForce GT 650M discrete graphics card. I can run and paste the output to print_devices from the cuda sample code if need be.
EDIT: Note no errors or warnings during compilation with the following makefile:
all:
nvcc -c mandlebrot.cu -o mandlebrot.cu.o
nvcc mandlebrot.cu.o -o mandlebrot -lpng
and no errors at runtime.

It's better if you provide a complete code that someone can copy, paste, compile, and run, without adding anything or changing anything, Stripping off the include headers isn't helpful, in my opinion, and making your test code dependent on a png library that others may not have is also not productive, if you want help.
Your error checking on kernel launches is broken. You may want to review proper cuda error checking. If you had proper error checking, or ran your code with cuda-memcheck, you would discover an error 9 on the kernel launch. This is an invalid configuration. If you print out your blocks and threads_per_block variables, you'll see something like this:
blocks: 2, 2
threads: 500, 500
You are in fact setting threads per block to 500,500 here:
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
That is illegal, as you are requesting 500x500 threads per block (i.e. 250000 threads) which exceeds the maximum limit of 1024 threads per block.
So your kernel is not running at all and you're getting garbage.
You can fix this error pretty simply by changing your block_size definition:
unsigned long block_size = 16;
After that there is still an issue, as you've misinterpreted the parameters for cudaMemcpy2D.:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
The documentation states for the 5th parameter:
width - Width of matrix transfer (columns in bytes)
but you've passed the width in elements (groups of 4 bytes) rather than bytes.
This will fix that:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
With the above changes, I was able to get good results with a test version of your code:
#include <stdio.h>
#include <stdint.h>
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
dim3 threads_per_block(16, 16);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
printf("blocks: %u, %u\n", blocks.x, blocks.y);
printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
// save_png("out.png", h_pixels, dims.x, dims.y);
for (int row = 0; row < dims.y; row++)
for (int col = 0; col < dims.x; col++){
if (h_pixels[(row*dims.x*4) + col*4 ] != 0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +2] != 0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
}
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
Note the above code is a complete code you can copy, paste, compile and run.

Related

Crash on cudaMalloc when allocating 2D array

I am trying to build arrays of histograms of unsigned char corresponding to each pixel in an image for the gPb algorithm implementation. I have a crash on a cudaMalloc call which I cannot solve. I have looked through other similar questions and I tested always if the previous operations returned cudaSuccess or not. Here is my code:
First I allocate this structure in constructor of my class CudaImage:
bool CudaImage::create2DHistoArray()
{
//preparing histograms
m_LastCudaError = cudaMalloc((void**)&m_dHistograms, (m_Height + 2 * m_Scale) * sizeof(unsigned int*));
if (m_LastCudaError != cudaSuccess)
return false;
//set all histograms to nullptr
m_LastCudaError = cudaMemset(m_dHistograms, 0, (m_Height + 2 * m_Scale) * sizeof(unsigned int*));
if (m_LastCudaError != cudaSuccess)
return false;
return true;
}
then at some point I would call a member function to allocate some of m_dHistograms[i] as follows:
bool CudaImage::initializeHistoRange(int start, int stop)
{
for (int i = start; i < stop; ++i) {
m_LastCudaError = cudaMalloc((void**)&m_dHistograms[i], 256 * 2 * m_ArcNo * (m_Width + 2 * m_Scale) * sizeof(unsigned int));
if (m_LastCudaError != cudaSuccess) {
return false;
}
//set all pixels in the gradient images to 0
m_LastCudaError = cudaMemset(m_dHistograms[i], 0, 256 * 2 * m_ArcNo * (m_Width + 2 * m_Scale) * sizeof(unsigned int));
if (m_LastCudaError != cudaSuccess)
return false;
}
return true;
}
The first cudaMalloc in this last function crashes without a single warning. When running with cuda-memcheck I get the following message:
"The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under a host debugger to catch such errors."
Can anyone help ? Another question would be if the array allocation was correctly implemented. I do not want to allocate all memory from the beginning because it will be too much so I allocate in constructor (first function) only the pointers to the rows of the array and then in the application I allocate memory when I need it and free what I do not need.
You are getting segfaults because it is illegal to read or modify the value of m_dHistograms[i] in host code, given it is allocated in device memory. What you need to do is something like this:
bool CudaImage::initializeHistoRange(int start, int stop)
{
for (int i = start; i < stop; ++i) {
// Allocated memory
unsigned int* p;
m_LastCudaError = cudaMalloc((void**)&p, 256 * 2 * m_ArcNo * (m_Width + 2 * m_Scale) * sizeof(unsigned int));
if (m_LastCudaError != cudaSuccess) {
return false;
}
//set all pixels in the gradient images to 0
m_LastCudaError = cudaMemset(p, 0, 256 * 2 * m_ArcNo * (m_Width + 2 * m_Scale) * sizeof(unsigned int));
if (m_LastCudaError != cudaSuccess)
return false;
}
// Transfer address of allocation to device
m_LastCudaError = cudaMemcpy(m_dHistograms + i, &p, sizeof(unsigned int *), cudaMemcpyHostToDevice);
if (m_LastCudaError != cudaSuccess)
return false;
}
return true;
}
[disclaimer: never compiled or run, use at your risk]
Here the allocation address is stored in a host variable which is finally copied to the device array after the allocation and memset operations are done. This incurs the penalty of an additional host to device memory transfer per allocation.
The solution that I found is with the help of this stackoverflow answer. The code is as follows:
bool CudaImage::initializeHistoRange(int start, int stop)
{
for (int i = start; i < stop; ++i) {
m_LastCudaError = cudaMalloc((void**)&m_hHistograms[i], 256 * 2 * m_ArcNo * (m_Width + 2 * m_Scale) * sizeof(unsigned int));
if (m_LastCudaError != cudaSuccess) {
return false;
}
cudaMemcpy(m_dHistograms, m_hHistograms, stop * sizeof(unsigned int*), cudaMemcpyHostToDevice);
if (m_LastCudaError != cudaSuccess)
return false;
}
return true;
}
bool CudaImage::create2DHistoArray()
{
m_LastCudaError = cudaMalloc((void**)&m_dHistograms, (m_Height + 2 * m_Scale) * sizeof(unsigned int*));
if (m_LastCudaError != cudaSuccess)
return false;
m_hHistograms = (unsigned int**)malloc((m_Height + 2 * m_Scale) * sizeof(unsigned int*));
return true;
}
That is I am using an additional member in the host member which helps me to create the memory in the device. The code for freeing memory during the algorithm operation is :
void CudaImage::deleteFromHistoMaps(int index) {
//I need some more device memory
if (index + m_Scale + 1 < m_Height + 2 * m_Scale) {
initializeHistoRange(index + m_Scale + 1, index + m_Scale + 2);
}
//device memory is not needed anymore - free it
if (index >= m_Scale + 1) {
cudaFree(m_hHistograms[index - m_Scale - 1]);
m_hHistograms[index - m_Scale - 1] = nullptr;
}
}

cudaMemcpy - seems to not work properly

I am trying to copy data from host to device in my GPU greyscale filter program. However, there is some kind of problem because when I try to do so, nothing happens. Probably I have some mistakes in my code but compiler doesn't show any errors. I need to copy variables d_bufferRGB into GPU, process it and return it in d_new_bufferRGB in order to save it with function save_bmp();
EDIT 1: implemented CUDA error checking in main()
It says there is invalid argument in this line cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice)
HERE is the code >>>
#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int width, heigth;
long size;
long *d_size;
RGBTRIPLE *bufferRGB, *new_bufferRGB;
RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;
void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
{
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(file_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
*width = bmp_info_header.biWidth;
*heigth = bmp_info_header.biHeight;
size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
std::cout << "velkost nacitanych pixelov je " << size <<'\n';
int x, y;
*bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
for (y = 0; y < *heigth; y++)
{
for (x = 0; x < *width; x++)
{
(*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
}
for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
fgetc(file);
}
fclose(file);
}
void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
{
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(old_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fclose(file);
file = fopen(new_name, "wb");
fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
int alligment_x = (4 - (3 * width) % 4) % 4;
unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
unsigned int index = 0;
int x, y;
for (y = 0; y < heigth; y++)
{
for (x = 0; x < width; x++)
{
to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
}
for (x = 0; x < alligment_x; x++)
to_save[index++] = 0;
}
std::cout << "velkost na ulozenie je " << sizeof(&to_save) << '\n';
fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);
fclose(file);
free(to_save);
}
__global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
BYTE grayscale;
if (idx < *d_size)
{
grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
d_new_bufferRGB[idx].rgbtRed = grayscale;
d_new_bufferRGB[idx].rgbtGreen = grayscale;
d_new_bufferRGB[idx].rgbtBlue = grayscale;
}
}
int main()
{
gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_size, sizeof(size)));
load_bmp(&bufferRGB, &width, &heigth, "test.bmp"); //tu je vztvoreny a naplneny smernik *buffer_RGB
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));
CUDA_filter_grayscale << <32, 512 >> > (d_bufferRGB, d_new_bufferRGB, d_size); //size of kernel dont bother me for now
gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));
save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");
}
It's killing my brain for several days, plese help me with this.
So, with significant help obtained from #Robert Crovella i had finished my code. I also made some extra features like dynamic kernel allocation as a free gift for internet users. Code is fully functional for BMP ver. 3 from Microsoft(one can create some in Paint). I've tried to upload some image but it can be max 2MB big, which is not enough for true color depth. When compiling, there is error of null pointer but the program is created and stored in project Debug folder. When you run it with an image in the folder, it works without problem.
The problem with code above are >
1, uninicialised new_bufferRGB
2, load function do not provide variables sooner then I use them
3, mistakes in cudaMemcpy function
SO, HERE IS THE CODE >>>
#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <iostream>
int width, heigth;
long size;
long *d_size;
RGBTRIPLE *bufferRGB, *new_bufferRGB;
RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
{
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(file_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
*width = bmp_info_header.biWidth;
*heigth = bmp_info_header.biHeight;
size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
std::cout << "size of loaded pixels is " << size << '\n';
int x, y;
*bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
for (y = 0; y < *heigth; y++)
{
for (x = 0; x < *width; x++)
{
(*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
}
for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
fgetc(file);
}
fclose(file);
}
void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
{
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(old_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fclose(file);
file = fopen(new_name, "wb");
fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
int alligment_x = (4 - (3 * width) % 4) % 4;
unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
unsigned int index = 0;
int x, y;
for (y = 0; y < heigth; y++)
{
for (x = 0; x < width; x++)
{
to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
}
for (x = 0; x < alligment_x; x++)
to_save[index++] = 0;
}
fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);
fclose(file);
free(to_save);
}
__global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
BYTE grayscale;
if (idx < *d_size)
{
grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
d_new_bufferRGB[idx].rgbtRed = grayscale;
d_new_bufferRGB[idx].rgbtGreen = grayscale;
d_new_bufferRGB[idx].rgbtBlue = grayscale;
}
}
int main()
{
// load to have all variables reachable and loaded
load_bmp(&bufferRGB, &width, &heigth, "test.bmp");
// inicialise buffer for copy of proccesed image from device to host
new_bufferRGB = (RGBTRIPLE *)malloc(width* heigth * 4);
//inicializing variables on GPU
gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_size, sizeof(size)));
// copying variables to GPU
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));
// find out the kernel size, number of threads depends on your GPU max number of threads
int numbThreads = 1024;
int numbBlocks = (width*heigth) / numbThreads;
if (((width*heigth) % numbThreads)>0) numbBlocks++;
CUDA_filter_grayscale <<<numbBlocks, numbThreads >>> (d_bufferRGB, d_new_bufferRGB, d_size);
//copy result from device to host
gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));
//save result
save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");
return 0;
}

Save OpenGL screen pixels to PNG using libpng

I've been using SOIL to save images as BMP, but it turns out that SOIL (or stbi to be more specific) saves ~5MB images (which is about 1366x768 resolution image or more) which is quite insane.
Original BMP saving code (NOTE Everything is done in the render function):
uint8_t *pixels = new uint8_t[w * h * 3];
// copy pixels from screen
glBindTexture(GL_TEXTURE_2D, screenTex);
glCopyTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 0, 0, w, h);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
glReadPixels(0, 0, w, h, GL_RGB, GL_UNSIGNED_BYTE, (GLvoid *)pixels);
// invert pixels (stolen from SOILs source code)
for (int j = 0; j * 2 < h; ++j) {
int x = j * w * 3;
int y = (h - 1 - j) * w * 3;
for (int i = w * 3; i > 0; --i) {
uint8_t tmp = pixels[x];
pixels[x] = pixels[y];
pixels[y] = tmp;
++x;
++y;
}
}
// save the image
int err = SOIL_save_image(fileName, SOIL_SAVE_TYPE_BMP, w, h, 3, pixels);
if (err)
printf("Done\n");
else
printf("Failed\n");
Code for saving PNG:
bool save_png_libpng(const char *filename, uint8_t *pixels, int w, int h)
{
png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
if (!png)
return false;
png_infop info = png_create_info_struct(png);
if (!info) {
png_destroy_write_struct(&png, &info);
return false;
}
FILE *fp = fopen(filename, "wb");
if (!fp) {
png_destroy_write_struct(&png, &info);
return false;
}
png_init_io(png, fp);
png_set_IHDR(png, info, w, h, 8 /* depth */, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
png_colorp palette = (png_colorp)png_malloc(png, PNG_MAX_PALETTE_LENGTH * sizeof(png_color));
if (!palette) {
fclose(fp);
png_destroy_write_struct(&png, &info);
return false;
}
png_set_PLTE(png, info, palette, PNG_MAX_PALETTE_LENGTH);
png_write_info(png, info);
png_set_packing(png);
png_bytepp rows = (png_bytepp)png_malloc(png, h * sizeof(png_bytep));
for (int i = 0; i < h; ++i)
rows[i] = (png_bytep)(pixels + (h - i) * w * 3);
png_write_image(png, rows);
png_write_end(png, info);
png_free(png, palette);
png_destroy_write_struct(&png, &info);
fclose(fp);
delete[] rows;
return true;
}
NOTE: I have not changed any of the original code, just replaced SOIL_save_image with save_png.
The code fails in the following line:
png_write_image(png, rows)
in PNG's source code, this function fails at the highlighted line:
void PNGAPI
png_write_image(png_structrp png_ptr, png_bytepp image)
{
png_uint_32 i; /* row index */
int pass, num_pass; /* pass variables */
png_bytepp rp; /* points to current row */
if (png_ptr == NULL)
return;
png_debug(1, "in png_write_image");
#ifdef PNG_WRITE_INTERLACING_SUPPORTED
/* Initialize interlace handling. If image is not interlaced,
* this will set pass to 1
*/
num_pass = png_set_interlace_handling(png_ptr);
#else
num_pass = 1;
#endif
/* Loop through passes */
for (pass = 0; pass < num_pass; pass++)
{
/* Loop through image */
for (i = 0, rp = image; i < png_ptr->height; i++, rp++)
{
png_write_row(png_ptr, *rp); // HERE
}
}
}
png_write_row then fails here: (The code for png_write_row is quite long to post here, so if you're curious about what happens before this line, you can check out pngwrite.c in png's source code. )
/* Copy user's row into buffer, leaving room for filter byte. */
memcpy(png_ptr->row_buf + 1, row, row_info.rowbytes);
P.S: I was using exactly the same code on MinGW and it was working 100% fine, when I switched to MSVC it started failing. I'm not sure if GCC does something magically here or it's my code's fault, so I would like to know for the sake of learning.
The following line:
rows[i] = (png_bytep)(pixels + (h - i) * w * 3);
is unforunately going past the block of memory (pixels), so the following edit fixes it:
rows[i] = (png_bytep)(pixels + (h - i - 1) * w * 3);
Quite trivial but whatever.

Managing a 2D CUDA Array

I'm trying to pass a 2d array to a kernel so that each thread can access index = threadIdx.x + (blockIdx.x * blockDim.x) but I'm having trouble figuring out just how to do this and how to copy the data back over.
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
for(block_size)
h_array[block][thread] should be 1
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
What am I doing wrong, here?
Your cudaMemset2D is accesing to a bigger memory space that you previously allocated with cudaMallocPitch Also your cudaMemcpy2D is copying a little portion of that memory.
You should use the function in the following way:
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);
Here's a complete code that passes a basic test with the errors mentioned by #hidrargyro fixed:
$ cat t236.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
int main(){
int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
for(int j = 0; j<block_size; j++)
if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}
$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$

How do take a screenshot correctly with xlib?

I am trying to capture an image of the screen for use in screencasting. Thus I need a fast solution, and cannot rely on shell programs such as import or xwd.
This is the code I have written so far, but it fails and gives me a junk image, which just seems to show fragments of several images with odd colors tossed together.
Any ideas on what I am doing wrong?
#include <X11/Xlib.h>
#include <X11/X.h>
#include <cstdio>
#include <CImg.h>
using namespace cimg_library;
int main()
{
Display *display = XOpenDisplay(NULL);
Window root = DefaultRootWindow(display);
XWindowAttributes gwa;
XGetWindowAttributes(display, root, &gwa);
int width = gwa.width;
int height = gwa.height;
XImage *image = XGetImage(display,root, 0,0 , width,height,AllPlanes, ZPixmap);
unsigned char *array = new unsigned char[width * height * 3];
unsigned long red_mask = image->red_mask;
unsigned long green_mask = image->green_mask;
unsigned long blue_mask = image->blue_mask;
for (int x = 0; x < width; x++)
for (int y = 0; y < height ; y++)
{
unsigned long pixel = XGetPixel(image,x,y);
unsigned char blue = pixel & blue_mask;
unsigned char green = (pixel & green_mask) >> 8;
unsigned char red = (pixel & red_mask) >> 16;
array[(x + width * y) * 3] = red;
array[(x + width * y) * 3+1] = green;
array[(x + width * y) * 3+2] = blue;
}
CImg<unsigned char> pic(array,width,height,1,3);
pic.save_png("blah.png");
printf("%ld %ld %ld\n",red_mask>> 16, green_mask>>8, blue_mask);
return 0;
}
You are mistaken about the way array is laid out in memory, as you can find out by declaring img before the loop and adding this printf to your inner loop:
printf("%ld %ld %u %u %u\n",x,y,pic.offset(x,y,0),pic.offset(x,y,1),pic.offset(x,y,2));
This yields (on my 1920x1200 screen):
0 0 0 2304000 4608000
0 1 1920 2305920 4609920
0 2 3840 2307840 4611840
and so on, indicating that the red/green/blue subimages are kept "together" instead of the three color components of a single pixel being adjacent to each other.
The builtin CImg accessors will make your code work:
pic(x,y,0) = red;
pic(x,y,1) = green;
pic(x,y,2) = blue;
You can use libpng
int code = 0;
FILE *fp;
png_structp png_ptr;
png_infop png_info_ptr;
png_bytep png_row;
// Open file
fp = fopen ("test.png", "wb");
if (fp == NULL){
fprintf (stderr, "Could not open file for writing\n");
code = 1;
}
// Initialize write structure
png_ptr = png_create_write_struct (PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (png_ptr == NULL){
fprintf (stderr, "Could not allocate write struct\n");
code = 1;
}
// Initialize info structure
png_info_ptr = png_create_info_struct (png_ptr);
if (png_info_ptr == NULL){
fprintf (stderr, "Could not allocate info struct\n");
code = 1;
}
// Setup Exception handling
if (setjmp (png_jmpbuf (png_ptr))){
fprintf(stderr, "Error during png creation\n");
code = 1;
}
png_init_io (png_ptr, fp);
// Write header (8 bit colour depth)
png_set_IHDR (png_ptr, png_info_ptr, width, height,
8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
// Set title
char *title = "Screenshot";
if (title != NULL){
png_text title_text;
title_text.compression = PNG_TEXT_COMPRESSION_NONE;
title_text.key = "Title";
title_text.text = title;
png_set_text (png_ptr, png_info_ptr, &title_text, 1);
}
png_write_info (png_ptr, png_info_ptr);
// Allocate memory for one row (3 bytes per pixel - RGB)
png_row = (png_bytep) malloc (3 * width * sizeof (png_byte));
// Write image data
int x, y;
for (y = 0; y < height; y++){
for (x = 0; x < width; x++){
unsigned long pixel = XGetPixel (image, x, y);
unsigned char blue = pixel & blue_mask;
unsigned char green = (pixel & green_mask) >> 8;
unsigned char red = (pixel & red_mask) >> 16;
png_byte *ptr = &(png_row[x*3]);
ptr[0] = red;
ptr[1] = green;
ptr[2] = blue;
}
png_write_row (png_ptr, png_row);
}
// End write
png_write_end (png_ptr, NULL);
// Free
fclose (fp);
if (png_info_ptr != NULL) png_free_data (png_ptr, png_info_ptr, PNG_FREE_ALL, -1);
if (png_ptr != NULL) png_destroy_write_struct (&png_ptr, (png_infopp)NULL);
if (png_row != NULL) free (png_row);
image has to stored in memory as R1R2R3R4R5R6......G1G2G3G4G5G6.......B1B2B3B4B5B6.
cimg storage