I am trying to copy data from host to device in my GPU greyscale filter program. However, there is some kind of problem because when I try to do so, nothing happens. Probably I have some mistakes in my code but compiler doesn't show any errors. I need to copy variables d_bufferRGB into GPU, process it and return it in d_new_bufferRGB in order to save it with function save_bmp();
EDIT 1: implemented CUDA error checking in main()
It says there is invalid argument in this line cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice)
HERE is the code >>>
#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
int width, heigth;
long size;
long *d_size;
RGBTRIPLE *bufferRGB, *new_bufferRGB;
RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;
void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(file_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
*width = bmp_info_header.biWidth;
*heigth = bmp_info_header.biHeight;
size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
std::cout << "velkost nacitanych pixelov je " << size <<'\n';
int x, y;
*bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
for (y = 0; y < *heigth; y++)
for (x = 0; x < *width; x++)
(*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(old_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
file = fopen(new_name, "wb");
fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
int alligment_x = (4 - (3 * width) % 4) % 4;
unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
unsigned int index = 0;
int x, y;
for (y = 0; y < heigth; y++)
for (x = 0; x < width; x++)
to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
for (x = 0; x < alligment_x; x++)
to_save[index++] = 0;
std::cout << "velkost na ulozenie je " << sizeof(&to_save) << '\n';
fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);
__global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
int idx = blockIdx.x*blockDim.x + threadIdx.x;
BYTE grayscale;
if (idx < *d_size)
grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
d_new_bufferRGB[idx].rgbtRed = grayscale;
d_new_bufferRGB[idx].rgbtGreen = grayscale;
d_new_bufferRGB[idx].rgbtBlue = grayscale;
int main()
gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_size, sizeof(size)));
load_bmp(&bufferRGB, &width, &heigth, "test.bmp"); //tu je vztvoreny a naplneny smernik *buffer_RGB
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));
CUDA_filter_grayscale << <32, 512 >> > (d_bufferRGB, d_new_bufferRGB, d_size); //size of kernel dont bother me for now
gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));
save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");
It's killing my brain for several days, plese help me with this.
So, with significant help obtained from #Robert Crovella i had finished my code. I also made some extra features like dynamic kernel allocation as a free gift for internet users. Code is fully functional for BMP ver. 3 from Microsoft(one can create some in Paint). I've tried to upload some image but it can be max 2MB big, which is not enough for true color depth. When compiling, there is error of null pointer but the program is created and stored in project Debug folder. When you run it with an image in the folder, it works without problem.
The problem with code above are >
1, uninicialised new_bufferRGB
2, load function do not provide variables sooner then I use them
3, mistakes in cudaMemcpy function
#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <iostream>
int width, heigth;
long size;
long *d_size;
RGBTRIPLE *bufferRGB, *new_bufferRGB;
RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
//if (abort) exit(code);
void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(file_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
*width = bmp_info_header.biWidth;
*heigth = bmp_info_header.biHeight;
size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
std::cout << "size of loaded pixels is " << size << '\n';
int x, y;
*bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
for (y = 0; y < *heigth; y++)
for (x = 0; x < *width; x++)
(*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
(*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
BITMAPFILEHEADER bmp_file_header;
BITMAPINFOHEADER bmp_info_header;
FILE *file;
file = fopen(old_name, "rb");
fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
file = fopen(new_name, "wb");
fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);
int alligment_x = (4 - (3 * width) % 4) % 4;
unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
unsigned int index = 0;
int x, y;
for (y = 0; y < heigth; y++)
for (x = 0; x < width; x++)
to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
for (x = 0; x < alligment_x; x++)
to_save[index++] = 0;
fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);
__global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
int idx = blockIdx.x*blockDim.x + threadIdx.x;
BYTE grayscale;
if (idx < *d_size)
grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
d_new_bufferRGB[idx].rgbtRed = grayscale;
d_new_bufferRGB[idx].rgbtGreen = grayscale;
d_new_bufferRGB[idx].rgbtBlue = grayscale;
int main()
// load to have all variables reachable and loaded
load_bmp(&bufferRGB, &width, &heigth, "test.bmp");
// inicialise buffer for copy of proccesed image from device to host
new_bufferRGB = (RGBTRIPLE *)malloc(width* heigth * 4);
//inicializing variables on GPU
gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
gpuErrchk(cudaMalloc(&d_size, sizeof(size)));
// copying variables to GPU
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));
// find out the kernel size, number of threads depends on your GPU max number of threads
int numbThreads = 1024;
int numbBlocks = (width*heigth) / numbThreads;
if (((width*heigth) % numbThreads)>0) numbBlocks++;
CUDA_filter_grayscale <<<numbBlocks, numbThreads >>> (d_bufferRGB, d_new_bufferRGB, d_size);
//copy result from device to host
gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));
//save result
save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");
return 0;
Wrote a simple BMP generation code but instead of outputting what I want (red that gradually goes to black (from left to right)) it returns an image with some some weird layout far from what I expect to see. Was inspecting the way header and pixels written in the memory and everything seems alright
#include <iostream>
#include <fstream>
size_t width = 1000, height = 1000, CPP = 3;
//this creates an array of pixels
//supposed to be from red to black from left to right
uint8_t* createBitMapI(const size_t& width, const size_t& height)
uint8_t _color[3] = { 255, 0, 0 };
uint8_t* bitMap = new uint8_t[width * height * 3];
//creating one row
for (float i = 0; i < width; i++)
*(bitMap + (int)(i * 3)) = (uint8_t)_color[0] * (1 - i / width);
*(bitMap + 1 + (int)(i * 3)) = 0;//(uint8_t)_color[1] * (1 - i / width);
*(bitMap + 2 + (int)(i * 3)) = 0;//(uint8_t)_color[2] * (1 - i / width);
//copying previously created row to others
for (size_t i = 1; i < height; i++)
memcpy(bitMap + (width * 3) * i, bitMap, width * 3);
return bitMap;
//creates BMP file and writes contents into it
class BMP
BMP(const size_t& width, const size_t& height,
uint8_t* arr, const char* name)
const char pad_[3] = { 0, 0, 0 };
char padding = (4 - width % 4) % 4;
fullSize = (width + padding) * height * 3 + 54;
image.write((const char*)header, 54);
for (size_t i = 0; i < height; i++)
image.write((const char*)arr + (i * width), width * 3);
image.write(pad_, padding);
void writeHeader()
memcpy(header, "BM", 2);
*(size_t*)(header + 2) = fullSize;
*(size_t*)(header + 10) = 54;
*(size_t*)(header + 14) = 40;
*(size_t*)(header + 18) = width;
*(size_t*)(header + 22) = height;
*(uint16_t*)(header + 26) = 1;
*(uint16_t*)(header + 28) = 24;
std::ofstream image;
uint8_t header[54];
uint8_t* pixels;
size_t fullSize;
int main()
uint8_t* arr = createBitMapI(width, height);
BMP newImage(width, height, arr, "image.bmp");
delete[] arr;
upd: changing image.open(name) to image.open(name, std::ios::binary) gives us output2
I'm confused whether an image is stored in row-major or column-major order in global memory of the device.
I'am getting two different outputs of an image while accessing the image in both the orders.
When accessing in row-major order-
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = numCols * y + x;
if (x >= numCols || y >= numRows)
//marking column boundaries
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
else if (x >= numCols-2){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
d_Image[m].w = d_sample[m].w;
output using row-major
when accessing in column-major order-
int m = x * numRows + y;
output using col-major
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_Image, d_sample, numRows, numCols);
I'm loading and saving the image using opencv.
In the first output red and blue dots are scattered all over the image. And in the second output(col-major) the boundary rows are marked while i'm trying to mark the columns. I'm too much confused.
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols);
cv::Mat sample;
cv::Mat Image;
size_t numRows() { return sample.rows; }
size_t numCols() { return sample.cols; }
__global__ void blur(const uchar4 *d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = y*numCols + x;
if (x >= numCols || y >= numRows)
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
else if (x >= (numCols-2)){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
d_Image[m].w = d_sample[m].w;
int main(){
uchar4 *h_sample, *d_sample, *d_Image, *h_Image;
int filter[9];
sample = cv::imread("sample.jpg", CV_LOAD_IMAGE_COLOR);
if (sample.empty()){
std::cout << "error in loading image.";
Image.create(numRows(), numCols(), CV_8UC4);
if (!sample.isContinuous() || !Image.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
h_sample = (uchar4*)sample.data;
h_Image = (uchar4*)Image.data;
size_t numPixels = numRows() * numCols();
//allocate mmeory on device
checkCudaErrors(cudaMalloc((void**)&d_sample, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMalloc((void**)&d_Image, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_sample, 0, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_Image, 0, sizeof(uchar4) * numPixels));
//copy to device
checkCudaErrors(cudaMemcpy(d_sample, h_sample, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
helper(d_sample, d_Image, numCols(), numRows());
//copy back to host
checkCudaErrors(cudaMemcpy(h_Image, d_Image, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
cv::namedWindow("Image", CV_WINDOW_AUTOSIZE);
cv::imshow("Image", Image);
cv::imwrite("sample.jpg", Image);
return 0;
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_sample, d_Image, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
and you call
helper(d_sample, d_Image, numCols(), numRows());
I think you may have switched cols and rows when you call helper...
If this sort of question has been asked I apologize, link me to the thread please!
Anyhow I am new to CUDA (I'm coming from OpenCL) and wanted to try generating an image with it. The relevant CUDA code is:
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
int main(void) {
ulong2 dims = {1000, 1000};
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
save_png("out.png", h_pixels, dims.x, dims.y);
return 0;
The save_png function is a usual utility function I created for taking a block of data and saving it to a png:
void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr) {
std::cerr << "Failed to create png write struct" << std::endl;
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) {
std::cerr << "Failed to create info_ptr" << std::endl;
png_destroy_write_struct(&png_ptr, NULL);
FILE *fp = fopen(filename, "wb");
if (!fp) {
std::cerr << "Failed to open " << filename << " for writing" << std::endl;
png_destroy_write_struct(&png_ptr, &info_ptr);
if (setjmp(png_jmpbuf(png_ptr))) {
png_destroy_write_struct(&png_ptr, &info_ptr);
std::cerr << "Error from libpng!" << std::endl;
png_init_io(png_ptr, fp);
png_write_info(png_ptr, info_ptr);
png_byte *row_pnts[height];
size_t i;
for (i = 0; i < height; i++) {
row_pnts[i] = buffer + width * 4 * i;
png_write_image(png_ptr, row_pnts);
png_write_end(png_ptr, info_ptr);
png_destroy_write_struct(&png_ptr, &info_ptr);
Anyways the image that's generated is a weird whiteish strip that's speckled with random colored pixels which can be seen here.
Is there something glaring I did wrong? I tried to follow the introduction documentation on the CUDA site. Otherwise can anyone help me out to fix this? Here I'm simply trying to fill the pixels buffer with green pixels.
I am using a MBP retina with an NVIDIA GeForce GT 650M discrete graphics card. I can run and paste the output to print_devices from the cuda sample code if need be.
EDIT: Note no errors or warnings during compilation with the following makefile:
nvcc -c mandlebrot.cu -o mandlebrot.cu.o
nvcc mandlebrot.cu.o -o mandlebrot -lpng
and no errors at runtime.
It's better if you provide a complete code that someone can copy, paste, compile, and run, without adding anything or changing anything, Stripping off the include headers isn't helpful, in my opinion, and making your test code dependent on a png library that others may not have is also not productive, if you want help.
Your error checking on kernel launches is broken. You may want to review proper cuda error checking. If you had proper error checking, or ran your code with cuda-memcheck, you would discover an error 9 on the kernel launch. This is an invalid configuration. If you print out your blocks and threads_per_block variables, you'll see something like this:
blocks: 2, 2
threads: 500, 500
You are in fact setting threads per block to 500,500 here:
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
That is illegal, as you are requesting 500x500 threads per block (i.e. 250000 threads) which exceeds the maximum limit of 1024 threads per block.
So your kernel is not running at all and you're getting garbage.
You can fix this error pretty simply by changing your block_size definition:
unsigned long block_size = 16;
After that there is still an issue, as you've misinterpreted the parameters for cudaMemcpy2D.:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
The documentation states for the 5th parameter:
width - Width of matrix transfer (columns in bytes)
but you've passed the width in elements (groups of 4 bytes) rather than bytes.
This will fix that:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
With the above changes, I was able to get good results with a test version of your code:
#include <stdio.h>
#include <stdint.h>
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
int main(void) {
ulong2 dims = {1000, 1000};
dim3 threads_per_block(16, 16);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
printf("blocks: %u, %u\n", blocks.x, blocks.y);
printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
// save_png("out.png", h_pixels, dims.x, dims.y);
for (int row = 0; row < dims.y; row++)
for (int col = 0; col < dims.x; col++){
if (h_pixels[(row*dims.x*4) + col*4 ] != 0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +2] != 0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
return 0;
Note the above code is a complete code you can copy, paste, compile and run.
I want to find a small bmp file from another bigger bmp file (the bigger one is captured from screen and called Sample.bmp , the small bmp file is called Button.bmp . Thing is the when comparing the images the file can't be found anywhere.
the compare code :
for (int i=0;i<SCREEN_WIDTH-width;++i)
for (int j=0;j<SCREEN_HEIGHT-height;++j)
boolean isequal = true;
for(int qqq=i;qqq<i+width;++qqq)
for (int kkk=j;kkk<j+height;++kkk)
if (PI[qqq][kkk]!=NPI[qqq-i][kkk-j]) isequal = false;
if (isequal == false)
qqq = i + width + 1;
kkk = j + height + 1;
if (isequal==true)
MidX = i;
MidY = j;
note : Screen_width and Screen_height are for the bigger image and width and height are for the smaller one
Full Code:
void readBMP()
int i;
FILE* f = fopen("Sample.bmp", "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f); // read the 54-byte header
// extract image height and width from header
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = 3 * width * height;
unsigned char* data = new unsigned char[size]; // allocate 3 bytes per pixel
fread(data, sizeof(unsigned char), size, f); // read the rest of the data at once
for(int qq=0;qq<SCREEN_WIDTH;++qq)
for (int kk=0;kk<SCREEN_HEIGHT;++kk)
PI[qq][kk][0] = data[kk * width + qq];
PI[qq][kk][1] = data[kk * width + qq + 1];
PI[qq][kk][2] = data[kk * width + qq + 2];
void FindImageInScreen(char* FileName)
FILE* f = fopen(FileName, "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f); // read the 54-byte header
// extract image height and width from header
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = 3 * width * height;
unsigned char* data = new unsigned char[size]; // allocate 3 bytes per pixel
fread(data, sizeof(unsigned char), size, f); // read the rest of the data at once
for(int qq=0;qq<width;++qq)
for (int kk=0;kk<height;++kk)
NPI[qq][kk][0] = data[kk * width + qq];
NPI[qq][kk][1] = data[kk * width + qq + 1];
NPI[qq][kk][2] = data[kk * width + qq + 2];
for (int i=0;i<SCREEN_WIDTH-width;++i)
for (int j=0;j<SCREEN_HEIGHT-height;++j)
boolean isequal = true;
for(int qqq=i;qqq<i+width;++qqq)
for (int kkk=j;kkk<j+height;++kkk)
if (PI[qqq][kkk][0]!=NPI[qqq-i][kkk-j][0]) isequal = false;
if (isequal == false)
qqq = i + width + 1;
kkk = j + height + 1;
if (isequal==true)
MidX = i;
MidY = j;
MidX = -1;
MidY = -1;
definition of arrays (added because of request) , This is before functions execute :
PI = new unsigned int**[SCREEN_WIDTH];
for (int i=0;i<SCREEN_WIDTH;++i)
PI[i] = new unsigned int*[SCREEN_HEIGHT];
for (int i=0;i<SCREEN_WIDTH;++i)
for (int j=0;j<SCREEN_HEIGHT;++j)
PI[i][j] = new unsigned int[3];
NPI = new unsigned int**[SCREEN_WIDTH];
for (int i=0;i<SCREEN_WIDTH;++i)
NPI[i] = new unsigned int*[SCREEN_HEIGHT];
for (int i=0;i<SCREEN_WIDTH;++i)
for (int j=0;j<SCREEN_HEIGHT;++j)
NPI[i][j] = new unsigned int[3];
The First function executes then the second. and sorry for some bad programming because I did thousands of changes to make it work!
PI[qq][kk][0] = data[kk * width + qq];
From how PI and NPI are filled in, it appears that they are 3-dimensional arrays (it would help if you included their definition in the code sample). But
if (PI[qqq][kkk]!=NPI[qqq-i][kkk-j]) isequal = false;
which is only indexing 2 dimensions of each. PI[a][b] is the address of the array containing PI[a][b][0..2] and will certainly never match the address of NPI[x][y], so this statement is always returning false I expect.
Lets get you started. Here is a better LoadBMP.
Yours, among other thing, read the size, and uses SCREEN_HEIGHT anyway.
Using this for loading both images is probably easier.
#include <vector>
#include <cstdio>
#include <string>
using namespace std;
typedef unsigned char UC;
struct RGB { UC r,g,b; };
bool operator == ( const RGB& p1, const RGB& p2 ) { return p1.r==p2.r && p1.g==p2.g && p1.b==p2.b; }
struct BMP
int width;
int height;
vector<RGB> pixels;
RGB& Pix(int x,int y) { return pixels[ y*width + x ]; }
void LoadBMP( BMP& bmp, const char* filename )
FILE* f = fopen(filename, "rb");
UC info[54];
fread(info, 1, 54, f); // read the 54-byte header
// extract image height and width from header
bmp.width = *(int*) (info+18);
bmp.height = *(int*) (info+22);
// scanlines are always multiple of 4, padded with 0-3 bytes
int scanlinesize = 3*bmp.width;
while( scanlinesize % 4 ) ++scanlinesize;
int size = scanlinesize * bmp.height;
UC* data = new UC[size];
fread(data, 1, size, f);
for(int yy=0;yy<bmp.height;++yy)
UC* p = data+scanlinesize*yy;
for (int xx=0;xx<bmp.width;++xx)
RGB rgb;
rgb.b = *p++;
rgb.g = *p++;
rgb.r = *p++;
delete[] data;
My taks is to restore an mp3 file, wich is coded bit-per-bit in a PNG file. I got the right bits from the PNG RGB data (per pixel) in a vector. I'm using C++.
I have to go through the png file and read the RGB data of a pixel: then I have 3 decimal values. From binary representation of the decimal values, I need the least smallest local value. The 11 pixels shows on 33 bits the length of the mp3. Then i decode all of the binary data from the pixels, and put in a vector;
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <vector>
#include <math.h>
#include <iostream>
#include <fstream>
#define PNG_DEBUG 3
#include <png.h>
void abort_(const char * s, ...)
va_list args;
va_start(args, s);
vfprintf(stderr, s, args);
fprintf(stderr, "\n");
void itob(short n, std::vector<int> &bin)
int d = n;
if (n > 1)
d = n % 2;
itob(n / 2, bin);
void btoi(unsigned int& n, std::vector<int> bin)
n = 0;
int k = 32;
for(int i = 0; i < bin.size() ; i++){
if(bin[i] == 1){
long int num = pow(2,k);
n += num;
int x, y;
int width, height;
png_byte color_type;
png_byte bit_depth;
png_structp png_ptr;
png_infop info_ptr;
int number_of_passes;
png_bytep * row_pointers;
void read_png_file()
unsigned char header[8]; // 8 is the maximum size that can be checked
/* open file and test for it being a png */
FILE *fp = fopen("image.png", "rb");
if (!fp)
abort_("[read_png_file] File %s could not be opened for reading", "image.png");
fread(header, 1, 8, fp);
if (png_sig_cmp(header, 0, 8))
abort_("[read_png_file] File %s is not recognized as a PNG file", "image.png");
/* initialize stuff */
png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr)
abort_("[read_png_file] png_create_read_struct failed");
info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr)
abort_("[read_png_file] png_create_info_struct failed");
png_init_io(png_ptr, fp);
png_set_sig_bytes(png_ptr, 8);
png_read_info(png_ptr, info_ptr);
width = png_get_image_width(png_ptr, info_ptr);
height = png_get_image_height(png_ptr, info_ptr);
color_type = png_get_color_type(png_ptr, info_ptr);
bit_depth = png_get_bit_depth(png_ptr, info_ptr);
number_of_passes = png_set_interlace_handling(png_ptr);
png_read_update_info(png_ptr, info_ptr);
row_pointers = (png_bytep*) malloc(sizeof(png_bytep) * height);
for (y=0; y<height; y++)
row_pointers[y] = (png_byte*) malloc(png_get_rowbytes(png_ptr,info_ptr));
png_read_image(png_ptr, row_pointers);
void process_file(void)
if (png_get_color_type(png_ptr, info_ptr) == PNG_COLOR_TYPE_RGBA)
abort_("[process_file] input file is PNG_COLOR_TYPE_RGB but must be PNG_COLOR_TYPE_RGB "
"(lacks the alpha channel)");
if (png_get_color_type(png_ptr, info_ptr) != PNG_COLOR_TYPE_RGB)
abort_("[process_file] color_type of input file must be PNG_COLOR_TYPE_RGB (%d) (is %d)",
PNG_COLOR_TYPE_RGBA, png_get_color_type(png_ptr, info_ptr));
printf("width: %d\nheight: %d\n", width, height);
int mHeader = 33; unsigned int mSize = 0;
std::vector<int> mSizeByBites;
for (y=0; y<height; y++) {
png_byte* row = row_pointers[y];
for (x=0; x<width; x++) {
png_byte* ptr = &(row[x*3]);
if(mHeader == 0){ break; }
std::vector<int> b;
itob(ptr[0], b);
itob(ptr[1], b);
itob(ptr[2], b);
if(mHeader == 0){ break; }
for(int i =0; i<mSizeByBites.size(); i++){
printf("%d", mSizeByBites[i]);
btoi(mSize, mSizeByBites);
printf(" = %i\n", mSize);
std::vector<int> mDataBaBites;
for (y=0; y<height; y++) {
png_byte* row = row_pointers[y];
for (x=0; x<width; x++) {
if(mSize <= 0){ break; }
png_byte* ptr = &(row[x*3]);
std::vector<int> b;
itob(ptr[0], b);
if(mSize <= 0){ break; }
itob(ptr[1], b);
if(mSize <= 0){ break; }
itob(ptr[2], b);
if(mSize <= 0){ break; }
printf("%i\n", mSize);
if(mSize<=0){ break; }
std::ofstream output("result.mp3", std::ios::out | std::ios::binary);
printf("[D] Writing to file start: %li\n", mDataBaBites.size());
output.write( (char*)(&mDataBaBites[0]), mDataBaBites.size() );
int main(int argc, char **argv)
return 0;
Now I have no clue, how to write it in a file, wich i can play as an mp3. I tried to convert the bits to hexa.
What is the correct format of an mp3 file? How can I write the bits in the correct format?
Try this:
#include <fstream> //For std::min
std::ofstream mp3File( "restored.mp3", std::ios::out | std::ios::binary );
//Assuming rgbData is a char* with the mp3 data,
//and rgbDataSize is its size in bytes
mp3File.write( rgbData, rgbDataSize );
Update: When we (programmers) say "binary representation" we almost always mean bytes, not bits. From your description of the decoding process, I gather you should compare the 3 RGB components for each pixel and keep the minimum as the decoded byte. To do that:
#include <algorithm>
std::vector<char> mDataBaBites;
for (y=0; y<height; y++) {
png_byte* row = row_pointers[y];
for (x=0; x<width; x++) {
png_byte red = row[x*3];
png_byte green = row[x*3 + 1];
png_byte blue = row[x*3 + 2];
png_byte minByte = std::min( std::min(red,green), blue );
mDataBaBites.push_back( minByte );
mSize -= 3;
if(mSize<=0){ break; }
std::ofstream output("result.mp3", std::ios::out | std::ios::binary);
printf("[D] Writing to file start: %li\n", mDataBaBites.size());
output.write( (char*)(&mDataBaBites[0]), mDataBaBites.size() );
Update 2:
std::ofstream output("result.mp3", std::ios::out | std::ios::binary);
printf("[D] Writing to file start: %li\n", mDataBaBites.size());
for( int i=0; i<mDataBaBites.size(); i+=8 ){
char decodedByte = 0;
for( int j=0; j<8; j++ )
decodedByte |= (mDataBaBites[i+j] << j);
output.write( (char*)(&mDataBaBites[0]), 1 );
If this doesn't work either, you might want to clarify the decoding process definition (which is its source? is there some formal definition?)