Set Pixel colour in an array - c++

I have an array of pixels stored in a vector as follows:
typedef union RGBA
{
std::uint32_t Colour;
struct
{
std::uint8_t R, G, B, A;
};
} *PRGB;
std::vector<RGBA> Pixels; //My pixels are read into this vector.
I process it using the following two functions. One is for reading, the other is for writing.
The read function takes an array of bytes and flips them and stores them into the struct above. It takes padding into consideration so it works for both 24 and 32 bit bitmaps. The write function flips it back and writes it to an array of bytes.
void ReadPixels(const std::uint8_t* In, RGBA* Out)
{
for (std::size_t I = 0; I < height; ++I)
{
for (std::size_t J = 0; J < width; ++J)
{
Out[(height - 1 - I) * width + J].B = *(In++);
Out[(height - 1 - I) * width + J].G = *(In++);
Out[(height - 1 - I) * width + J].R = *(In++);
Out[(height - 1 - I) * width + J].A = (BitsPerPixel > 24 ? * (In++) : 0xFF);
}
if(BitsPerPixel == 24)
In += (-width * 3) & 3;
}
}
void WritePixels(const RGBA* In, std::uint8_t* Out)
{
for (std::size_t I = 0; I < height; ++I)
{
for (std::size_t J = 0; J < width; ++J)
{
*(Out++) = In[(height - 1 - I) * width + J].B;
*(Out++) = In[(height - 1 - I) * width + J].G;
*(Out++) = In[(height - 1 - I) * width + J].R;
if (BitsPerPixel > 24)
*(Out++) = In[(height - 1 - I) * width + J].A;
}
if(BitsPerPixel == 24)
Out += (-width * 3) & 3;
}
}
The thing is, if I want to change just one pixel in the array, I have to flip and copy the whole image into the vector, change the pixel using:
inline void SetPixel(int X, int Y, std::uint32_t Color)
{
Pixels[Y * width + X].Colour = Color;
}
And then flip it back into the array. Is there a better way to change a single pixel in the array without having to do this every single time?
I tried this formula (so that padding is taken into consideration):
ByteArray[((height - 1 - Y) * width + X) + (Y * ((-width * 3) & 3))] = Color;
But it doesn't work. Any ideas?

Your subscript->index formula looks all wrong.
Perhaps:
int stride = width * BitsPerPixel/8;
stride = ((stride - 1) & ~3) + 4; // round up to multiple of 4 bytes
RGBQUAD& selected_pixel = *reinterpret_cast<RGBQUAD*>(array + stride * (height - 1 - Y)) + X * BitsPerPixel/8);
selected_pixel.R = ...
...

Related

Problems with BMP

Wrote a simple BMP generation code but instead of outputting what I want (red that gradually goes to black (from left to right)) it returns an image with some some weird layout far from what I expect to see. Was inspecting the way header and pixels written in the memory and everything seems alright
Output
#include <iostream>
#include <fstream>
size_t width = 1000, height = 1000, CPP = 3;
//this creates an array of pixels
//supposed to be from red to black from left to right
uint8_t* createBitMapI(const size_t& width, const size_t& height)
{
uint8_t _color[3] = { 255, 0, 0 };
uint8_t* bitMap = new uint8_t[width * height * 3];
//creating one row
for (float i = 0; i < width; i++)
{
*(bitMap + (int)(i * 3)) = (uint8_t)_color[0] * (1 - i / width);
*(bitMap + 1 + (int)(i * 3)) = 0;//(uint8_t)_color[1] * (1 - i / width);
*(bitMap + 2 + (int)(i * 3)) = 0;//(uint8_t)_color[2] * (1 - i / width);
}
//copying previously created row to others
for (size_t i = 1; i < height; i++)
{
memcpy(bitMap + (width * 3) * i, bitMap, width * 3);
}
return bitMap;
}
//creates BMP file and writes contents into it
class BMP
{
public:
BMP(const size_t& width, const size_t& height,
uint8_t* arr, const char* name)
{
const char pad_[3] = { 0, 0, 0 };
char padding = (4 - width % 4) % 4;
fullSize = (width + padding) * height * 3 + 54;
image.open(name);
writeHeader();
image.write((const char*)header, 54);
for (size_t i = 0; i < height; i++)
{
image.write((const char*)arr + (i * width), width * 3);
image.write(pad_, padding);
}
image.close();
}
void writeHeader()
{
memcpy(header, "BM", 2);
*(size_t*)(header + 2) = fullSize;
*(size_t*)(header + 10) = 54;
*(size_t*)(header + 14) = 40;
*(size_t*)(header + 18) = width;
*(size_t*)(header + 22) = height;
*(uint16_t*)(header + 26) = 1;
*(uint16_t*)(header + 28) = 24;
}
private:
std::ofstream image;
uint8_t header[54];
uint8_t* pixels;
size_t fullSize;
};
int main()
{
uint8_t* arr = createBitMapI(width, height);
BMP newImage(width, height, arr, "image.bmp");
delete[] arr;
}
upd: changing image.open(name) to image.open(name, std::ios::binary) gives us output2

Problem of converting bgr to yuv420p with cuda

I need to convert image from bgr to yuv420p and I first use OpenCV to do so.
Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);
The result of it is normal. However,my image is too big and its pixel is almost 6400 * 2000.
I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.
Then I decide to convert it myself and speed it with cuda.
Here is code in cpu:
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
I test the code bgr_to_yuv420p(...) and the result is also normal.
Then I speed it up with cuda.
Here is all my code include kernel function and test function.
#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
int main(void)
{
Mat srcImage = imread("1.bmp");
imshow("srcImage", srcImage);
const uint imgheight = srcImage.rows;
const uint imgwidth = srcImage.cols;
Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));
//input and output
uchar3 *d_in;
unsigned char *d_out;
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
imshow("nv12",nv12Image);
imwrite("cuda.bmp",nv12Image);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
The code with cuda can run but the result is not normal. Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...)
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I try a lot but still cannot solve it. And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb. So I want to know is somebody running into the same question or giving me some advice?
Thanks Robert Crovella.Here is my update-1.
I follow Robert Crovella's advice and change the kernel function like this:
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
I test the new kernel with excitement,but the result is also not normal.
Here is my result image with the updated kernel function.
yuv420p image converted by myself
Then the normal result image converted by opencv api is here.
yuv420p image converted by opencv api
As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, i.e.
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I think it will work but it does not. Any other advice? Robert Crovella
Edit: The solution is Robert Crovella's latest answer. I have double checked it and it is really perfect.
There are a variety of issues:
the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical. Yes, this matters.
Your CPU code has planar Y,U,V storage. That means Y has its own plane, U has its own plane, and V has its own plane. Your GPU codes is semi planar (NV12) format. That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.
IMO, there is no need to drag OpenCV into this.
Your UV offset calculation in the kernel (GPU) code was broken. The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. You do not have that many rows in the UV planar region, you only have half as many rows.
In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...
My recommendation would be to start by harmonizing the calculation differences and storage order/format. The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:
$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Any time you are having trouble with a CUDA code, I recommend
Proper CUDA error checking
Running your code with cuda-memcheck
EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420sp(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
d_out[u_offset] = bgr2u(r,g,b);
int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
d_out[v_offset] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

How to copy a pixel buffer into a larger one

I’m trying to copy a set of pixels into a larger pixel buffer. Clearly, I’m calculating the coordinates wrong, probably with the strides, but I can’t find what I’m missing. I get a completely messed up result.
So in essence, what I try to achieve is to copy a RGBA pixel array into a larger RGB array (alpha gets discarded), keeping the strides correctly. See the image below for a visual representation of the expected result.
void draw(const uint8_t* tileRGBA, uint8_t* canvasRGB, const int canvasStride,
const int tileWidth, const int tileHeight)
{
for (int y = 0; y < tileHeight; y++)
{
for (int x = 0; x < tileWidth; x++)
{
long tileIndex = (4 * x) + (y * tileWidth);
long canvasIndex = (3 * x) + (y * canvasStride);
canvasRGB[canvasIndex] = tileRGBA[tileIndex];
canvasRGB[canvasIndex + 1] = tileRGBA[tileIndex + 1];
canvasRGB[canvasIndex + 2] = tileRGBA[tileIndex + 2];
}
}
}
uint8_t* test(uint32_t* tileRGBA, const int tileWidth, const int tileHeight)
{
int canvasStride = tileWidth * 5; // 5 is just an arbitrary value for this example, in this case a canvas 5 times the width of the tile
uint8_t* canvasRGB = new uint8_t[canvasStride * tileHeight * 3];
draw((uint8_t*)tileRGBA, canvasRGB, canvasStride, tileWidth, tileHeight);
return canvasRGB;
}
SOLVED: Thanks to the comments of Johnny Mopp. It was a matter of brackets.
This:
long tileIndex = (4 * x) + (y * tileWidth)
long canvasIndex = (3 * x) + (y * canvasStride);
Must be really this:
long tileIndex = 4 * (x + y * tileWidth);
long canvasIndex = 3 * (x + y * canvasStride);
The problem is in your calculation of the indexes.
To get a 2D index (row,col) in a 1D array you would do:
index = ((number_of_columns * row) + col) * sizeof(data)
Where number_of_columns is the intended number of "columns" in the data - in this case the width of the image. And sizeof(data) is the size in bytes of one item in the array - in this case, 4 bytes for RGBA and 3 for RGB. So, as you have determined, it should be:
long tileIndex = 4 * (x + y * tileWidth);
long canvasIndex = 3 * (x + y * canvasStride);
You can do away with the sizeof multiplication if you can represent the data as a single item. For example, in your case, create 2 structs:
struct RGB {
uint8_t r,g,b;
};
struct RGBA {
uint8_t r,g,b,a;
};
Then pass the parameters as arrays of these structs:
void draw(const RGBA* tileRGBA, RGB* canvasRGB, const int canvasStride, const int tileWidth, const int tileHeight)
Then the calculation simplifies to:
long tileIndex = x + y * tileWidth;
long canvasIndex = x + y * canvasStride;

implementing de castlejau algorithm c++

The program runs but the curved line isn't being displayed .
Here is my code and note, I have 4 vertices in an array.
void GLWidget::drawControlPolygon(){
for (int i = 0; i < vertices.size()-1;i++){
drawEdge(vertices[i], vertices[i+1], RGBValue(0,0,0));
}
}
void GLWidget::drawDeCasteljau(float t) {
Point p;
int N_PTS = 4;
p.x = pow((1-t),3)*vertices[0].x+3* t * pow((1 -t), 2) * vertices[1].x + 3 * (1-t)*pow(t,2)*vertices[2].x+ pow (t, 3)*vertices[3].x;
p.y = pow((1-t),3)*vertices[0].y+3* t * pow((1 -t), 2) * vertices[1].y + 3 * (1-t)*pow(t,2)*vertices[2].y+ pow (t, 3)*vertices[3].y;
p.z = pow((1-t),3)*vertices[0].z+3* t * pow((1 -t), 2) * vertices[1].z + 3 * (1-t)*pow(t,2)*vertices[2].z+ pow (t, 3)*vertices[3].z;
int bezPoints[3][3] ;
for (float u = 0.0; u <= 1.0; u += t) {
for (int diag = N_PTS-2; diag >= 0; diag--) {
for (int i = 0; i <= diag; i++) {
int j = diag - i;
bezPoints[i][j] = (1.0-u)*bezPoints[i][j+1] + u*bezPoints[i+1][j];
}
}
// set the pixel for this parameter value
//Set pixel method for theImage object.
// void setPixel(Index row, Index col, Byte red, Byte green, Byte blue, Byte alpha=255);
// void setPixel(Index row, Index col, RGBValue colour, Byte alpha = 255);
theImage.setPixel(bezPoints[0], bezPoints[0][0], RGBValue());
}
}
void GLWidget::drawBezierCurve() {
}
for the full class here is the link to it...
https://www.dropbox.com/s/j6jw51uhz30m3tb/testApp.cc?dl=0
So far the output looks like this
Thanks!

Bitmap 24 to 32 and back

This may be a long post but I really need to know how to Convert between 24 and 32 bit bitmaps. For the sake of the length of this post, I removed the PNG part of my question.
Here goes:
I have a struct like the one below that holds all pixel information:
typedef union RGB
{
uint32_t Color;
struct
{
unsigned char B, G, R, A;
} RGBA;
} *PRGB;
std::vector<RGB> Pixels; //Holds all pixels.
All of the bitmap writing works except when going from 24 to 32 or vice-versa. I don't know what I'm doing wrong or why 24-32 conversions don't work. My bitmap reading and writing code is as follows:
Bitmap(const void* Pointer, int Width, int Height, uint32_t BitsPerPixel) //Constructor initialization here...
{
Pixels.clear();
if (Pointer == nullptr) {throw std::logic_error("Null Pointer Exception. Pointer is NULL.");}
if (Width < 1 || Height < 1) {throw std::invalid_argument("Invalid Arguments. Width and Height cannot equal 0.");}
std::memset(&Info, 0, sizeof(BITMAPINFO));
size = ((width * BitsPerPixel + 31) / 32) * 4 * height;
Info.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
Info.bmiHeader.biWidth = width;
Info.bmiHeader.biHeight = height;
Info.bmiHeader.biPlanes = 1;
Info.bmiHeader.biBitCount = BitsPerPixel;
Info.bmiHeader.biCompression = BI_RGB;
Info.bmiHeader.biSizeImage = size;
bFileHeader.bfType = 0x4D42;
bFileHeader.bfOffBits = sizeof(BITMAPFILEHEADER) + sizeof(Info.bmiHeader);
bFileHeader.bfSize = bFileHeader.bfOffBits + size;
const unsigned char* BuffPos = static_cast<const unsigned char*>(Pointer);
height = (height < 0 ? -height : height);
Pixels.resize(width * height);
for (int I = 0; I < height; I++)
{
for (int J = 0; J < width; J++)
{
Pixels[(height - 1 - I) * width + J].RGBA.B = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.G = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.R = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.A = (Info.bmiHeader.biBitCount > 24 ? *(BuffPos++) : 0);
}
if(Info.bmiHeader.biBitCount == 24)
BuffPos += width % 4;
}
}
bool SaveBitmap(const char* FilePath)
{
std::vector<unsigned char> ImageData(size);
unsigned char* BuffPos = ImageData.data();
for (int I = 0; I < height; ++I)
{
for (int J = 0; J < width; ++J)
{
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.B;
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.G;
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.R;
if (Info.bmiHeader.biBitCount > 24)
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.A;
}
if(Info.bmiHeader.biBitCount == 24)
BuffPos += width % 4;
}
std::fstream hFile(FilePath, std::fstream::out | std::ofstream::binary);
if (!hFile.is_open()) return false;
hFile.write(reinterpret_cast<char*>(&bFileHeader), sizeof(BITMAPFILEHEADER));
hFile.write(reinterpret_cast<char*>(&Info.bmiHeader), sizeof (BITMAPINFOHEADER));
hFile.write(reinterpret_cast<char*>(&ImageData[0]), Size());
hFile.close();
return true;
}
Any idea what the two problems could be? I want it so that if I called Bitmap(24BmpBuff, W, H, 32); It'll save as 32. If I do Bitmap(32BmpBuff, W, H, 24) it'll save as 24 bit. I just can't see it so I'm hoping one of you will.
I also tried making helper functions:
Convert From 24 bit to 32 bit.
void T24To32(std::vector<RGB> &Input, std::vector<RGB> &Output, int Width, int Height)
{
Output.resize(Input.size());
for (int I = 0; I < Height; ++I)
{
for (int J = 0; J < Width; ++J)
{
Output[J].RGBA.B = Input[J].RGBA.B;
Output[J].RGBA.G = Input[J].RGBA.G;
Output[J].RGBA.R = Input[J].RGBA.R;
Output[J].RGBA.A = 0;
}
}
}
Take the unsigned char* of pixels and store them upside down within the struct.
void Pack(int width, int height, int BPP, unsigned char* Input, std::vector<RGB> &Pixels)
{
unsigned char* BuffPos = Input;
height = (height < 0 ? -height : height);
Pixels.resize(width * height);
for (int I = 0; I < height; I++)
{
for (int J = 0; J < width; J++)
{
Pixels[(height - 1 - I) * width + J].RGBA.B = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.G = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.R = *(BuffPos++);
Pixels[(height - 1 - I) * width + J].RGBA.A = (BPP > 24 ? *(BuffPos++) : 0);
}
if(BPP == 24)
BuffPos += width % 4;
}
}
Take the struct of pixels and store them upright in the unsigned char*.
void Unpack(int width, int height, int BPP, std::vector<RGB> Pixels, unsigned char* &Output)
{
unsigned char* BuffPos = Output;
for (int I = 0; I < height; ++I)
{
for (int J = 0; J < width; ++J)
{
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.B;
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.G;
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.R;
if (BPP > 24)
*(BuffPos++) = Pixels[(height - 1 - I) * width + J].RGBA.A;
}
if(BPP == 24)
BuffPos += width % 4;
}
}
I use all of the above like so.. Input image(32 bit):
Code:
void Bitmap32ToBitmap24(int Width, int Height)
{
Bitmap Image("C:/Images/Bitmap32.bmp");
std::vector<unsigned char> Pixels(((Width * 32 + 31) / 32) * 4 * Height); //Array large enough to hold 32 bit bmp.
unsigned char* BuffPos = Pixels.data();
Unpack(Width, Height, 32, Image.Get(), BuffPos); //Fill the array of unsigned char with image pixels being upright
Bitmap BMP(Pixels.data(), Width, Height, 24); //Convert image to 24 bit bmp and save it.
BMP.Save("C:/Images/Output/Bitmap32ToBitmap24.png");
}
Output image (24 bit):
24 to 32 results in:
In all your code snippets
if(Info.bmiHeader.biBitCount == 24)
BuffPos += width % 4;
or
if(BPP == 24)
BuffPos += width % 4;
occur. I assume this should add the padding value to each line. But it isn't the padding, it is the number of pixels per line %4.
The correct adding value is (4 - ((width * 3) % 4)) % 4. The width*3 is the number of bytes in that line. The %4 calculates the number of bytes which are to many for a 4 byte padding, but to fill up to the next higher limes we need 4-this value. This again is 4 if no padding offset is needed -> %4 to avoid that.
A faster way to compute the same value is (-width * 3) & 3. See wiki.