Problem of converting bgr to yuv420p with cuda

Problem of converting bgr to yuv420p with cuda - c++

I need to convert image from bgr to yuv420p and I first use OpenCV to do so.
Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);
The result of it is normal. However，my image is too big and its pixel is almost 6400 * 2000.
I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.
Then I decide to convert it myself and speed it with cuda.
Here is code in cpu:
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
I test the code bgr_to_yuv420p(...) and the result is also normal.
Then I speed it up with cuda.
Here is all my code include kernel function and test function.
#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
int main(void)
{
Mat srcImage = imread("1.bmp");
imshow("srcImage", srcImage);
const uint imgheight = srcImage.rows;
const uint imgwidth = srcImage.cols;
Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));
//input and output
uchar3 *d_in;
unsigned char *d_out;
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
imshow("nv12",nv12Image);
imwrite("cuda.bmp",nv12Image);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
The code with cuda can run but the result is not normal. Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...)
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I try a lot but still cannot solve it. And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb. So I want to know is somebody running into the same question or giving me some advice?
Thanks Robert Crovella.Here is my update-1.
I follow Robert Crovella's advice and change the kernel function like this:
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
I test the new kernel with excitement,but the result is also not normal.
Here is my result image with the updated kernel function.
yuv420p image converted by myself
Then the normal result image converted by opencv api is here.
yuv420p image converted by opencv api
As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, i.e.
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I think it will work but it does not. Any other advice? Robert Crovella
Edit: The solution is Robert Crovella's latest answer. I have double checked it and it is really perfect.

There are a variety of issues:
the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical. Yes, this matters.
Your CPU code has planar Y,U,V storage. That means Y has its own plane, U has its own plane, and V has its own plane. Your GPU codes is semi planar (NV12) format. That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.
IMO, there is no need to drag OpenCV into this.
Your UV offset calculation in the kernel (GPU) code was broken. The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. You do not have that many rows in the UV planar region, you only have half as many rows.
In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...
My recommendation would be to start by harmonizing the calculation differences and storage order/format. The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:
$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Any time you are having trouble with a CUDA code, I recommend
Proper CUDA error checking
Running your code with cuda-memcheck
EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420sp(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
d_out[u_offset] = bgr2u(r,g,b);
int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
d_out[v_offset] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

Related

Implement bokeh blur with C++

We would like to perform bokeh blur on a image. I have tried to test some code below but could not get Circle of Confusion on bright point.
void bokeh(unsigned char *Input, unsigned char *Output, int Width, int Height, int Stride, int Radius)
{
int Channels = Stride / Width;
int rsq = fmax(1, sqrtf(Radius));
for (int y = 0; y < Height; y++)
{
unsigned char * LinePD = Output + y*Stride;
for (int x = 0; x < Width; x++)
{
unsigned int sum[3] = { 0 };
unsigned int weightsum = 0;
for (int ny = std::max(0, y - Radius); ny < std::min(y + Radius, Height); ny++)
{
const unsigned char * sampleLine = Input + ny*Stride;
for (int nx = std::max(0, x - Radius); nx < std::min(x + Radius, Width); nx++)
{
if (sqrtf(nx - x) + sqrtf(ny - y) < rsq)
{
const unsigned char * sample = sampleLine + nx*Channels;
const unsigned char&R = sample[0];
const unsigned char&G = sample[1];
const unsigned char&B = sample[2];
float weight = sqrtf((unsigned char)((21627 * R + 21627 * G + 21627 * B) >> 16));
for (int c = 0; c < Channels; c++)
{
sum[c] += weight*sample[c];
}
weightsum += weight;
}
}
}
for (int c = 0; c < Channels; c++)
{
LinePD[c] = ClampToByte(sum[c] / weightsum);
}
LinePD += Channels;
}
}
}
The source image is:
The result is:
while I expect effect is which like circular in pictures below
seems that I replace sqrtf(nx - x) + sqrtf(ny - y) < rsq
with
powf(nx - x, 2.0) + powf(ny - y, 2.0) < powf(Radius, 2)
and replace float weight = sqrtf((unsigned char)((21627 * R + 21627 * G + 21627 * B) >> 16));
with
float weight = (R + G + B)*1.0f/3.0f;
I could get bokeh blur effect, so how to set the weight to by brightness?

Converting RGBA binary image into YCbCr but the result is not as expected

I have a RGBA image in binary format(.raw) and I am trying to convert the image into YCbCr using C++. However the converted image when viewed using ffplay give me a green image. What could I be doing wrong? I have a code to reproduce the problem I am facing. The input image looks like this: https://drive.google.com/file/d/1oDswYmmSV0pfNe-u8Do06WWVu2v1B-rg/view?usp=sharing and the snapshot of the converted image is https://drive.google.com/file/d/1G8Rut3CXILqbmlGrFQsnushy2CLKu40w/view?usp=sharing. The input RGBA .raw image can be obtained here: https://drive.google.com/file/d/19JhMjRdibGCgaUsE6DBGAXGTRiT2bmTM/view?usp=sharing
#include <fstream>
#include <iostream>
#include <vector>
#include <array>
typedef unsinged char byte;
int main(){
std::ifstream infile;
std::ofstream outfile;
const unsigned width = 1280;
const unsigned height = 720;
std::vector<std::array<byte, 4>> imageBuffer;
std::vector<std::array<byte, 3>> output;
imageBuffer.resize(width*height);
output.resize(width*height);
infile.open("input.raw", std::ios::binary);
if(infile){
infile.read(reinterpret_cast<char*>(&imageBuffer[0]), width*height*4*sizeof(char));
}
for (unsigned y=0; y<height; ++y){
for(unsigned x=0; x<width; ++x){
byte R, G, B, A;
R = imageBuffer[y*width + x][0];
G = imageBuffer[y*width + x][1];
B = imageBuffer[y*width + x][2];
byte Y, Cb, Cr;
Y = 0.257*R + 0.504*G + 0.098*B + 16;
Cb = -0.148*R - 0.291*G + 0.439*B + 128;
Cr = 0.439*R - 0.368*G - 0.071*B + 128;
output[y*width + x][0] = Y;
output[y*width + x][1] = Cb;
output[y*width + x][2] = Cr;
}
}
std::ofstream os("output444.yuv", std::ios::binary);
if(!os)
return false;
os.write(reinterpret_cast<char*>(&output[0]), 1280*720*3*sizeof(char));}

Your code is fine for YUV_4_4_4 8-bit-Packed.
You can view it with YUView: https://github.com/IENT/YUView/releases
and select the settings:
It will display just fine.
However, if you are seeing it Green or whatever wrong colours, it means the program reading it is expecting a different format. Most likely it is expecting planar format which means you need to write all Y bytes first. Then write Cb bytes, then Cr bytes.
So it'd look like (YCbCr_4_4_4_Planar):
YYYY
YYYY
YYYY
CbCbCbCb
CbCbCbCb
CrCrCrCr
CrCrCrCr
instead of packed which looks like (Your code above = YCbCr_4_4_4_Packed/Interleaved):
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
Below I wrote some code that can handle multiple formats. It'll take a RAW image format and convert it to either:
YUV_4_2_2_PLANAR,
YUV_4_2_2_PACKED,
YUV_4_4_4_PLANAR,
YUV_4_4_4_PACKED,
//
// main.cpp
// RAW-To-YUV-Conversion
//
// Created by Brandon on 2021-08-06.
//
#include <iostream>
#include <fstream>
#include <utility>
#include <memory>
#include <vector>
void RGBToYUV(std::uint8_t R, std::uint8_t G, std::uint8_t B, std::uint8_t& Y, std::uint8_t& U, std::uint8_t& V)
{
Y = 0.257 * R + 0.504 * G + 0.098 * B + 16;
U = -0.148 * R - 0.291 * G + 0.439 * B + 128;
V = 0.439 * R - 0.368 * G - 0.071 * B + 128;
}
//void RGBToYUV(std::uint8_t R, std::uint8_t G, std::uint8_t B, std::uint8_t &Y, std::uint8_t &U, std::uint8_t &V)
//{
// #define RGB2Y(r, g, b) (uint8_t)(((66 * (r) + 129 * (g) + 25 * (b) + 128) >> 8) + 16)
// #define RGB2U(r, g, b) (uint8_t)(((-38 * (r) - 74 * (g) + 112 * (b) + 128) >> 8) + 128)
// #define RGB2V(r, g, b) (uint8_t)(((112 * (r) - 94 * (g) - 18 * (b) + 128) >> 8) + 128)
//
// Y = RGB2Y((int)R, (int)G, (int)B);
// U = RGB2U((int)R, (int)G, (int)B);
// V = RGB2V((int)R, (int)G, (int)B);
//}
enum Format
{
YUV_4_2_2_PLANAR,
YUV_4_2_2_PACKED,
YUV_4_4_4_PLANAR,
YUV_4_4_4_PACKED,
};
class RawImage
{
private:
std::unique_ptr<std::uint8_t> pixels;
std::uint32_t width, height;
std::uint16_t bpp;
public:
RawImage(const char* path, std::uint32_t width, std::uint32_t height);
~RawImage() {}
void SaveYUV(const char* path, Format format);
};
RawImage::RawImage(const char* path, std::uint32_t width, std::uint32_t height) : pixels(nullptr), width(width), height(height), bpp(32)
{
std::ifstream file(path, std::ios::in | std::ios::binary);
if (file)
{
std::size_t size = width * height * 4;
file.seekg(0, std::ios::beg);
pixels.reset(new std::uint8_t[size]);
file.read(reinterpret_cast<char*>(pixels.get()), size);
}
}
void RawImage::SaveYUV(const char* path, Format format)
{
std::ofstream file(path, std::ios::out | std::ios::binary);
if (file)
{
if (format == Format::YUV_4_2_2_PLANAR)
{
std::unique_ptr<std::uint8_t> y_plane{new std::uint8_t[width * height]};
std::unique_ptr<std::uint8_t> u_plane{new std::uint8_t[(width * height) >> 1]};
std::unique_ptr<std::uint8_t> v_plane{new std::uint8_t[(width * height) >> 1]};
std::uint8_t* in = pixels.get();
std::uint8_t* y_plane_ptr = y_plane.get();
std::uint8_t* u_plane_ptr = u_plane.get();
std::uint8_t* v_plane_ptr = v_plane.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; j += 2)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y1 = 0;
std::uint8_t U1 = 0;
std::uint8_t V1 = 0;
std::uint8_t Y2 = 0;
std::uint8_t U2 = 0;
std::uint8_t V2 = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y1, U1, V1);
RGBToYUV(in[in_pos + 4], in[in_pos + 5], in[in_pos + 6], Y2, U2, V2);
std::uint8_t U3 = (U1 + U2 + 1) >> 1;
std::uint8_t V3 = (V1 + V2 + 1) >> 1;
*y_plane_ptr++ = Y1;
*y_plane_ptr++ = Y2;
*u_plane_ptr++ = U3;
*v_plane_ptr++ = V3;
}
}
file.write(reinterpret_cast<char*>(y_plane.get()), width * height);
file.write(reinterpret_cast<char*>(u_plane.get()), (width * height) >> 1);
file.write(reinterpret_cast<char*>(v_plane.get()), (width * height) >> 1);
}
else if (format == Format::YUV_4_2_2_PACKED)
{
std::size_t size = width * height * 2;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; j += 2)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y1 = 0;
std::uint8_t U1 = 0;
std::uint8_t V1 = 0;
std::uint8_t Y2 = 0;
std::uint8_t U2 = 0;
std::uint8_t V2 = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y1, U1, V1);
RGBToYUV(in[in_pos + 4], in[in_pos + 5], in[in_pos + 6], Y2, U2, V2);
std::uint8_t U3 = (U1 + U2 + 1) >> 1;
std::uint8_t V3 = (V1 + V2 + 1) >> 1;
std::size_t out_pos = i * (width * 2) + 2 * j;
out[out_pos + 0] = Y1;
out[out_pos + 1] = U3;
out[out_pos + 2] = Y2;
out[out_pos + 3] = V3;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
else if (format == Format::YUV_4_4_4_PLANAR)
{
std::size_t size = width * height * 3;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; ++j)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y = 0;
std::uint8_t U = 0;
std::uint8_t V = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y, U, V);
std::size_t y_pos = i * width + j;
std::size_t u_pos = y_pos + (width * height);
std::size_t v_pos = y_pos + (width * height * 2);
out[y_pos] = Y;
out[u_pos] = U;
out[v_pos] = V;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
else if (format == Format::YUV_4_4_4_PACKED)
{
std::size_t size = width * height * 3;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; ++j)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y = 0;
std::uint8_t U = 0;
std::uint8_t V = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y, U, V);
std::size_t out_pos = i * (width * 3) + 3 * j;
out[out_pos + 0] = Y;
out[out_pos + 1] = U;
out[out_pos + 2] = V;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
}
}
int main(int argc, const char * argv[]) {
RawImage img{"/Users/brandon/Downloads/input.raw", 1280, 720};
img.SaveYUV("/Users/brandon/Downloads/output.yuv", Format::YUV_4_4_4_PACKED);
return 0;
}

You are overwriting the same byte here:
output[y*width + x][0] = Y;
output[y*width + x][0] = Cb;
output[y*width + x][0] = Cr;

C++ Blur effect on bit map is working but colors are changed

In relation to my previous question BitMap_blur efect, i have succeeded to make the bit map blurred but the problem is the colors of the blurred picture has been changed:
Original photo: https://ibb.co/eFHg8G
Blurred photo: https://ibb.co/mQDShb
The code of the blurring algorytm is the same as in my previous question:
for (xx = 0; xx < bitmapInfoHeader.biWidth; xx++)
{
for (yy = 0; yy <bitmapInfoHeader.biHeight; yy++)
{
avgB = avgG = avgR = 0;
Counter = 0;
for (x = xx; x < bitmapInfoHeader.biWidth && x < xx + blurSize; x++)
{
for (y = yy; y < bitmapInfoHeader.biHeight && y < yy + blurSize; y++)
{
avgB += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 0]; //bitmapimage[x][y];
avgG += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 1];
avgR += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 2];
Counter++;
}
}
avgB = avgB / Counter;
avgG = avgG / Counter;
avgR = avgR / Counter;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 0] = avgB;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 1] = avgG;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 2] = avgR;
}
}
So what am doing wrong here?

It actually looks like size of each line is padded to be multiple of 4 bytes. To get correct byte offset of each line you will need to replace
* bitmapInfoHeader.biWidth * 3
with
* (bitmapInfoHeader.biWidth * 3 + padding_bytes_count)
where
padding_bytes_count =
(
(
bitmapFileHeader.bfSize - bitmapFileHeader.bfOffBits
-
bitmapInfoHeader.biWidth * bitmapInfoHeader.biHeight * 3
)
/
bitmapInfoHeader.biHeight
);
For your tiger image padding_bytes_count should be 2.

Here, I create a semi-portable bitmap reader/writer.. Works on Windows, Linux Mint, MacOS High Sierra. I didn't test other platforms.. but it should work.
It has:
Portability
Load 24-bit bitmaps.
Load 32-bit bitmaps.
Write 24-bit bitmaps.
Write 32-bit bitmaps.
Convert between 24-bit and 32-bit bitmaps.
Convert between 32-bit and 24-bit bitmaps.
It doesn't have:
Support for Alpha Transparency. Alpha transparency has special fields and flags required to be set in the header. I don't feel like writing them in so it won't support it.
Only part of it that doesn't seem very portable would be the #pragma pack..
#include <iostream>
#include <fstream>
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#endif
typedef struct
{
uint8_t r, g, b, a;
} rgb32;
#if !defined(_WIN32) && !defined(_WIN64)
#pragma pack(2)
typedef struct
{
uint16_t bfType;
uint32_t bfSize;
uint16_t bfReserved1;
uint16_t bfReserved2;
uint32_t bfOffBits;
} BITMAPFILEHEADER;
#pragma pack()
#pragma pack(2)
typedef struct
{
uint32_t biSize;
int32_t biWidth;
int32_t biHeight;
uint16_t biPlanes;
uint16_t biBitCount;
uint32_t biCompression;
uint32_t biSizeImage;
int16_t biXPelsPerMeter;
int16_t biYPelsPerMeter;
uint32_t biClrUsed;
uint32_t biClrImportant;
} BITMAPINFOHEADER;
#pragma pack()
#endif
#pragma pack(2)
typedef struct
{
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;
} BMPINFO;
#pragma pack()
class bitmap
{
private:
BMPINFO bmpInfo;
uint8_t* pixels;
public:
bitmap(const char* path);
~bitmap();
void save(const char* path, uint16_t bit_count = 24);
rgb32* getPixel(uint32_t x, uint32_t y) const;
void setPixel(rgb32* pixel, uint32_t x, uint32_t y);
uint32_t getWidth() const;
uint32_t getHeight() const;
uint16_t bitCount() const;
};
bitmap::bitmap(const char* path) : bmpInfo(), pixels(nullptr)
{
std::ifstream file(path, std::ios::in | std::ios::binary);
if (file)
{
file.read(reinterpret_cast<char*>(&bmpInfo.bfh), sizeof(bmpInfo.bfh));
if (bmpInfo.bfh.bfType != 0x4d42)
{
throw std::runtime_error("Invalid format. Only bitmaps are supported.");
}
file.read(reinterpret_cast<char*>(&bmpInfo.bih), sizeof(bmpInfo.bih));
if (bmpInfo.bih.biCompression != 0)
{
std::cerr<<bmpInfo.bih.biCompression<<"\n";
throw std::runtime_error("Invalid bitmap. Only uncompressed bitmaps are supported.");
}
if (bmpInfo.bih.biBitCount != 24 && bmpInfo.bih.biBitCount != 32)
{
throw std::runtime_error("Invalid bitmap. Only 24bit and 32bit bitmaps are supported.");
}
file.seekg(bmpInfo.bfh.bfOffBits, std::ios::beg);
pixels = new uint8_t[bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits];
file.read(reinterpret_cast<char*>(&pixels[0]), bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits);
uint8_t* temp = new uint8_t[bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * sizeof(rgb32)];
uint8_t* in = pixels;
rgb32* out = reinterpret_cast<rgb32*>(temp);
int padding = bmpInfo.bih.biBitCount == 24 ? ((bmpInfo.bih.biSizeImage - bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * 3) / bmpInfo.bih.biHeight) : 0;
for (int i = 0; i < bmpInfo.bih.biHeight; ++i, in += padding)
{
for (int j = 0; j < bmpInfo.bih.biWidth; ++j)
{
out->b = *(in++);
out->g = *(in++);
out->r = *(in++);
out->a = bmpInfo.bih.biBitCount == 32 ? *(in++) : 0xFF;
++out;
}
}
delete[] pixels;
pixels = temp;
}
}
bitmap::~bitmap()
{
delete[] pixels;
}
void bitmap::save(const char* path, uint16_t bit_count)
{
std::ofstream file(path, std::ios::out | std::ios::binary);
if (file)
{
bmpInfo.bih.biBitCount = bit_count;
uint32_t size = ((bmpInfo.bih.biWidth * bmpInfo.bih.biBitCount + 31) / 32) * 4 * bmpInfo.bih.biHeight;
bmpInfo.bfh.bfSize = bmpInfo.bfh.bfOffBits + size;
file.write(reinterpret_cast<char*>(&bmpInfo.bfh), sizeof(bmpInfo.bfh));
file.write(reinterpret_cast<char*>(&bmpInfo.bih), sizeof(bmpInfo.bih));
file.seekp(bmpInfo.bfh.bfOffBits, std::ios::beg);
uint8_t* out = NULL;
rgb32* in = reinterpret_cast<rgb32*>(pixels);
uint8_t* temp = out = new uint8_t[bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * sizeof(rgb32)];
int padding = bmpInfo.bih.biBitCount == 24 ? ((bmpInfo.bih.biSizeImage - bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * 3) / bmpInfo.bih.biHeight) : 0;
for (int i = 0; i < bmpInfo.bih.biHeight; ++i, out += padding)
{
for (int j = 0; j < bmpInfo.bih.biWidth; ++j)
{
*(out++) = in->b;
*(out++) = in->g;
*(out++) = in->r;
if (bmpInfo.bih.biBitCount == 32)
{
*(out++) = in->a;
}
++in;
}
}
file.write(reinterpret_cast<char*>(&temp[0]), size); //bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits
delete[] temp;
}
}
rgb32* bitmap::getPixel(uint32_t x, uint32_t y) const
{
rgb32* temp = reinterpret_cast<rgb32*>(pixels);
return &temp[(bmpInfo.bih.biHeight - 1 - y) * bmpInfo.bih.biWidth + x];
}
void bitmap::setPixel(rgb32* pixel, uint32_t x, uint32_t y)
{
rgb32* temp = reinterpret_cast<rgb32*>(pixels);
memcpy(&temp[(bmpInfo.bih.biHeight - 1 - y) * bmpInfo.bih.biWidth + x], pixel, sizeof(rgb32));
};
uint32_t bitmap::getWidth() const
{
return bmpInfo.bih.biWidth;
}
uint32_t bitmap::getHeight() const
{
return bmpInfo.bih.biHeight;
}
uint16_t bitmap::bitCount() const
{
return bmpInfo.bih.biBitCount;
}
void apply_blur(int x, int y, bitmap* bmp, int blurRadius)
{
double blurValue = 0.111;
int r = 0;
int g = 0 ;
int b = 0;
for (int k = y - blurRadius; k <= blurRadius; ++k)
{
for (int l = x - blurRadius; l <= blurRadius; ++l)
{
rgb32* pixel = bmp->getPixel(l, k);
r += blurValue * pixel->r;
g += blurValue * pixel->g;
b += blurValue * pixel->b;
}
}
rgb32 pixel = *bmp->getPixel(x, y);
pixel.r = r;
pixel.g = g;
pixel.b = b;
bmp->setPixel(&pixel, x, y);
}
int main(int argc, const char * argv[])
{
bitmap bmp{"/Users/brandon/Desktop/tiger.bmp"};
bmp.save("/Users/brandon/Desktop/blurred-tiger-24.bmp");
bmp.save("/Users/brandon/Desktop/blurred-tiger-32.bmp", 32);
return 0;
}
Now all you have to do is add your blur algorithm.. I tried it, but couldn't figure out the blurring part.. I ended up porting an algorithm found here: http://blog.ivank.net/fastest-gaussian-blur.html
void blur(bitmap* bmp, int radius)
{
float rs = ceil(radius * 2.57);
for (int i = 0; i < bmp->getHeight(); ++i)
{
for (int j = 0; j < bmp->getWidth(); ++j)
{
double r = 0, g = 0, b = 0;
double count = 0;
for (int iy = i - rs; iy < i + rs + 1; ++iy)
{
for (int ix = j - rs; ix < j + rs + 1; ++ix)
{
auto x = std::min(static_cast<int>(bmp->getWidth()) - 1, std::max(0, ix));
auto y = std::min(static_cast<int>(bmp->getHeight()) - 1, std::max(0, iy));
auto dsq = ((ix - j) * (ix - j)) + ((iy - i) * (iy - i));
auto wght = std::exp(-dsq / (2.0 * radius * radius)) / (M_PI * 2.0 * radius * radius);
rgb32* pixel = bmp->getPixel(x, y);
r += pixel->r * wght;
g += pixel->g * wght;
b += pixel->b * wght;
count += wght;
}
}
rgb32* pixel = bmp->getPixel(j, i);
pixel->r = std::round(r / count);
pixel->g = std::round(g / count);
pixel->b = std::round(b / count);
}
}
}
int main(int argc, const char * argv[])
{
bitmap bmp{"/Users/brandon/Desktop/tiger.bmp"};
blur(&bmp, 5);
bmp.save("/Users/brandon/Desktop/blurred-tiger.bmp");
return 0;
}
The result becomes:

Since iam only applying the blur effect on 24-bitmaps, I add the padding thing and modified my 3th and 4th loop:
for (x = xx; x < bitmapInfoHeader.biWidth && x < xx + blurSize; **x+=3**)
{
for (y = yy; y < bitmapInfoHeader.biHeight && y < yy + blurSize; **y+=3**)
And it works! the photo still have a weard thin line on the left but i think this is a read/write bitmap problem and i can handle it myself :)
The blurred photo: https://ibb.co/iGp9Cb and another blurred picture: https://ibb.co/jFXUCb
Thank you guys for your answers! it helped alot

SDL2.0 screen nullptr on render of Window

Hey so I'm relatively new to the SDL library and just trying to get to grips with it.
I found a C++ conversion for Minecraft4k but it was based on SDL1.x so I'm trying to convert it to SDL2.0
At present the build is successful, but when it gets to;
plot(x, y, rgbmul(col, fxmul(br, ddist)));
It throws a read access violation exception:
screen was nullptr
This is my code;
// C++ port of Minecraft 4k JS (http://jsdo.it/notch/dB1E)
// By The8BitPimp
// See: the8bitpimp.wordpress.com
#include <SDL.h>
#include <math.h>
#include <windows.h>
#include <tchar.h>
#include "plot.h"
#include "llist.h"
const int w = 320;
const int h = 240;
SDL_Surface *screen = nullptr;
const float math_pi = 3.14159265359f;
static inline float math_sin(float x) {
return sinf(x);
}
static inline float math_cos(float x) {
return cosf(x);
}
// the texture map
int texmap[16 * 16 * 16 * 3];
// the voxel map
char map[64 * 64 * 64];
static inline int random(int max) {
return (rand() ^ (rand() << 16)) % max;
}
static inline void plot(int x, int y, int c) {
int *p = (int*)screen->pixels;
p[y * w + x] = c;
}
static void makeTextures(void) {
// each texture
for (int j = 0; j<16; j++) {
int k = 255 - random(96);
// each pixel in the texture
for (int m = 0; m<16 * 3; m++)
for (int n = 0; n<16; n++) {
int i1 = 0x966C4A;
int i2 = 0;
int i3 = 0;
if (j == 4)
i1 = 0x7F7F7F;
if ((j != 4) || (random(3) == 0))
k = 255 - random(96);
if (j == 1)
{
if (m < (((n * n * 3 + n * 81) >> 2) & 0x3) + 18)
i1 = 0x6AAA40;
else if (m < (((n * n * 3 + n * 81) >> 2) & 0x3) + 19)
k = k * 2 / 3;
}
if (j == 7)
{
i1 = 0x675231;
if ((n > 0) && (n < 15) && (((m > 0) && (m < 15)) || ((m > 32) && (m < 47))))
{
i1 = 0xBC9862;
i2 = n - 7;
i3 = (m & 0xF) - 7;
if (i2 < 0)
i2 = 1 - i2;
if (i3 < 0)
i3 = 1 - i3;
if (i3 > i2)
i2 = i3;
k = 196 - random(32) + i2 % 3 * 32;
}
else if (random(2) == 0)
k = k * (150 - (n & 0x1) * 100) / 100;
}
if (j == 5)
{
i1 = 0xB53A15;
if (((n + m / 4 * 4) % 8 == 0) || (m % 4 == 0))
i1 = 0xBCAFA5;
}
i2 = k;
if (m >= 32)
i2 /= 2;
if (j == 8)
{
i1 = 5298487;
if (random(2) == 0)
{
i1 = 0;
i2 = 255;
}
}
// fixed point colour multiply between i1 and i2
i3 =
((((i1 >> 16) & 0xFF) * i2 / 255) << 16) |
((((i1 >> 8) & 0xFF) * i2 / 255) << 8) |
((i1 & 0xFF) * i2 / 255);
// pack the colour away
texmap[n + m * 16 + j * 256 * 3] = i3;
}
}
}
static void makeMap(void) {
// add random blocks to the map
for (int x = 0; x < 64; x++) {
for (int y = 0; y < 64; y++) {
for (int z = 0; z < 64; z++) {
int i = (z << 12) | (y << 6) | x;
float yd = (y - 32.5) * 0.4;
float zd = (z - 32.5) * 0.4;
map[i] = random(16);
float th = random(256) / 256.0f;
if (th > sqrtf(sqrtf(yd * yd + zd * zd)) - 0.8f)
map[i] = 0;
}
}
}
}
static void init(void) {
makeTextures();
makeMap();
}
// fixed point byte byte multiply
static inline int fxmul(int a, int b) {
return (a*b) >> 8;
}
// fixed point 8bit packed colour multiply
static inline int rgbmul(int a, int b) {
int _r = (((a >> 16) & 0xff) * b) >> 8;
int _g = (((a >> 8) & 0xff) * b) >> 8;
int _b = (((a)& 0xff) * b) >> 8;
return (_r << 16) | (_g << 8) | _b;
}
static void render(void) {
float now = (float)(SDL_GetTicks() % 10000) / 10000.f;
float xRot = math_sin(now * math_pi * 2) * 0.4 + math_pi / 2;
float yRot = math_cos(now * math_pi * 2) * 0.4;
float yCos = math_cos(yRot);
float ySin = math_sin(yRot);
float xCos = math_cos(xRot);
float xSin = math_sin(xRot);
float ox = 32.5 + now * 64.0;
float oy = 32.5;
float oz = 32.5;
// for each column
for (int x = 0; x < w; x++) {
// get the x axis delta
float ___xd = ((float)x - (float)w / 2.f) / (float)h;
// for each row
for (int y = 0; y < h; y++) {
// get the y axis delta
float __yd = ((float)y - (float)h / 2.f) / (float)h;
float __zd = 1;
float ___zd = __zd * yCos + __yd * ySin;
float _yd = __yd * yCos - __zd * ySin;
float _xd = ___xd * xCos + ___zd * xSin;
float _zd = ___zd * xCos - ___xd * xSin;
int col = 0;
int br = 255;
float ddist = 0;
float closest = 32.f;
// for each principle axis x,y,z
for (int d = 0; d < 3; d++) {
float dimLength = _xd;
if (d == 1)
dimLength = _yd;
if (d == 2)
dimLength = _zd;
float ll = 1.0f / (dimLength < 0.f ? -dimLength : dimLength);
float xd = (_xd)* ll;
float yd = (_yd)* ll;
float zd = (_zd)* ll;
float initial = ox - floor(ox);
if (d == 1) initial = oy - floor(oy);
if (d == 2) initial = oz - floor(oz);
if (dimLength > 0) initial = 1 - initial;
float dist = ll * initial;
float xp = ox + xd * initial;
float yp = oy + yd * initial;
float zp = oz + zd * initial;
if (dimLength < 0) {
if (d == 0) xp--;
if (d == 1) yp--;
if (d == 2) zp--;
}
// while we are concidering a ray that is still closer then the best so far
while (dist < closest) {
// quantize to the map grid
int tex = map[(((int)zp & 63) << 12) | (((int)yp & 63) << 6) | ((int)xp & 63)];
// if this voxel has a texture applied
if (tex > 0) {
// find the uv coordinates of the intersection point
int u = ((int)((xp + zp) * 16.f)) & 15;
int v = ((int)(yp * 16.f) & 15) + 16;
// fix uvs for alternate directions?
if (d == 1) {
u = ((int)(xp * 16.f)) & 15;
v = (((int)(zp * 16.f)) & 15);
if (yd < 0)
v += 32;
}
// find the colour at the intersection point
int cc = texmap[u + v * 16 + tex * 256 * 3];
// if the colour is not transparent
if (cc > 0) {
col = cc;
ddist = 255 - ((dist / 32 * 255));
br = 255 * (255 - ((d + 2) % 3) * 50) / 255;
// we now have the closest hit point (also terminates this ray)
closest = dist;
}
}
// advance the ray
xp += xd;
yp += yd;
zp += zd;
dist += ll;
}
}
plot(x, y, rgbmul(col, fxmul(br, ddist)));
}
}
}
int main(int argc, char *argv[]) {
SDL_Init(SDL_INIT_EVERYTHING);
SDL_Window *screen;
screen = SDL_CreateWindow(
"Minecraft4k", // window title
SDL_WINDOWPOS_CENTERED, // initial x position
SDL_WINDOWPOS_CENTERED, // initial y position
320, // width, in pixels
240, // height, in pixels
SDL_WINDOW_OPENGL // flags - see below
);
SDL_Renderer* renderer;
renderer = SDL_CreateRenderer(screen, -1, SDL_RENDERER_ACCELERATED);
if (screen == nullptr) {
return 1;
}
init();
bool running = true;
while (running) {
SDL_Event event;
while (SDL_PollEvent(&event)) {
running &= (event.type != SDL_QUIT);
}
SDL_RenderPresent(renderer);
render();
}
SDL_DestroyWindow(screen);
SDL_Quit();
return 0;
}
When I actually run the code I do get a black screen, but the debugger lands on the line
plot(x, y, rgbmul(col, fxmul(br, ddist)));
in ;
static void render(void)
This is all just "for fun" so any information or guidance is appreciated.

You define screen twice (the first time as a global variable, the second time within your main), but you initialize it only once (within your main).
Because of that, the global variable screen actually is set to nullptr and plot fails trying to use it, as the error message states.

How to optimize YUV to RGB color conversion code

I have written a function to convert an image in YUV420P to RGB but it is taking 30 millisecond to convert an image (size: 1280 x 720) into RGB, but when I am using ffmpeg function ( as this) to convert YUV image into RGB its taking only 2 millisecond for the same image. What is the problem with my code ? How can I optimize the code that I have written ??
My code is given below
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
for (int i = 0; i<origImage->height; i++)
{
for (int j=0; j<origImage->width; j++)
{
float Y = data[i*step + j];
float U = data[ (int)(size + (i/2)*(step/2) + j/2) ];
float V = data[ (int)(size*1.25 + (i/2)*(step/2) + j/2)];
float R = Y + 1.402 * (V - 128);
float G = Y - 0.344 * (U - 128) - 0.714 * (V - 128);
float B = Y + 1.772 * (U - 128);
if (R < 0){ R = 0; } if (G < 0){ G = 0; } if (B < 0){ B = 0; }
if (R > 255 ){ R = 255; } if (G > 255) { G = 255; } if (B > 255) { B = 255; }
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}

Here, try this(should reduce to 25 milliseconds):
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
int stepDb2=step /2;
float sizeMb1d25=size*1.25 ;
int origImagePTheight=origImage->height;
int origImagePTwidth=origImage->width;
for (int i = 0; i<origImagePTheight; i++)
{
float idb2=i/2;
int iStep=i*step;
for (int j=0; j<origImagePTwidth; j++)
{
float variable=idb2*stepDb2 + j/2;
float Y = data[iStep + j];
float U = -128 + data[ (int)(size + variable) ];
float V = -128 + data[ (int)(sizeMb1d25 + variable)];
float R = Y + 1.402 * V ;
float G = Y - 0.344 * U - 0.714 * V;
float B = Y + 1.772 * U;
R= R * !(R<0);
G= G * !(G<0);
B= B * !(B<0);
R=R*(!(R>255)) + 255 * (R>255);
G=G*(!(G>255)) + 255 * (G>255);
B=B*(!(B>255)) + 255 * (B>255);
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Problem of converting bgr to yuv420p with cuda - c++

Related

Implement bokeh blur with C++

Converting RGBA binary image into YCbCr but the result is not as expected

C++ Blur effect on bit map is working but colors are changed

SDL2.0 screen nullptr on render of Window

How to optimize YUV to RGB color conversion code

Categories

Resources