GDIPlus::Bitmap Brightening Image c++ - c++

The aim of the following function is to get the R,G,B values of each pixel from a Bitmap loaded from file and increase them by 10.
void PerformTransformation(Gdiplus::Bitmap* bitmap, LPCTSTR SaveFileName) {
Gdiplus::BitmapData* bitmapData = new Gdiplus::BitmapData;
UINT Width = bitmap->GetWidth();
UINT Height = bitmap->GetHeight();
Gdiplus::Rect rect(0, 0,Width,Height );
bitmap->LockBits(&rect, Gdiplus::ImageLockModeRead, PixelFormat32bppARGB, bitmapData);
byte* pixels = (byte*)bitmapData->Scan0;
INT iStride = abs(bitmapData->Stride);
for (UINT col = 0; col < Width; ++col)
for (UINT row = 0; row < Height; ++row)
{
unsigned int curColor = pixels[row * iStride / 4 + col];
int b = curColor & 0xff;
int g = (curColor & 0xff00) >> 8;
int r = (curColor & 0xff0000) >> 16;
if ((r + 10) > 255) r = 255; else r += 10;
if ((g + 10) > 255) g = 255; else g += 10;
if ((b + 10) > 255) b = 255; else b += 10;
pixels[curColor & 0xff ] = b;
pixels[curColor & 0xff00 >> 8] = g;
pixels[curColor & 0xff0000 >> 16] = r;
}
bitmap->UnlockBits(bitmapData);
CLSID pngClsid;
GetEncoderClsid(L"image/png", &pngClsid);
bitmap->Save(SaveFileName, &pngClsid, NULL);
}
However when checking the save file, the brightness has not increased. I have tried to increase the values to update each R,G,B value to be 100 each but the image remains the same, Seems like i'm not setting the new values correctly.
Can anyone show me what im doing wrong?
EDIT:
After following some guidance i now have the image brightening but only brightening a quarter of the image.
Changed Code
void PerformTransformation(Gdiplus::Bitmap* bitmap, LPCTSTR SaveFileName) {
Gdiplus::BitmapData* bitmapData = new Gdiplus::BitmapData;
UINT Width = bitmap->GetWidth();
UINT Height = bitmap->GetHeight();
Gdiplus::Rect rect(0, 0,Width,Height );
// Lock a 5x3 rectangular portion of the bitmap for reading.
bitmap->LockBits(&rect, Gdiplus::ImageLockModeWrite,
PixelFormat32bppARGB, bitmapData);
byte* Pixels = (byte*)bitmapData->Scan0;
INT stride_bytes_count = abs(bitmapData->Stride);
UINT row_index, col_index;
byte pixel[4];
for (col_index = 0; col_index < Width; ++col_index) {
for (row_index = 0; row_index < Height; ++row_index)
{
unsigned int curColor = Pixels[row_index * stride_bytes_count /
4 + col_index];
int b = curColor & 0xff;
int g = (curColor & 0xff00) >> 8;
int r = (curColor & 0xff0000) >> 16;
if ((r + 10) > 255) r = 255; else r += 10;
if ((g + 10) > 255) g = 255; else g += 10;
if ((b + 10) > 255) b = 255; else b += 10;
pixel[0] = b;
pixel[1] = g;
pixel[2] = r;
Pixels[row_index * stride_bytes_count / 4 + col_index] = *pixel;
}
}
bitmap->UnlockBits(bitmapData);
::DeleteObject(bitmapData);
CLSID pngClsid;
GetEncoderClsid(L"image/png", &pngClsid);
bitmap->Save(SaveFileName, &pngClsid, NULL);
}
};

You never check return codes.
You access bitmap data in reading mode (Gdiplus::ImageLockModeRead)
You are indexing pixel channel values by color value pixels[curColor & 0xff]
You never delete allocated bitmapData object

Related

Problem of converting bgr to yuv420p with cuda

I need to convert image from bgr to yuv420p and I first use OpenCV to do so.
Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);
The result of it is normal. However,my image is too big and its pixel is almost 6400 * 2000.
I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.
Then I decide to convert it myself and speed it with cuda.
Here is code in cpu:
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
I test the code bgr_to_yuv420p(...) and the result is also normal.
Then I speed it up with cuda.
Here is all my code include kernel function and test function.
#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
int main(void)
{
Mat srcImage = imread("1.bmp");
imshow("srcImage", srcImage);
const uint imgheight = srcImage.rows;
const uint imgwidth = srcImage.cols;
Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));
//input and output
uchar3 *d_in;
unsigned char *d_out;
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
imshow("nv12",nv12Image);
imwrite("cuda.bmp",nv12Image);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
The code with cuda can run but the result is not normal. Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...)
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I try a lot but still cannot solve it. And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb. So I want to know is somebody running into the same question or giving me some advice?
Thanks Robert Crovella.Here is my update-1.
I follow Robert Crovella's advice and change the kernel function like this:
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
I test the new kernel with excitement,but the result is also not normal.
Here is my result image with the updated kernel function.
yuv420p image converted by myself
Then the normal result image converted by opencv api is here.
yuv420p image converted by opencv api
As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, i.e.
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I think it will work but it does not. Any other advice? Robert Crovella
Edit: The solution is Robert Crovella's latest answer. I have double checked it and it is really perfect.
There are a variety of issues:
the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical. Yes, this matters.
Your CPU code has planar Y,U,V storage. That means Y has its own plane, U has its own plane, and V has its own plane. Your GPU codes is semi planar (NV12) format. That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.
IMO, there is no need to drag OpenCV into this.
Your UV offset calculation in the kernel (GPU) code was broken. The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. You do not have that many rows in the UV planar region, you only have half as many rows.
In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...
My recommendation would be to start by harmonizing the calculation differences and storage order/format. The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:
$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Any time you are having trouble with a CUDA code, I recommend
Proper CUDA error checking
Running your code with cuda-memcheck
EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420sp(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
d_out[u_offset] = bgr2u(r,g,b);
int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
d_out[v_offset] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

MediaFoundation + VP8 + Color Formats

My ultimate goal is to add video support to my Virtual Class application. My method is:
Capture frames using Media Foundation
Encode with VP8 using LibVPX
Transmit with UDP
Decode at the receipient site
Show frames in a Window
The first problem arises with my WebCam's supported color encodings. The webcam Media Types contain only MFVideoFormat_NV12. My first debugging attempt saves the receiving image to a bitmap so I can test it is correctly captured (error handling removed):
HRESULT CAP::StartRecord(HWND hh, CComPtr<IMFMediaSource> src)
{
MFCreateSourceReaderFromMediaSource(src, 0, &sr);
CComPtr<IMFMediaType> fmt;
sr->GetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM,&fmt);
LogMediaType(fmt); // Shows: MFVideoFormat_NV12
auto [wi, he] = WidthHeight(fmt);
for (;;)
{
DWORD streamIndex = 0, flags = 0;
LONGLONG llTimeStamp = 0;
CComPtr<IMFSample> pSample;
hr = sr->ReadSample(MF_SOURCE_READER_FIRST_VIDEO_STREAM,0,&streamIndex,&flags,&llTimeStamp,&pSample);
if (FAILED(hr))
break;
if (!pSample)
continue;
CComPtr<IMFMediaBuffer> bu;
pSample->ConvertToContiguousBuffer(&bu);
SaveSampleNV12(bu, wi, he);
}
...
}
SaveSampleNV12 uses code from here to convert NV12 to RGB, then:
void SaveSampleNV12(CComPtr<IMFMediaBuffer> mm, int width32, int height32)
{
DWORD le = 0;
mm->GetCurrentLength(&le);
BYTE *pDatad = NULL;
auto hr = mm->Lock(&pDatad, NULL, NULL);
vector<char> rgb(1000000);
NV12ToRGB((BYTE*)rgb.data(), pDatad, width32, height32);
mm->Unlock();
HANDLE file;
BITMAPFILEHEADER fileHeader;
BITMAPINFOHEADER fileInfo;
DWORD write = 0;
auto df = L"r:\\f.bmp";
file = CreateFile(df.c_str(), GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); //Sets up the new bmp to be written to
int bits = 24;
fileHeader.bfType = 19778; //Sets our type to BM or bmp
fileHeader.bfSize = sizeof(fileHeader.bfOffBits) + sizeof(RGBTRIPLE); //Sets the size equal to the size of the header struct
fileHeader.bfReserved1 = 0; //sets the reserves to 0
fileHeader.bfReserved2 = 0;
fileHeader.bfOffBits = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER); //Sets offbits equal to the size of file and info header
fileInfo.biSize = sizeof(BITMAPINFOHEADER);
fileInfo.biWidth = width32;
fileInfo.biHeight = height32;
fileInfo.biPlanes = 1;
fileInfo.biBitCount = bits;
fileInfo.biCompression = BI_RGB;
fileInfo.biSizeImage = width32 * height32 * (bits / 8);
fileInfo.biXPelsPerMeter = 0;// 2400;
fileInfo.biYPelsPerMeter = 0;// 2400;
fileInfo.biClrImportant = 0;
fileInfo.biClrUsed = 0;
WriteFile(file, &fileHeader, sizeof(fileHeader), &write, NULL);
WriteFile(file, &fileInfo, sizeof(fileInfo), &write, NULL);
unsigned char* ptrIn = (unsigned char*)rgb.data();
int rgbs = width32 * height32 * (bits / 8);
vector<char> d2(rgbs);
unsigned char* ptrOut = (unsigned char*)d2.data();
for (int i = 0; i < (width32*height32) / 2; ++i)
{
int y0 = ptrIn[0];
int u0 = ptrIn[1];
int y1 = ptrIn[2];
int v0 = ptrIn[3];
ptrIn += 4;
int c = y0 - 16;
int d = u0 - 128;
int e = v0 - 128;
int bb = clip((298 * c + 516 * d + 128) >> 8); // blue
int gg = clip((298 * c - 100 * d - 208 * e + 128) >> 8); // green
int rr = clip((298 * c + 409 * e + 128) >> 8); // red
ptrOut[0] = bb;
ptrOut[1] = gg;
ptrOut[2] = rr;
c = y1 - 16;
ptrOut[3] = clip((298 * c + 516 * d + 128) >> 8); // blue
ptrOut[4] = clip((298 * c - 100 * d - 208 * e + 128) >> 8); // green
ptrOut[5] = clip((298 * c + 409 * e + 128) >> 8); // red
ptrOut += 6;
}
unsigned char* cc = (unsigned char*)d2.data();
WriteFile(file, cc, rgbs, &write, NULL);
CloseHandle(file);
}
This returns a weird full of Pink image. Something I'm doing wrong, but what?
Thanks a lot.
The solution is to use IMFTransform to tranform between various color spaces.

Generate Image from generated byte array in UWP vc++

Reference with this Question & answer by #Decade Moon
How can i use that method for generate image from byte array instead of image file.
i tried like below but nothing works. no image are shown
std::vector<char> data= std::vector<char>(imgx->Height * imgx->Width * 4);
int offset;
for (int row = 0; row < imgx->Height; row++)
{
for (int col = 0; col < imgx->Width; col++)
{
offset = (row * (int)(imgx->Width * 4)) + (col * 4);
data[offset] = 0x58; // Red
data[offset + 1] = 0x58; // Green
data[offset + 2] = 0x58; // Blue
data[offset + 3] = 0x58; // Alpha
}
};
My approach is little bit different from the reply you reffered to, but it works pretty well.
#include <wrl.h>
#include <robuffer.h>
using namespace Windows::UI::Xaml::Media::Imaging;
using namespace Windows::Storage::Streams;
using namespace Microsoft::WRL;
typedef uint8 byte;
byte* GetPointerToPixelData(IBuffer^ pixelBuffer, unsigned int *length)
{
if (length != nullptr)
{
*length = pixelBuffer ->Length;
}
// Query the IBufferByteAccess interface.
ComPtr<IBufferByteAccess> bufferByteAccess;
reinterpret_cast<IInspectable*>(pixelBuffer)->QueryInterface(IID_PPV_ARGS(&bufferByteAccess));
// Retrieve the buffer data.
byte* pixels = nullptr;
bufferByteAccess->Buffer(&pixels);
return pixels;
}
MainPage::MainPage()
{
InitializeComponent();
auto bitmap = ref new WriteableBitmap(50, 50);
image->Source = bitmap;
unsigned int length;
byte* sourcePixels = GetPointerToPixelData(bitmap->PixelBuffer, &length);
const unsigned int width = bitmap->PixelWidth;
const unsigned int height = bitmap->PixelHeight;
create_async([this, width, height, sourcePixels] {
byte* temp = sourcePixels;
// generate RED - BLUE gradient
for(unsigned int k = 0; k < height; k++) {
for (unsigned int i = 0; i < (width * 4); i += 4) {
int pos = k * (width * 4) + (i);
temp[pos] = (byte)(0xFF * k / (float)height); // B
temp[pos + 1] = 0x0; // G
temp[pos + 2] = 0xFF - (byte)(0xFF * k / (float)height); // R
temp[pos + 3] = 0xFF; // A
}
}
});
}

BITMAPDATA issue on 64 bit and 32 bit C++

I am developing an application using c++
I am facing a problem when trying to capture screen. then edit some of its pixels
and save the image
My code works absolutely fine when when i select the platform as Win32
but as soon as i change the platform from Win32 to x64, the code fails
It start giving access violation when trying to access the pixels
I checked that under both platforms, size of int is 4 bytes and imageData.Stride is coming as -5528
when i do (row*stride/4 + col) i get same value on both platforms
imageData.getPixelFormat() returns 139273 which is PixelFormat32bppRGB
under both platforms
I am posting the code below
please help me out, i have done lot of google, but nothing helps
The access violation error comes at this line
UINT curColor = pixels[row * iStride / 4 + col];
when row value is >0
void BitmapToJpg(HBITMAP hbmpImage, int width, int height)
{
p_bmp = Bitmap::FromHBITMAP(hbmpImage, NULL);
CLSID pngClsid;
int result = GetEncoderClsid(L"image/jpeg", &pngClsid);
if (result != -1)
std::cout << "Encoder succeeded" << std::endl;
else
std::cout << "Encoder failed" << std::endl;
//***************************Testing Lockbits********************************//
// successfull results and position is also correct
BitmapData imageData;
Rect rect(0, 0, width, height);
p_bmp->LockBits(
&rect,
ImageLockModeWrite,
p_bmp->GetPixelFormat(),
//PixelFormat24bppRGB,
&imageData);
cout << p_bmp->GetPixelFormat();
UINT* pixels;
pixels = (UINT*)imageData.Scan0;
int iStride = imageData.Stride;
int x = sizeof(int);
byte red = 0;
byte green = 0;
byte blue = 255;
byte alpha = 0;
for (int row = 0; row < height; ++row)
{
for (int col = 0; col < width; ++col)
{
///Some code to get color
UINT curColor = pixels[row * iStride / 4 + col];
int b = curColor & 0xff;
int g = (curColor & 0xff00) >> 8;
int r = (curColor & 0xff0000) >> 16;
int a = (curColor & 0xff000000) >> 24;
//result_pixels[col][row] = RGB(r, g, b);
if (b>15 && b < 25 && g<5 && r>250)
{
//Red found
//Code to change color, generate ARGB from provided RGB values
UINT32 rgb = (alpha << 24) + (red << 16) + (green << 8) + (blue);
curColor = rgb;
b = curColor & 0xff;
g = (curColor & 0xff00) >> 8;
r = (curColor & 0xff0000) >> 16;
a = (curColor & 0xff000000) >> 24;
cout << "Red found" << endl;
pixels[row * iStride / 4 + col]=rgb;
}
}
}
p_bmp->UnlockBits(&imageData);
//*****************************Till Here*************************************//
p_bmp->Save(L"screen.jpg", &pngClsid, NULL);
delete p_bmp;
}

Applying a transparent gradient from left to right in C/C++

I'm trying to create an algorithm in C/C++, which applies a uniform transparent gradient from left to right to a pixel buffer. As seen on the next image:
Next is so far my implementation. But the resulting image is not even close to what I need to achieve. Anyone can spot what I'm doing wrong? Thanks
void alphaGradient(uint32_t* pixelsBuffer, const int width, const int height)
{
const short OPAQUE = 255;
int pixelOffsetY, pixelIndex;
short A, R, G, B;
for (int y = 0; y < height; y++)
{
A = OPAQUE;
pixelOffsetY = y * height;
for (int x = 0; x < width; x++)
{
pixelIndex = pixelOffsetY + x;
A = (int)(OPAQUE - ((OPAQUE * x) / width));
R = (pixelsBuffer[pixelIndex] & 0x00FF0000) >> 16;
G = (pixelsBuffer[pixelIndex] & 0x0000FF00) >> 8;
B = (pixelsBuffer[pixelIndex] & 0x000000FF);
pixelsBuffer[pixelIndex] = (A << 24) + (R << 16) + (G << 8) + B;
}
}
}
I haven't tried this code out but something like this should work :
void alphaGradient(uint32_t* pixelBuffer, const int width, const int height)
{
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
const DWORD src = pixelBuffer[i + j * width];
const DWORD dst = MYBACKGROUNDCOLOR;
const unsigned char src_A = (width - i) * 255 / width;
const unsigned char src_R = (src & 0x00FF0000) >> 16;
const unsigned char src_G = (src & 0x0000FF00) >> 8;
const unsigned char src_B = (src & 0x000000FF);
//const unsigned char dst_Alpha = (src & 0xFF000000) >> 24;
const unsigned char dst_R = (dst & 0x00FF0000) >> 16;
const unsigned char dst_G = (dst & 0x0000FF00) >> 8;
const unsigned char dst_B = (dst & 0x000000FF);
const unsigned char rlt_R = (src_R * src_A + dst_R * (255 - src_A)) / 255;
const unsigned char rlt_G = (src_G * src_A + dst_G * (255 - src_A)) / 255;
const unsigned char rlt_B = (src_B * src_A + dst_B * (255 - src_A)) / 255;
//pixelBuffer[i + j*width] = (DWORD)(((255) << 24) | (((rlt_R)& 0xff) << 16) | (((rlt_G)& 0xff) << 8) | ((rlt_B)& 0xff));
// or if you want to save the transparancy then
//pixelBuffer[i + j*width] = (DWORD)(((src_A) << 24) | (((src_R)& 0xff) << 16) | (((src_G)& 0xff) << 8) | ((src_B)& 0xff));
}
}
}
But personally, I would try to use DirectX or OpenGL for this and write a good PixelShader. It would make this ALOT faster.
As a suggestion, since you only want to modify the alpha channel, you do not need to do anything with the colors. So the following would work too:
char *b((char *) pixelBuffer);
for(int j = 0; j < height; ++j)
{
for(int i = 0; i < width; ++i, b += 4)
{
*b = (width - i) * 255 / width;
}
}
That's it. You could also eliminate the computation for each line by duplicating the data of the first line in the following lines:
// WARNING: code expects height > 0!
char *b((char *) pixelBuffer);
for(int i = 0; i < width; ++i, b += 4)
{
*b = (width - i) * 255 / width;
}
int offset = width * -4;
for(int j = 1; j < height; ++j)
{
for(int i = 0; i < width; ++i, b += 4)
{
*b = b[offset];
}
}
I will leave as an exercise to you to change this double for() loop in a single for() loop, which would make it a little faster yet (because you'd have a single counter (variable b) instead of three).
Note that I do not understand how Mikael's answer would work as he uses the * 255 in the wrong place in his computation of the alpha channel. With integer arithmetic, that's very important. So this should return 0 or 255:
(width - i) / width * 255
because if value < width then value / width == 0. And (width - i) is either width or a value smaller than width...