I am using CUDA 7.0 toolkit on a Windows-7 OS. I am using VS-2013.
I tried to generate the timeline of vector addition sample program and it worked. But when I follow exactly same steps to generate a timeline of my own code then, it keep showing a message "Running application to generate timeline". I know that the kernel gets called and everything is working.
cudaDeviceReset() call is also there after finishing everything related to CUDA.
Program: I have changed my original question to provide a minimal working example which can produce the same problem. The following code is not generating a timeline using nvvp irrespective of the place where I put cudaDeviceReset().
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <stdio.h>
using namespace cv;
__global__ void colorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih)
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];
int main()
while (1)
Mat image(400, 400, CV_8UC3, Scalar(0, 0, 255));
unsigned char *h_src = image.data;
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
unsigned char *dev_src, *dev_dst, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src, numBytes * sizeof(unsigned char));
cudaMalloc((void**)&dev_dst, numBytes * sizeof(unsigned char));
////Copy the source image to the device i.e. GPU
cudaMemcpy(dev_src, h_src, numBytes * sizeof(unsigned char), cudaMemcpyHostToDevice);
dim3 numOfBlocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied by 3 because we have 3 channel image now
dim3 numOfThreadsPerBlocks(20, 20);
colorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks >> >(numChannels, image.cols, image.rows, dev_src, dev_dst);
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3);
h_dst = org_dijSDK_img.data;
cudaMemcpy(h_dst, dev_dst, numBytes * sizeof(unsigned char), cudaMemcpyDeviceToHost);
imshow("Processed dijSDK image", org_dijSDK_img);
return 0;
Very Important Clue: If I comment the line while(1) and hence run the code only once then, the nvvp generates timeline. But in my original project, I cannot get the timeline profile by doing so because, it contain multi-threading and other stuff due to which, there is no image to process during the first run. So, I must need some way to generate the timeline with a code containing infinite while loop.

The problem in my code is the endless while loop due to which the cudaDeviceReset() were never being called. There are two possible solutions to deal with such situations:
If you are interested to have a look at timeline profiling only then, just comment your while loop and the nvvp would be able to reach the cudaDeviceReset() present at the end of main().
There might be a situation where you must keep a loop inside your program. For example, in my original project containing multi-threading, there is no image to process during initial 180 run of while loop. To deal with such situations, replace your while loop with the for loop which can run for limited number of times. For example, the following code has helped me to get a timeline profiling of 4 number of runs. I am posting only the modified main().
int main()
cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;
//while (1)
for (int i = 0; i < 4; i++)
Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;
unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;
//copy new data of image to the host pointer
h_src = image.data;
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);
//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);
// wait for results
//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
imshow("Processed dijSDK image", org_dijSDK_img);
return 0;


Better way to convert YUV to RGB in Unity3D with native plugins

I have a Unity3D application that plays videos on the UI.
I find myself in need to find a better way to convert a YUV video buffer to a RGB buffer.
My situation is this:
Unity3D with a UI image that renders a video
Gstreamer external process which actually plays the video
A native plugin, called from Unity3D to convert the video YUV buffer to a RGBA one
My C++/Native plugin portion of code related to the YUV->RGBA conversion:
unsigned char * rgba = (unsigned char*) obj->g_RgbaBuff;
unsigned char * yuv = (unsigned char*) obj->g_Buffer;
while ( ta < obj->g_BufferLength)
int ty = (int)yuv[i];
int tu = (int)yuv[i + 1];
int tY2 = (int)yuv[i + 2];
int tv = (int)yuv[i + 3];
int tp1 = (int)(1.164f * (ty - 16));
int tr = Clamp((int)tp1 + 1.596f * (tv - 128));
int tg = Clamp((int)tp1 - 0.813f * (tv - 128) - 0.391f * (tu - 128));
int tb = Clamp((int)tp1 + 2.018f * (tu - 128));
rgba[ta] = tb;
rgba[ta + 1] = tg;
rgba[ta + 2] = tr;
ta += 4;
int tp2 = (int)(1.164f * (tY2 - 16));
int tr2 = Clamp((int)tp2 + 1.596f * (tv - 128));
int tg2 = Clamp((int)tp2 - 0.813f * (tv - 128) - 0.391f * (tu - 128));
int tb2 = Clamp((int)tp2 + 2.018f * (tu - 128));
rgba[ta] = tb2;
rgba[ta + 1] = tg2;
rgba[ta + 2] = tr2;
ta += 4;
This code gets called by Unity3D in a while loop to continuously update the output of my image, which is correctly showing the video.
Thing is, it's really slow. When I'm opening more than one video, my FPSs drop from 60 to way below 30 with just three 720p videos.
Is there a way to do this on the GPU? Or a smarter way to do it. Should I approach it in a different way?
To render the buffer to a texture I'm using this code in my native code, there's the rendering being done every frame by using Unity GL.IssuePluginEvent()
static void ModifyTexturePixels(void* textureHandle, int w, int h, void* rgbaBuff)
int textureRowPitch;
glBindTexture(GL_TEXTURE_2D, (GLuint)(size_t)textureHandle);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, rgbaBuff);

OPENCV : CUDA context initialization for different methods

I'm working on a simple c++ program to evaluate the performance of some Opencv GPU methods (cv::cuda).
I am using Opencv 3.1 on Ubuntu 15 (with CUDA 7.5) with a GeForce 770.
I previously read that we need to initialize CUDA environment to avoid slow process at first call. So, I initialize my program with a cv::cuda::getDevice() and setDevice().
Then, I test 2 methods:
cv::cuda::resize() (factor 0.5)
and cv::cuda::meanStdDev.
Initialization takes 400ms. Then, resizing takes 2 or 3 ms, that's OK.
But... meanStdDev takes 476ms!
If I run 2 successive meanStdDev, the second one is much faster (3ms).
I really don't understand why the initialization has an effect on resize() but not on meanStdDev().
I compile OPENCV with -DCUDA_ARCH_BIN=3.0. I try with -DCUDA_ARCH_PTX="" but the problem is still the same.
#include <opencv2/opencv.hpp>
#include <opencv2/cudaimgproc.hpp>
#include "opencv2/cudawarping.hpp"
#include "opencv2/cudaarithm.hpp"
using namespace std;
int main(int argc, char *argv[])
double t_init_cuda = (double)cv::getTickCount();
int CudaDevice;
cerr<<endl<<"ERROR: NO CudaEnabledDevice"<<endl;
CudaDevice = cv::cuda::getDevice();
t_init_cuda = ((double)cv::getTickCount() - t_init_cuda)/cv::getTickFrequency() * 1000;
cv::Mat src = cv::imread(argv[1], 0);
if (!src.data) exit(1);
cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat d_dst;
double factor = 0.5;
double t_gpu_resize = cv::getTickCount();
cv::cuda::resize(d_src, d_dst, cv::Size( (int) ((float) (d_src.cols)*factor) , (int) ((float) (d_src.rows)*factor)), 0, 0, CV_INTER_AREA);
t_gpu_resize = ((double)cv::getTickCount() - t_gpu_resize)/cv::getTickFrequency() * 1000;
cout<<endl<<"D_SRC="<<d_src.rows<<"x"<<d_src.cols<<" => D_DST="<<d_dst.rows<<"x"<<d_dst.cols<<endl;
double t_meanstddev = (double)cv::getTickCount();
cv::Scalar mean, stddev;
std::vector<cv::cuda::GpuMat> d_src_split;
cv::cuda::split(d_src, d_src_split);
cv::cuda::meanStdDev (d_src_split[0], mean, stddev);
t_meanstddev = ((double)cv::getTickCount() - t_meanstddev)/cv::getTickFrequency() * 1000.0;
cout<<endl<<"mean="<<mean.val[0]<<" | stddev="<<stddev.val[0]<<endl;
return 0;
My friend, When you call same function twice :
1- First time you allocate new memory at Device for resized. "According to WIKI of OpenCV"
2- Second time you reuse allocated memory so it will be fast.
I get that function from OpenCV for you so you can understand why it said that.
void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
const GpuMat src = getInputMat(_src, stream);
CV_Assert( src.type() == CV_8UC1 );
GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
int bufSize;
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
BufferPool pool(stream);
GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1); // <--- this line create new GpuMat
NppStreamHandler h(StreamAccessor::getStream(stream));
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
syncOutput(dst, _dst, stream);
this function
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
GpuMat buf(allocator_);
buf.create(rows, cols, type);
return buf;
I hope this will help you.

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
using namespace cimg_library;
void kernel(unsigned char* bitmap, Sphere* s)
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
__constant__ Sphere s[SPHERES_COUNT];
int main ()
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
kernel<<<grids, threads>>>(dev_bitmap, s);
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

Strange acting of CUDA for large amount of threads

I want to prepare my CUDA kernels for working over large amount of particles (much exceeding 65535 which is max value of gridDim). I tried to create a proper thread index mapping working for any <<<numBlocks, threadsPerBlock>>> values.
I wrote this:
__global__ void step_k(float* position, size_t numElements, unsigned int* blabla)
unsigned int i = calculateIndex();
if (i < numElements){
blabla[i] = i;
__device__ unsigned int calculateIndex(){
unsigned int xIndex = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int zIndex = blockIdx.z*blockDim.z+threadIdx.z;
unsigned int xSize = gridDim.x*blockDim.x;
unsigned int ySize = gridDim.y*blockDim.y;
return xSize*ySize*zIndex+xSize*yIndex+xIndex;
and I use it this way:
void CudaSphFluids::step(void)
//dim3 threadsPerBlock(1024, 1024, 64);
//dim3 numBlocks(65535, 65535, 65535);
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
unsigned int result[256] = {};
unsigned int* d_results;
cudaMalloc( (void**) &d_results,sizeof(unsigned int)*256);
step_k<<<numBlocks, threadsPerBlock>>>(d_position, 256, d_results);
cudaMemcpy(result,d_results,sizeof(unsigned int)*256,cudaMemcpyDeviceToHost);
for(unsigned int t=0; t<256;t++) {
cout<<result[t]<<"; ";
It seems to be ok (incrementing numbers from 0 to 255) for :
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 1, 1);
It works for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 3, 1);
but when I try to run it for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 5, 1);
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 10, 1);
and for larger values like:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
it's getting crazy:
Then I tried to use another mapping from some smart guy's website:
__device__ int getGlobalIdx_3D_3D()
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
But unfortunately it doesn't work. (numbers are different, but also wrong).
Any ideas what is the reason of such a strange acting?
I use CUDA 6.0 on GeForce GTX 560Ti (sm_21) and VS2012 with NSight.
This is requesting 65536 threads per block:
dim3 threadsPerBlock(256, 256, 1);
That is not acceptable on any current CUDA GPU, which are limited to either 512 or 1024 threads per block.
These are also launching too many threads per block:
dim3 threadsPerBlock(256, 5, 1);
dim3 threadsPerBlock(256, 10, 1);
Start by adding proper cuda error checking to your program. I would suggest doing this on any CUDA code before posting here. You will be more informed, and others will be able to help you better.
Although you don't show your complete kernel, your kernel indexing seems to be set up correctly for 3D indexing. Therefore, it may just be a matter of also modifying this line:
dim3 numBlocks(1, 1, 1);
Which you will probably want to do to get reasonable performance out of the GPU.

Issue with writing YUV image frame in C/C++

I am trying to convert an RGB frame, which is taken from OpenGL glReadPixels(), to a YUV frame, and write the YUV frame to a file (.yuv). Later on I would like to write it to a named_pipe as an input for FFMPEG, but as for now I just want to write it to a file and view the image result using a YUV Image Viewer. So just disregard the "writing to pipe" for now.
After running my code, I encountered the following errors:
The number of frames shown in the YUV Image Viewer software is always 1/3 of the number of frames I declared in my program. When I declare fps as 10, I could only view 3 frames. When I declared fps as 30, I could only view 10 frames. However when I view the file in Text Editor, I could see that I have the correct amount of word "FRAME" printed in the file.
This is the example output that I got: http://www.bobdanani.net/image.yuv
I could not see the correct image, but just some distorted green, blue, yellow, and black pixels.
I read about YUV format from http://wiki.multimedia.cx/index.php?title=YUV4MPEG2 and http://www.fourcc.org/fccyvrgb.php#mikes_answer and http://kylecordes.com/2007/pipe-ffmpeg
Here is what I have tried so far. I know that this conversion approach is quite in-efficient, and I can optimize it later. Now I just want to get this naive approach to work and have the image shown properly.
int frameCounter = 1;
int windowWidth = 0, windowHeight = 0;
unsigned char *yuvBuffer;
unsigned long bufferLength = 0;
unsigned long frameLength = 0;
int fps = 10;
void display(void) {
/* clear the color buffers */
/* DRAW some OPENGL animation, i.e. cube, sphere, etc
if ((frameCounter % fps) == 1){
bufferLength = 0;
windowWidth = glutGet(GLUT_WINDOW_WIDTH);
windowHeight = glutGet (GLUT_WINDOW_HEIGHT);
frameLength = (long) (windowWidth * windowHeight * 1.5 * fps) + 100; // YUV 420 length (width*height*1.5) + header length
yuvBuffer = new unsigned char[frameLength];
frameCounter = (frameCounter % fps) + 1;
if ( (frameCounter % fps) == 1){
snprintf(filename, 100, "out/image-%d.yuv", seq_num);
ofstream out(filename, ios::out | ios::binary);
if(!out) {
cout << "Cannot open file.\n";
out.write (reinterpret_cast<char*> (yuvBuffer), bufferLength);
bufferLength = 0;
delete[] yuvBuffer;
void write_yuv_frame_header (){
char *yuvHeader = new char[100];
sprintf (yuvHeader, "YUV4MPEG2 W%d H%d F%d:1 Ip A0:0 C420mpeg2 XYSCSS=420MPEG2\n", windowWidth, windowHeight, fps);
memcpy ((char*)yuvBuffer + bufferLength, yuvHeader, strlen(yuvHeader));
bufferLength += strlen (yuvHeader);
delete (yuvHeader);
void write_yuv_frame() {
int width = glutGet(GLUT_WINDOW_WIDTH);
int height = glutGet(GLUT_WINDOW_HEIGHT);
memcpy ((void*) (yuvBuffer+bufferLength), (void*) "FRAME\n", 6);
bufferLength +=6;
long length = windowWidth * windowHeight;
long yuv420FrameLength = (float)length * 1.5;
long lengthRGB = length * 3;
unsigned char *rgb = (unsigned char *) malloc(lengthRGB * sizeof(unsigned char));
unsigned char *yuvdest = (unsigned char *) malloc(yuv420FrameLength * sizeof(unsigned char));
glReadPixels(0, 0, windowWidth, windowHeight, GL_RGB, GL_UNSIGNED_BYTE, rgb);
int r, g, b, y, u, v, ypos, upos, vpos;
for (int j = 0; j < windowHeight; ++j){
for (int i = 0; i < windowWidth; ++i){
r = (int)rgb[(j * windowWidth + i) * 3 + 0];
g = (int)rgb[(j * windowWidth + i) * 3 + 1];
b = (int)rgb[(j * windowWidth + i) * 3 + 2];
y = (int)(r * 0.257 + g * 0.504 + b * 0.098) + 16;
u = (int)(r * 0.439 + g * -0.368 + b * -0.071) + 128;
v = (int)(r * -0.148 + g * -0.291 + b * 0.439 + 128);
ypos = j * windowWidth + i;
upos = (j/2) * (windowWidth/2) + i/2 + length;
vpos = (j/2) * (windowWidth/2) + i/2 + length + length/4;
yuvdest[ypos] = y;
yuvdest[upos] = u;
yuvdest[vpos] = v;
memcpy ((void*) (yuvBuffer + bufferLength), (void*)yuvdest, yuv420FrameLength);
bufferLength += yuv420FrameLength;
free (yuvdest);
free (rgb);
This is just the very basic approach, and I can optimize the conversion algorithm later.
Can anyone tell me what is wrong in my approach? My guess is that one of the issues is with the outstream.write() call, because I converted the unsigned char* data to char* data that it may lose data precision. But if I don't cast it to char* I will get a compile error. However this doesn't explain why the output frames are corrupted (only account to 1/3 of the number of total frames).
It looks to me like you have too many bytes per frame for 4:2:0 data. ACcording to the spec you linked to, the number of bytes for a 200x200 pixel 4:2:0 frame should be 200 * 200 * 3 / 2 = 60,000. But you have ~90,000 bytes. Looking at your code, I don't see where you are convert from 4:4:4 to 4:2:0. So you have 2 choices - either set the header to 4:4:4, or convert the YCbCr data to 4:2:0 before writing it out.
I compiled your code and surely there is a problem when computing upos and vpos values.
For me this worked (RGB to YUV NV12):
vpos = length + (windowWidth * (j/2)) + (i/2)*2;
upos = vpos + 1;