How to optimize this code? - c++

Profiler says that 50% of total time spends inside this function. How would you optimize it?
It converts BMP color scheme to YUV. Thanks!
Update: platform is ARMV6 (writing for IPhone)
#define Y_FROM_RGB(_r_,_g_,_b_) ( ( 66 * _b_ + 129 * _g_ + 25 * _r_ + 128) >> 8) + 16
#define V_FROM_RGB(_r_,_g_,_b_) ( ( 112 * _b_ - 94 * _g_ - 18 * _r_ + 128) >> 10) + 128
#define U_FROM_RGB(_r_,_g_,_b_) ( ( -38 * _b_ - 74 * _g_ + 112 * _r_ + 128) >> 10) + 128
/*!
* \brief
* Converts 24 bit image to YCrCb image channels
*
* \param source
* Source 24bit image pointer
*
* \param source_width
* Source image width
*
* \param dest_Y
* destination image Y component pointer
*
* \param dest_scan_size_Y
* destination image Y component line size
*
* \param dest_U
* destination image U component pointer
*
* \param dest_scan_size_U
* destination image U component line size
*
* \param dest_V
* destination image V component pointer
*
* \param dest_scan_size_V
* destination image V component line size
*
* \param dest_width
* Destination image width = source_width
*
* \param dest_height
* Destination image height = source image height
*
* Convert 24 bit image (source) with width (source_width)
* to YCrCb image channels (dest_Y, dest_U, dest_V) with size (dest_width)x(dest_height), and line size
* (dest_scan_size_Y, dest_scan_size_U, dest_scan_size_V) (in bytes)
*
*/
void ImageConvert_24_YUV420P(unsigned char * source, int source_width,
unsigned char * dest_Y, int dest_scan_size_Y,
unsigned char * dest_U, int dest_scan_size_U,
unsigned char * dest_V, int dest_scan_size_V,
int dest_width, int dest_height)
{
int source_scan_size = source_width*3;
int half_width = dest_width/2;
//Y loop
for (int y = 0; y < dest_height/2; y ++)
{
//Start of line
unsigned char * source_scan = source;
unsigned char * source_scan_next = source+source_scan_size;
unsigned char * dest_scan_Y = dest_Y;
unsigned char * dest_scan_U = dest_U;
unsigned char * dest_scan_V = dest_V;
//Do all pixels
for (int x = 0; x < half_width; x++)
{
int R = source_scan[0];
int G = source_scan[1];
int B = source_scan[2];
//Y
int Y = Y_FROM_RGB(B, G, R);
*dest_scan_Y = Y;
source_scan += 3;
dest_scan_Y += 1;
int R1 = source_scan[0];
int G1 = source_scan[1];
int B1 = source_scan[2];
//Y
Y = Y_FROM_RGB(B1, G1, R1);
R += (R1 + source_scan_next[0] + source_scan_next[3]);
G += (G1 + source_scan_next[1] + source_scan_next[4]);
B += (B1 + source_scan_next[2] + source_scan_next[5]);
//YCrCb
*dest_scan_Y = Y;
*dest_scan_V = V_FROM_RGB(B, G, R);
*dest_scan_U = U_FROM_RGB(B, G, R);
source_scan += 3;
dest_scan_Y += 1;
dest_scan_U += 1;
dest_scan_V += 1;
source_scan_next += 6;
};
//scroll to next line
source += source_scan_size;
dest_Y += dest_scan_size_Y;
dest_U += dest_scan_size_U;
dest_V += dest_scan_size_V;
//Start of line
source_scan = source;
dest_scan_Y = dest_Y;
//Do all pixels
for (int x = 0; x < half_width; x ++)
{
int R = source_scan[0];
int G = source_scan[1];
int B = source_scan[2];
//Y
int Y = Y_FROM_RGB(B, G, R);
*dest_scan_Y = Y;
source_scan += 3;
dest_scan_Y += 1;
R = source_scan[0];
G = source_scan[1];
B = source_scan[2];
//Y
Y = Y_FROM_RGB(B, G, R);
*dest_scan_Y = Y;
source_scan += 3;
dest_scan_Y += 1;
};
source += source_scan_size;
dest_Y += dest_scan_size_Y;
};
};

Unless I am missing something the follow code seems to be repeated in both loops, so, why not go through this loop once? This may require some changes to your algorithm, but it would improve performance.
for (int x = 0; x < half_width; x ++)
{
int R = source_scan[0];
int G = source_scan[1];
int B = source_scan[2];
//Y
int Y = Y_FROM_RGB(B, G, R);
*dest_scan_Y = Y;
source_scan += 3;
dest_scan_Y += 1;
R = source_scan[0];
G = source_scan[1];
B = source_scan[2];
But, before doing anything, move the two inside loops into separate functions, and then run your profiler, and see if you spend more time in one function than the other.
You have three loops in this function, and you don't know which section is actually where you are spending your time. So determine that before doing any optimization, otherwise you may find that you are fixing the wrong section.

I don't know what platform you are using but you might want to look SIMD
Arm Cotext-A8 has Neon technology that does support SIMD. You should be able to find more information on the ARM website.

Presuming that the memory they point to does not overlap, you should declare your source, dest_Y, dest_U and dest_V pointers with the restrict qualifier, to tell the compiler this and allow it to optimise better.

Related

How to set a value at memory offset from a const void pointer?

I have a pointer returned from a function
rs2::video_frame frame = frames.get_color_frame();
const void* data = frame.get_data();
I know that this pointer is an array of RGB values (i.e. 3 chars) of size frame.get_data_size().
How can I modify certain pixel colors given that
int bpp = frame.get_bytes_per_pixel();
int width = frame.get_width();
int height = frame.get_height();
int offset = (y * width * bpp) + (x * bpp);
int r = offset;
int g = offset + 1;
int b = offset + 2;
// ?data[r] = newRed;
// ?data[g] = newGreen;
// ?data[b] = newBlue;
You would have it easier if you would have an object oriented approach:
struct Pixel {
short red;
short green;
short blue;
};
Let your frame work with an std::vector<Pixel> pixels; which is returned by reference. std::vector<Pixel>& get_data();
pixels[y * width + x].red = newRed;
pixels[y * width + x].green = newGreen;
pixels[y * width + x].blue = newBlue;
If you really have to work with void* then try this
char* data = static_cast<char*>(const_cast<void*>(dataframe.get_data()));
// Since you also const cast it becomes more and more dangerous. You really need know what you are doing.
size_t bpp = frame.get_bytes_per_pixel();
size_t width = frame.get_width();
size_t height = frame.get_height();
size_t offset = (y * width * bpp) + (x * bpp);
size_t r = offset;
size_t g = offset + 1;
size_t b = offset + 2;
*(data + r) = newRed;
*(data + g) = newGreen;
*(data + b) = newBlue;
For bulk updates you can use memset.
See https://godbolt.org/z/xvc1xs for details.

implementing de castlejau algorithm c++

The program runs but the curved line isn't being displayed .
Here is my code and note, I have 4 vertices in an array.
void GLWidget::drawControlPolygon(){
for (int i = 0; i < vertices.size()-1;i++){
drawEdge(vertices[i], vertices[i+1], RGBValue(0,0,0));
}
}
void GLWidget::drawDeCasteljau(float t) {
Point p;
int N_PTS = 4;
p.x = pow((1-t),3)*vertices[0].x+3* t * pow((1 -t), 2) * vertices[1].x + 3 * (1-t)*pow(t,2)*vertices[2].x+ pow (t, 3)*vertices[3].x;
p.y = pow((1-t),3)*vertices[0].y+3* t * pow((1 -t), 2) * vertices[1].y + 3 * (1-t)*pow(t,2)*vertices[2].y+ pow (t, 3)*vertices[3].y;
p.z = pow((1-t),3)*vertices[0].z+3* t * pow((1 -t), 2) * vertices[1].z + 3 * (1-t)*pow(t,2)*vertices[2].z+ pow (t, 3)*vertices[3].z;
int bezPoints[3][3] ;
for (float u = 0.0; u <= 1.0; u += t) {
for (int diag = N_PTS-2; diag >= 0; diag--) {
for (int i = 0; i <= diag; i++) {
int j = diag - i;
bezPoints[i][j] = (1.0-u)*bezPoints[i][j+1] + u*bezPoints[i+1][j];
}
}
// set the pixel for this parameter value
//Set pixel method for theImage object.
// void setPixel(Index row, Index col, Byte red, Byte green, Byte blue, Byte alpha=255);
// void setPixel(Index row, Index col, RGBValue colour, Byte alpha = 255);
theImage.setPixel(bezPoints[0], bezPoints[0][0], RGBValue());
}
}
void GLWidget::drawBezierCurve() {
}
for the full class here is the link to it...
https://www.dropbox.com/s/j6jw51uhz30m3tb/testApp.cc?dl=0
So far the output looks like this
Thanks!

How to optimize YUV to RGB color conversion code

I have written a function to convert an image in YUV420P to RGB but it is taking 30 millisecond to convert an image (size: 1280 x 720) into RGB, but when I am using ffmpeg function ( as this) to convert YUV image into RGB its taking only 2 millisecond for the same image. What is the problem with my code ? How can I optimize the code that I have written ??
My code is given below
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
for (int i = 0; i<origImage->height; i++)
{
for (int j=0; j<origImage->width; j++)
{
float Y = data[i*step + j];
float U = data[ (int)(size + (i/2)*(step/2) + j/2) ];
float V = data[ (int)(size*1.25 + (i/2)*(step/2) + j/2)];
float R = Y + 1.402 * (V - 128);
float G = Y - 0.344 * (U - 128) - 0.714 * (V - 128);
float B = Y + 1.772 * (U - 128);
if (R < 0){ R = 0; } if (G < 0){ G = 0; } if (B < 0){ B = 0; }
if (R > 255 ){ R = 255; } if (G > 255) { G = 255; } if (B > 255) { B = 255; }
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}
Here, try this(should reduce to 25 milliseconds):
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
int stepDb2=step /2;
float sizeMb1d25=size*1.25 ;
int origImagePTheight=origImage->height;
int origImagePTwidth=origImage->width;
for (int i = 0; i<origImagePTheight; i++)
{
float idb2=i/2;
int iStep=i*step;
for (int j=0; j<origImagePTwidth; j++)
{
float variable=idb2*stepDb2 + j/2;
float Y = data[iStep + j];
float U = -128 + data[ (int)(size + variable) ];
float V = -128 + data[ (int)(sizeMb1d25 + variable)];
float R = Y + 1.402 * V ;
float G = Y - 0.344 * U - 0.714 * V;
float B = Y + 1.772 * U;
R= R * !(R<0);
G= G * !(G<0);
B= B * !(B<0);
R=R*(!(R>255)) + 255 * (R>255);
G=G*(!(G>255)) + 255 * (G>255);
B=B*(!(B>255)) + 255 * (B>255);
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}

How to return a matrix structure from a mex function?

I have a struct that defines a 3d array, the size is known:
struct uchar3
{
unsigned char x, y, z;
};
and I want to return it via mex function in order to use it in matlab like a three dimensional array, like an image. How can this be done?
EDIT:
This is apart of the function I use.
foo(uchar3 **imagePtr, Mat Im){
unsigned char *cvPtr = Im.ptr<unsigned char>(0);
for (size_t i = 0; i < Im.rows * Im.cols; ++i) {
(*imagePtr)[i].x = cvPtr[3 * i + 0];
(*imagePtr)[i].y = cvPtr[3 * i + 1];
(*imagePtr)[i].z = cvPtr[3 * i + 2];
}
}
Shai's code:
cv::Mat imageRGB;
cv::cvtColor(OutPutMat, imageRGB, CV_BGR2RGB);
// uc3 is populated here
mwSize sz[3];
sz[0] = imageRGB.rows; // matlab is row first
sz[1] = imageRGB.cols;
sz[2] = 3;
plhs[0] = mxCreateNumericArray( 3, sz, mxDOUBLE_CLASS, // create double array, you can change the type here
mxREAL ); // create real matrix
float *cvPtr = imageRGB.ptr<float>(0);
float* p = (float*)mxGetData(plhs[0]); // get a pointer to actual data
for ( size_t y = 0 ; y < imageRGB.rows ; y++ ) {
for ( size_t x = 0; x < imageRGB.cols ; x++ ) {
int i = y * imageRGB.cols + x; // opencv is col first
p[ x * imageRGB.rows + y ] = cvPtr[3 * i + 0];
p[ imageRGB.cols * imageRGB.rows + x * imageRGB.rows + y ] = cvPtr[3 * i + 1];
p[ 2*imageRGB.cols * imageRGB.rows + x * imageRGB.rows + y ] = cvPtr[3 * i + 2];
}
}
You need to use mxCreateNumericArray
uchar3 uc3;
// uc3 is populated here
mwSize sz[3];
sz[0] = Im.rows; // matlab is row first
sz[1] = Im.cols;
sz[2] = 3;
mxArray* pOut = mxCreateNumericArray( 3, sz, mxDOUBLE_CLASS // create double array, you can change the type here
mxREAL ); // create real matrix
double* p = (double*)mxGetData(pOut); // get a pointer to actual data
for ( size_t y = 0 ; y < Im.rows ; y++ ) {
for ( size_t x = 0; x < Im.cols ; x++ ) {
int i = y * Im.cols + x; // opencv is col first
p[ x * Im.rows + y ] = cvPtr[3 * i + 0];
p[ Im.cols*Im.rows + x * Im.rows + y ] = cvPtr[3 * i + 1];
p[ 2*Im.cols*Im.rows + x * Im.rows + y ] = cvPtr[3 * i + 2];
}
}
// set one of your mexFunction's outputs to pOut
Into your mex function do this :
plhs[0] = valueStruct(Test,Test2);
Where ValueStruct is a function
mxArray* valueStruct(const double& d,const double& d2)
{
mxArray* p = mxCreateStructMatrix(1,1,2,_fieldnames);
if (!p)
mexErrMsgIdAndTxt("error","Allocation error");
mxSetField(p,0,"d",mxArray(d));
mxSetField(p,0,"d2",mxArray(d2));
return p;
}
You can refer to mxCreateStructMatrix documentation for more informations.
And for mxSetField.
For an example, You can refer to mexopencv that create struct with his mxArray class where you can get here.

mjpeg to raw rgb24 with video4linux

I'm writing a c++ webcam viewer using video4linux. I need a RGB24 output (interleaved R8B8G8) for displaying. I'm able to get video input for almost all low-resolution webcam, using YUYV, GREY8 or RGB24. But I need to get input also from high-resolution webcams, that use MJPEG for compression when high framerate is needed.
I'm able to get MJPEG stream using V4L2_PIX_FMT_MJPEG as pixel format, but received framebuffer is compressed.
How can I quickly convert it to RGB24?
Can I use libjpeg for this?
The quickest solution I've found is decode_jpeg_raw from mjpegtools which decode jpeg data to planar YUV420. Then the conversion from yuv420 to rgb24 is done by this function:
inline int clip(int value) {
return (value > 255) ? 255 : (value < 0) ? 0 : value;
}
static void yuv420_to_rgb24(
/* luminance (source) */const uint8_t* const y
/* u chrominance (source) */, const uint8_t* u
/* v chrominance (source) */, const uint8_t* v
/* rgb interleaved (destination) */, uint8_t* const dst
/* jpeg size */, int const size
/* image width */, int const width) {
const int lineSize = width * 3;
uint8_t* r1 = dst;
uint8_t* g1 = r1 + 1;
uint8_t* b1 = r1 + 2;
uint8_t* r2 = r1 + lineSize;
uint8_t* g2 = r2 + 1;
uint8_t* b2 = r2 + 2;
const uint8_t* y1 = y;
const uint8_t* y2 = y + width;
uint8_t* const end = r1 + size;
int c1 = 0;
int c2 = 0;
int e = 0;
int d = 0;
while (r1 != end) {
uint8_t* const lineEnd = r2;
/* line by line */
while (r1 != lineEnd) {
/* first pixel */
c1 = *y1 - 16;
c2 = *y2 - 16;
d = *u - 128;
e = *v - 128;
*r1 = clip(c1 + ((454 * e) >> 8));
*g1 = clip(c1 - ((88 * e + 183 * d) >> 8));
*b1 = clip(c1 + ((359 * d) >> 8));
*r2 = clip(c2 + ((454 * e) >> 8));
*g2 = clip(c2 - ((88 * e + 183 * d) >> 8));
*b2 = clip(c2 + ((359 * d) >> 8));
r1 += 3;
g1 += 3;
b1 += 3;
r2 += 3;
g2 += 3;
b2 += 3;
++y1;
++y2;
/* second pixel */
c1 = *y1 - 16;
c2 = *y2 - 16;
d = *u - 128;
e = *v - 128;
*r1 = clip(c1 + ((454 * e) >> 8));
*g1 = clip(c1 - ((88 * e + 183 * d) >> 8));
*b1 = clip(c1 + ((359 * d) >> 8));
*r2 = clip(c2 + ((454 * e) >> 8));
*g2 = clip(c2 - ((88 * e + 183 * d) >> 8));
*b2 = clip(c2 + ((359 * d) >> 8));
r1 += 3;
g1 += 3;
b1 += 3;
r2 += 3;
g2 += 3;
b2 += 3;
++y1;
++y2;
++u;
++v;
}
r1 += lineSize;
g1 += lineSize;
b1 += lineSize;
r2 += lineSize;
g2 += lineSize;
b2 += lineSize;
y1 += width;
y2 += width;
}
}
Yes you can use libjpeg for this, but usually the output of libjpeg is in YUV420 or YUV422.
You might instead use that code: http://mxhaard.free.fr/spca50x/Download/gspcav1-20071224.tar.gz (check for decoder source, there's a small jpeg decoder that's working well and deals with color conversion directly so the output is in RGB888)