Code optimization of Bilinear interpolation - c++

I have implemented bilinear interpolation algorithm according to the MATLAB example:
https://se.mathworks.com/matlabcentral/fileexchange/10772-fast-2-dimensional-interpolation; https://en.wikipedia.org/wiki/Bilinear_interpolation - Unit square formula.
Would be great to speed up the implementation. Can somebody give me suggestions how to optimize the code for speeding it up? Please, find part of the method:
x_loops = floor((X_end-X_11)/pixel_size_mm)+1;
y_loops = floor((Y_end-Y_11)/pixel_size_mm)+1;
float** Zi = new float*[x_loops] ();
for(int i = 0; i < x_loops; ++i)
Zi[i] = new float[y_loops] ();
n_dx = 1/(X_12 - X_11);
n_dy = 1/(Y_21 - Y_11);
Yi = Y_11;
int count = 0;
for(int i = 0; i < y_loops; i++)
{
Xi = X_11;
xi = 0;
yi = 0;
for(int j = 0; j < x_loops; j++)
{
xi = (Xi - X_11)*n_dx;
yi = (Yi - Y_11)*n_dy;
Xi += pixel_size_mm;
fxi = floor(xi);
fyi = floor(yi);
dfxi = xi - fxi;
dfyi = yi - fyi;
Zi[j][i] = (Strain_image[fxi][fyi]*(1 - dfxi)*(1-dfyi) +
Strain_image[fxi+1][fyi]*dfxi*(1-dfyi) + Strain_image[fxi][fyi+1]*
(1-dfxi)*dfyi + Strain_image[fxi+1][fyi+1]*dfxi*dfyi);
}
Yi += pixel_size_mm;
}
iMage(Zi, x_loops, y_loops,fX1,fY1);
for(int i = 0; i < x_loops; ++i)
delete [] Zi[i];
delete[] Zi;
for(int i = 0; i < number_of_RF_rows; ++i)
delete [] Strain_image[i];
delete[] Strain_image;

Related

has invalid type for ‘reduction' openmp for c++

I have used openm to parallelize my c++ code as below:
int shell_num = 50, grparallel[shell_num],grbot[shell_num];
double p_x,p_y,grp[shell_num];
for (int f = 0; f < shell_num; f++)
{
grp[f] = 0;
grparallel[f] = 0;
grbot[f] = 0;
}
//some code...
#pragma omp parallel for reduction(+ : grp,grparallel,grbot)
for(int i = 0; i < N; i++){ //some code
for(int j = 0; j < N; j++){
if (j==i) continue;
double delta_x = x[i]-x[j],
delta_y = y[i]-y[j],
e_dot_e = e_x[i] * e_x[j] + e_y[i] * e_y[j],
e_cross_e = e_x[i] * e_y[j] - e_y[i] * e_x[j];
if (j > i)
{
double fasele = sqrt(dist(x[i],y[i],x[j],y[j],L));
for (int h = 0; h < shell_num; h++) //determine periodic distance between i and j is in which shel
{
if( L * h / 100 < fasele && fasele < L * (h + 1) / 100 )
{grp[h]+= e_dot_e;
double pdotr = abs(periodic(delta_x,L) * p_x + periodic(delta_y,L) * p_y)/fasele;
if (pdotr > 0.9659)
{
grparallel[h]+= 1;}else if(pdotr < 0.2588)
{
grbot[h]+= 1;
}
break;
}
}
}
}
}
When I run the code in terminal, there is an error:
‘grp’ has invalid type for ‘reduction’
The same error occurs for grparallel and grbot.
How can I remove the error?

meshgrid matlab to c++ different results

here is the matlab code i'm trying to convert to c++
where
size(Iorig) == 1334X 2026
%% label checkers
Label = zeros(size(Iorig));
Margins = 11;
[X,Y] = meshgrid(1:size(Iorig,2),1:size(Iorig,1));
k = 1;
for i = 1:4
for j = 1:6
rr = rect{i,j};
x1 = rr(1);
x2 = rr(1) + rr(3);
y1 = rr(2);
y2 = rr(2) + rr(4);
Label(X>=x1+Margins&X<x2-Margins&Y>=y1+Margins&Y<y2-Margins) = k;
k = k+1;
end
end
I understand that we want to label the rectangles which are found in the previous step, there are 24 of those.
but I don't understand how to convert this line into easy c++ code without allocating a huge buffer of X and Y which basically just holds... indices..
thanks for your help here is what i started doing.
//label Checkers
List<List<int>^>^ label = gcnew List<List<int>^>();
int margins = 11;
int k = 1;
for (size_t i = 0; i < 4; i++)
{
for (size_t j = 0; j < 6; j++)
{
MacbethCheckerBatchesColor^ rect = autoDetectMacbethResult[i * 6 + j];
Point^ r = rect->Points[0];
int x1 = r->X;
int y1 = r->Y;
r = rect->Points[2];
int x2 = r->X;
int y2 = r->Y;
for (int h = 0; h < inputImage->HeightLines; h++)
{
List<int>^ tempRow = gcnew List<int>();
for (int w = 0; w < inputImage->WidthColumns; w++)
{
if ( (w>= x1+margins) & (w<x2-margins) & (h >= y1+margins) & (h<y2-margins) )
{
tempRow->Add(k);
}
else
{
tempRow->Add(0);
}
}
label->Add(tempRow);
}
k= k+100;//i tried here many other numbers... same result
}
}
Here is my result can you please help me find my mistake, the rectangles are the same, I guesss I have some other logical mistake.

Can the following C++ code be parallelized, with OpenMP, for better performance?

I have a code that does Singular Value Decomposition (SVD) for square matrices. Code does the job however, it is quite slow and when matrix size increases it gets unbearable. As I am not familiar with parallel programming hence, I am asking advise from experts before I start digging deeper and eventually realize the action I want to achieve is not even possible.
Thank you in advance.
void SVD::decompose() {
bool flag;
int i, its, j, jj, k, l, nm;
double anorm, c, f, g, h, s, scale, x, y, z;
Row rv1(n);
g = scale = anorm = 0.0; //Householder reduction to bidiagonal form.
for (i = 0; i < n; i++) {
l = i + 2;
rv1[i] = scale*g;
g = s = scale = 0.0;
if (i < m) {
for (k = i; k < m; k++) scale += abs(u[k][i]);
if (scale != 0.0) {
for (k = i; k < m; k++) {
u[k][i] /= scale;
s += u[k][i] * u[k][i];
}
f = u[i][i];
g = -SIGN(sqrt(s), f);
h = f*g - s;
u[i][i] = f - g;
for (j = l - 1; j < n; j++) {
for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
f = s / h;
for (k = i; k < m; k++) u[k][j] += f*u[k][i];
}
for (k = i; k < m; k++) u[k][i] *= scale;
}
}
w[i] = scale *g;
g = s = scale = 0.0;
if (i + 1 <= m && i + 1 != n) {
for (k = l - 1; k < n; k++) scale += abs(u[i][k]);
if (scale != 0.0) {
for (k = l - 1; k < n; k++) {
u[i][k] /= scale;
s += u[i][k] * u[i][k];
}
f = u[i][l - 1];
g = -SIGN(sqrt(s), f);
h = f*g - s;
u[i][l - 1] = f - g;
for (k = l - 1; k < n; k++) rv1[k] = u[i][k] / h;
for (j = l - 1; j < m; j++) {
for (s = 0.0, k = l - 1; k < n; k++) s += u[j][k] * u[i][k];
for (k = l - 1; k < n; k++) u[j][k] += s*rv1[k];
}
for (k = l - 1; k < n; k++) u[i][k] *= scale;
}
}
anorm = MAX(anorm, (abs(w[i]) + abs(rv1[i])));
}
for (i = n - 1; i >= 0; i--) { //Accumulation of right-hand tranformations.
if (i < n - 1) {
if (g != 0.0) {
for (j = l; j < n; j++) // Double division to avoid possible underflow.
v[j][i] = (u[i][j] / u[i][l]) / g;
for (j = l; j < n; j++) {
for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
for (k = l; k < n; k++) v[k][j] += s*v[k][i];
}
}
for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
}
v[i][i] = 1.0;
g = rv1[i];
l = i;
}
for (i = MIN(m, n) - 1; i >= 0; i--) { //Accumulation of left-hand transformations.
l = i + 1;
g = w[i];
for (j = l; j < n; j++) u[i][j] = 0.0;
if (g != 0.0) {
g = 1.0 / g;
for (j = l; j < n; j++) {
for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
f = (s / u[i][i])*g;
for (k = i; k < m; k++) u[k][j] += f*u[k][i];
}
for (j = i; j < m; j++) u[j][i] *= g;
}
else for (j = i; j < m; j++) u[j][i] = 0.0;
++u[i][i];
}
for (k = n - 1; k >= 0; k--) { //Diagonalization of the bidiagonal form: Loop over
for (its = 0; its < 30; its++) { //singular values, and over allowed iterations.
flag = true;
for (l = k; l >= 0; l--) { //Test ofr splitting.
nm = l - 1;
if (l == 0 || abs(rv1[l]) <= eps*anorm) {
flag = false;
break;
}
if (abs(w[nm]) <= eps*anorm) break;
}
if (flag) {
c = 0.0; //Cancellatin of rv[l], if l>0.
s = 1.0;
for (i = l; i < k + 1; i++) {
f = s*rv1[i];
rv1[i] = c*rv1[i];
if (abs(f) <= eps*anorm) break;
g = w[i];
h = pythag(f, g);
w[i] = h;
h = 1.0 / h;
c = g*h;
s = -f*h;
for (j = 0; j < m; j++) {
y = u[j][nm];
z = u[j][i];
u[j][nm] = y*c + z*s;
u[j][i] = z*c - y*s;
}
}
}
z = w[k];
if (l == k) { //Convergence.
if (z < 0.0) { //Singular value is made nonnegative.
w[k] = -z;
for (j = 0; j < n; j++) v[j][k] = -v[j][k];
}
break;
}
x = w[l]; //Shift from bottom 2-by-2 minor.
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z)*(y + z) + (g - h)*(g + h)) / (2.0*h*y);
g = pythag(f, 1.0);
f = ((x - z)*(x + z) + h*((y / (f + SIGN(g, f))) - h)) / x;
c = s = 1.0; //Next QR transformation:
for (j = l; j <= nm; j++) {
i = j + 1;
g = rv1[i];
y = w[i];
h = s*g;
g = c*g;
z = pythag(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x*c + g*s;
g = g*c - x*s;
h = y*s;
y *= c;
for (jj = 0; jj < n; jj++) {
x = v[jj][j];
z = v[jj][i];
v[jj][j] = x*c + z*s;
v[jj][i] = z*c - x*s;
}
z = pythag(f, h);
w[j] = z; //Rotation can be arbitrary if z = 0.
if (z) {
z = 1.0 / z;
c = f*z;
s = h*z;
}
f = c*g + s*y;
x = c*y - s*g;
for (jj = 0; jj < m; jj++) {
y = u[jj][j];
z = u[jj][i];
u[jj][j] = y*c + z*s;
u[jj][i] = z*c - y*s;
}
}
rv1[l] = 0.0;
rv1[k] = f;
w[k] = x;
}
}
}
Parts of your code can certainly be parallelized. How much you gain, that is an other question.
The easy way would be to use a common math library.
The fun way would be to maybe use OpenMP to do it yourself.
But befor you even think about OpenMP, consider to rearange your indices. You tend to loop over the first index alot, like in for (k = i; k < m; k++) u[k][i] *= scale;. This has a very bad cache hit rate in c++ for u[k][i] is basicly u[k*second_index_size+i]. If you swap the indices you get for (k = i; k < m; k++) u[i][k] *= scale; which makes perfect use of the cache.
You should see quite a speedup by implementing this.
Now for the OpenMP part.
Find out where the hot regions in your code are. Maybe use Visual Studio to do so. And then you could use OpenMP to parallelize certain for loops, like
#pragma omp parallel for
for (k = i; k < m; k++) u[i][k] *= scale;
What you will gain depends on where the hot regions are and how big your matrices are. Benchmarks will have to show.

convert non-contiguous dot product to neon assembly

Instead of a normal dot product, my application requires a slightly modified version. Here is the original C++ code:
for (int m = 0; m < k; m++) {
for (int n = 0; n < l; n++) {
for (int t = 0; t < dims2[2]; t++) {
for (int dm = 0; dm < dims2[1]; dm++) {
for (int dn = 0; dn < dims2[0]; dn++) {
int ai = (n + dn) + (m + dm) * dims1[0] + t * dims1[0] * dims1[1];
int bi = dn + dm * dims2[0] + t * dims2[0] * dims2[1];
total += A[ai] * B[bi];
}
}
}
}
}
Feel free to change the order of the loops. How do I convert this to neon assembly?

Laplacian does not give desired output

Here is my code:
int Factor=3,offset=0,k,l,p,q;
IplImage * image = cvCreateImage(cvSize(img->width, img->height),img->depth, img->nChannels);
cvCopy (img, image, 0);
long double mean=0,nTemp=0,c,sum=0,n=0,s=0,d=0;
int i=0,j=0,krow,kcol;
kernel[0][0]=kernel[0][2]=kernel[2][0]=kernel[2][2]=0;
kernel[0][1]=kernel[1][0]=kernel[1][2]=kernel[2][1]=1;
kernel[1][1]=-4;
uchar* temp_ptr=0 ;
int rows=image->height,cols=image->width,row,col;
//calculate the mean of image and deviation
for ( row = 1; row < rows - 2; row++ )
{
for ( col = 1; col < cols - 2; col++ )
{
nTemp = 0.0;
for (p=0, krow = -1 ; p < 3; krow++,p++)
{
for (q=0, kcol = -1; q < 3; kcol++,q++)
{
temp_ptr = &((uchar*)(image->imageData + (image->widthStep*(row+krow))))[(col+kcol)*3];
for(int k=0; k < 3; k++)
Pixel[p][q].val[k]=temp_ptr[k];
}
}
for (i=0 ; i < 3; i++)
{
for (j=0 ; j < 3; j++)
{
c = (Pixel[i][j].val[0]+Pixel[i][j].val[1]+Pixel[i][j].val[2])/Factor ;
nTemp += (double)c * kernel[i][j];
}
}
sum += nTemp;
n++;
}
}
mean = ((double)sum / n);
for ( row = 1; row < rows - 2; row++ )
{
for ( col = 1; col < cols - 2; col++ )
{
nTemp = 0.0;
for (p=0, krow = -1 ; p < 3; krow++,p++)
{
for (q=0, kcol = -1; q < 3; kcol++,q++)
{
temp_ptr = &((uchar*)(image->imageData + (image->widthStep*(row+krow))))[(col+kcol)*3];
for(int k=0; k < 3; k++)
Pixel[p][q].val[k]=temp_ptr[k];
}
}
for (i=0 ; i < 3; i++)
{
for (j=0 ; j < 3; j++)
{
c = (Pixel[i][j].val[0]+Pixel[i][j].val[1]+Pixel[i][j].val[2])/Factor ;
nTemp += (double)c * kernel[i][j];
}
}
s = (mean - nTemp);
d += (s * s);
}
}
d = d / (n - 1);
d = (sqrt(d));
d=d* 2;
// Write to image
for ( row = 1; row < rows - 2; row++ )
{
for ( col = 1; col < cols - 2; col++ )
{
nTemp = 0.0;
for (p=0, krow = -1 ; p < 3; krow++,p++)
{
for (q=0, kcol = -1; q < 3; kcol++,q++)
{
temp_ptr = &((uchar*)(image->imageData + (image->widthStep*(row+krow))))[(col+kcol)*3];
for(int k=0; k < 3; k++)
Pixel[p][q].val[k]=temp_ptr[k];
}
}
for (i=0 ; i < 3; i++)
{
for (j=0 ; j < 3; j++)
{
c = (Pixel[i][j].val[0]+Pixel[i][j].val[1]+Pixel[i][j].val[2])/Factor ;
nTemp += (double)c * kernel[i][j];
}
}
temp_ptr = &((uchar*)(image->imageData + (image->widthStep*row)))[col*3];
if (nTemp > d)
temp_ptr[0]=temp_ptr[1]=temp_ptr[2]=255;
else
temp_ptr[0]=temp_ptr[1]=temp_ptr[2]=0;
}
}
Where am i going wrong? I have implemented Gaussian Filtering in a similar manner, is there anything wrong in my algorithm?
It seems that your code (labeled "Write to image") overwrites the input image during the calculations. This is not good. Create a copy of the image, calculate its pixels, then delete the original image.
I noticed is that your code is needlessly complex and inefficient. You don't need to convolve the image before calculating its mean — the convolution just multiplies the mean by the sum of the kernel entries.
Also, since your convolution kernel sums to zero, the mean you get after convolving with it will also (almost) be zero. The only non-zero contributions will come from the edges of the image. I rather doubt that's actually what you want to calculate (and if it is, you could save a lot of time by summing only over the edges in the first place).
Third, as pointed out in another answer (I missed this one myself), you're not averaging over all the color channels, you're averaging over the red channel three times. (Besides, you should probably use a weighted average anyway, after applying gamma correction.)
And finally, as anatolyg said, you're overwriting the image data before you're done reading it. There are several ways to fix that, but the easiest is to write your output to a separate buffer.