Manual vectorization/SSE for a complex problem in C++

Manual vectorization/SSE for a complex problem in C++ - c++

I want to speed up my algorithm which is an objective function f(x). The problem dimension is 5000. I have already introduced a lot of improvement in the code, but still the calculation time does not fit to my expectation.
Most of the dataset are allocated dynamically as (float*)_mm_malloc(N_h*sizeof(float),16). In the objective function where "long" for loops are present I applied successfully the _mm_mul_ps, _mm_rcp_ps, _mm_store_ps ... etc on __m128Var variables. And I introduced threading (_beginthreadex) to speed up the most slowest code.
But there is a part of code which cannot be vectorized easily...
I attached the most problematic code (slowest calculation) where I still cannot make an improvement (reminder, this is a part of a code from a bigger calculation, but my problem can be seen with this). I am expecting vector calculations, but I got simple calculation for each row of code (a lot of MOVSS, MULSS, SUBSS...etc in the assembly code). Could someone give me a hint what can be a problem?
I am using MinGW GCC-8.2.0-3 compiler on Windows machine with -O3 -march=native -ffast-math flags.
#include <immintrin.h>
#include "math.h"
#define N_h 5000
float* x_vec; // allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data0; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data1; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data2; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data3; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
int main()
{
float* q_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xx_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* cP_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xPtr = x_vec;
float* f32Ptr;
float c0;
int n = N_h;
int sum = 0;
while(n > 0)
{
int k=1;
n-=8;
cP_vec[0] = 1;
cP_vec[1] = 1;
cP_vec[2] = 1;
cP_vec[3] = 1;
cP_vec[4] = 1;
cP_vec[5] = 1;
cP_vec[6] = 1;
cP_vec[7] = 1;
//preload of x data shall be done with vector preload, currently it is row-by-row **MOVS**
xx_vec[0] = *xPtr++;
xx_vec[1] = *xPtr++;
xx_vec[2] = *xPtr++;
xx_vec[3] = *xPtr++;
xx_vec[4] = *xPtr++;
xx_vec[5] = *xPtr++;
xx_vec[6] = *xPtr++;
xx_vec[7] = *xPtr++;
c0 = data0[k];
//I am expecting vector subtraction here, but each of the row generates almost same assembly code
q_vec[0] = xx_vec[0] - c0;
q_vec[1] = xx_vec[1] - c0;
q_vec[2] = xx_vec[2] - c0;
q_vec[3] = xx_vec[3] - c0;
q_vec[4] = xx_vec[4] - c0;
q_vec[5] = xx_vec[5] - c0;
q_vec[6] = xx_vec[6] - c0;
q_vec[7] = xx_vec[7] - c0;
//if I create more internal variable for all of the multiplication, does it help?
cP_vec[0] = cP_vec[0] * data1[k] * exp(-pow(q_vec[0], 2.0f) * data2[k]);
cP_vec[1] = cP_vec[1] * data1[k] * exp(-pow(q_vec[1], 2.0f) * data2[k]);
cP_vec[2] = cP_vec[2] * data1[k] * exp(-pow(q_vec[2], 2.0f) * data2[k]);
cP_vec[3] = cP_vec[3] * data1[k] * exp(-pow(q_vec[3], 2.0f) * data2[k]);
cP_vec[4] = cP_vec[4] * data1[k] * exp(-pow(q_vec[4], 2.0f) * data2[k]);
cP_vec[5] = cP_vec[5] * data1[k] * exp(-pow(q_vec[5], 2.0f) * data2[k]);
cP_vec[6] = cP_vec[6] * data1[k] * exp(-pow(q_vec[6], 2.0f) * data2[k]);
cP_vec[7] = cP_vec[7] * data1[k] * exp(-pow(q_vec[7], 2.0f) * data2[k]);
k++;
f32Ptr = &data3[k];
for (int j =1; j <= 5; j++) //the index of this for is defined by a variable in my application, so it is not a constant
{
c0 = data0[k];
//here the subtraction and multiplication is not vectoritzed
q_vec[0] = (xx_vec[0] - c0) * (*f32Ptr);
q_vec[1] = (xx_vec[1] - c0) * (*f32Ptr);
q_vec[2] = (xx_vec[2] - c0) * (*f32Ptr);
q_vec[3] = (xx_vec[3] - c0) * (*f32Ptr);
q_vec[4] = (xx_vec[4] - c0) * (*f32Ptr);
q_vec[5] = (xx_vec[5] - c0) * (*f32Ptr);
q_vec[6] = (xx_vec[6] - c0) * (*f32Ptr);
q_vec[7] = (xx_vec[7] - c0) * (*f32Ptr);
q_vec[0] = (0.5f - 0.5f*erf( q_vec[0] ) );
q_vec[1] = (0.5f - 0.5f*erf( q_vec[1] ) );
q_vec[2] = (0.5f - 0.5f*erf( q_vec[2] ) );
q_vec[3] = (0.5f - 0.5f*erf( q_vec[3] ) );
q_vec[4] = (0.5f - 0.5f*erf( q_vec[4] ) );
q_vec[5] = (0.5f - 0.5f*erf( q_vec[5] ) );
q_vec[6] = (0.5f - 0.5f*erf( q_vec[6] ) );
q_vec[7] = (0.5f - 0.5f*erf( q_vec[7] ) );
//here the multiplication is not vectorized...
cP_vec[0] = cP_vec[0] * q_vec[0];
cP_vec[1] = cP_vec[1] * q_vec[1];
cP_vec[2] = cP_vec[2] * q_vec[2];
cP_vec[3] = cP_vec[3] * q_vec[3];
cP_vec[4] = cP_vec[4] * q_vec[4];
cP_vec[5] = cP_vec[5] * q_vec[5];
cP_vec[6] = cP_vec[6] * q_vec[6];
cP_vec[7] = cP_vec[7] * q_vec[7];
f32Ptr++;
k++;
}
sum += cP_vec[0];
sum += cP_vec[1];
sum += cP_vec[2];
sum += cP_vec[3];
sum += cP_vec[4];
sum += cP_vec[5];
sum += cP_vec[6];
sum += cP_vec[7];
}
return 0;
}
You can see the assembly code on Godbolt:
https://godbolt.org/z/wbkNAk
UPDATE:
I have implemented some SSE calculations. The speedup is approx. x1.10-1.15 which is far below as expected...
Am I do something wrong in the main()?
#include <immintrin.h>
#include "math.h"
#define N_h 5000
#define EXP_TABLE_SIZE 10
static const __m128 M128_1 = {1.0, 1.0, 1.0, 1.0};
float* x_vec; // allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data0; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data1; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data2; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data3; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
typedef struct ExpVar {
enum {
s = EXP_TABLE_SIZE,
n = 1 << s,
f88 = 0x42b00000 /* 88.0 */
};
float minX[8];
float maxX[8];
float a[8];
float b[8];
float f1[8];
unsigned int i127s[8];
unsigned int mask_s[8];
unsigned int i7fffffff[8];
unsigned int tbl[n];
union fi {
float f;
unsigned int i;
};
ExpVar()
{
float log_2 = ::logf(2.0f);
for (int i = 0; i < 8; i++) {
maxX[i] = 88;
minX[i] = -88;
a[i] = n / log_2;
b[i] = log_2 / n;
f1[i] = 1.0f;
i127s[i] = 127 << s;
i7fffffff[i] = 0x7fffffff;
mask_s[i] = mask(s);
}
for (int i = 0; i < n; i++) {
float y = pow(2.0f, (float)i / n);
fi fi;
fi.f = y;
tbl[i] = fi.i & mask(23);
}
}
inline unsigned int mask(int x)
{
return (1U << x) - 1;
}
};
inline __m128 exp_ps(__m128 x, ExpVar* expVar)
{
__m128i limit = _mm_castps_si128(_mm_and_ps(x, *(__m128*)expVar->i7fffffff));
int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *(__m128i*)expVar->maxX));
if (over) {
x = _mm_min_ps(x, _mm_load_ps(expVar->maxX));
x = _mm_max_ps(x, _mm_load_ps(expVar->minX));
}
__m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *(__m128*)(expVar->a)));
__m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *(__m128*)(expVar->b)));
t = _mm_add_ps(t, *(__m128*)(expVar->f1));
__m128i v4 = _mm_and_si128(r, *(__m128i*)(expVar->mask_s));
__m128i u4 = _mm_add_epi32(r, *(__m128i*)(expVar->i127s));
u4 = _mm_srli_epi32(u4, expVar->s);
u4 = _mm_slli_epi32(u4, 23);
unsigned int v0, v1, v2, v3;
v0 = _mm_cvtsi128_si32(v4);
v1 = _mm_extract_epi16(v4, 2);
v2 = _mm_extract_epi16(v4, 4);
v3 = _mm_extract_epi16(v4, 6);
__m128 t0, t1, t2, t3;
t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v0]));
t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v1]));
t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v2]));
t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v3]));
t1 = _mm_movelh_ps(t1, t3);
t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
t0 = _mm_movelh_ps(t0, t2);
t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));
t0 = _mm_or_ps(t0, t1);
t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
t = _mm_mul_ps(t, t0);
return t;
}
int main()
{
float* q_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xx_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* cP_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xPtr = x_vec;
float* f32Ptr;
__m128 c0,c1;
__m128* m128Var1;
__m128* m128Var2;
float* f32Ptr1;
float* f32Ptr2;
int n = N_h;
int sum = 0;
ExpVar expVar;
while(n > 0)
{
int k=1;
n-=8;
//cP_vec[0] = 1;
f32Ptr1 = cP_vec;
_mm_store_ps(f32Ptr1,M128_1);
f32Ptr1+=4;
_mm_store_ps(f32Ptr1,M128_1);
//preload x data
//xx_vec[0] = *xPtr++;
f32Ptr1 = xx_vec;
m128Var1 = (__m128*)xPtr;
_mm_store_ps(f32Ptr1,*m128Var1);
m128Var1++;
xPtr+=4;
f32Ptr1+=4;
m128Var1 = (__m128*)xPtr;
_mm_store_ps(f32Ptr1,*m128Var1);
xPtr+=4;
c0 = _mm_set1_ps(data0[k]);
m128Var1 = (__m128*)xx_vec;
f32Ptr1 = q_vec;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
//calc -pow(q_vec[0], 2.0f)
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var1) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var1) );
m128Var1 = (__m128*)q_vec;
*m128Var1 = _mm_xor_ps(*m128Var1, _mm_set1_ps(-0.0));
m128Var1++;
*m128Var1 = _mm_xor_ps(*m128Var1, _mm_set1_ps(-0.0));
//-pow(q_vec[0], 2.0f) * data2[k]
c0 = _mm_set1_ps(data2[k]);
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1 = (__m128*)q_vec;
//calc exp(x)
*m128Var1 = exp_ps(*m128Var1,&expVar);
m128Var1++;
*m128Var1 = exp_ps(*m128Var1,&expVar);
//data1[k] * exp(x)
c0 = _mm_set1_ps(data1[k]);
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
//cP_vec[0] * data1[k] * exp(x)
f32Ptr1 = cP_vec;
m128Var1 = (__m128*)cP_vec;
m128Var2 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var2) );
m128Var1++;
m128Var2++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var2) );
k++;
for (int j =1; j <= 5; j++)
{
c0 = _mm_set1_ps(data0[k]);
c1 = _mm_set1_ps(data3[k]);
m128Var1 = (__m128*)xx_vec;
m128Var2 = (__m128*)q_vec;
f32Ptr1 = q_vec;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var2, c1) );
m128Var1++;
m128Var2++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var2, c1) );
q_vec[0] = (0.5f - 0.5f*erf( q_vec[0] ) );
q_vec[1] = (0.5f - 0.5f*erf( q_vec[1] ) );
q_vec[2] = (0.5f - 0.5f*erf( q_vec[2] ) );
q_vec[3] = (0.5f - 0.5f*erf( q_vec[3] ) );
q_vec[4] = (0.5f - 0.5f*erf( q_vec[4] ) );
q_vec[5] = (0.5f - 0.5f*erf( q_vec[5] ) );
q_vec[6] = (0.5f - 0.5f*erf( q_vec[6] ) );
q_vec[7] = (0.5f - 0.5f*erf( q_vec[7] ) );
cP_vec[0] = cP_vec[0] * q_vec[0];
cP_vec[1] = cP_vec[1] * q_vec[1];
cP_vec[2] = cP_vec[2] * q_vec[2];
cP_vec[3] = cP_vec[3] * q_vec[3];
cP_vec[4] = cP_vec[4] * q_vec[4];
cP_vec[5] = cP_vec[5] * q_vec[5];
cP_vec[6] = cP_vec[6] * q_vec[6];
cP_vec[7] = cP_vec[7] * q_vec[7];
k++;
}
sum += cP_vec[0];
sum += cP_vec[1];
sum += cP_vec[2];
sum += cP_vec[3];
sum += cP_vec[4];
sum += cP_vec[5];
sum += cP_vec[6];
sum += cP_vec[7];
}
return 0;
}
https://godbolt.org/z/N7K6j0

The code is super weird with all those explicit stores into memory instead of plain old variables. I've tried to make it less weird, and added a vectorized erf, which is the main computation. Since I don't know what this code is supposed to do I can't really test it, except for performance, which did get better.
while (n > 0)
{
int k = 1;
n -= 8;
//preload x data
__m128 x_0 = _mm_load_ps(xPtr);
__m128 x_1 = _mm_load_ps(xPtr + 4);
xPtr += 8;
__m128 c0 = _mm_set1_ps(data0[k]);
__m128 q_0 = _mm_sub_ps(x_0, c0);
__m128 q_1 = _mm_sub_ps(x_1, c0);
//pow(q_vec, 2.0f)
__m128 t_0 = _mm_mul_ps(q_0, q_0);
__m128 t_1 = _mm_mul_ps(q_1, q_1);
//-pow(q_vec[0], 2.0f) * data2[k]
__m128 neg_data2k = _mm_xor_ps(_mm_set1_ps(data2[k]), _mm_set1_ps(-0.0));
t_0 = _mm_mul_ps(t_0, neg_data2k);
t_1 = _mm_mul_ps(t_1, neg_data2k);
//exp(-pow(q_vec[0], 2.0f) * data2[k])
t_0 = fast_exp_sse(t_0);
t_1 = fast_exp_sse(t_1);
//cP = data1[k] * exp(...)
c0 = _mm_set1_ps(data1[k]);
__m128 cP_0 = _mm_mul_ps(t_0, c0);
__m128 cP_1 = _mm_mul_ps(t_1, c0);
k++;
for (int j = 1; j <= 5; j++)
{
__m128 data0k = _mm_set1_ps(data0[k]);
__m128 data3k = _mm_set1_ps(data3[k]);
// q = (x - data0k) * data3k;
q_0 = _mm_mul_ps(_mm_sub_ps(x_0, data0k), data3k);
q_1 = _mm_mul_ps(_mm_sub_ps(x_1, data0k), data3k);
// q = 0.5 - 0.5 * erf(q)
__m128 half = _mm_set1_ps(0.5);
q_0 = _mm_sub_ps(half, _mm_mul_ps(half, erf_sse(q_0)));
q_1 = _mm_sub_ps(half, _mm_mul_ps(half, erf_sse(q_1)));
// cP = cP * q;
cP_0 = _mm_mul_ps(cP_0, q_0);
cP_1 = _mm_mul_ps(cP_1, q_1);
k++;
}
__m128 t = _mm_add_ps(cP_0, cP_1);
t = _mm_hadd_ps(t, t);
t = _mm_hadd_ps(t, t);
sum += _mm_cvtss_f32(t);
}
For erf I used:
__m128 erf_sse(__m128 x)
{
__m128 a1 = _mm_set1_ps(0.0705230784);
__m128 a2 = _mm_set1_ps(0.0422820123);
__m128 a3 = _mm_set1_ps(0.0092705272);
__m128 a4 = _mm_set1_ps(0.0001520143);
__m128 a5 = _mm_set1_ps(0.0002765672);
__m128 a6 = _mm_set1_ps(0.0000430638);
__m128 one = _mm_set1_ps(1);
__m128 p = _mm_add_ps(one,
_mm_mul_ps(x, _mm_add_ps(a1,
_mm_mul_ps(x, _mm_add_ps(a2,
_mm_mul_ps(x, _mm_add_ps(a3,
_mm_mul_ps(x, _mm_add_ps(a4,
_mm_mul_ps(x, _mm_add_ps(a5,
_mm_mul_ps(x, a6))))))))))));
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
return _mm_sub_ps(one, _mm_div_ps(one, p));
}
I'm not too sure about this one, it's just a formula from wikipedia transcribed into SSE instrinsics, using Horner's scheme to evaluate the polynomial. There is probably a better way.
For fast_exp_sse the usual combination of exponent extraction and a polynomial approximation. Going through a huge lookup table is a good way to ruin the SIMD gains.

Related

SSE mean filter in c++ and OpenCV

I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.

Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.

Closest Two 3D Point between two Line Segment of varied Magnitude in Different Plane(SOLVED)

Let's say AB1, AB2, CD1, CD2. AB1&AB2 and CD1&CD2 3D Points makes a Line Segment. And the Said Line segments are Not in the same Plane.
AP is a point Line segment AB1&AB2,
BP is a point Line segment CD1&CD2.
Point1 and Point2 Closest To each other (Shortest distance between the two line segment)
Now, how can I Find the said two points Point1 and Point2? What method should I use?
Below is only partially solved For full solution please See this answer here... because This function does not work when Two Line is on the same plane...
Thanks to #MBo I have come across Geometry GoldMine of Code and Explanations! They have Many Source Code Contributors! i picked one from there here it is clean and great!
bool CalculateLineLineIntersection(Vector3D p1, Vector3D p2, Vector3D p3, Vector3D p4, Vector3D& resultSegmentPoint1, Vector3D& resultSegmentPoint2)
{
// Algorithm is ported from the C algorithm of
// Paul Bourke at http://local.wasp.uwa.edu.au/~pbourke/geometry/lineline3d/
resultSegmentPoint1 = { 0,0,0 };
resultSegmentPoint2 = { 0,0,0 };
Vector3D p13 = VectorMinus(p1, p3);
Vector3D p43 = VectorMinus(p4, p3);
/*if (p43.LengthSq() < Math.Epsilon) {
return false;
}*/
Vector3D p21 = VectorMinus(p2, p1);
/*if (p21.LengthSq() < Math.Epsilon) {
return false;
}*/
double d1343 = p13.x * (double)p43.x + (double)p13.y * p43.y + (double)p13.z * p43.z;
double d4321 = p43.x * (double)p21.x + (double)p43.y * p21.y + (double)p43.z * p21.z;
double d1321 = p13.x * (double)p21.x + (double)p13.y * p21.y + (double)p13.z * p21.z;
double d4343 = p43.x * (double)p43.x + (double)p43.y * p43.y + (double)p43.z * p43.z;
double d2121 = p21.x * (double)p21.x + (double)p21.y * p21.y + (double)p21.z * p21.z;
double denom = d2121 * d4343 - d4321 * d4321;
/*if (Math.Abs(denom) < Math.Epsilon) {
return false;
}*/
double numer = d1343 * d4321 - d1321 * d4343;
double mua = numer / denom;
double mub = (d1343 + d4321 * (mua)) / d4343;
resultSegmentPoint1.x = (float)(p1.x + mua * p21.x);
resultSegmentPoint1.y = (float)(p1.y + mua * p21.y);
resultSegmentPoint1.z = (float)(p1.z + mua * p21.z);
resultSegmentPoint2.x = (float)(p3.x + mub * p43.x);
resultSegmentPoint2.y = (float)(p3.y + mub * p43.y);
resultSegmentPoint2.z = (float)(p3.z + mub * p43.z);
return true;
}
So Far I have Tried All these Below which works only when both Line segments have the same Magnitude...
Link 1
Link 2
I tried Calculating the centroid of both line segments and calculating the nearest Point on Segment From the midpoint. (I know how to calculate the Closest Point line segment from another Point)
But This only works when Both Line segments are of equal length AND each of Both the Linesegment's MidPoint is perpendicular to Each other and the centroid...
NOTE:Visual Geometry Geogbra3D for a visual representation of these Points
NOTE:AB1CD means From Point AB1 to Line CD(not segment)
AB1 = (6.550000, -7.540000, 0.000000 )
AB2 = (4.540000, -3.870000, 6.000000 )
CD1 = (0.000000, 8.000000, 3.530000 )
CD2 = (0.030000, -7.240000, -1.340000 )
PointCD1AB = (3.117523, -1.272742, 10.246199 )
PointCD2AB = (6.318374, -7.117081, 0.691420 )
PointAB1CD = (0.029794, -7.135321, -1.306549 )
PointAB2CD = (0.019807, -2.062110, 0.314614 )
Magntidue of PointCD1AB - P1LineSegmentCD = 11.866340
Magntidue of PointCD2AB - P2LineSegmentCD = 6.609495
Magntidue of PointAB1CD - P1LineSegmentAB = 6.662127
Magntidue of PointAB2CD - P2LineSegmentAB = 9.186399
Magntidue of PointCD1AB - PointAB1CD = 13.318028
Magntidue of PointCD2AB - PointAB2CD = 8.084965
Magntidue of PointCD1AB - PointAB2CD = 10.433375
Magntidue of PointCD2AB - PointAB1CD = 6.598368
Actual Shortest Point are
Point1 = (0.01, 1.59, 1.48 )
Point2 = (-1.23, 1.11, 3.13 )
Magnitude of Point1 And Point2 = 2.1190799890518526
For the Above Data, I used this Below Function
void NearestPointBetweenTwoLineSegmentOfVariedLength(Vector3D P1LineSegmentAB, Vector3D P2LineSegmentAB, Vector3D P1LineSegmentCD, Vector3D P2LineSegmentCD, Vector3D Testing)
{
/* float Line1Mag = Magnitude(VectorMinus(P1LineSegmentAB, P2LineSegmentAB));
float Line2Mag = Magnitude(VectorMinus(P1LineSegmentCD, P2LineSegmentCD));
P2LineSegmentAB = VectorMinus(P2LineSegmentAB, P1LineSegmentAB);
P1LineSegmentCD = VectorMinus(P1LineSegmentCD, P1LineSegmentAB);
P2LineSegmentCD = VectorMinus(P2LineSegmentCD, P1LineSegmentAB);
P1LineSegmentAB = VectorMinus(P1LineSegmentAB, P1LineSegmentAB);
Vector3D P1P2UnitDirection = GetUnitVector(P2LineSegmentAB, { 0,0,0 });
AngleBetweenTwoVectorsWithCommonUnitVectorAngleOfSecondArgument(P1LineSegmentAB, P2LineSegmentAB, P1P2UnitDirection);*/
Vector3D ReturnVal;
Vector3D PointCD1AB;
Vector3D PointCD2AB;
Vector3D PointAB1CD;
Vector3D PointAB2CD;
NearestPointOnLineFromPoint(P1LineSegmentCD, P1LineSegmentAB, P2LineSegmentAB, PointCD1AB, false);
PrintVector3Dfor(VectorMinus(PointCD1AB, Testing), "PointCD1AB", true);
NearestPointOnLineFromPoint(P2LineSegmentCD, P1LineSegmentAB, P2LineSegmentAB, PointCD2AB, false);
PrintVector3Dfor(VectorMinus(PointCD2AB, Testing), "PointCD2AB", true);
NearestPointOnLineFromPoint(P1LineSegmentAB, P1LineSegmentCD, P2LineSegmentCD, PointAB1CD, false);
PrintVector3Dfor(VectorMinus(PointAB1CD, Testing), "PointAB1CD", true);
NearestPointOnLineFromPoint(P2LineSegmentAB, P1LineSegmentCD, P2LineSegmentCD, PointAB2CD, false);
PrintVector3Dfor(VectorMinus(PointAB2CD, Testing), "PointAB2CD", true);
float m1 = Magnitude(VectorMinus(PointCD1AB, P1LineSegmentCD));
float m2 = Magnitude(VectorMinus(PointCD2AB, P2LineSegmentCD));
float m3 = Magnitude(VectorMinus(PointAB1CD, P1LineSegmentAB));
float m4 = Magnitude(VectorMinus(PointAB1CD, P2LineSegmentAB));
float m5 = Magnitude(VectorMinus(PointCD1AB, PointAB1CD));
float m6 = Magnitude(VectorMinus(PointCD2AB, PointAB2CD));
float m7 = Magnitude(VectorMinus(PointCD1AB, PointAB2CD));
float m8 = Magnitude(VectorMinus(PointCD2AB, PointAB1CD));
Printfloatfor(m1, "Magntidue of PointCD1AB - P1LineSegmentCD");
Printfloatfor(m2, "Magntidue of PointCD2AB - P2LineSegmentCD");
Printfloatfor(m3, "Magntidue of PointAB1CD - P1LineSegmentAB");
Printfloatfor(m4, "Magntidue of PointAB2CD - P2LineSegmentAB");
Printfloatfor(m5, "Magntidue of PointCD1AB - PointAB1CD");
Printfloatfor(m6, "Magntidue of PointCD2AB - PointAB2CD");
Printfloatfor(m7, "Magntidue of PointCD1AB - PointAB2CD");
Printfloatfor(m8, "Magntidue of PointCD2AB - PointAB1CD");
//NearestPointBetweenTwoLineSegmentOfSameLength1(P1LineSegmentAB, P2LineSegmentAB, P1LineSegmentCD, P2LineSegmentCD);
//NearestPointBetweenTwoLineSegmentOfSameLength2(P1LineSegmentAB, P2LineSegmentAB, P1LineSegmentCD, P2LineSegmentCD);
//NearestPointBetweenTwoLineSegmentOfSameLength3(P1LineSegmentAB, P2LineSegmentAB, P1LineSegmentCD, P2LineSegmentCD);
}
void NearestPointOnLineFromPoint(Vector3D Point, Vector3D LineSegmentStart, Vector3D LineSegmentEnd, Vector3D& ReturnVector, bool ClampTheValue)
{
//Get Heading Direction of Capsule from Origin To End
Vector3D CapsuleHeading = VectorMinus(LineSegmentEnd, LineSegmentStart);
float MagnitudeOfLineSegment = Magnitude(CapsuleHeading);
CapsuleHeading = VectorDivide(CapsuleHeading, MagnitudeOfLineSegment);
// Project From Point to Origin
Vector3D Projection = VectorMinus(Point, LineSegmentStart);
float DotProd = DotProduct(Projection, CapsuleHeading);
if (ClampTheValue)
{
DotProd = Clamp(DotProd, 0.0f, MagnitudeOfLineSegment);
}
ReturnVector = VectorAdd(LineSegmentStart, VectorMultiply(CapsuleHeading, DotProd));
}
I have Converted This Code from C# to C++ and it is not working as intended... I don't know if there is a problem with my code conversion or a problem within the code itself?
Vector3D ClampPointToLine(Vector3D pointToClamp, Vector3D LineStart, Vector3D LineEnd)
{
Vector3D clampedPoint = {0,0,0};
double minX, minY, minZ, maxX, maxY, maxZ;
if (LineStart.x <= LineEnd.x)
{
minX = LineStart.x;
maxX = LineEnd.x;
}
else
{
minX = LineEnd.x;
maxX = LineStart.x;
}
if (LineStart.y <= LineEnd.y)
{
minY = LineStart.y;
maxY = LineEnd.y;
}
else
{
minY = LineEnd.y;
maxY = LineStart.y;
}
if (LineStart.z <= LineEnd.z)
{
minZ = LineStart.z;
maxZ = LineEnd.z;
}
else
{
minZ = LineEnd.z;
maxZ = LineStart.z;
}
clampedPoint.x = (pointToClamp.x < minX) ? minX : (pointToClamp.x > maxX) ? maxX : pointToClamp.x;
clampedPoint.y = (pointToClamp.y < minY) ? minY : (pointToClamp.y > maxY) ? maxY : pointToClamp.y;
clampedPoint.z = (pointToClamp.z < minZ) ? minZ : (pointToClamp.z > maxZ) ? maxZ : pointToClamp.z;
return clampedPoint;
}
void distBetweenLines(Vector3D p1, Vector3D p2, Vector3D p3, Vector3D p4, Vector3D& ClosestPointOnLineP1P2, Vector3D& ClosestPointOnLineP3P4)
{
Vector3D d1;
Vector3D d2;
d1 = VectorMinus(p2, p1);
d2 = VectorMinus(p4, p3);
double eq1nCoeff = (d1.x * d2.x) + (d1.y * d2.y) + (d1.z * d2.z);
double eq1mCoeff = (-(powf(d1.x, 2)) - (powf(d1.y, 2)) - (powf(d1.z, 2)));
double eq1Const = ((d1.x * p3.x) - (d1.x * p1.x) + (d1.y * p3.y) - (d1.y * p1.y) + (d1.z * p3.z) - (d1.z * p1.z));
double eq2nCoeff = ((powf(d2.x, 2)) + (powf(d2.y, 2)) + (powf(d2.z, 2)));
double eq2mCoeff = -(d1.x * d2.x) - (d1.y * d2.y) - (d1.z * d2.z);
double eq2Const = ((d2.x * p3.x) - (d2.x * p1.x) + (d2.y * p3.y) - (d2.y * p2.y) + (d2.z * p3.z) - (d2.z * p1.z));
double M[2][3] = { { eq1nCoeff, eq1mCoeff, -eq1Const }, { eq2nCoeff, eq2mCoeff, -eq2Const } };
int rowCount = 2;
// pivoting
for (int col = 0; col + 1 < rowCount; col++) if (M[col, col] == 0)
// check for zero coefficients
{
// find non-zero coefficient
int swapRow = col + 1;
for (; swapRow < rowCount; swapRow++) if (M[swapRow, col] != 0) break;
if (M[swapRow, col] != 0) // found a non-zero coefficient?
{
// yes, then swap it with the above
double tmp[2];
for (int i = 0; i < rowCount + 1; i++)
{
tmp[i] = M[swapRow][i];
M[swapRow][i] = M[col][i];
M[col][i] = tmp[i];
}
}
else
{
std::cout << "\n the matrix has no unique solution";
return; // no, then the matrix has no unique solution
}
}
// elimination
for (int sourceRow = 0; sourceRow + 1 < rowCount; sourceRow++)
{
for (int destRow = sourceRow + 1; destRow < rowCount; destRow++)
{
double df = M[sourceRow][sourceRow];
double sf = M[destRow][sourceRow];
for (int i = 0; i < rowCount + 1; i++)
M[destRow][i] = M[destRow][i] * df - M[sourceRow][i] * sf;
}
}
// back-insertion
for (int row = rowCount - 1; row >= 0; row--)
{
double f = M[row][row];
if (f == 0) return;
for (int i = 0; i < rowCount + 1; i++) M[row][i] /= f;
for (int destRow = 0; destRow < row; destRow++)
{
M[destRow][rowCount] -= M[destRow][row] * M[row][rowCount]; M[destRow][row] = 0;
}
}
double n = M[0][2];
double m = M[1][2];
Vector3D i1 = { p1.x + (m * d1.x), p1.y + (m * d1.y), p1.z + (m * d1.z) };
Vector3D i2 = { p3.x + (n * d2.x), p3.y + (n * d2.y), p3.z + (n * d2.z) };
Vector3D i1Clamped = ClampPointToLine(i1, p1, p2);
Vector3D i2Clamped = ClampPointToLine(i2, p3, p4);
ClosestPointOnLineP1P2 = i1Clamped;
ClosestPointOnLineP3P4 = i2Clamped;
return;
}

Your problem is to find the shortest connection P1P2 between two line segments AB and CD. Let us define l1 as the line which goes through the points A and B and l2 as the line which goes through C and D.
You can split this problem up into several subtasks:
finding the shortest connection between the lines l1 and l2.
finding the shortest connection from either of the points A, B to segment CD (likewise for C,D to segment AB).
Let's start with the first subtask. THe line l1, going through A and B, can be parametrised by a single scalar, say sc,
l1(sc) = u*sc + A
with the direction vector u=(B-A).
As a consequence, we also have l1(0) = A and l(1) = B. Now, we want to find the minimal distance between this line and another line going through C and D, i.e.
l2(c) = v*tc + C
with v = D-C. In analogy to the other line, we have have l2(0) = C and l(1) = D. Now, we define
f(sc, tc) = 1/2*|l1(sc)-l2(tc)|^2
which is nothing but half the distance between the two lines squared. If we now want to minimise this function, we need to satisfy
df/dsc = 0 and df/dtc = 0
You'll find that
df/dsc = [u*sc - v*tc + (A-C)]*u and df/dtc = [u*sc - v*tc + (A-C)]*(-v)
Introducing w=A-C and arranging in vectors and matrices yields:
[ u*u -v*u] * [sc] = -[ w*u]
[-u*v v*v] [tc] [-w*v]
m * result = -rhs
The solution of the linear system is result = -m^(⁻1)* rhs, where m^(⁻1) is the inverse of m. If a and c are less than 0 or greater than 1, the closest point of the lines is outside the segments AB and CD. You might return these values as well.
The second subtask is closely related to this problem, but we minimise
f(sc) = 1/2*|l1(sc)-P|^2 and g(tc) = 1/2*|l2(tc)-P|^2
which directly yields
sc = -(A-P)*u/(u*u) and rc = -(C-P)*v/(v*v)
If sc < 0 we set sc = 0 or if sc > 1 we set sc = 1 (and likewise for tc) in order to get points on the segments.
Here is the implementation, which I took from here and modified it.
First, we define some helpers, i.e. vectors and some basic mathematical relations.
template <int dim>
struct Vector
{
std::array<double, dim> components;
};
using Vector2D = Vector<2>;
using Vector3D = Vector<3>;
// subtract
template <int dim>
Vector<dim> operator-(const Vector<dim> &u, const Vector<dim> &v) {
Vector<dim> result(u);
for (int i = 0; i < dim; ++i)
result.components[i] -= v.components[i];
return result;
}
// add
template <int dim>
Vector<dim> operator+(const Vector<dim> &u, const Vector<dim> &v) {
Vector<dim> result(u);
for (int i = 0; i < dim; ++i)
result.components[i] += v.components[i];
return result;
}
// negate
template <int dim>
Vector<dim> operator-(const Vector<dim> &u) {
Vector<dim> result;
for (int i = 0; i < dim; ++i)
result.components[i] = -u.components[i];
return result;
}
// scalar product
template <int dim>
double operator*(const Vector<dim> &u, const Vector<dim> &v) {
double result = 0;
for (int i = 0; i < dim; ++i)
result += u.components[i] * v.components[i];
return result;
}
// scale
template <int dim>
Vector<dim> operator*(const Vector<dim> &u, const double s) {
Vector<dim> result(u);
for (int i = 0; i < dim; ++i)
result.components[i] *= s;
return result;
}
// scale
template <int dim>
Vector<dim> operator*(const double s, const Vector<dim> &u) {
return u*s;
}
// ostream
template <int dim>
std::ostream& operator<< (std::ostream& out, const Vector<dim> &u) {
out << "(";
for (auto c : u.components)
out << std::setw(15) << c ;
out << ")";
return out;
}
This function does the actual work:
std::pair<Vector3D, Vector3D>
shortest_connection_segment_to_segment(const Vector3D A, const Vector3D B,
const Vector3D C, const Vector3D D)
{
Vector3D u = B - A;
Vector3D v = D - C;
Vector3D w = A - C;
double a = u*u; // always >= 0
double b = u*v;
double c = v*v; // always >= 0
double d = u*w;
double e = v*w;
double sc, sN, sD = a*c - b*b; // sc = sN / sD, sD >= 0
double tc, tN, tD = a*c - b*b; // tc = tN / tD, tD >= 0
double tol = 1e-15;
// compute the line parameters of the two closest points
if (sD < tol) { // the lines are almost parallel
sN = 0.0; // force using point A on segment AB
sD = 1.0; // to prevent possible division by 0.0 later
tN = e;
tD = c;
}
else { // get the closest points on the infinite lines
sN = (b*e - c*d);
tN = (a*e - b*d);
if (sN < 0.0) { // sc < 0 => the s=0 edge is visible
sN = 0.0; // compute shortest connection of A to segment CD
tN = e;
tD = c;
}
else if (sN > sD) { // sc > 1 => the s=1 edge is visible
sN = sD; // compute shortest connection of B to segment CD
tN = e + b;
tD = c;
}
}
if (tN < 0.0) { // tc < 0 => the t=0 edge is visible
tN = 0.0;
// recompute sc for this edge
if (-d < 0.0) // compute shortest connection of C to segment AB
sN = 0.0;
else if (-d > a)
sN = sD;
else {
sN = -d;
sD = a;
}
}
else if (tN > tD) { // tc > 1 => the t=1 edge is visible
tN = tD;
// recompute sc for this edge
if ((-d + b) < 0.0) // compute shortest connection of D to segment AB
sN = 0;
else if ((-d + b) > a)
sN = sD;
else {
sN = (-d + b);
sD = a;
}
}
// finally do the division to get sc and tc
sc = (fabs(sN) < tol ? 0.0 : sN / sD);
tc = (fabs(tN) < tol ? 0.0 : tN / tD);
Vector3D P1 = A + (sc * u);
Vector3D P2 = C + (tc * v);
return {P1, P2}; // return the closest distance
}
Usage:
int main() {
Vector3D A = {-7.54, 6.55, 0 };
Vector3D B = {4.54, -3.87, 6.0 };
Vector3D C = {0.0, 8.0, 3.53 };
Vector3D D = {0.03, -7.24, -1.34 };
auto [P1, P2] = shortest_connection_segment_to_segment (A, B, C, D);
std::cout << "P1 = " << P1 << std::endl;
std::cout << "P2 = " << P2 << std::endl;
return 0;
}
This prints
P1 = ( -1.24635 1.1212 3.12599)
P2 = ( 0.0125125 1.64365 1.49881)
live demo.
Note that this code still requires more testing.

Below Is a "Compact" version of the code from #StefanKssmr which is Here, This "Compact" version can easily be ported to OpenCL
Many thanks to #StefanKssmr for posting the Correct Answer,
void NearestPointBetweenTwoLineSegment(Vector3D AB1, Vector3D AB2, Vector3D CD1, Vector3D CD2, Vector3D& resultSegmentPoint1, Vector3D& resultSegmentPoint2)
{
Vector3D u = VectorMinus(AB2, AB1);
Vector3D v = VectorMinus(CD2, CD1);
Vector3D w = VectorMinus(AB1, CD1);
double a = DotProduct(u, u); // always >= 0
double b = DotProduct(u, v);
double c = DotProduct(v, v); // always >= 0
double d = DotProduct(u, w);
double e = DotProduct(v, w);
double sN, sD = (a * c) - (b * b); // sc = sN / sD, default sD = D >= 0
double tN, tD = (a * c) - (b * b); // tc = tN / tD, default tD = D >= 0
float Temp1;
float Temp2;
float Temp3;// Unfortuantely i have no choice but to use this...
//Part 1
Temp1 = (sD < 1e-6f) ? 1.0f : 0.0f;
sN = (1.0f - Temp1) * (b * e - c * d);
sD = ((1.0f - Temp1) * sD) + Temp1;
tN = (Temp1 * e) + ((1.0f - Temp1) * ((a * e) - (b * d)));
tD = (Temp1 * c) + ((1.0f - Temp1) * tD);
Temp2 = (sN < 0.0f) ? 1.0f : 0.0f;
Temp2 = Temp2 * (1.0f - Temp1);
sN = ((1.0f - Temp2) * sN);
tN = ((1.0f - Temp2) * tN) + (Temp2 * e);
tD = ((1.0f - Temp2) * tD) + (Temp2 * c);
Temp2 = ((sN > sD) ? 1.0f : 0.0f) * (1.0f - Temp2);
Temp2 = Temp2 * (1.0f - Temp1);
sN = ((1.0f - Temp2) * sN) + (Temp2 * sD);
tN = ((1.0f - Temp2) * tN) + (Temp2 * (e + b));
tD = ((1.0f - Temp2) * tD) + (Temp2 * c);
//Part 2.1
Temp1 = (tN < 0.0f) ? 1.0f : 0.0f;
tN = tN * (1.0f - Temp1);
Temp2 = (((-d) < 0.0) ? 1.0f : 0.0f) * Temp1;
sN = (1.0f - Temp2) * sN;//sN = (Temp2 * 0) + ((1.0f - Temp2) * sN);
Temp3 = ((((-d) > a) ? 1.0f : 0.0f) * (1.0f - Temp2)) * (Temp1);
sN = (Temp3 * sD) + ((1.0f - Temp3) * (sN));
Temp2 = (1.0f - Temp3) * (1.0f - Temp2) * (Temp1);
sN = (Temp2 * (-d)) + ((1.0f - Temp2) * (sN));
sD = (Temp2 * a) + ((1.0f - Temp2) * (sD));
//Part 2.2
Temp1 = ((tN > tD) ? 1.0f : 0.0f) * (1.0f - Temp1);
tN = ((1.0f - Temp1) * tN) + (Temp1 * tD);
Temp2 = (((-d + b) < 0.0) ? 1.0f : 0.0f) * Temp1;
sN = (1.0f - Temp2) * sN;//sN = (Temp2 * 0) + ((1.0f - Temp2) * sN);
Temp3 = ((((-d + b) > a) ? 1.0f : 0.0f) * (1.0f - Temp2)) * (Temp1);
sN = (Temp3 * sD) + ((1.0f - Temp3) * (sN));
Temp2 = (1.0f - Temp3) * (1.0f - Temp2) * (Temp1);
sN = (Temp2 * (-d)) + ((1.0f - Temp2) * (sN));
sD = (Temp2 * a) + ((1.0f - Temp2) * (sD));
resultSegmentPoint1 = VectorAdd(AB1, VectorMultiply(u, (fabs(sN) < 1e-6f ? 0.0 : sN / sD)));
resultSegmentPoint2 = VectorAdd(CD1, VectorMultiply(v, (fabs(tN) < 1e-6f ? 0.0 : tN / tD)));
}

Spline Catmull-Rom for image zooming using C++ and opencv

I'm trying to implement spline Catmull-Rom for image zooming using C++ and OpenCV.
I performed two tests, the first is image zooming (X2), and the second image reconstruction (zooming image decimated).
My problem is that in the image interpolated appear some white and black pixel (image1) when I displayed the value of pixels I found that white pixels have a negative value and the black one has a value greater than 255, also the image reconstructed appear blurred (image2 and image3).
float CalCurveInt(float t, float p0, float p1, float p2, float p3)
{
float t2 = t * t;
float t3 = t2 * t;
float x = 0.5f * ((2.0f * p1) +
(-p0 + p2) * t +
(2.0f * p0 - 5.0f * p1 + 4 * p2 - p3) * t2 +
(-p0 + 3.0f * p1 - 3.0f * p2 + p3) * t3);
return x;
}
Mat CalcCatmull(Mat &src, int zoom)
{
int v1, v2, v3, v4, Ptr, Xmax, Ymax;
float Result, t, c1, c2, c3, c4;
//------------------------------------------------------------
Xmax = src.cols;
Ymax = src.rows;
Size srcSize(zoom*Xmax, Ymax);
Mat dst(srcSize, CV_8UC1);
for (int j = 0; j < Ymax; j++)
{
Ptr = 0;
for (int i = 0; i < Xmax; i++)
{
v1 = i - 1; v2 = i; v3 = i + 1; v4 = i + 2;
if (i - 1 < 0) v1 = 0;
if (Xmax <= i + 1) v3 = Xmax - 1;
if (Xmax <= i + 2) v4 = Xmax - 1;
for (double J = 1; J <= zoom; J++)
{
t = J / zoom;
Result = 0.0;
c1 = src.at<uchar>(j, v1);
c2 = src.at<uchar>(j, v2);
c3 = src.at<uchar>(j, v3);
c4 = src.at<uchar>(j, v4);
Result = CalCurveInt(t, c1, c2, c3, c4);
dst.at<uchar>(j, Ptr) = abs(Result);
Ptr++;
}
}
}
//------------------------------------------------
Xmax = dst.cols;
Ymax = dst.rows;
Size srcSize1(Xmax, zoom*Ymax);
Mat dest(srcSize1, CV_8UC1);
for (int i = 0; i < Xmax; i++)
{
Ptr = 0;
for (int j = 0; j < Ymax; j++)
{
v1 = j - 1; v2 = j; v3 = j + 1; v4 = j + 2;
if (j - 1 < 0) v1 = 0;
if (Ymax <= j + 1) v3 = Ymax - 1;
if (Ymax <= j + 2) v4 = Ymax - 1;
for (double J = 1; J <= zoom; J++)
{
t = J / zoom;
Result = 0.0;
c1 = dst.at<uchar>(v1, i);
c2 = dst.at<uchar>(v2, i);
c3 = dst.at<uchar>(v3, i);
c4 = dst.at<uchar>(v4, i);
Result = CalCurveInt(t, c1, c2, c3, c4);
dest.at<uchar>(Ptr, i) = Result;
Ptr++;
}
}
}
return dest;
}
float zoom = 2.0;
int main()
{
Mat src = imread("fruits.png", CV_LOAD_IMAGE_GRAYSCALE);
int width = src.cols;
int hight = src.rows;
/*Image Decimation*/
Size srcdSize(int(width / zoom), int(hight / zoom));
Mat srcd;
pyrDown(src, srcd, srcdSize);
imshow("decimation", srcd);
Mat dst = CalcCatmull(srcd, zoom);
imshow("Image Source", src);
imshow("Image dest", dst);
imwrite("Image dest.png", dst);
waitKey(0);
return 0;
}
Thanks in advance.

My old implementation, seems it worked fine.
#include <iostream>
#include <vector>
#include <stdio.h>
#include <stdarg.h>
#include "opencv2/opencv.hpp"
#include "fstream"
#include "iostream"
using namespace std;
using namespace cv;
//-----------------------------------------------------------------------------------------------------
// Take 2 points, compute values between p1 and p2, p0 and p3 need for tangents computation
// on the bouunds. Parameter t - changes in range 0 to 1 (0 - we are in p1, 1 - we are in p2)
//-----------------------------------------------------------------------------------------------------
void PointOnCurve(Point2f &out, float t, Point2f p0, Point2f p1, Point2f p2, Point2f p3)
{
float t2 = t * t;
float t3 = t2 * t;
out.x = 0.5f * ( ( 2.0f * p1.x ) + ( -p0.x + p2.x ) * t +
( 2.0f * p0.x - 5.0f * p1.x + 4 * p2.x - p3.x ) * t2 +
( -p0.x + 3.0f * p1.x - 3.0f * p2.x + p3.x ) * t3 );
out.y = 0.5f * ( ( 2.0f * p1.y ) + ( -p0.y + p2.y ) * t +
( 2.0f * p0.y - 5.0f * p1.y + 4 * p2.y - p3.y ) * t2 +
( -p0.y + 3.0f * p1.y - 3.0f * p2.y + p3.y ) * t3 );
}
//-----------------------------------------------------------------------------------------------------
// interpolation of 4х4 patch
//
// S * S * S * S
// * * * * * * *
// S * S * S * S
// * * * * * * *
// S * S * S * S
// * * * * * * *
// S * S * S * S
//
// S- pixels of source imgage
//
// sequentially take 2 middle columns and computte D.
//
// S * 1 * 2 * S
// * * * * * * *
// S * 1 * 2 * S
// * * D * D * *
// S * 1 * 2 * S
// * * * * * * *
// S * 1 * 2 * S
//
// same for rows and we will have F
//
// S * S * S * S
// * * * * * * *
// 3 * 3 F 3 * 3
// * * D * D * *
// 4 * 4 F 4 * 4
// * * * * * * *
// S * S * S * S
//
// then compute diagonals and after averafing with neihbours will find С
//
// 1 * S * S * 2
// * * * * * * *
// S * 1 F 2 * S
// * * D C D * *
// S * 2 F 1 * S
// * * * * * * *
// 2 * S * S * 1
//-----------------------------------------------------------------------------------------------------
void PointOnSurface(Mat& src,Mat& dst)
{
float t=0.5;
Point2f out;
dst=Mat(3,3,CV_32FC1);
// Угловые точки результата совпадают с точками центральной ячейки исходного патча
dst.at<float>(0,0)=src.at<float>(1,1);
dst.at<float>(2,0)=src.at<float>(2,1);
dst.at<float>(0,2)=src.at<float>(1,2);
dst.at<float>(2,2)=src.at<float>(2,2);
Point2f p0;
Point2f p1;
Point2f p2;
Point2f p3;
p0.x=0;p0.y=src.at<float>(0,1);
p1.x=1;p1.y=src.at<float>(1,1);
p2.x=2;p2.y=src.at<float>(2,1);
p3.x=3;p3.y=src.at<float>(3,1);
PointOnCurve(out,t,p0,p1,p2,p3);
dst.at<float>(1,0)=out.y;
p0.x=0;p0.y=src.at<float>(0,2);
p1.x=1;p1.y=src.at<float>(1,2);
p2.x=2;p2.y=src.at<float>(2,2);
p3.x=3;p3.y=src.at<float>(3,2);
PointOnCurve(out,t,p0,p1,p2,p3);
dst.at<float>(1,2)=out.y;
p0.x=0;p0.y=src.at<float>(1,0);
p1.x=1;p1.y=src.at<float>(1,1);
p2.x=2;p2.y=src.at<float>(1,2);
p3.x=3;p3.y=src.at<float>(1,3);
PointOnCurve(out,t,p0,p1,p2,p3);
dst.at<float>(0,1)=out.y;
p0.x=0;p0.y=src.at<float>(2,0);
p1.x=1;p1.y=src.at<float>(2,1);
p2.x=2;p2.y=src.at<float>(2,2);
p3.x=3;p3.y=src.at<float>(2,3);
PointOnCurve(out,t,p0,p1,p2,p3);
dst.at<float>(2,1)=out.y;
// diagonals
// 1
p0.x=0;p0.y=src.at<float>(0,0);
p1.x=1;p1.y=src.at<float>(1,1);
p2.x=2;p2.y=src.at<float>(2,2);
p3.x=3;p3.y=src.at<float>(3,3);
PointOnCurve(out,t,p0,p1,p2,p3);
float d1=out.y;
// 2
p0.x=0;p0.y=src.at<float>(3,0);
p1.x=1;p1.y=src.at<float>(2,1);
p2.x=2;p2.y=src.at<float>(1,2);
p3.x=3;p3.y=src.at<float>(0,3);
PointOnCurve(out,t,p0,p1,p2,p3);
float d2=out.y;
// averaging
dst.at<float>(1,1)=1.0/6.0*(d1+d2+dst.at<float>(0,1)+dst.at<float>(1,0)+dst.at<float>(1,2)+dst.at<float>(2,1));
}
//-----------------------------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------------------
void Scale2Times(Mat& src_img,Mat &dstImg)
{
Mat imgf,img;
Mat dst;
Mat src;
src_img.convertTo(imgf,CV_32FC1,1.0/255.0);
cv::copyMakeBorder(imgf,img,1,1,1,1,cv::BORDER_REFLECT);
dstImg=Mat(src_img.rows*2,src_img.cols*2,CV_32FC1);
for(int i=0;i<img.rows-4;i++)
{
for(int j=0;j<img.cols-4;j++)
{
img(Rect(j,i,4,4)).copyTo(src);
PointOnSurface(src,dst);
dst.copyTo(dstImg(Rect(2*j+1,2*i+1,3,3)));
}
}
dstImg=dstImg(Rect(0,0,dstImg.cols-2,dstImg.rows-2)).clone();
}
//-----------------------------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------------------------
int main( int argc, char** argv )
{
namedWindow("Src");
namedWindow("cvResize");
namedWindow("Catmul-Rom");
Mat Img=imread("C:\\ImagesForTest\\1.tiff",0);
imshow("Src",Img);
Mat dstImg;
Scale2Times(Img,dstImg);
imshow("Catmul-Rom",dstImg);
Mat ImgLin(Img.rows*2,Img.cols*2,CV_8UC1);
cv::resize(Img,ImgLin,Size(Img.cols*2,Img.rows*2),INTER_CUBIC);
imshow("cvResize",ImgLin);
waitKey(0);
//getchar();
return 0;
}

What's wrong in this SSE2 transposition?

I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}

(update: this answer was written before the edits that show v_phase being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase at each step. Yeah, there was a // some other code (that will use phase) comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase) to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod on the result to range-reduce into the [0..2Pi) range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1] before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.

How to store all the pixels within a RotatedRect to another matrix?

I am going to process a region of pixels defined by RotatedRect in OpenCV. Although I know the rectangle center, size, and angle, I am not sure how to store all the x and y of this region to another matrix. I have checked some other posts, some suggest to rotate the image, but this will crop part of the image. Can you please help me out?

Try this (not sure I undestand the problem perfectly):
#include "opencv2/opencv.hpp"
#include <vector>
using namespace std;
using namespace cv;
//----------------------------------------------------------
//
//----------------------------------------------------------
void getQuadrangleSubPix_8u32f_CnR( const uchar* src, size_t src_step, Size src_size,
float* dst, size_t dst_step, Size win_size,
const double *matrix, int cn )
{
int x, y, k;
double A11 = matrix[0], A12 = matrix[1], A13 = matrix[2];
double A21 = matrix[3], A22 = matrix[4], A23 = matrix[5];
src_step /= sizeof(src[0]);
dst_step /= sizeof(dst[0]);
for( y = 0; y < win_size.height; y++, dst += dst_step )
{
double xs = A12*y + A13;
double ys = A22*y + A23;
double xe = A11*(win_size.width-1) + A12*y + A13;
double ye = A21*(win_size.width-1) + A22*y + A23;
if( (unsigned)(cvFloor(xs)-1) < (unsigned)(src_size.width - 3) &&
(unsigned)(cvFloor(ys)-1) < (unsigned)(src_size.height - 3) &&
(unsigned)(cvFloor(xe)-1) < (unsigned)(src_size.width - 3) &&
(unsigned)(cvFloor(ye)-1) < (unsigned)(src_size.height - 3))
{
for( x = 0; x < win_size.width; x++ )
{
int ixs = cvFloor( xs );
int iys = cvFloor( ys );
const uchar *ptr = src + src_step*iys;
float a = (float)(xs - ixs), b = (float)(ys - iys), a1 = 1.f - a, b1 = 1.f - b;
float w00 = a1*b1, w01 = a*b1, w10 = a1*b, w11 = a*b;
xs += A11;
ys += A21;
if( cn == 1 )
{
ptr += ixs;
dst[x] = ptr[0]*w00 + ptr[1]*w01 + ptr[src_step]*w10 + ptr[src_step+1]*w11;
}
else if( cn == 3 )
{
ptr += ixs*3;
float t0 = ptr[0]*w00 + ptr[3]*w01 + ptr[src_step]*w10 + ptr[src_step+3]*w11;
float t1 = ptr[1]*w00 + ptr[4]*w01 + ptr[src_step+1]*w10 + ptr[src_step+4]*w11;
float t2 = ptr[2]*w00 + ptr[5]*w01 + ptr[src_step+2]*w10 + ptr[src_step+5]*w11;
dst[x*3] = t0;
dst[x*3+1] = t1;
dst[x*3+2] = t2;
}
else
{
ptr += ixs*cn;
for( k = 0; k < cn; k++ )
dst[x*cn+k] = ptr[k]*w00 + ptr[k+cn]*w01 +
ptr[src_step+k]*w10 + ptr[src_step+k+cn]*w11;
}
}
}
else
{
for( x = 0; x < win_size.width; x++ )
{
int ixs = cvFloor( xs ), iys = cvFloor( ys );
float a = (float)(xs - ixs), b = (float)(ys - iys), a1 = 1.f - a, b1 = 1.f - b;
float w00 = a1*b1, w01 = a*b1, w10 = a1*b, w11 = a*b;
const uchar *ptr0, *ptr1;
xs += A11; ys += A21;
if( (unsigned)iys < (unsigned)(src_size.height-1) )
ptr0 = src + src_step*iys, ptr1 = ptr0 + src_step;
else
ptr0 = ptr1 = src + (iys < 0 ? 0 : src_size.height-1)*src_step;
if( (unsigned)ixs < (unsigned)(src_size.width-1) )
{
ptr0 += ixs*cn; ptr1 += ixs*cn;
for( k = 0; k < cn; k++ )
dst[x*cn + k] = ptr0[k]*w00 + ptr0[k+cn]*w01 + ptr1[k]*w10 + ptr1[k+cn]*w11;
}
else
{
ixs = ixs < 0 ? 0 : src_size.width - 1;
ptr0 += ixs*cn; ptr1 += ixs*cn;
for( k = 0; k < cn; k++ )
dst[x*cn + k] = ptr0[k]*b1 + ptr1[k]*b;
}
}
}
}
}
//----------------------------------------------------------
//
//----------------------------------------------------------
void myGetQuadrangleSubPix(const Mat& src, Mat& dst,Mat& m )
{
CV_Assert( src.channels() == dst.channels() );
cv::Size win_size = dst.size();
double matrix[6];
cv::Mat M(2, 3, CV_64F, matrix);
m.convertTo(M, CV_64F);
double dx = (win_size.width - 1)*0.5;
double dy = (win_size.height - 1)*0.5;
matrix[2] -= matrix[0]*dx + matrix[1]*dy;
matrix[5] -= matrix[3]*dx + matrix[4]*dy;
if( src.depth() == CV_8U && dst.depth() == CV_32F )
getQuadrangleSubPix_8u32f_CnR( src.data, src.step, src.size(),
(float*)dst.data, dst.step, dst.size(),
matrix, src.channels());
else
{
CV_Assert( src.depth() == dst.depth() );
cv::warpAffine(src, dst, M, dst.size(),
cv::INTER_LINEAR + cv::WARP_INVERSE_MAP,
cv::BORDER_REPLICATE);
}
}
//----------------------------------------------------------
//
//----------------------------------------------------------
void getRotRectImg(cv::RotatedRect rr,Mat &img,Mat& dst)
{
Mat m(2,3,CV_64FC1);
float ang=rr.angle*CV_PI/180.0;
m.at<double>(0,0)=cos(ang);
m.at<double>(1,0)=sin(ang);
m.at<double>(0,1)=-sin(ang);
m.at<double>(1,1)=cos(ang);
m.at<double>(0,2)=rr.center.x;
m.at<double>(1,2)=rr.center.y;
myGetQuadrangleSubPix(img,dst,m);
}
//----------------------------------------------------------
//
//----------------------------------------------------------
int main(int argc, char* argv[])
{
Mat img=imread("D:\\ImagesForTest\\lena.jpg");
img.convertTo(img,CV_32FC3,1.0/255.0);
cv::RotatedRect rr(cv::Point2f(200,200),Size(50,50),-30);
// rotated rectangle
Point2f rect_points[4];
rr.points( rect_points );
for( int j = 0; j < 4; j++ )
{
line( img, rect_points[j], rect_points[(j+1)%4], Scalar(0,1,0), 1, CV_AA );
}
imshow("colImg",img);
Mat dst(rr.size,CV_32FC3);
getRotRectImg(rr,img,dst);
imshow("rotImg",dst);
cv::waitKey(0);
cv::destroyAllWindows();
return 0;
}
The result:

Implementation with OpenCV warpAffine.
Mat getAffineTransformForRotatedRect(RotatedRect rr) {
float angle = rr.angle * M_PI / 180.0;
// angle += M_PI; // you may want rotate it upsidedown
float sinA = sin(angle), cosA = cos(angle);
float data[6] = {
cosA, sinA, rr.size.width/2.0f - cosA * rr.center.x - sinA * rr.center.y,
-sinA, cosA, rr.size.height/2.0f - cosA * rr.center.y + sinA * rr.center.x};
Mat rot_mat(2, 3, CV_32FC1, data);
return rot_mat.clone();
}
Mat getRotatedRectImg(const cv::Mat &mat, RotatedRect rr) {
Mat M, result;
M = getAffineTransformForRotatedRect(rr);
warpAffine(mat, result, M, rr.size, INTER_CUBIC);
return result;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js