Related
I want to speed up my algorithm which is an objective function f(x). The problem dimension is 5000. I have already introduced a lot of improvement in the code, but still the calculation time does not fit to my expectation.
Most of the dataset are allocated dynamically as (float*)_mm_malloc(N_h*sizeof(float),16). In the objective function where "long" for loops are present I applied successfully the _mm_mul_ps, _mm_rcp_ps, _mm_store_ps ... etc on __m128Var variables. And I introduced threading (_beginthreadex) to speed up the most slowest code.
But there is a part of code which cannot be vectorized easily...
I attached the most problematic code (slowest calculation) where I still cannot make an improvement (reminder, this is a part of a code from a bigger calculation, but my problem can be seen with this). I am expecting vector calculations, but I got simple calculation for each row of code (a lot of MOVSS, MULSS, SUBSS...etc in the assembly code). Could someone give me a hint what can be a problem?
I am using MinGW GCC-8.2.0-3 compiler on Windows machine with -O3 -march=native -ffast-math flags.
#include <immintrin.h>
#include "math.h"
#define N_h 5000
float* x_vec; // allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data0; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data1; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data2; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data3; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
int main()
{
float* q_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xx_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* cP_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xPtr = x_vec;
float* f32Ptr;
float c0;
int n = N_h;
int sum = 0;
while(n > 0)
{
int k=1;
n-=8;
cP_vec[0] = 1;
cP_vec[1] = 1;
cP_vec[2] = 1;
cP_vec[3] = 1;
cP_vec[4] = 1;
cP_vec[5] = 1;
cP_vec[6] = 1;
cP_vec[7] = 1;
//preload of x data shall be done with vector preload, currently it is row-by-row **MOVS**
xx_vec[0] = *xPtr++;
xx_vec[1] = *xPtr++;
xx_vec[2] = *xPtr++;
xx_vec[3] = *xPtr++;
xx_vec[4] = *xPtr++;
xx_vec[5] = *xPtr++;
xx_vec[6] = *xPtr++;
xx_vec[7] = *xPtr++;
c0 = data0[k];
//I am expecting vector subtraction here, but each of the row generates almost same assembly code
q_vec[0] = xx_vec[0] - c0;
q_vec[1] = xx_vec[1] - c0;
q_vec[2] = xx_vec[2] - c0;
q_vec[3] = xx_vec[3] - c0;
q_vec[4] = xx_vec[4] - c0;
q_vec[5] = xx_vec[5] - c0;
q_vec[6] = xx_vec[6] - c0;
q_vec[7] = xx_vec[7] - c0;
//if I create more internal variable for all of the multiplication, does it help?
cP_vec[0] = cP_vec[0] * data1[k] * exp(-pow(q_vec[0], 2.0f) * data2[k]);
cP_vec[1] = cP_vec[1] * data1[k] * exp(-pow(q_vec[1], 2.0f) * data2[k]);
cP_vec[2] = cP_vec[2] * data1[k] * exp(-pow(q_vec[2], 2.0f) * data2[k]);
cP_vec[3] = cP_vec[3] * data1[k] * exp(-pow(q_vec[3], 2.0f) * data2[k]);
cP_vec[4] = cP_vec[4] * data1[k] * exp(-pow(q_vec[4], 2.0f) * data2[k]);
cP_vec[5] = cP_vec[5] * data1[k] * exp(-pow(q_vec[5], 2.0f) * data2[k]);
cP_vec[6] = cP_vec[6] * data1[k] * exp(-pow(q_vec[6], 2.0f) * data2[k]);
cP_vec[7] = cP_vec[7] * data1[k] * exp(-pow(q_vec[7], 2.0f) * data2[k]);
k++;
f32Ptr = &data3[k];
for (int j =1; j <= 5; j++) //the index of this for is defined by a variable in my application, so it is not a constant
{
c0 = data0[k];
//here the subtraction and multiplication is not vectoritzed
q_vec[0] = (xx_vec[0] - c0) * (*f32Ptr);
q_vec[1] = (xx_vec[1] - c0) * (*f32Ptr);
q_vec[2] = (xx_vec[2] - c0) * (*f32Ptr);
q_vec[3] = (xx_vec[3] - c0) * (*f32Ptr);
q_vec[4] = (xx_vec[4] - c0) * (*f32Ptr);
q_vec[5] = (xx_vec[5] - c0) * (*f32Ptr);
q_vec[6] = (xx_vec[6] - c0) * (*f32Ptr);
q_vec[7] = (xx_vec[7] - c0) * (*f32Ptr);
q_vec[0] = (0.5f - 0.5f*erf( q_vec[0] ) );
q_vec[1] = (0.5f - 0.5f*erf( q_vec[1] ) );
q_vec[2] = (0.5f - 0.5f*erf( q_vec[2] ) );
q_vec[3] = (0.5f - 0.5f*erf( q_vec[3] ) );
q_vec[4] = (0.5f - 0.5f*erf( q_vec[4] ) );
q_vec[5] = (0.5f - 0.5f*erf( q_vec[5] ) );
q_vec[6] = (0.5f - 0.5f*erf( q_vec[6] ) );
q_vec[7] = (0.5f - 0.5f*erf( q_vec[7] ) );
//here the multiplication is not vectorized...
cP_vec[0] = cP_vec[0] * q_vec[0];
cP_vec[1] = cP_vec[1] * q_vec[1];
cP_vec[2] = cP_vec[2] * q_vec[2];
cP_vec[3] = cP_vec[3] * q_vec[3];
cP_vec[4] = cP_vec[4] * q_vec[4];
cP_vec[5] = cP_vec[5] * q_vec[5];
cP_vec[6] = cP_vec[6] * q_vec[6];
cP_vec[7] = cP_vec[7] * q_vec[7];
f32Ptr++;
k++;
}
sum += cP_vec[0];
sum += cP_vec[1];
sum += cP_vec[2];
sum += cP_vec[3];
sum += cP_vec[4];
sum += cP_vec[5];
sum += cP_vec[6];
sum += cP_vec[7];
}
return 0;
}
You can see the assembly code on Godbolt:
https://godbolt.org/z/wbkNAk
UPDATE:
I have implemented some SSE calculations. The speedup is approx. x1.10-1.15 which is far below as expected...
Am I do something wrong in the main()?
#include <immintrin.h>
#include "math.h"
#define N_h 5000
#define EXP_TABLE_SIZE 10
static const __m128 M128_1 = {1.0, 1.0, 1.0, 1.0};
float* x_vec; // allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data0; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data1; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data2; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
float* data3; //allocated as: (float*)_mm_malloc(N_h*sizeof(float),16);
typedef struct ExpVar {
enum {
s = EXP_TABLE_SIZE,
n = 1 << s,
f88 = 0x42b00000 /* 88.0 */
};
float minX[8];
float maxX[8];
float a[8];
float b[8];
float f1[8];
unsigned int i127s[8];
unsigned int mask_s[8];
unsigned int i7fffffff[8];
unsigned int tbl[n];
union fi {
float f;
unsigned int i;
};
ExpVar()
{
float log_2 = ::logf(2.0f);
for (int i = 0; i < 8; i++) {
maxX[i] = 88;
minX[i] = -88;
a[i] = n / log_2;
b[i] = log_2 / n;
f1[i] = 1.0f;
i127s[i] = 127 << s;
i7fffffff[i] = 0x7fffffff;
mask_s[i] = mask(s);
}
for (int i = 0; i < n; i++) {
float y = pow(2.0f, (float)i / n);
fi fi;
fi.f = y;
tbl[i] = fi.i & mask(23);
}
}
inline unsigned int mask(int x)
{
return (1U << x) - 1;
}
};
inline __m128 exp_ps(__m128 x, ExpVar* expVar)
{
__m128i limit = _mm_castps_si128(_mm_and_ps(x, *(__m128*)expVar->i7fffffff));
int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *(__m128i*)expVar->maxX));
if (over) {
x = _mm_min_ps(x, _mm_load_ps(expVar->maxX));
x = _mm_max_ps(x, _mm_load_ps(expVar->minX));
}
__m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *(__m128*)(expVar->a)));
__m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *(__m128*)(expVar->b)));
t = _mm_add_ps(t, *(__m128*)(expVar->f1));
__m128i v4 = _mm_and_si128(r, *(__m128i*)(expVar->mask_s));
__m128i u4 = _mm_add_epi32(r, *(__m128i*)(expVar->i127s));
u4 = _mm_srli_epi32(u4, expVar->s);
u4 = _mm_slli_epi32(u4, 23);
unsigned int v0, v1, v2, v3;
v0 = _mm_cvtsi128_si32(v4);
v1 = _mm_extract_epi16(v4, 2);
v2 = _mm_extract_epi16(v4, 4);
v3 = _mm_extract_epi16(v4, 6);
__m128 t0, t1, t2, t3;
t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v0]));
t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v1]));
t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v2]));
t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar->tbl[v3]));
t1 = _mm_movelh_ps(t1, t3);
t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
t0 = _mm_movelh_ps(t0, t2);
t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));
t0 = _mm_or_ps(t0, t1);
t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
t = _mm_mul_ps(t, t0);
return t;
}
int main()
{
float* q_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xx_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* cP_vec = (float*)_mm_malloc(8*sizeof(float),16);
float* xPtr = x_vec;
float* f32Ptr;
__m128 c0,c1;
__m128* m128Var1;
__m128* m128Var2;
float* f32Ptr1;
float* f32Ptr2;
int n = N_h;
int sum = 0;
ExpVar expVar;
while(n > 0)
{
int k=1;
n-=8;
//cP_vec[0] = 1;
f32Ptr1 = cP_vec;
_mm_store_ps(f32Ptr1,M128_1);
f32Ptr1+=4;
_mm_store_ps(f32Ptr1,M128_1);
//preload x data
//xx_vec[0] = *xPtr++;
f32Ptr1 = xx_vec;
m128Var1 = (__m128*)xPtr;
_mm_store_ps(f32Ptr1,*m128Var1);
m128Var1++;
xPtr+=4;
f32Ptr1+=4;
m128Var1 = (__m128*)xPtr;
_mm_store_ps(f32Ptr1,*m128Var1);
xPtr+=4;
c0 = _mm_set1_ps(data0[k]);
m128Var1 = (__m128*)xx_vec;
f32Ptr1 = q_vec;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
//calc -pow(q_vec[0], 2.0f)
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var1) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var1) );
m128Var1 = (__m128*)q_vec;
*m128Var1 = _mm_xor_ps(*m128Var1, _mm_set1_ps(-0.0));
m128Var1++;
*m128Var1 = _mm_xor_ps(*m128Var1, _mm_set1_ps(-0.0));
//-pow(q_vec[0], 2.0f) * data2[k]
c0 = _mm_set1_ps(data2[k]);
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1 = (__m128*)q_vec;
//calc exp(x)
*m128Var1 = exp_ps(*m128Var1,&expVar);
m128Var1++;
*m128Var1 = exp_ps(*m128Var1,&expVar);
//data1[k] * exp(x)
c0 = _mm_set1_ps(data1[k]);
f32Ptr1 = q_vec;
m128Var1 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
m128Var1++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, c0) );
//cP_vec[0] * data1[k] * exp(x)
f32Ptr1 = cP_vec;
m128Var1 = (__m128*)cP_vec;
m128Var2 = (__m128*)q_vec;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var2) );
m128Var1++;
m128Var2++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var1, *m128Var2) );
k++;
for (int j =1; j <= 5; j++)
{
c0 = _mm_set1_ps(data0[k]);
c1 = _mm_set1_ps(data3[k]);
m128Var1 = (__m128*)xx_vec;
m128Var2 = (__m128*)q_vec;
f32Ptr1 = q_vec;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var2, c1) );
m128Var1++;
m128Var2++;
f32Ptr1+=4;
_mm_store_ps(f32Ptr1, _mm_sub_ps(*m128Var1, c0) );
_mm_store_ps(f32Ptr1, _mm_mul_ps(*m128Var2, c1) );
q_vec[0] = (0.5f - 0.5f*erf( q_vec[0] ) );
q_vec[1] = (0.5f - 0.5f*erf( q_vec[1] ) );
q_vec[2] = (0.5f - 0.5f*erf( q_vec[2] ) );
q_vec[3] = (0.5f - 0.5f*erf( q_vec[3] ) );
q_vec[4] = (0.5f - 0.5f*erf( q_vec[4] ) );
q_vec[5] = (0.5f - 0.5f*erf( q_vec[5] ) );
q_vec[6] = (0.5f - 0.5f*erf( q_vec[6] ) );
q_vec[7] = (0.5f - 0.5f*erf( q_vec[7] ) );
cP_vec[0] = cP_vec[0] * q_vec[0];
cP_vec[1] = cP_vec[1] * q_vec[1];
cP_vec[2] = cP_vec[2] * q_vec[2];
cP_vec[3] = cP_vec[3] * q_vec[3];
cP_vec[4] = cP_vec[4] * q_vec[4];
cP_vec[5] = cP_vec[5] * q_vec[5];
cP_vec[6] = cP_vec[6] * q_vec[6];
cP_vec[7] = cP_vec[7] * q_vec[7];
k++;
}
sum += cP_vec[0];
sum += cP_vec[1];
sum += cP_vec[2];
sum += cP_vec[3];
sum += cP_vec[4];
sum += cP_vec[5];
sum += cP_vec[6];
sum += cP_vec[7];
}
return 0;
}
https://godbolt.org/z/N7K6j0
The code is super weird with all those explicit stores into memory instead of plain old variables. I've tried to make it less weird, and added a vectorized erf, which is the main computation. Since I don't know what this code is supposed to do I can't really test it, except for performance, which did get better.
while (n > 0)
{
int k = 1;
n -= 8;
//preload x data
__m128 x_0 = _mm_load_ps(xPtr);
__m128 x_1 = _mm_load_ps(xPtr + 4);
xPtr += 8;
__m128 c0 = _mm_set1_ps(data0[k]);
__m128 q_0 = _mm_sub_ps(x_0, c0);
__m128 q_1 = _mm_sub_ps(x_1, c0);
//pow(q_vec, 2.0f)
__m128 t_0 = _mm_mul_ps(q_0, q_0);
__m128 t_1 = _mm_mul_ps(q_1, q_1);
//-pow(q_vec[0], 2.0f) * data2[k]
__m128 neg_data2k = _mm_xor_ps(_mm_set1_ps(data2[k]), _mm_set1_ps(-0.0));
t_0 = _mm_mul_ps(t_0, neg_data2k);
t_1 = _mm_mul_ps(t_1, neg_data2k);
//exp(-pow(q_vec[0], 2.0f) * data2[k])
t_0 = fast_exp_sse(t_0);
t_1 = fast_exp_sse(t_1);
//cP = data1[k] * exp(...)
c0 = _mm_set1_ps(data1[k]);
__m128 cP_0 = _mm_mul_ps(t_0, c0);
__m128 cP_1 = _mm_mul_ps(t_1, c0);
k++;
for (int j = 1; j <= 5; j++)
{
__m128 data0k = _mm_set1_ps(data0[k]);
__m128 data3k = _mm_set1_ps(data3[k]);
// q = (x - data0k) * data3k;
q_0 = _mm_mul_ps(_mm_sub_ps(x_0, data0k), data3k);
q_1 = _mm_mul_ps(_mm_sub_ps(x_1, data0k), data3k);
// q = 0.5 - 0.5 * erf(q)
__m128 half = _mm_set1_ps(0.5);
q_0 = _mm_sub_ps(half, _mm_mul_ps(half, erf_sse(q_0)));
q_1 = _mm_sub_ps(half, _mm_mul_ps(half, erf_sse(q_1)));
// cP = cP * q;
cP_0 = _mm_mul_ps(cP_0, q_0);
cP_1 = _mm_mul_ps(cP_1, q_1);
k++;
}
__m128 t = _mm_add_ps(cP_0, cP_1);
t = _mm_hadd_ps(t, t);
t = _mm_hadd_ps(t, t);
sum += _mm_cvtss_f32(t);
}
For erf I used:
__m128 erf_sse(__m128 x)
{
__m128 a1 = _mm_set1_ps(0.0705230784);
__m128 a2 = _mm_set1_ps(0.0422820123);
__m128 a3 = _mm_set1_ps(0.0092705272);
__m128 a4 = _mm_set1_ps(0.0001520143);
__m128 a5 = _mm_set1_ps(0.0002765672);
__m128 a6 = _mm_set1_ps(0.0000430638);
__m128 one = _mm_set1_ps(1);
__m128 p = _mm_add_ps(one,
_mm_mul_ps(x, _mm_add_ps(a1,
_mm_mul_ps(x, _mm_add_ps(a2,
_mm_mul_ps(x, _mm_add_ps(a3,
_mm_mul_ps(x, _mm_add_ps(a4,
_mm_mul_ps(x, _mm_add_ps(a5,
_mm_mul_ps(x, a6))))))))))));
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
p = _mm_mul_ps(p, p);
return _mm_sub_ps(one, _mm_div_ps(one, p));
}
I'm not too sure about this one, it's just a formula from wikipedia transcribed into SSE instrinsics, using Horner's scheme to evaluate the polynomial. There is probably a better way.
For fast_exp_sse the usual combination of exponent extraction and a polynomial approximation. Going through a huge lookup table is a good way to ruin the SIMD gains.
I'm trying to test and animate some Clifford attractors
Seen here:
http://paulbourke.net/fractals/clifford/
I tried compiling the code taken from here.
http://paulbourke.net/fractals/clifford/paul_richards/main.cpp
The code compiles but no image file is generated.
I'm using:
Visual Studio Code
Version: 1.38.0
Commit: 3db7e09f3b61f915d03bbfa58e258d6eee843f35
Date: 2019-09-03T21:51:09.716Z
Electron: 4.2.10
Chrome: 69.0.3497.128
Node.js: 10.11.0
V8: 6.9.427.31-electron.0
OS: Linux x64 4.15.0-60-generic snap
/*
xn+1 = sin(a yn) + c cos(a xn)
yn+1 = sin(b xn) + d cos(b yn)
*/
#include <iostream>
#include <cmath>
#include <vector>
using namespace std;
// Change params only in this block
namespace {
const int width = 1600;
const int height = 1200;
const int frames = 10000;
const int iters = 10000;
const int skipIters = 10;
double sensitivity = 0.02;
const double minX = -4.0;
const double minY = minX * height / width;
const double maxX = 4.0;
const double maxY = maxX * height / width;
const double minA = acos( 1.6 / 2.0 );
const double maxA = acos( 1.3 / 2.0 );
const double minB = acos( -0.6 / 2.0 );
const double maxB = acos( 1.7 / 2.0 );
const double minC = acos( -1.2 / 2.0 );
const double maxC = acos( 0.5 / 2.0 );
const double minD = acos( 1.6 / 2.0 );
const double maxD = acos( 1.4 / 2.0 );
};
class Color {
public:
double r, g, b;
Color(const double &red = 0, const double &green = 0, const double &blue = 0) : r(red), g(green), b(blue) {
}
Color& operator+=(const Color &rhs) {
r += rhs.r;
g += rhs.g;
b += rhs.b;
return *this;
}
static Color createHue( double h ) {
h *= 6.0;
int hi = static_cast<int>( h );
double hf = h - hi;
switch( hi % 6 ) {
case 0:
return Color( 1.0 , hf, 0.0 );
case 1:
return Color( 1.0 - hf, 1.0, 0.0 );
case 2:
return Color( 0.0 , 1.0, hf );
case 3:
return Color( 0.0, 1.0 - hf, 1.0 );
case 4:
return Color( hf, 0.0, 1.0 );
case 5:
return Color( 1.0, 0.0, 1.0 - hf );
}
return Color();
}
Color operator+(const Color &rhs) const {
return Color(*this) += rhs;
}
};
int main(void) {
vector<Color> image( width * height );
for (int i = 0; i < frames; i++) {
const double p = static_cast<double>(i) / frames;
const double a = cos( minA + p * (maxA - minA) ) * 2.0;
const double b = cos( minB + p * (maxB - minB) ) * 2.0;
const double c = cos( minC + p * (maxC - minC) ) * 2.0;
const double d = cos( minD + p * (maxD - minD) ) * 2.0;
const Color curCol = Color::createHue( p );
double x = 0.0, y = 0.0;
for (int j = 0; j < iters; j++) {
double xn = sin(a * y) + c * cos(a * x);
double yn = sin(b * x) + d * cos(b * y);
x = xn;
y = yn;
if ( j < skipIters )
continue;
int xi = static_cast<int>( (x - minX) * width / (maxX - minX) );
int yi = static_cast<int>( (y - minY) * height / (maxY - minY) );
if ( xi >= 0 && xi < width &&
yi >= 0 && yi < height ) {
image[ xi + yi * width ] += curCol;
}
}
clog << "\r" << i;
}
clog << "\n";
cout
<< "P6\n"
<< width << " " << height << "\n"
<< "255\n";
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
Color &c = image[ x + y * width ];
unsigned char r = static_cast<unsigned char>( (1.0 - exp( -sensitivity * c.r )) * 255.0 );
unsigned char g = static_cast<unsigned char>( (1.0 - exp( -sensitivity * c.g )) * 255.0 );
unsigned char b = static_cast<unsigned char>( (1.0 - exp( -sensitivity * c.b )) * 255.0 );
cout << r << g << b;
}
}
return 0;
}
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 6 years ago.
Improve this question
I am trying to run a code scheme published in the following paper:
http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1001059#s4
Specifically, the implementation of the code is from:
http://www.comp.nus.edu.sg/~rpsysbio/pada/
Following successful compilation of the below code on Mac OS X El Capitan using gcc Fortran, I get an executable file as expected from the code below. However, when I try to execute this file I get
segmentation fault 11.
After some research I think this is a recursion problem causing the stack to overflow, but I have no idea how to solve this. Could someone point me in the right direction please?
#include <stdio.h>
#include <math.h>
#include "./3rdparty/dSFMT-src-2.0/dSFMT.c"
double x1,
x1p,
x1preN;
double x2,
x2p,
x2preN;
double x3,
x3p,
x3preN;
double x4,
x4p,
x4preN;
double k1;
double k2;
double k3;
int x1ctr[100][5][5][5][5][5][5][5];
int x2ctr[100][5][5][5][5][5][5][5];
int x3ctr[100][5][5][5][5][5][5];
int x4ctr[100][5][5][5][5];
double
fx1(double x1)
{
return -(k1 * x1 * x3 - k2 * x2) + k3 * x2;
}
double
fx2(double x2)
{
return (k1 * x1 * x3 - k2 * x2) - k3 * x2;
}
double
fx3(double x3)
{
return -(k1 * x1 * x3 - k2 * x2);
}
double
fx4(double x4)
{
return k3 * x2;
}
int
discretize(double v, double xi[], int length)
{
for (int j = 1; j < length - 1; j++)
if (v < xi[j])
return j - 1;
return length - 2;
}
int
main(int argc, char *argv[])
{
int myid = atoi(argv[1]);
double dt = 0.01,
halfdt = dt / 2.0;
int tps = (int) (10.0 / dt),
t,
i;
int block = tps / 100,
tb;
double halfF1,
halfF2,
F3,
F4;
double x1i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x2i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x3i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x4i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
int x1pre,
x1post,
x1init = 2;
int x2pre,
x2post,
x2init = 0;
int x3pre,
x3post,
x3init = 4;
int x4pre,
x4post,
x4init = 0;
double k1i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
double k2i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
double k3i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
int k1bin;
int k2bin;
int k3bin;
int sampleNo = 1000;
dsfmt_t dsfmt;
int seed = 7018 + myid;
dsfmt_init_gen_rand(&dsfmt, seed);
for (int i = 0; i < sampleNo; i++) {
k1 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
k2 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
k3 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
x1 = x1i[x1init] +
dsfmt_genrand_close_open(&dsfmt) * (x1i[x1init + 1] -
x1i[x1init]);
x2 = x2i[x2init] +
dsfmt_genrand_close_open(&dsfmt) * (x2i[x2init + 1] -
x2i[x2init]);
x3 = x3i[x3init] +
dsfmt_genrand_close_open(&dsfmt) * (x3i[x3init + 1] -
x3i[x3init]);
x4 = x4i[x4init] +
dsfmt_genrand_close_open(&dsfmt) * (x4i[x4init + 1] -
x4i[x4init]);
x1preN = x1;
x2preN = x2;
x3preN = x3;
x4preN = x4;
k1bin = discretize(k1, k1i, 6);
k2bin = discretize(k2, k2i, 6);
k3bin = discretize(k3, k3i, 6);
for (int t = 1; t <= tps; t++) {
// x1
halfF1 = halfdt * fx1(x1);
halfF2 = halfdt * fx1(x1 + halfF1);
F3 = dt * fx1(x1 + halfF2);
F4 = dt * fx1(x1 + F3);
x1p = x1 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x2
halfF1 = halfdt * fx2(x2);
halfF2 = halfdt * fx2(x2 + halfF1);
F3 = dt * fx2(x2 + halfF2);
F4 = dt * fx2(x2 + F3);
x2p = x2 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x3
halfF1 = halfdt * fx3(x3);
halfF2 = halfdt * fx3(x3 + halfF1);
F3 = dt * fx3(x3 + halfF2);
F4 = dt * fx3(x3 + F3);
x3p = x3 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x4
halfF1 = halfdt * fx4(x4);
halfF2 = halfdt * fx4(x4 + halfF1);
F3 = dt * fx4(x4 + halfF2);
F4 = dt * fx4(x4 + F3);
x4p = x4 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
if (t % block == 0) {
tb = t / block - 1;
x1pre = discretize(x1preN, x1i, 6);
x2pre = discretize(x2preN, x2i, 6);
x3pre = discretize(x3preN, x3i, 6);
x4pre = discretize(x4preN, x4i, 6);
x1post = discretize(x1, x1i, 6);
x2post = discretize(x2, x2i, 6);
x3post = discretize(x3, x3i, 6);
x4post = discretize(x4, x4i, 6);
x1ctr[tb][k1bin][k2bin][k3bin][x1pre][x2pre][x3pre]
[x1post]++;
x2ctr[tb][k1bin][k2bin][k3bin][x1pre][x2pre][x3pre]
[x2post]++;
x3ctr[tb][k1bin][k2bin][x1pre][x2pre][x3pre][x3post]++;
x4ctr[tb][k3bin][x2pre][x4pre][x4post]++;
x1preN = x1;
x2preN = x2;
x3preN = x3;
x4preN = x4;
}
x1 = x1p;
x2 = x2p;
x3 = x3p;
x4 = x4p;
}
}
// output
FILE *out;
char buffer[256];
snprintf(buffer, sizeof(buffer), "dummy.txt");
int idx = 0;
for (tb = 0; tb < 100; tb++) {
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx1T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int ki2 = 0; ki2 < 5; ki2++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x1ctr[tb][ki0][ki1][ki2][vi0][vi1]
[vi2][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx,
ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx2T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int ki2 = 0; ki2 < 5; ki2++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x2ctr[tb][ki0][ki1][ki2][vi0][vi1]
[vi2][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx,
ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx3T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x3ctr[tb][ki0][ki1][vi0][vi1][vi2]
[vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx, ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx4T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x4ctr[tb][ki0][vi0][vi1][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx, ctrtmp);
}
idx++;
}
fclose(out);
}
return 0;
}
Perhaps there are others, but I see only two things that can generate a crash.
you get the myid integer from this instruction
int myid = atoi(argv[1]);
But if you call the program without passing the id parameter? argv[1] is NULL. Crash!
Suggestion: define a default id and check argc; something like
int myid = (argc > 1 ? atoi(argv[1]) : defId);
you fopen() the output files but you don't check the success of the opening; so, when you write in the files, like in
fprintf(out, "%d %d\n", idx,
ctrtmp);
in case of failure, in opening the file, out is NULL. Crash!
Suggestion: check the opening of the output files (out != NULL).
p.s.: sorry for my bad English.
I am trying to replicate the following fortran 77 subroutine into c++.
subroutine calcul(nb, m, sd, x_min, n, y_w)
common / speed_1 / v_0
common / dat1 / x_m
common / dat2 / x_l0, x_r0, x_lq, e_l, e_r, a1, a2, b1
common / sor_arr / x(6500), y(6500), z(6500)
common / ef_mat / x_f(35000), y_f(35000)
y_m = y_w - rl_h
y_ma = rl_h
y_mi = -rl_h
x_ma = rb_h
x_mi = -rb_h
do 10 i = m, 1, -1
y_y = z(i) - y_m
if (y_y.gt.y_ma) goto 10
if (y_y.le.y_mi) goto 10
c1_s = y(i) / v_0
x_l = x_l0 + e_l*c1_s**a1
x_r = x_r0 + e_r*c1_s**a2
cl_r = x_r - x_l
c2_s = x(i) + x_min
i1 = 1
i2 = nb
i3 = 1
if (mod(i, 2).eq. 0) goto 20
i1 = nb
i2 = 1
i3 = -1
20 do 30 i = i1, i2, i3
r_m1 = float(i - 1)
x_q = c2_s + r_m1*sd - x_l
x_q = x_q * b1 / c1_r + xl_q
x_x = x_q - x_m
if (x_x.gt.x_ma) goto 30
if (x_x.le.x_mi) goto 30
n = n + 1
y_f(n) = y_y
x_f(n) = x_x
30 continue
10 continue
return
end
here is my translation
#include "stdafx.h"
#include <iostream>
using namespace std;
void calcul(int& nb, int& m, double& sd, double& x_min, int& n, double& y_w);
struct speed
{
double v_0;
};
struct dat1
{
double x_m;
};
struct dat2
{
double x_l0, x_r0, x_lq, e_l, e_r, rl_h, rb_h, a1, a2, b1;
};
struct sor_arr
{
double x[6500], y[6500], z[6500];
};
struct ef_mat
{
double x_f[35000], y_f[35000];
};
int main()
{
int nb1 = 10;
int m1 = 250;
double sd1 = 16;
double x1nmin = 1.e38;
int n1 = 0;
double yw1 = 0;
calcul(nb1, m1, sd1, x1nmin, n1, yw1);
system("PAUSE");
return 0;
}
void calcul(int& nb, int& m, double& sd, double& x_min, int& n, double& y_w)
{
struct speed sp;
struct dat1 d1;
struct dat2 d2;
struct sor_arr s;
struct ef_mat ef;
int i1;
int i2;
int i3;
double y_m;
double y_ma;
double y_mi;
double x_ma;
double x_mi;
double y_y;
double x_x;
double c1_s;
double x_l;
double x_r;
double cl_r;
double c2_s;
double r_m1;
double x_q;
y_m = y_w - d2.rl_h;
y_ma = d2.rl_h;
y_mi = -d2.rl_h;
x_ma = d2.rb_h;
x_mi = -d2.rb_h;
for (int i = m; i > 1; i--)
{
y_y = s.z[i] - y_m;
if (y_y > y_ma)
{
continue;
}
if (y_y <= y_ma)
{
continue;
}
c1_s = s.y[i] / sp.v_0;
x_l = d2.x_l0 + d2.e_l * pow(c1_s, d2.a1);
x_r = d2.x_r0 + d2.e_r * pow(c1_s, d2.a2);
cl_r = x_r - x_l;
c2_s = s.x[i] + x_min;
i1 = 1;
i2 = nb;
i3 = 1;
if (fmod(i,2)== 0)
{
for (int i = i1; i < i2; i+= i3)
{
r_m1 = float(i - 1);
x_q = c2_s + r_m1 * sd - x_l;
x_q = x_q * d2.b1 / cl_r + d2.x_lq;
x_x = x_q - d1.x_m;
if (x_x >= x_ma)
{
continue;
}
if (x_x < x_ma)
{
continue;
}
n = n + 1;
ef.y_f[n] = y_y;
ef.x_f[n] = x_x;
}
i1 = nb;
i2 = 1;
i3 = -1;
}
}
}
The errors are below. Can someone help me and tell me how to fix?
error C4700: uninitialized local variable 'sp' used
error C4700: uninitialized local variable 'd2' used
error C4700: uninitialized local variable 'd1' used
This is somewhat crazy and bold, but what you could try is to initialize your local variables.
For instance:
struct speed sp = {0};
struct dat1 d1 = {0};
struct dat2 d2 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
If you go really crazy, you could even consider using other values than 0 - something that actually makes sense in this algorithm would be awesome!!!
Note: I have not the slightest clue about Fortran, nor did I look into further problems of your code.
I am implementing an image analysis algorithm using openCV and c++, but I found out openCV doesnt have any function for Butterworth Bandpass filter officially.
in my project I have to pass a time series of pixels into the Butterworth 5 order filter and the function will return the filtered time series pixels. Butterworth(pixelseries,order, frequency), if you have any idea to help me of how to start please let me know. Thank you
EDIT :
after getting help, finally I come up with the following code. which can calculate the Numerator Coefficients and Denominator Coefficients, but the problem is that some of the numbers is not as same as matlab results. here is my code:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.14159
double *ComputeLP( int FilterOrder )
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc( FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for( i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP( int FilterOrder )
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL ) return( NULL );
for( i = 0; i <= FilterOrder; ++i)
if( i % 2 ) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply( int FilterOrder, double *b, double *c )
{
int i, j;
double *RetVal;
RetVal = (double *)calloc( 4 * FilterOrder, sizeof(double) );
if( RetVal == NULL ) return( NULL );
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for( i = 1; i < FilterOrder; ++i )
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for( j = 2*i; j > 1; --j )
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder)
{
double *TCoeffs;
double *NumCoeffs;
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs( int FilterOrder, double Lcutoff, double Ucutoff )
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff) / 2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff) / 2.0);
theta = PI * (Ucutoff - Lcutoff) / 2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
TCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
for( k = 0; k < FilterOrder; ++k )
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs );
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for( k = 3; k <= 2*FilterOrder; ++k )
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
double FrequencyBands[2] = {0.25,0.375};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
NumC = ComputeNumCoeffs(FiltOrd);
for(int k = 0; k<11; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<11; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}
I get this resutls for my code :
B= 1,0,-5,0,10,0,-10,0,5,0,-1
A= 1.000000000000000, -4.945988709743181, 13.556489496973796, -24.700711850327743,
32.994881546824828, -33.180726698160655, 25.546126213403539, -14.802008410165968,
6.285430089797051, -1.772929809750849, 0.277753012228403
but if I want to test the coefficinets in same frequency band in MATLAB, I get the following results:
>> [B, A]=butter(5, [0.25,0.375])
B = 0.0002, 0, -0.0008, 0, 0.0016, 0, -0.0016, 0, 0.0008, 0, -0.0002
A = 1.0000, -4.9460, 13.5565, -24.7007, 32.9948, -33.1806, 25.5461, -14.8020, 6.2854, -1.7729, 0.2778
I have test this website :http://www.exstrom.com/journal/sigproc/ code, but the result is equal as mine, not matlab. anybody knows why? or how can I get the same result as matlab toolbox?
I know this is a post on an old thread, and I would usually leave this as a comment, but I'm apparently not able to do that.
In any case, for people searching for similar code, I thought I would post the link from where this code originates (it also has C code for other types of Butterworth filter coefficients and some other cool signal processing code).
The code is located here:
http://www.exstrom.com/journal/sigproc/
Additionally, I think there is a piece of code which calculates said scaling factor for you already.
/**********************************************************************
sf_bwbp - calculates the scaling factor for a butterworth bandpass filter.
The scaling factor is what the c coefficients must be multiplied by so
that the filter response has a maximum value of 1.
*/
double sf_bwbp( int n, double f1f, double f2f )
{
int k; // loop variables
double ctt; // cotangent of theta
double sfr, sfi; // real and imaginary parts of the scaling factor
double parg; // pole angle
double sparg; // sine of pole angle
double cparg; // cosine of pole angle
double a, b, c; // workspace variables
ctt = 1.0 / tan(M_PI * (f2f - f1f) / 2.0);
sfr = 1.0;
sfi = 0.0;
for( k = 0; k < n; ++k )
{
parg = M_PI * (double)(2*k+1)/(double)(2*n);
sparg = ctt + sin(parg);
cparg = cos(parg);
a = (sfr + sfi)*(sparg - cparg);
b = sfr * sparg;
c = -sfi * cparg;
sfr = b - c;
sfi = a - b - c;
}
return( 1.0 / sfr );
}
I finally found it.
I just need to implement the following code from matlab source code to c++ . "the_mandrill" were right, I need to add the normalizing constant into the coefficient:
kern = exp(-j*w*(0:length(b)-1));
b = real(b*(kern*den(:))/(kern*b(:)));
EDIT:
and here is the final edition, which the whole code will return numbers exactly equal to MATLAB :
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NormalizedKernel = (std::complex<double> *)calloc( 2*FilterOrder+1, sizeof(std::complex<double>) );
if( NormalizedKernel == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
double Bw, Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff / 2.0);
Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<11; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<11; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<11; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
There are code which could be found online implementing butterworth filter. If you use the source code to try to get result matching MATLAB results, there will be the same problem.Basically the result you got from the code hasn't been normalized, and in the source code there is a variable sff in bwhp.c. If you set that to 1, the problem will be easily solved.
I recommend you to use this source code and
the source code and usage could be found here
I added the final edition of function ComputeNumCoeffs to the program and fix "FilterOrder" (k<11 to k<2*FiltOrd+1). Maybe it will save someone's time.
f1=0.5Gz, f2=10Gz, fs=127Gz/2
In MatLab
a={1.000000000000000,-3.329746259105707, 4.180522138699884,-2.365540522960743,0.514875789136976};
b={0.041065495448784, 0.000000000000000,-0.082130990897568, 0.000000000000000,0.041065495448784};
Program:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
#include <complex>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.1415926535897932384626433832795
double *ComputeLP(int FilterOrder)
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc(FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for(i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP(int FilterOrder)
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL) return(NULL);
for(i = 0; i <= FilterOrder; ++i)
if(i % 2) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply(int FilterOrder, double *b, double *c)
{
int i, j;
double *RetVal;
RetVal = (double *)calloc(4 * FilterOrder, sizeof(double));
if(RetVal == NULL) return(NULL);
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for(i = 1; i < FilterOrder; ++i)
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for(j = 2*i; j > 1; --j)
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc(2*FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NormalizedKernel = (std::complex<double> *)calloc(2*FilterOrder+1, sizeof(std::complex<double>));
if(NormalizedKernel == NULL) return(NULL);
TCoeffs = ComputeHP(FilterOrder);
if(TCoeffs == NULL) return(NULL);
for(i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
//double Bw;
double Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff/2.0);
//Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
//double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<2*FilterOrder+1; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<2*FilterOrder+1; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<2*FilterOrder+1; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs(int FilterOrder, double Lcutoff, double Ucutoff)
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff)/2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff)/2.0);
theta = PI * (Ucutoff - Lcutoff)/2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
TCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
for(k = 0; k < FilterOrder; ++k)
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs);
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for(k = 3; k <= 2*FilterOrder; ++k)
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
(void)argc;
(void)argv;
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
//f1 = 0.5Gz f2=10Gz
//fs=127Gz
//Kotelnikov/2=Nyquist (127/2)
double FrequencyBands[2] = {0.5/(127.0/2.0),10.0/(127.0/2.0)};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 2;//5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
printf("\n");
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
printf("\n");
NumC = ComputeNumCoeffs(FiltOrd,FrequencyBands[0],FrequencyBands[1],DenC);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}