What's wrong in this SSE2 transposition?

What's wrong in this SSE2 transposition? - c++

I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}

(update: this answer was written before the edits that show v_phase being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase at each step. Yeah, there was a // some other code (that will use phase) comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase) to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod on the result to range-reduce into the [0..2Pi) range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1] before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.

Related

C++ PSNR implementation not matching opencv

The PSNR values I was getting looked a little weird so i decided to compare with openCV. The asnwers do not match and I can't for the life of me figure out why.
double calc_psnr(char* src, char* ref, uint n_pixels) {
char a, b;
double diff, mse, psnr, ssd = 0;;
double psnr1, psnr2, psnr3;
for (auto i = 0; i < n_pixels; i++) {
a = *src++;
b = *ref++;
diff = double(a) - double(b);
ssd += diff * diff;
}
// 20 * log_10(max_f/sqrt(mse)) = 20*(log_10(255) + (-1/2)*log_10(mse)) =
// 48.1308036 - 10*log_10(mse) =
mse = ssd / double(n_pixels);
if (mse == 0) return 100;
psnr = 20 * log10(255 / sqrt(mse)
// These all give the same answer
//psnr1 = 10 * log10((255 * 255) / mse);;
//psnr2 = 20 * log10(255 / sqrt(mse) + std::numeric_limits<double>::epsilon());
//psnr3 = 48.1308036 - 10 * log10(mse);
return psnr;
(In python)
import opencv
d = numpy.zeros((3,3))
c = numpy.zeros((3,3))
d[0] = 10
cv2.PSNR(c,d)
32.90201615587573
const int size = 3;
char img_a[size * size];
char img_b[size * size];
for (int i = 0; i < size * size; i++) {
img_a[i] = 0;
img_b[i] = 0;
}
img_b[0] = 10;
double psnr_test = calc_psnr(img_a, img_b, size * size);
std::cout << "psnr_test: " << psnr_test << endl;
psnr_test: 37.6732
The error is far worse when computing with a full image. Any ideas what the difference could be due to? I checked the opencv codebase but don't see any obvious differences: https://github.com/opencv/opencv/blob/35f1a90df7e5a9b3b275a74868759efd787a8c70/modules/ts/src/ts_func.cpp
Thanks for any help!

C++ manually-unrolled (conditional-sum) is slower than regular code, was expecting AVX vectorization

test_euclid_ask.h (only need to read 2 functions: euclid_slow, euclid_fast)
#pragma once
#include "included.h"
double
euclid_slow(int n, double* data1, double* data2, int* mask1, int* mask2, const double weight[])
{
double result = 0.0;
double totalWeight = 0;
for (int i = 0; i < n; i++) {
if (mask1[i] && mask2[i]) {
double term = data1[i] - data2[i];
result += weight[i] * term * term;
totalWeight += weight[i];
}
}
if (totalWeight==0) return 0;
return result / totalWeight;
}
double
euclid_fast(int n, double* data1, double* data2, int* mask1, int* mask2, const double weight[])
{
double result = 0.0;
double totalWeight = 0;
double subResult[4] = { 0. };
double subTweight[4] = { 0. };
double subDiff[4] = { 0. };
double subWeight[4] = { 0. };
double subMask[4] = { 0. };
int nstep4 = n - n % 4;
for (int i = 0; i < nstep4; i += 4) {
subMask[0] = mask1[i] && mask2[i];
subMask[1] = mask1[i + 1] && mask2[i + 1];
subMask[2] = mask1[i + 2] && mask2[i + 2];
subMask[3] = mask1[i + 3] && mask2[i + 3];
if (!(subMask[0] || subMask[1] || subMask[2] || subMask[3])) continue;
subDiff[0] = data1[i] - data2[i];
subDiff[1] = data1[i + 1] - data2[i + 1];
subDiff[2] = data1[i + 2] - data2[i + 2];
subDiff[3] = data1[i + 3] - data2[i + 3];
subDiff[0] *= subDiff[0];
subDiff[1] *= subDiff[1];
subDiff[2] *= subDiff[2];
subDiff[3] *= subDiff[3];
subWeight[0] = weight[i] * subMask[0];
subWeight[1] = weight[i + 1] * subMask[1];
subWeight[2] = weight[i + 2] * subMask[2];
subWeight[3] = weight[i + 3] * subMask[3];
subTweight[0] += subWeight[0];
subTweight[1] += subWeight[1];
subTweight[2] += subWeight[2];
subTweight[3] += subWeight[3];
subResult[0] += subWeight[0] * subDiff[0];
subResult[1] += subWeight[1] * subDiff[1];
subResult[2] += subWeight[2] * subDiff[2];
subResult[3] += subWeight[3] * subDiff[3];
}
for (int i = nstep4; i < n; i++) {
if (mask1[i] && mask2[i]) {
double term = data1[i] - data2[i];
result += weight[i] * term * term;
totalWeight += weight[i];
}
}
result += subResult[0] + subResult[1] + subResult[2] + subResult[3];
totalWeight += subTweight[0] + subTweight[1] + subTweight[2] + subTweight[3];
//cout << "end fast\n";
if (!totalWeight) return 0;
return result / totalWeight;
}
void test_euclid_ask()
{
const int MAXN = 10000000, MINN = 1000000;
double* data1, * data2;
int* mask1, * mask2;
double* dataPro1, * dataPro2;
int* maskPro1, * maskPro2;
double *weight, * weightPro;
//***********
data1 = new double[MAXN + MINN + 1];
data2 = new double[MAXN + MINN + 1];
mask1 = new int[MAXN + MINN + 1];
mask2 = new int[MAXN + MINN + 1];
dataPro1 = new double[MAXN + MINN + 1];
dataPro2 = new double[MAXN + MINN + 1];
maskPro1 = new int[MAXN + MINN + 1];
maskPro2 = new int[MAXN + MINN + 1];
// ******
weight = new double[MAXN + MINN + 1];
weightPro = new double[MAXN + MINN + 1];
MyTimer timer;
int n;
double guess1, guess2, tmp, total1 = 0, total2 = 0, prev1 = 0, prev2 = 0;
for (int t = 5000; t < 6000; t++) {
if (t <= 5000) n = t;
else n = MINN + rand() % (MAXN - MINN);
cout << n << "\n";
int index = 0;
for (int i = 0; i < n; i++) {
weight[i] = int64(randomed()) % 100;
data1[i] = int64(randomed()) % 100;
data2[i] = int64(randomed()) % 100;
mask1[i] = rand() % 10;
mask2[i] = rand() % 10;
}
memcpy(weightPro, weight, n * sizeof(double));
memcpy(dataPro1, data1, n * sizeof(double));
memcpy(dataPro2, data2, n * sizeof(double));
memcpy(maskPro1, mask1, n * sizeof(int));
memcpy(maskPro2, mask2, n * sizeof(int));
//****
int tmp = flush_cache(); // do something to ensure the cache does not contain test data
cout << "ignore this " << tmp << "\n";
timer.startCounter();
guess1 = euclid_slow(n, data1, data2, mask1, mask2, weight);
tmp = timer.getCounterMicro();
total1 += tmp;
cout << "time slow = " << tmp << " us\n";
timer.startCounter();
guess2 = euclid_fast(n, dataPro1, dataPro2, maskPro1, maskPro2, weightPro);
tmp = timer.getCounterMicro();
total2 += tmp;
cout << "time fast = " << tmp << " us\n";
bool ok = fabs(guess1 - guess2) <= 0.1;
if (!ok) {
cout << "error at N = " << n << "\n";
exit(-1);
}
cout << "\n";
}
cout << "slow speed = " << (total1 / 1000) << " ms\n";
cout << "fast speed = " << (total2 / 1000) << " ms\n";
}
Basically, the function computes a kind-of Euclidean distance between 2 arrays:
result = sum(weight[i] * (data1[i] - data2[i])^2)
but only in positions where both values are available (mask1[i]==0 means it's ignored, same with mask2). The normal code is in function euclid_slow.
So I tried to improve the code by processing 4 elements at once, hoping that SSE/AVX can speed this up. However, the result stays the same or slower(using g++ -O3 -march=native) or becomes 40% slower (using Visual Studio 2019 compiler, release mode (x64), -O2, AVX2 enabled). I tried both -O2 and -O3, same result.
The compiler made better optimizations than what I wrote. But how can I make it actually faster?
Edit: code to test the programs here

Mac Segmentation fault on Unix Executable File, file compilation is fine [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 6 years ago.
Improve this question
I am trying to run a code scheme published in the following paper:
http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1001059#s4
Specifically, the implementation of the code is from:
http://www.comp.nus.edu.sg/~rpsysbio/pada/
Following successful compilation of the below code on Mac OS X El Capitan using gcc Fortran, I get an executable file as expected from the code below. However, when I try to execute this file I get
segmentation fault 11.
After some research I think this is a recursion problem causing the stack to overflow, but I have no idea how to solve this. Could someone point me in the right direction please?
#include <stdio.h>
#include <math.h>
#include "./3rdparty/dSFMT-src-2.0/dSFMT.c"
double x1,
x1p,
x1preN;
double x2,
x2p,
x2preN;
double x3,
x3p,
x3preN;
double x4,
x4p,
x4preN;
double k1;
double k2;
double k3;
int x1ctr[100][5][5][5][5][5][5][5];
int x2ctr[100][5][5][5][5][5][5][5];
int x3ctr[100][5][5][5][5][5][5];
int x4ctr[100][5][5][5][5];
double
fx1(double x1)
{
return -(k1 * x1 * x3 - k2 * x2) + k3 * x2;
}
double
fx2(double x2)
{
return (k1 * x1 * x3 - k2 * x2) - k3 * x2;
}
double
fx3(double x3)
{
return -(k1 * x1 * x3 - k2 * x2);
}
double
fx4(double x4)
{
return k3 * x2;
}
int
discretize(double v, double xi[], int length)
{
for (int j = 1; j < length - 1; j++)
if (v < xi[j])
return j - 1;
return length - 2;
}
int
main(int argc, char *argv[])
{
int myid = atoi(argv[1]);
double dt = 0.01,
halfdt = dt / 2.0;
int tps = (int) (10.0 / dt),
t,
i;
int block = tps / 100,
tb;
double halfF1,
halfF2,
F3,
F4;
double x1i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x2i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x3i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
double x4i[] = { 0.0, 3.0, 6.0, 9.0, 12.0, 15.0 };
int x1pre,
x1post,
x1init = 2;
int x2pre,
x2post,
x2init = 0;
int x3pre,
x3post,
x3init = 4;
int x4pre,
x4post,
x4init = 0;
double k1i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
double k2i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
double k3i[] =
{ 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 1.0 };
int k1bin;
int k2bin;
int k3bin;
int sampleNo = 1000;
dsfmt_t dsfmt;
int seed = 7018 + myid;
dsfmt_init_gen_rand(&dsfmt, seed);
for (int i = 0; i < sampleNo; i++) {
k1 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
k2 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
k3 = 0.0 + dsfmt_genrand_close_open(&dsfmt) * 1.0;
x1 = x1i[x1init] +
dsfmt_genrand_close_open(&dsfmt) * (x1i[x1init + 1] -
x1i[x1init]);
x2 = x2i[x2init] +
dsfmt_genrand_close_open(&dsfmt) * (x2i[x2init + 1] -
x2i[x2init]);
x3 = x3i[x3init] +
dsfmt_genrand_close_open(&dsfmt) * (x3i[x3init + 1] -
x3i[x3init]);
x4 = x4i[x4init] +
dsfmt_genrand_close_open(&dsfmt) * (x4i[x4init + 1] -
x4i[x4init]);
x1preN = x1;
x2preN = x2;
x3preN = x3;
x4preN = x4;
k1bin = discretize(k1, k1i, 6);
k2bin = discretize(k2, k2i, 6);
k3bin = discretize(k3, k3i, 6);
for (int t = 1; t <= tps; t++) {
// x1
halfF1 = halfdt * fx1(x1);
halfF2 = halfdt * fx1(x1 + halfF1);
F3 = dt * fx1(x1 + halfF2);
F4 = dt * fx1(x1 + F3);
x1p = x1 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x2
halfF1 = halfdt * fx2(x2);
halfF2 = halfdt * fx2(x2 + halfF1);
F3 = dt * fx2(x2 + halfF2);
F4 = dt * fx2(x2 + F3);
x2p = x2 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x3
halfF1 = halfdt * fx3(x3);
halfF2 = halfdt * fx3(x3 + halfF1);
F3 = dt * fx3(x3 + halfF2);
F4 = dt * fx3(x3 + F3);
x3p = x3 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
// x4
halfF1 = halfdt * fx4(x4);
halfF2 = halfdt * fx4(x4 + halfF1);
F3 = dt * fx4(x4 + halfF2);
F4 = dt * fx4(x4 + F3);
x4p = x4 + (2 * halfF1 + 4 * halfF2 + 2 * F3 + F4) / 6.0;
if (t % block == 0) {
tb = t / block - 1;
x1pre = discretize(x1preN, x1i, 6);
x2pre = discretize(x2preN, x2i, 6);
x3pre = discretize(x3preN, x3i, 6);
x4pre = discretize(x4preN, x4i, 6);
x1post = discretize(x1, x1i, 6);
x2post = discretize(x2, x2i, 6);
x3post = discretize(x3, x3i, 6);
x4post = discretize(x4, x4i, 6);
x1ctr[tb][k1bin][k2bin][k3bin][x1pre][x2pre][x3pre]
[x1post]++;
x2ctr[tb][k1bin][k2bin][k3bin][x1pre][x2pre][x3pre]
[x2post]++;
x3ctr[tb][k1bin][k2bin][x1pre][x2pre][x3pre][x3post]++;
x4ctr[tb][k3bin][x2pre][x4pre][x4post]++;
x1preN = x1;
x2preN = x2;
x3preN = x3;
x4preN = x4;
}
x1 = x1p;
x2 = x2p;
x3 = x3p;
x4 = x4p;
}
}
// output
FILE *out;
char buffer[256];
snprintf(buffer, sizeof(buffer), "dummy.txt");
int idx = 0;
for (tb = 0; tb < 100; tb++) {
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx1T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int ki2 = 0; ki2 < 5; ki2++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x1ctr[tb][ki0][ki1][ki2][vi0][vi1]
[vi2][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx,
ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx2T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int ki2 = 0; ki2 < 5; ki2++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x2ctr[tb][ki0][ki1][ki2][vi0][vi1]
[vi2][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx,
ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx3T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int ki1 = 0; ki1 < 5; ki1++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi2 = 0; vi2 < 5; vi2++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x3ctr[tb][ki0][ki1][vi0][vi1][vi2]
[vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx, ctrtmp);
}
idx++;
}
fclose(out);
snprintf(buffer, sizeof(buffer),
"./models/toy/batct/toyCTx4T%d_%d.txt", tb, myid);
out = fopen(buffer, "w");
idx = 0;
for (int ki0 = 0; ki0 < 5; ki0++)
for (int vi0 = 0; vi0 < 5; vi0++)
for (int vi1 = 0; vi1 < 5; vi1++)
for (int vi = 0; vi < 5; vi++) {
int ctrtmp =
(x4ctr[tb][ki0][vi0][vi1][vi]);
if (ctrtmp > 0) {
fprintf(out, "%d %d\n", idx, ctrtmp);
}
idx++;
}
fclose(out);
}
return 0;
}

Perhaps there are others, but I see only two things that can generate a crash.
you get the myid integer from this instruction
int myid = atoi(argv[1]);
But if you call the program without passing the id parameter? argv[1] is NULL. Crash!
Suggestion: define a default id and check argc; something like
int myid = (argc > 1 ? atoi(argv[1]) : defId);
you fopen() the output files but you don't check the success of the opening; so, when you write in the files, like in
fprintf(out, "%d %d\n", idx,
ctrtmp);
in case of failure, in opening the file, out is NULL. Crash!
Suggestion: check the opening of the output files (out != NULL).
p.s.: sorry for my bad English.

Pointers in C/C++ compiles but gives segfault error

Here's a code snipped that I have for a larger program
double *pos_x_h[224];
double *pos_y_h[224];
const double A = 1;
const int N = 224;
double d_0;
double alpha;
void initialize(double nu, int rows = 16, int columns = 14) {
double d = 1 / double(columns);
d_0 = d * (1 - pow(2.0, nu - 8));
alpha = d - d_0;
double dx = d;
double dy = d * sqrt(3.0) / 2;
for (int j = 0; j < rows; j++) {
for (int i = 0; i < columns; i++) {
int n = i + j * columns;
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
}
}
}
int main(int argc, char *argv[]) {
double nu=7.5;
int rows=16;
int columns=14;
initialize(nu);
return 0;
}
The code compiles but it is gives a seg fault error. Can't see why that's the case. Am I going over array_size?

There doesn't seem to be any point in utilizing pos_x_h and pos_y_h as pointer arrays.
Change this:
double *pos_x_h[224];
double *pos_y_h[224];
To this:
double pos_x_h[224];
double pos_y_h[224];
And this:
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
To this:
pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
pos_y_h[n] = j * dy;
If you really insist on utilizing pointer arrays, then you can use this (in addition to the above):
double *pos_x_h_ptr[224];
double *pos_y_h_ptr[224];
for (int n=0; n<224; n++)
{
pos_x_h_ptr[n] = pos_x_h+n;
pos_y_h_ptr[n] = pos_y_h+n;
}

double *pos_x_h[224];
double *pos_y_h[224];
are arrays of pointers, but you use them wihtout allocating memory
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
probably something like that
pos_x_h[n] = malloc(sizeof(double));
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
pos_y_h[n] = malloc(sizeof(double));
*pos_y_h[n] = j * dy;
if you need to alocate memory outside the initialize function (why would you? it is init function) you can do it in main
int i = 0;
for(;i< 224;++i)
{
pos_x_h[i] = malloc(sizeof(double));
pos_y_h[i] = malloc(sizeof(double));
}

My Particle Swarm Optimization code generates different answers in C++ and MATLAB

I have written a global version of Particle Swarm Optimization algorithm in C++.
I tried to write it exactly as same as my MATLAB PSO code that have written before, but this code generates different and so worst answers.
The MATLAB code is:
clear all;
numofdims = 30;
numofparticles = 50;
c1 = 2;
c2 = 2;
numofiterations = 1000;
V = zeros(50, 30);
initialpop = V;
Vmin = zeros(30, 1);
Vmax = Vmin;
Xmax = ones(30, 1) * 100;
Xmin = -Xmax;
pbestfits = zeros(50, 1);
worsts = zeros(50, 1);
bests = zeros(50, 1);
meanfits = zeros(50, 1);
pbests = zeros(50, 30);
initialpop = Xmin + (Xmax - Xmin) .* rand(numofparticles, numofdims);
X = initialpop;
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
gbestfit = minfit;
gbest = X(minfitidx, :);
for i = 1:numofdims
Vmax(i) = 0.2 * (Xmax(i) - Xmin(i));
Vmin(i) = -Vmax(i);
end
for t = 1:1000
w = 0.9 - 0.7 * (t / numofiterations);
for i = 1:numofparticles
if(fitnesses(i) < pbestfits(i))
pbestfits(i) = fitnesses(i);
pbests(i, :) = X(i, :);
end
end
for i = 1:numofparticles
for j = 1:numofdims
V(i, j) = min(max((w * V(i, j) + rand * c1 * (pbests(i, j) - X(i, j))...
+ rand * c2 * (gbest(j) - X(i, j))), Vmin(j)), Vmax(j));
X(i, j) = min(max((X(i, j) + V(i, j)), Xmin(j)), Xmax(j));
end
end
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
if(minfit < gbestfit)
gbestfit = minfit;
gbest = X(minfitidx, :);
end
worsts(t) = max(fitnesses);
bests(t) = gbestfit;
meanfits(t) = mean(fitnesses);
end
In which, testfunc1 is:
function [out] = testfunc1(R)
out = sum(R .^ 2, 2);
end
The C++ code is:
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#define rand_01 ((float)rand() / (float)RAND_MAX)
const int numofdims = 30;
const int numofparticles = 50;
using namespace std;
void fitnessfunc(float X[numofparticles][numofdims], float fitnesses[numofparticles])
{
memset(fitnesses, 0, sizeof (float) * numofparticles);
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
fitnesses[i] += (pow(X[i][j], 2));
}
}
}
float mean(float inputval[], int vallength)
{
int addvalue = 0;
for(int i = 0; i < vallength; i++)
{
addvalue += inputval[i];
}
return (float)(addvalue / vallength);
}
void PSO(int numofiterations, float c1, float c2,
float Xmin[numofdims], float Xmax[numofdims], float initialpop[numofparticles][numofdims],
float worsts[], float meanfits[], float bests[], float *gbestfit, float gbest[numofdims])
{
float V[numofparticles][numofdims] = {0};
float X[numofparticles][numofdims];
float Vmax[numofdims];
float Vmin[numofdims];
float pbests[numofparticles][numofdims];
float pbestfits[numofparticles];
float fitnesses[numofparticles];
float w;
float minfit;
int minfitidx;
memcpy(X, initialpop, sizeof(float) * numofparticles * numofdims);
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
for(int i = 0; i < numofdims; i++)
{
Vmax[i] = 0.2 * (Xmax[i] - Xmin[i]);
Vmin[i] = -Vmax[i];
}
for(int t = 0; t < 1000; t++)
{
w = 0.9 - 0.7 * (float) (t / numofiterations);
for(int i = 0; i < numofparticles; i++)
{
if(fitnesses[i] < pbestfits[i])
{
pbestfits[i] = fitnesses[i];
memcpy(pbests[i], X[i], sizeof(float) * numofdims);
}
}
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
V[i][j] = min(max((w * V[i][j] + rand_01 * c1 * (pbests[i][j] - X[i][j])
+ rand_01 * c2 * (gbest[j] - X[i][j])), Vmin[j]), Vmax[j]);
X[i][j] = min(max((X[i][j] + V[i][j]), Xmin[j]), Xmax[j]);
}
}
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
if(minfit < *gbestfit)
{
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
}
worsts[t] = *max_element(fitnesses, fitnesses + numofparticles);
bests[t] = *gbestfit;
meanfits[t] = mean(fitnesses, numofparticles);
}
}
int main()
{
time_t t;
srand((unsigned) time(&t));
float xmin[30], xmax[30];
float initpop[50][30];
float worsts[1000], bests[1000];
float meanfits[1000];
float gbestfit;
float gbest[30];
for(int i = 0; i < 30; i++)
{
xmax[i] = 100;
xmin[i] = -100;
}
for(int i = 0; i < 50; i++)
for(int j = 0; j < 30; j++)
{
initpop[i][j] = rand() % (100 + 100 + 1) - 100;
}
PSO(1000, 2, 2, xmin, xmax, initpop, worsts, meanfits, bests, &gbestfit, gbest);
cout<<"fitness: "<<gbestfit<<endl;
return 0;
}
I have debugged two codes many times but can not find the difference which makes answers different.
It is making me crazy!
May you help me please?
Update:
Please consider that, the function mean is just used for reporting some information and is not used in the optimization procedure.

You've got integer division in the following line
w = 0.9 - 0.7 * (float) (t / numofiterations);
w will be 0.2 for every iteration, change it to
w = 0.9 - 0.7 * t / numofiterations;
The first multiplication will automatically promote t to a double the division should then promote numof iterations to a double.
The parenthesis means it will be done first and therefore not be promoted as wo integers is involved in the division.

This could be a mistake in function mean:
return (float)(addvalue / vallength);
This is integer division, so the result is truncated down, then cast to float. It is unlikely this is what you want.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js