SSE addition producing garbage

SSE addition producing garbage - c++

I am trying to compare SSE float[4] addition to standard float[4] addition. I tried this:
#include <iostream>
#include <vector>
struct Point4
{
Point4()
{
data[0] = 0;
data[1] = 0;
data[2] = 0;
data[3] = 0;
}
float data[4];
};
static float SumOfDifferences(const Point4& a, const Point4& b)
{
// This function only returns the sum of the sum of the components
float sumValues = 0.0f;
for(unsigned int i = 0; i < 4; ++i)
{
sumValues += a.data[i] + b.data[i];
}
return sumValues;
}
void Standard()
{
Point4 a;
a.data[0] = 1;
a.data[1] = 2;
a.data[2] = 3;
a.data[3] = 4;
Point4 b;
b.data[0] = 1;
b.data[1] = 6;
b.data[2] = 3;
b.data[3] = 5;
float total = 0.0f;
for(unsigned int i = 0; i < 1e6; ++i)
{
total += SumOfDifferences(a, b);
}
std::cout << "total: " << total << std::endl;
}
void Vectorized()
{
typedef int v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
v4sf a;
float* aPointer = (float*)&a;
aPointer[0] = 1; aPointer[1] = 2; aPointer[2] = 3; aPointer[3] = 4;
v4sf b;
float* bPointer = (float*)&b;
bPointer[0] = 1; bPointer[1] = 2; bPointer[2] = 3; bPointer[3] = 4;
float total = 0.0f;
v4sf result;
float* resultPointer = (float*)&result;
for(unsigned int i = 0; i < 1e6; ++i)
{
result = a + b; // Vectorized operation
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer[component];
}
}
std::cout << "total: " << total << std::endl;
}
int main()
{
// Standard();
Vectorized();
return 0;
}
but the output is 'inf' for the Vectorized() function. When I stepped through with a debugger, the values of 'result' seem to be garbage (i'd expect them to be (0, 4, 0, 1) ). Where am I going wrong here?

Try typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
I get 2e+07 as the result.

Related

C++ neural network implemented from scratch cannot get above 50% on MNIST

So I have implemented a fully connected one hidden layer neural network in C++ using Eigen for matrix multiplication. It uses minibatch gradient descent.
However, my model cannot get above 50% accuracy on mnist. I have tried learning rates from between 0.0001 and 10. The model does overfit on training sizes < 100 (with ~90% accuracy which is still pretty bad), albeit extremely slowly.
What might be causing this low accuracy and extremely slow learning? My main concern is that the backpropagation is incorrect. Furthermore, I would prefer not to add any other optimization techniques (learning rate schedule, regularization, etc.).
Feed forward and backprop code:
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
Full program code:
#include <iostream>
#include <fstream>
#include <math.h>
#include <cstdlib>
#include <Eigen/Dense>
#include <vector>
#include <string>
using namespace Eigen;
#define N 30
#define epsilon 0.7
#define epoch 1000
//sizes
const int minibatch_size = 10;
const int training_size = 10000;
const int val_size = 10;
unsigned int num, magic, rows, cols;
//images
unsigned int image[training_size][28][28];
unsigned int val_image[val_size][28][28];
//labels
unsigned int label[training_size];
unsigned int val_label[val_size];
//inputs
MatrixXd X(784, training_size);
MatrixXd Y = MatrixXd::Zero(10, training_size);
//minibatch
MatrixXd mbX(784, minibatch_size);
MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);
//validation
MatrixXd Xv(784, val_size);
MatrixXd Yv = MatrixXd::Zero(10, val_size);
//Image processing courtesy of https://stackoverflow.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%ad
unsigned int in(std::ifstream& icin, unsigned int size) {
unsigned int ans = 0;
for (int i = 0; i < size; i++) {
unsigned char x;
icin.read((char*)&x, 1);
unsigned int temp = x;
ans <<= 8;
ans += temp;
}
return ans;
}
void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) {
std::ifstream icin;
//training data
icin.open(ipath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < training_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
image[i][x][y] = in(icin, 1);
X(val, i) = image[i][x][y]/255;
val++;
}
}
}
icin.close();
//training labels
icin.open(lpath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < training_size; i++) {
label[i] = in(icin, 1);
Y(label[i], i) = 1;
}
icin.close();
//validation data
icin.open(ipath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < val_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
val_image[i][x][y] = in(icin, 1);
Xv(val, i) = val_image[i][x][y]/255;
val++;
}
}
}
icin.close();
//validation labels
icin.open(lpath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < val_size; i++) {
val_label[i] = in(icin, 1);
Yv(val_label[i], i) = 1;
}
icin.close();
}
//Neural Network calculations
MatrixXd sigmoid(MatrixXd m) {
m *= -1;
return (1/(1 + m.array().exp())).matrix();
}
MatrixXd sigmoid_derivative(MatrixXd m) {
return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();
}
//Initialize weights and biases
//hidden layer
VectorXd b1 = MatrixXd::Zero(N, 1);
MatrixXd w1 = MatrixXd::Random(N, 784);
//output
VectorXd b2 = MatrixXd::Zero(10, 1);
MatrixXd w2 = MatrixXd::Random(10, N);
//Initialize intermediate values
MatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;
MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);
int main() {
input("C:\\Users\\Aaron\\Documents\\Test\\train-images-idx3-ubyte\\train-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\train-labels-idx1-ubyte\\train-labels.idx1-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-labels-idx1-ubyte\\t10k-labels.idx1-ubyte");
std::cout << "Finished Image Processing" << std::endl;
//std::cout << w1 << std::endl;
std::vector<double> val_ac;
std::vector<double> c;
std::vector<int> order;
for (int i = 0; i < training_size; i++) {
order.push_back(i);
}
for (int i = 0; i < epoch; i++) {
//feed forward
std::random_shuffle(order.begin(), order.end());
for (int j = 0; j < training_size/minibatch_size; j++) {
for (int k = 0; k < minibatch_size; k++) {
int index = order[j * minibatch_size + k];
mbX.col(k) = X.col(index);
mbY.col(k) = Y.col(index);
}
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
//std::cout << err << std::endl;
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
//std::cout << err << std::endl;
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
}
//validation
z1 = (w1 * X).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
double cost = 1/((double) training_size) * ((a2 - Y).array() * (a2 - Y).array()).matrix().sum();
c.push_back(cost);
int correct = 0;
for (int i = 0; i < training_size; i++) {
double maxP = -1;
int na;
for (int j = 0; j < 10; j++) {
if (a2(j, i) > maxP) {
maxP = a2(j, i);
na = j;
}
}
if (na == label[i]) correct++;
}
val_ac.push_back(((double) correct) / ((double) training_size));
std::cout << "Finished Epoch " << i + 1 << std::endl;
std::cout << "Cost: " << cost << std::endl;
std::cout << "Accuracy: " << ((double) correct) / ((double) training_size) << std::endl;
}
//plot accuracy
FILE * gp = _popen("gnuplot", "w");
fprintf(gp, "set terminal wxt size 600,400 \n");
fprintf(gp, "set grid \n");
fprintf(gp, "set title '%s' \n", "NN");
fprintf(gp, "plot '-' w line, '-' w lines \n");
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, c[i]);
}
fprintf(gp, "e\n");
//validation accuracy
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, val_ac[i]);
}
fprintf(gp, "e\n");
fflush(gp);
system("pause");
_pclose(gp);
return 0;
}
UPD
Here is a graph of the accuracy on the training dataset (green) and the loss (purple)
https://i.stack.imgur.com/Ya2yR.png
Here is a graph of the loss for the training data and validation data:
https://imgur.com/a/4gmFCrk
The loss of the validation data is increasing past a certain point, which shows signs of overfitting. However, the accuracy still remains abysmal even on the training data.

unsigned int val_image[val_size][28][28];
Xv(val, i) = val_image[i][x][y]/255;
Can you try again with Xv(val, i) = val_image[i][x][y] / 255.0;
There too:
X(val, i) = image[i][x][y]/255;
With the code as written, Xv is 0 very often, and 1, when the image as value 255. With a floating point division, you'll get value between 0.0 and 1.0.
You'll need to check your code for other places where you may be dividing integers.
N.b.: In C++, 240/255 is 0.

C++ Code segmentation fault only in vscode

My C++ code (shown below) works on this site:
GDB Online but not in Visual Studio, where it crashes at
iterations[imag_times][real_times] = i % (iter / 2);
when imag_times is 1 and real_times is 0 with the exception being Exception has occurred. Segmentation fault
I have installed GDB version 7.6.1.
My Question: Does anybody know how to fix that and why this is happening?
#include <iostream>
using namespace std;
int main()
{
// initialization
const double real_min = -1;
const double real_max = 1;
const double imag_min = -1;
const double imag_max = 1;
const int iter = 30;
const double real_offs = 0.01;
const double imag_offs = 0.01;
double z_real = 0;
double z_imag = 0;
double c_real = real_min;
double c_imag = imag_max;
int real_times = 0;
int imag_times = 0;
int** iterations = new int*[1];
iterations[0] = new int;
int i = 0;
// start
while(c_imag >= imag_min)
{
iterations = (int**)realloc(iterations, sizeof(int*) * (imag_times + 1));
real_times = 0;
c_real = real_min;
while(c_real <= real_max)
{
iterations[imag_times] = (int*)realloc(iterations[imag_times], sizeof(int) * (real_times + 1));
z_real = 0;
z_imag = 0;
for(i = 0; i < iter; i++)
{
double z_imag2 = z_imag * z_imag;
z_imag = 2 * z_real * z_imag + c_imag;
z_real = z_real * z_real - z_imag2 + c_real;
if(z_real * z_real + z_imag * z_imag > 4)
{
break;
}
}
iterations[imag_times][real_times] = i % (iter / 2);
real_times++;
c_real = real_min + real_offs * real_times;
}
imag_times++;
c_imag = imag_max - imag_offs * imag_times;
}
// output
for(int i = 0; i < imag_times; i++)
{
for(int j = 0; j < real_times; j++)
{
cout << iterations[i][j];
cout << ",";
}
cout << "\n";
}
cout << "done";
std::cin.get(); // pause so the program doesnt exit instantly
return 0;
}
Thanks in advance!

Gradient descent converging towards the wrong value

I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
}
return 0;
}
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
}
return res / 2.0 / (double)n;
}
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
}
p += alpha * pg / n;
m += alpha * mg / n;
}
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm

This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
display();
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
display();
}
return 0;
}
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
}
return c;
}
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
}
}
omega = omega2;
}
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
}
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
}
std::cout << "\tError : " << res * .5/(double)n << std::endl;
}

C++ compilation error: "Double array redeclared as different kind of symbol"

When I try to compile the following code, I get the following errors:
hmm.cpp:16:29: error: ‘double gamma [3000][4]’ redeclared as different kind of symbol
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:266:1: error: previous declaration of >‘double gamma(double)’
hmm.cpp: In function ‘double updateModel(int&, int, int, double, double, int, double*, >double ()[4], double ()[5005], double*)’:
hmm.cpp:67:11: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:67:14: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:67:18: error: assignment of function ‘double gamma(double)’
hmm.cpp:67:18: error: cannot convert ‘int’ to ‘double(double)throw ()’ in assignment
hmm.cpp:69:12: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:69:15: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:69:46: error: invalid operands of types ‘double(double)throw ()’ and ‘double’ to >binary ‘operator+’
hmm.cpp:69:46: error: in evaluation of ‘operator+=(double(double)throw (), double)’
I get similar errors everytime gamma is used in the code.
Code follows:
#include <iostream>
#include <fstream>
#include <cstring>
#include <cstdlib>
#include <cmath>
//double atof(const char* str)
using namespace std;
#define MAXT 3000
#define MAXSTATE 4
#define MAXRANGE 5005
#define maxString 52
#define maxResult 405
double alpha[MAXT][MAXSTATE];
double beta[MAXT][MAXSTATE];
double gamma [MAXT][MAXSTATE];
double delta[MAXT][MAXSTATE];
double psi[MAXT][MAXSTATE];//Ψ
double xi[MAXT][MAXSTATE][MAXSTATE];
inline int getIndex(const double& value,const double& min,const double&
max,const int& k)
{
int ret;
//ret = int((value - min)*((max-min)/k)); // [possible error 1]
ret = (k - 1)*(value - min) / (max-min);
return ret;
}
// all the matrix start from 1 to max
// oMin is the minimal value of O
double updateModel(int& q,int tWindow, int oRange, double oMin, double oMax, int
stateNum, double _o[MAXT],double _A[MAXSTATE][MAXSTATE],double _B[MAXSTATE][MAXRANGE],double _Pi[MAXSTATE])
{
double p;
/* calculate lambda */
// alpha
for(int s=1;s<=stateNum;s++)
alpha[1][s] = _Pi[s]*_B[s][getIndex(_o[1], oMin, oMax, oRange)];
for(int t=2;t<=tWindow;t++)
{
for(int s=1;s<=stateNum;s++)
{
alpha[t][s] = 0;
for(int j=1;j<=stateNum;j++)
alpha[t][s] += alpha[t-1][j] * _A[j][s] * _B[s][getIndex(_o[t], oMin, oMax, oRange)];
}
}
// p
p = 0;
for(int i=1;i<=stateNum;i++)
p+=alpha[tWindow][i];
//beta
for(int s = 1; s <= stateNum; s++)
beta[tWindow][s] = 1;
for(int t = tWindow - 1; t >= 1; t--)
{
for(int s = 1; s <= stateNum; s++)
{
beta[t][s] = 0;
for(int j=1;j<=stateNum;j++)
beta[t][s] += beta[t + 1][j] * _A[j][s] * _B[s][getIndex(_o[t + 1], oMin, oMax, oRange)];
}
}
//gamma
for (int t = 1; t <= tWindow; t ++){
for (int i = 1; i <= stateNum; i ++){
gamma[t][i] = 0;
for (int s = 1; s <= stateNum; s ++){
gamma[t][i] += (alpha[t][s] * beta[t][s]);
}
gamma[t][i] = alpha[t][i] * beta[t][i] / gamma[t][i];
}
}
//delta, psi
for (int i = 1; i <= stateNum; i ++){
delta[1][i] = _Pi[i] * _B[i][getIndex(_o[1], oMin, oMax, oRange)];
psi[1][i] = 0;
}
for (int t = 2; t <= tWindow; t ++){
for (int i = 1; i <= stateNum; i ++){
int k = 1;
delta[t][1] = delta[t - 1][1] * _A[1][i] * _B[i][getIndex(_o[t], oMin, oMax, oRange)];
for (int j = 2; j <= stateNum; j ++)
{
if ((delta[t - 1][j] * _A[j][i]) > (delta[t - 1][k] *
_A[k][i]) )
{
delta[t][i] = delta[t - 1][j] * _A[j][i] *
_B[i][getIndex(_o[t], oMin, oMax, oRange)];
k = j;
}
}
psi[t][i] = k;
}
}
int k = 1;
double p_star = delta[tWindow][1];
for (int i = 1; i <= stateNum - 1; i ++)
{
if (delta[tWindow][i + 1] > delta[tWindow][k])
{
p_star = delta[tWindow][i + 1];
k = i + 1;
}
}
int q_star = k;
//xi
for (int t = 1; t <= tWindow - 1; t ++)
{
for (int i = 1; i <= stateNum; i ++)
{
for (int j = 1; j <= stateNum; j ++)
{
xi[t][i][j] = 0;
for (int s1 = 1; s1 <= stateNum; s1 ++)
{
for (int s2 = 1; s2 <= stateNum; s2 ++)
{
xi[t][i][j] = xi[t][i][j] + beta[t + 1][s2]
* _B[s2][getIndex(_o[t + 1], oMin, oMax, oRange)] * _A[s1][s2] * alpha [t][s1];
}
}
xi[t][i][j] = beta[t + 1][j] * _B[j][getIndex(_o[t + 1],
oMin, oMax, oRange)] * _A[i][j] * alpha [t][i] / xi[t][i][j];
}
}
}
//update
for (int i = 1; i <= stateNum; i ++)
{
_Pi[i] = gamma[1][i];
for (int j = 1; j <= stateNum; j ++)
{
double numerator = 0;
double denominator = 0;
for (int t = 1; t <= tWindow - 1; t ++)
{
numerator += xi[t][i][j];
denominator += gamma[t][i];
}
_A[i][j] = numerator / denominator;
}
double tmp,detmp;
for(int k=1; k<=oRange; k++)
{
tmp = 0;
detmp = 0;
for(int t=1; t<=tWindow; t++)
{
if(getIndex(_o[t], oMin, oMax, oRange) == k ) tmp+=gamma[t][i];
detmp+=gamma[t][i];
}
_B[i][k] = tmp/detmp;
}
}
q = q_star;
return p;
}
//double _A[maxState][maxState],double _B[maxState][MAXRANGE],double _Pi[maxState]
void converge(int& q, double previousP,double threshold, int tWindow, int
maxRange, double oMin, double oMax, int stateNum, double _o[MAXT],double _A[MAXSTATE][MAXSTATE],double _B[MAXSTATE][MAXRANGE],double _Pi[MAXSTATE])
{
double currentP = updateModel(q, tWindow,maxRange,oMin,oMax,stateNum, _o,
_A,_B,_Pi);
while(fabs(currentP-previousP)>threshold)
{
previousP = currentP;
currentP = updateModel(q, tWindow,maxRange,oMin,oMax,stateNum, _o,
_A,_B,_Pi);
}
}
int main()
{
ifstream fin1("..\\data\\input.txt");
ifstream fin2("..\\data\\input2.txt");
ofstream fout("..\\data\\output.txt");
double result[maxResult];
double _o[MAXT];
double _A[MAXSTATE][MAXSTATE];
double _B[MAXSTATE][MAXRANGE];
double _Pi[MAXSTATE];
int oRange;
int nState;
double oMin;
double oMax;
int tWindow;
/*
#####################################################################
Begin- Input data
*/
string tnum;
char tmps[maxString];
double t;
int cnt1, cnt2;
int cnttmp;
/* Get the num of input1 and input2 */
if(!fin1.eof())
{
getline(fin1,tnum);
strcpy(tmps,tnum.c_str());
t = atof(tmps);
cnt1 = int(t);
}
if(!fin2.eof())
{
getline(fin2,tnum);
strcpy(tmps,tnum.c_str());
t = atof(tmps);
cnt2 = int(t);
}
/* Get the real data of input1 and input2 */
cnttmp = 1;
oMin = oMax = 0;
while(!fin1.eof())
{
getline(fin1,tnum);
strcpy(tmps,tnum.c_str());
t = atof(tmps);
_o[cnttmp++] = t;
if(oMin > t) oMin = t;
if(oMax < t) oMax = t;
// printf("1: %lf\n",t);
}
//printf("oMin = %lf, oMax = %lf\n",oMin, oMax);
while(!fin2.eof())
{
getline(fin2,tnum);
strcpy(tmps,tnum.c_str());
t = atof(tmps);
_o[cnttmp++] = t;
//printf("2: %lf\n",t);
}
/*
End- Input data
#####################################################################
*/
/*
Parameters to set:
int oRange;
int tWindow;
*/
int maxRange = 5000;
tWindow = 70;
nState = 3;
double previousP = 0;
double threshold = 1e-8;
// [To do]
for(int i=1;i<=nState;i++)
for(int j=1;j<=nState;j++)
_A[i][j] = (1.0)/ nState;
for(int i=1;i<=nState;i++)
for(int j=1;j<=maxRange;j++)
_B[i][j] = (1.0)/maxRange;
for(int i=1;i<=nState;i++)
_Pi[i] = (1.0)/nState;
/*
#####################################################################
Begin- Process data
*/
int q_star;
converge(q_star,previousP,threshold, tWindow, maxRange, oMin, oMax, 3,
_o,_A,_B,_Pi);
int bestIndex = 1; // the index of O(T+1)
int tmp;
int choice;
double predictValue,currentValue;
double bestValue;
for(int k=1;k<=cnt2;k++) // cnt2 Real Data
{
currentValue = _o[cnt1+k-1];
bestValue = 0;
for(int i=1;i<=maxRange;i++)
{
//tmp = getIndex(_o[cnt1+k], oMin, oMax, maxRange);
if(_B[q_star][i] > bestValue)
{
bestValue = _B[q_star][i];
bestIndex = i;
}
}
predictValue = oMin + (oMax - oMin) * (bestIndex-1) /(maxRange-1);
//index --> value
converge(q_star,previousP,threshold, tWindow, maxRange, oMin, oMax,
3, _o+k,_A,_B,_Pi);
if(predictValue > currentValue) choice = 1;
else choice = -1;
result[k] = choice * (_o[cnt1+k] - _o[cnt1+k-1]);
}
/*
End- Process data
#####################################################################
*/
/*
#####################################################################
Begin- Output data
*/
for(int i=1;i<=cnt2;i++)
fout << result[i] << endl;
/*
End- Output data
#####################################################################
*/
fin1.close();
fin2.close();
fout.close();
return 0;
}
Could someone tell me how to fix this error?
Thank you.

The error message is pretty clear:
mathcalls.h:266:1: error: previous declaration of >‘double gamma(double)’
There is a function double gamma(double) that you get when importing cmath.
Change the name of your array.

Your variable gamma conflicts with a symbol defined in mathcalls.h, a prototype for the gamma function.

Simple SSE loop slower than non-SSE version

I am trying to compare SSE float[4] addition to standard float[4] addition. As a demo I compute the sum of the summed components, with and without SSE:
#include <iostream>
#include <vector>
struct Point4
{
Point4()
{
data[0] = 0;
data[1] = 0;
data[2] = 0;
data[3] = 0;
}
float data[4];
};
void Standard()
{
Point4 a;
a.data[0] = 1.0f;
a.data[1] = 2.0f;
a.data[2] = 3.0f;
a.data[3] = 4.0f;
Point4 b;
b.data[0] = 1.0f;
b.data[1] = 6.0f;
b.data[2] = 3.0f;
b.data[3] = 5.0f;
float total = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
{
for(unsigned int component = 0; component < 4; ++component)
{
total += a.data[component] + b.data[component];
}
}
std::cout << "total: " << total << std::endl;
}
void Vectorized()
{
typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
v4sf a;
float* aPointer = (float*)&a;
aPointer[0] = 1.0f; aPointer[1] = 2.0f; aPointer[2] = 3.0f; aPointer[3] = 4.0f;
v4sf b;
float* bPointer = (float*)&b;
bPointer[0] = 1.0f; bPointer[1] = 6.0f; bPointer[2] = 3.0f; bPointer[3] = 5.0f;
v4sf result;
float* resultPointer = (float*)&result;
resultPointer[0] = 0.0f;
resultPointer[1] = 0.0f;
resultPointer[2] = 0.0f;
resultPointer[3] = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
{
result += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
float total = 0.0f;
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer[component];
}
std::cout << "total: " << total << std::endl;
}
int main()
{
// Standard();
Vectorized();
return 0;
}
However, the code seems to be faster (~.2 seconds) with the standard method than with the vectorized (~.4 seconds) method. Is it because of the for loop to sum the v4sf values? Is there a better operation I can use to time the difference between these two techniques and still compare the output to make sure there were no differences between the two?

Then reason your version is slower as SSE is that you have to unpack from an SSE register to a scalar register 4 times every iteration, which has more of an overhead than what you gain from the vectorized addition. Look at the disassembly and you should get a clearer picture.
I think what you want to do is the following (which is faster with SSE):
for(unsigned int i = 0; i < 1e6; ++i)
{
result += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer[component];
}
Also the following might be even faster:
for(unsigned int i = 0; i < 1e6/4; ++i)
{
result0 += a + b; // Vectorized operation
result1 += a + b; // Vectorized operation
result2 += a + b; // Vectorized operation
result3 += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer0[component];
total += resultPointer1[component];
total += resultPointer2[component];
total += resultPointer3[component];
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

SSE addition producing garbage - c++

Try typedef float v4sf attribute (( vector_size(4*sizeof(float)) )); I get 2e+07 as the result.

Related

C++ neural network implemented from scratch cannot get above 50% on MNIST

C++ Code segmentation fault only in vscode

Gradient descent converging towards the wrong value

C++ compilation error: "Double array redeclared as different kind of symbol"

Simple SSE loop slower than non-SSE version

Categories

Resources

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

SSE addition producing garbage - c++

Try typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) )); I get 2e+07 as the result.

Related

C++ neural network implemented from scratch cannot get above 50% on MNIST

C++ Code segmentation fault only in vscode

Gradient descent converging towards the wrong value

C++ compilation error: "Double array redeclared as different kind of symbol"

Simple SSE loop slower than non-SSE version

Categories

Resources

Try typedef float v4sf attribute (( vector_size(4*sizeof(float)) )); I get 2e+07 as the result.