AVX calculation precision - c++

I wrote a program to display the mandelbrot set. To speed it up, I used AVX (really AVX2) instructions through the <immintrin.h> header.
The problem is: The result of the AVX computation (with double precision) has artifacts, and it differs to the result when computed using "normal" doubles.
In detail, there is a function getIterationCount which calculates the number of iterations until the mandelbrot sequence exceeds 4, or assumes the point is included in the set if the sequences does not exceed 4 during the first N steps.
The code looks like this:
#include "stdafx.h"
#include <iostream>
#include <complex>
#include <immintrin.h>
class MandelbrotSet {
public:
int getIterationCount(const std::complex<double>, const int) const noexcept;
__m256i getIterationCount(__m256d cReal, __m256d cIm, unsigned maxIterations) const noexcept;
};
inline int MandelbrotSet::getIterationCount(const std::complex<double> c, const int maxIterations) const noexcept
{
double currentReal = 0;
double currentIm = 0;
double realSquare;
double imSquare;
for (int i = 0; i < maxIterations; ++i) {
realSquare = currentReal * currentReal;
imSquare = currentIm * currentIm;
currentIm = 2 * currentReal * currentIm + c.imag();
currentReal = realSquare - imSquare + c.real();
if (realSquare + imSquare >= 4) {
return i;
}
}
return -1;
}
const __m256i negone = _mm256_set_epi64x(-1, -1, -1, -1);
const __m256i one = _mm256_set_epi64x(1, 1, 1, 1);
const __m256d two = _mm256_set_pd(2, 2, 2, 2);
const __m256d four = _mm256_set_pd(4, 4, 4, 4);
//calculates for i = 0,1,2,3
//output[i] = if ctrl[i] == 0b11...1 then onTrue[i] else onFalse[i]
inline __m256i _mm256_select_si256(__m256i onTrue, __m256i onFalse, __m256i ctrl) {
return _mm256_or_si256(_mm256_and_si256(onTrue, ctrl), _mm256_and_si256(onFalse, _mm256_xor_si256(negone, ctrl)));
}
inline __m256i MandelbrotSet::getIterationCount(__m256d cReal, __m256d cIm, unsigned maxIterations) const noexcept {
__m256i result = _mm256_set_epi64x(0, 0, 0, 0);
__m256d currentReal = _mm256_set_pd(0, 0, 0, 0);
__m256d currentIm = _mm256_set_pd(0, 0, 0, 0);
__m256d realSquare;
__m256d imSquare;
for (unsigned i = 0; i <= maxIterations; ++i)
{
realSquare = _mm256_mul_pd(currentReal, currentReal);
imSquare = _mm256_mul_pd(currentIm, currentIm);
currentIm = _mm256_mul_pd(currentIm, two);
currentIm = _mm256_fmadd_pd(currentIm, currentReal, cIm);
currentReal = _mm256_sub_pd(realSquare, imSquare);
currentReal = _mm256_add_pd(currentReal, cReal);
__m256i isSmaller = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(realSquare, imSquare), four, _CMP_LE_OS));
result = _mm256_select_si256(_mm256_add_epi64(one, result), result, isSmaller);
//if (i % 10 == 0 && !isSmaller.m256i_i64[0] && !isSmaller.m256i_i64[1] && !isSmaller.m256i_i64[2] && !isSmaller.m256i_i64[3]) return result;
}
return result;
}
using namespace std;
int main() {
MandelbrotSet m;
std::complex<double> point(-0.14203954214360026, 1);
__m256i result_avx = m.getIterationCount(_mm256_set_pd(-0.14203954214360026, -0.13995837669094691, -0.13787721123829355, -0.13579604578563975),
_mm256_set_pd(1, 1, 1, 1), 2681);
int result_normal = m.getIterationCount(point, 2681);
cout << "Normal: " << result_normal << ", AVX: " << result_avx.m256i_i64[0] << ", at point " << point << endl;
return 0;
}
When I run this code, I get the following result:
(The point -0.14203954214360026 + i is chosen intentionally, because both methods return the same/almost the same value in most points)
Normal: 13, AVX: 20, at point (-0.14204,1)
A difference of 1 might be acceptable, but a difference of 7 seems quite big, since both methods use double precision.
Have AVX instructions a lower precision than "normal" instruction? If not, why do both results differ so much?
I use MS Visual Studio 2017, MS Visual C++ 2017 15.6 v14.13 141 and my computer has a i7-7700K Processor. The Project is compiled for x64. The result is the same if it is compiler with no or full optimization.
The rendered results look like this:
AVX:
Normal
The values of realSquare and imSquare during the loop are as follows:
0, 0, 0
1, 0.0201752, 1
2, 1.25858, 0.512543
3, 0.364813, 0.367639
4, 0.0209861, 0.0715851
5, 0.0371096, 0.850972
6, 0.913748, 0.415495
7, 0.126888, 0.0539759
8, 0.00477863, 0.696364
9, 0.69493, 0.782567
10, 0.0527514, 0.225526
11, 0.0991077, 1.48388
12, 2.33115, 0.0542994
13, 4.5574, 0.0831971
In the AVX loop the values are:
0, 0, 0
1, 0.0184406, 1
2, 1.24848, 0.530578
3, 0.338851, 0.394109
4, 0.0365017, 0.0724287
5, 0.0294888, 0.804905
6, 0.830307, 0.478687
7, 0.04658, 0.0680608
8, 0.024736, 0.78746
9, 0.807339, 0.519651
10, 0.0230712, 0.0872787
11, 0.0400014, 0.828561
12, 0.854433, 0.404359
13, 0.0987707, 0.0308286
14, 0.00460416, 0.791455
15, 0.851277, 0.773114
16, 0.00332154, 0.387519
17, 0.270393, 1.14866
18, 1.02832, 0.0131355
19, 0.773319, 1.51892
20, 0.776852, 10.0336

Reversing the order of the arguments passed to _mm256_set_pd solves the problem.
If you inspect the value of cReal in the debugger you'll see that the first element is set to -0.13579604578563975 not -0.14203954214360026.

Related

Changes required when porting from 128-bit SIMD to 256-bit AVX? [duplicate]

This question already has answers here:
How to solve the 32-byte-alignment issue for AVX load/store operations?
(3 answers)
Fastest way to do horizontal SSE vector sum (or other reduction)
(5 answers)
Closed 1 year ago.
I recently followed a post/ blog to find the least member of an array and it used 128 bit vector instructions. I followed the post and it ran fine, until I decided to write the same for 256 bit instruction set.
The code is as follows -
#include<iostream>
#include<random>
//#include <Eigen/Dense>
#include <immintrin.h>
#include <cstdlib>
#include <vector>
float min128_sse(float *a, int n) {
float res;
__m128 *simdVector = (__m128*) a;
__m128 maxval = _mm_set1_ps(UINT32_MAX);
for (int i = 0; i < n / 4; i++) {
maxval = _mm_min_ps(maxval, simdVector[i]);
}
maxval = _mm_min_ps(maxval, _mm_shuffle_ps(maxval, maxval, 0x93));
_mm_store_ss(&res, maxval);
return res;
}
float min256_sse(float *a, int n) {
float res;
__m256* simdVector = (__m256*) a;
__m256 minVal = _mm256_set1_ps(UINT32_MAX);
for (int i = 0; i < n / 8; i++) {
minVal = _mm256_min_ps(minVal, simdVector[i]);
}
minVal = _mm256_min_ps(minVal, _mm256_shuffle_ps(minVal, minVal, 0x93));
res = minVal[0];
std::cout<<res<<std::endl;
return res;
}
int main()
{
std::vector<float> givenVector{1.0, 2.0, 3.0, 4.0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1, -2, -3, -4, -5, -6, -7, -8};
std::cout<<min128_sse(givenVector.data(), givenVector.size())<<std::endl;
std::cout<<min256_sse(givenVector.data(), givenVector.size())<<std::endl;
}
The code runs into segmentation fault in the following line -
minVal = _mm256_min_ps(minVal, simdVector[i])
From my basic understanding of SIMD instructions, _mm256_min_ps would operate on 256 bits at once as opposed to 128 bits in _mm_min_ps. If I am not encountering a segmentation fault in in my 128 bit version, I should not be facing it in the 256 bit version as well. The only changes that would be required would be the range of the for loop.
However the segmentation fault comes into picture even at i=0. I suppose there is a gap in my understanding of SIMD instructions. Can someone please highlight it.
Can someone also point out why the Segmentation fault is being thrown.
TIA

boost odeint gives very different values from Python3 scipy

I am trying to integrate a very simple ODE using boost odeint.
For some cases, the values are the same (or very similar) to Python's scipy odeint function.
But for other initial conditions, the values are vastly different.
The function is: d(uhat) / dt = - alpha^2 * kappa^2 * uhat
where alpha is 1.0, and kappa is a constant depending on the case (see values below).
I have tried several different ODE solvers from boost, and none seem to work.
Update: The code below is now working.
In the code below, the first case gives nearly identical results, the 2nd case is kind of trivial (but reassuring), and the 3rd case gives erroneous answers in the C++ version.
Here is the C++ version:
#include <boost/numeric/odeint.hpp>
#include <cstdlib>
#include <iostream>
typedef boost::numeric::odeint::runge_kutta_dopri5<double> Stepper_Type;
struct ResultsObserver
{
std::ostream& m_out;
ResultsObserver( std::ostream &out ) : m_out( out ) { }
void operator()(const State_Type& x , double t ) const
{
m_out << t << " : " << x << std::endl;
}
};
// The rhs: d_uhat_dt = - alpha^2 * kappa^2 * uhat
class Eq {
public:
Eq(double alpha, double kappa)
: m_constant(-1.0 * alpha * alpha * kappa * kappa) {}
void operator()(double uhat, double& d_uhat_dt, const double t) const
{
d_uhat_dt = m_constant * uhat;
}
private:
double m_constant;
};
void integrate(double kappa, double initValue)
{
const unsigned numTimeIncrements = 100;
const double dt = 0.1;
const double alpha = 1.0;
double uhat = initValue; //Init condition
std::vector<double> uhats; //Results vector
Eq rhs(alpha, kappa); //The RHS of the ODE
//This is what I was doing that did not work
//
//boost::numeric::odeint::runge_kutta_dopri5<double> stepper;
//for(unsigned step = 0; step < numTimeIncrements; ++step) {
// uhats.push_back(uhat);
// stepper.do_step(rhs, uhat, step*dt, dt);
//}
//This works
integrate_const(
boost::numeric::odeint::make_dense_output<Stepper_Type>( 1E-12, 1E-6 ),
rhs, uhat, startTime, endTime, dt, ResultsObserver(std::cout)
);
std::cout << "kappa = " << kappa << ", initial value = " << initValue << std::endl;
for(auto val : uhats)
std::cout << val << std::endl;
std::cout << "---" << std::endl << std::endl;
}
int main() {
const double kappa1 = 0.062831853071796;
const double initValue1 = -187.097241230045967;
integrate(kappa1, initValue1);
const double kappa2 = 28.274333882308138;
const double initValue2 = 0.000000000000;
integrate(kappa2, initValue2);
const double kappa3 = 28.337165735379934;
const double initValue3 = -0.091204068895190;
integrate(kappa3, initValue3);
return EXIT_SUCCESS;
}
and the corresponding Python3 version:
enter code here
#!/usr/bin/env python3
import numpy as np
from scipy.integrate import odeint
def Eq(uhat, t, kappa, a):
d_uhat = -a**2 * kappa**2 * uhat
return d_uhat
def integrate(kappa, initValue):
dt = 0.1
t = np.arange(0,10,dt)
a = 1.0
print("kappa = " + str(kappa))
print("initValue = " + str(initValue))
uhats = odeint(Eq, initValue, t, args=(kappa,a))
print(uhats)
print("---")
print()
kappa1 = 0.062831853071796
initValue1 = -187.097241230045967
integrate(kappa1, initValue1)
kappa2 = 28.274333882308138
initValue2 = 0.000000000000
integrate(kappa2, initValue2)
kappa3 = 28.337165735379934
initValue3 = -0.091204068895190
integrate(kappa3, initValue3)
With boost::odeint you should use the integrate... interface functions.
What happens in your code using do_step is that you use the dopri5 method as a fixed-step method with your provided step size. In connection with the large coefficient L=kappa^2 of about 800, the product L*dt=80 is far outside the stability region of the method, the boundary is between values of 2 to 3.5. Divergence or oscillating divergence is the expected result.
What should happen and what is implemented in the scipy odeint, ode-dopri5 or solve_ivp-RK45 methods is a method with adaptive step size. Internally the optimal step size for the error tolerances is chosen, and in each internal step an interpolation polynomial is determined. The output or observed values are computed with this interpolator, also called "dense output" if the interpolation function is returned from the integrator. One result of the error control is that the step size is always inside the stability region, for stiff problems with medium error tolerances on or close to the boundary of this region.
This has all the hallmarks of precision issues.
Simply replacing double with long double gives:
Live On Compiler Explorer
#include <boost/numeric/odeint.hpp>
#include <boost/multiprecision/cpp_bin_float.hpp>
#include <boost/multiprecision/cpp_dec_float.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <iostream>
using Value = long double;
//using Value = boost::multiprecision::number<
//boost::multiprecision::backends::cpp_bin_float<100>,
//boost::multiprecision::et_off>;
// The rhs: d_uhat_dt = - alpha^2 * kappa^2 * uhat
class Eq {
public:
Eq(Value alpha, Value kappa)
: m_constant(-1.0 * alpha * alpha * kappa * kappa)
{
}
void operator()(Value uhat, Value& d_uhat_dt, const Value) const
{
d_uhat_dt = m_constant * uhat;
}
private:
Value m_constant;
};
void integrate(Value const kappa, Value const initValue)
{
const unsigned numTimeIncrements = 100;
const Value dt = 0.1;
const Value alpha = 1.0;
Value uhat = initValue; // Init condition
std::vector<Value> uhats; // Results vector
Eq rhs(alpha, kappa); // The RHS of the ODE
boost::numeric::odeint::runge_kutta_dopri5<Value> stepper;
for (unsigned step = 0; step < numTimeIncrements; ++step) {
uhats.push_back(uhat);
auto&& stepdt = step * dt;
stepper.do_step(rhs, uhat, stepdt, dt);
}
fmt::print("kappa = {}, initial value = {}\n{}\n---\n", kappa, initValue,
uhats);
}
int main() {
for (auto [kappa, initValue] :
{
std::pair //
{ 0.062831853071796 , -187.097241230045967 },
{28.274333882308138 , 0.000000000000 },
{28.337165735379934 , -0.091204068895190 },
}) //
{
integrate(kappa, initValue);
}
}
Prints
kappa = 0.0628319, initial value = -187.097
[-187.097, -187.023, -186.95, -186.876, -186.802, -186.728, -186.655, -186.581, -186.507, -186.434, -186.36, -186.287, -186.213, -186.139, -186.066, -185.993, -185.919, -185.846, -185.772, -185.699, -185.626, -185.553, -185.479, -185.406, -185.333, -185.26, -185.187, -185.114, -185.04, -184.967, -184.894, -184.821, -184.748, -184.676, -184.603, -184.53, -184.457, -184.384, -184.311, -184.239, -184.166, -184.093, -184.021, -183.948, -183.875, -183.803, -183.73, -183.658, -183.585, -183.513, -183.44, -183.368, -183.296, -183.223, -183.151, -183.079, -183.006, -182.934, -182.862, -182.79, -182.718, -182.645, -182.573, -182.501, -182.429, -182.357, -182.285, -182.213, -182.141, -182.069, -181.998, -181.926, -181.854, -181.782, -181.71, -181.639, -181.567, -181.495, -181.424, -181.352, -181.281, -181.209, -181.137, -181.066, -180.994, -180.923, -180.852, -180.78, -180.709, -180.638, -180.566, -180.495, -180.424, -180.353, -180.281, -180.21, -180.139, -180.068, -179.997, -179.926]
---
kappa = 28.2743, initial value = 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
---
kappa = 28.3372, initial value = -0.0912041
[-0.0912041, -38364100, -1.61375e+16, -6.78809e+24, -2.85534e+33, -1.20107e+42, -5.0522e+50, -2.12516e+59, -8.93928e+67, -3.76022e+76, -1.5817e+85, -6.65327e+93, -2.79864e+102, -1.17722e+111, -4.95186e+119, -2.08295e+128, -8.76174e+136, -3.68554e+145, -1.55029e+154, -6.52114e+162, -2.74306e+171, -1.15384e+180, -4.85352e+188, -2.04159e+197, -8.58774e+205, -3.61235e+214, -1.5195e+223, -6.39163e+231, -2.68858e+240, -1.13092e+249, -4.75713e+257, -2.00104e+266, -8.41718e+274, -3.54061e+283, -1.48932e+292, -6.26469e+300, -2.63518e+309, -1.10846e+318, -4.66265e+326, -1.9613e+335, -8.25002e+343, -3.47029e+352, -1.45975e+361, -6.14028e+369, -2.58285e+378, -1.08645e+387, -4.57005e+395, -1.92235e+404, -8.08618e+412, -3.40137e+421, -1.43075e+430, -6.01833e+438, -2.53155e+447, -1.06487e+456, -4.47929e+464, -1.88417e+473, -7.92559e+481, -3.33382e+490, -1.40234e+499, -5.89881e+507, -2.48128e+516, -1.04373e+525, -4.39033e+533, -1.84675e+542, -7.76818e+550, -3.26761e+559, -1.37449e+568, -5.78166e+576, -2.432e+585, -1.023e+594, -4.30314e+602, -1.81008e+611, -7.61391e+619, -3.20272e+628, -1.34719e+637, -5.66684e+645, -2.3837e+654, -1.00268e+663, -4.21768e+671, -1.77413e+680, -7.4627e+688, -3.13911e+697, -1.32044e+706, -5.55429e+714, -2.33636e+723, -9.82768e+731, -4.13392e+740, -1.73889e+749, -7.31449e+757, -3.07677e+766, -1.29421e+775, -5.44399e+783, -2.28996e+792, -9.6325e+800, -4.05182e+809, -1.70436e+818, -7.16922e+826, -3.01567e+835, -1.26851e+844, -5.33587e+852]
---
As you can see, I tried some simple takes to get it to use Boost Multiprecision, but that didn't immediately work and may require someone with actual understanding of the maths / IDEINT.

Writing to File in Python, Reading From File with Arduino

I'm currently using Python 2.7 to pull pixel information from a series of bitmaps, and writing 24 bits of information to a file (with an arbitrary extension, ".bfs", to make it easy to find down the pipeline), 8 bits for position x, 8 bits for position y, 16 bits for color.
from PIL import Image
import struct
filename = raw_input('Please choose destination filename: ')
file_in = [0]*27
im = [0]*27
for i in range(1,27):
file_in[i] = str(i)+".bmp"
im[i] = Image.open(file_in[i])
file_out = open(filename+".bfs", 'w')
readable_out = open(filename+".txt", 'w')
for q in range(1,27):
pix = im[q].load()
width, height = im[q].size
for y in range (height):
for x in range (width):
rgb = pix[x,y]
red = rgb[0]
green = rgb[1]
blue = rgb[2]
Uint16_val = (((31*(red+4))/255)<<11) | (((63*(green+2))/255)<<5) | ((31*(blue+4))/255)
hex_16 = int('%.4x'%Uint16_val, 16)
print(str(x)+", "+str(y)+", "+str(hex_16)+"\n")
readable_out.write(str(x)+", "+str(y)+", "+str(hex_16)+"\n")
file_out.write(struct.pack('<1B', x))
file_out.write(struct.pack('<1B', y))
file_out.write(struct.pack('<1H', hex_16))
On the PC side everything is coming out clean how I expect (this is copied from a .txt file that I output and format to make it easier to read):
0, 0, 40208
1, 0, 33544
2, 0, 33544
3, 0, 39952
4, 0, 39944
5, 0, 33544
6, 0, 39688
7, 0, 39952
8, 0, 39944
9, 0, 33544
10, 0, 33800
11, 0, 39952
12, 0, 39952
13, 0, 33544
14, 0, 33800
15, 0, 48400
From here I'm taking the .bfs file and loading it onto an SD card for an Arduino Uno to read from. The Arduino code is supposed to read from the SD card, and output the x, y, and color values to a TFT LCD. Here is the Arduino Code:
#include <Adafruit_GFX.h> // Core graphics library
#include <Adafruit_ST7735.h> // Hardware-specific library
#include <SPI.h>
#include <SD.h>
#define TFT_CS 10 // Chip select line for TFT display
#define TFT_RST 9 // Reset line for TFT (or see below...)
#define TFT_DC 8 // Data/command line for TFT
#define SD_CS 4 // Chip select line for SD card
Adafruit_ST7735 tft = Adafruit_ST7735(TFT_CS, TFT_DC, TFT_RST);
void setup(void) {
Serial.begin(9600);
tft.initR(INITR_144GREENTAB);
Serial.print("Initializing SD card...");
if (!SD.begin(SD_CS)) {
Serial.println("failed!");
return;
}
Serial.println("OK!");
tft.fillScreen(0x0000);
}
uint32_t pos = 0;
uint8_t x,y;
uint8_t buffpix[3];
uint16_t c;
void loop() {
bfsDraw("image.bfs");
}
#define BUFFPIXEL 20
void bfsDraw(char *filename) {
File bfsFile;
int w, h, row, col;
uint8_t x,y;
uint16_t c;
uint32_t pos = 0, startTime = millis();
if((0 >= tft.width()) || (0 >= tft.height())) return;
if ((bfsFile = SD.open(filename)) == NULL) {
Serial.print("File not found");
return;
}
w = 128;
h = 128;
tft.setAddrWindow(0, 0, 0+w-1, 0+h-1);
for (row=0; row<h; row++) {
for (col=0; col<w; col++) {
x = bfsFile.read();
Serial.print(x);
Serial.print(", ");
y = bfsFile.read();
Serial.print(y);
Serial.print(", ");
c = read16(bfsFile);
Serial.print(c);
Serial.print(" ");
Serial.println(" ");
tft.drawPixel(x,y,c);
}
}
}
uint8_t read8(File f) {
uint16_t result;
((uint8_t *)&result)[0] = f.read();
return result;
}
uint16_t read16(File f) {
uint16_t result;
((uint8_t *)&result)[0] = f.read();
((uint8_t *)&result)[1] = f.read();
return result;
}
I have some print statements around the code that reads from the card before sending out to the TFT, and instead of matching the file that (I think) I wrote it outputs like this:
0, 0, 40208
1, 0, 33544
2, 0, 33544
3, 0, 39952
4, 0, 39944
5, 0, 33544
6, 0, 39688
7, 0, 39952
8, 0, 39944
9, 0, 33544
13, 10, 2048
132, 11, 4096
156, 12, 4096
As you can see the reading from the Arduino starts out matching the writing of the Python script, but after 9 the "X" byte has shifted into the middle instead of the leading position. My question, is what is causing this shift, after x = 9? is this a little endian versus big endian issue?
Thanks for your help!
You opened your file in text mode, not binary mode. On Windows, that means that every newline character (byte value 10) that you write gets converted into carriage return + linefeed (byte values 13, 10). Use 'wb' for the mode when opening the .bfs file.
Note that writing the coordinates of each pixel into the file is insane - you're doubling the size of the file for absolutely no benefit. You can easily recreate the coordinates as you're reading the file - in fact you're ALREADY DOING SO, in the form of the row and col variables!

Running error of SSE2 code in VS2013

I have the following SIMD code trying to run in vs2013. It can be well compiled but cannot run. Anyone knows why?
#include <cstdio>
#include <xmmintrin.h>
int main()
{
const size_t num = 7;
float a[num] = { 1, 2, 3, 4, 5, 6, 7 };
float b[num] = { 1, -1, -2, 1, -3, -2, 5 };
float c[num];
__m128 A, B, C;
A = _mm_load_ps(&a[0]); // <== crash here.
B = _mm_load_ps(&b[0]);
C = _mm_add_ps(A, B);
_mm_store_ps(&c[0], C);
return 0;
}
The address being loaded from or stored to using these intrinsics needs to be 16 byte aligned (divisible by 16). See
https://msdn.microsoft.com/en-us/library/zzd50xxt(v=vs.90).aspx
You should declare the variables a,b and c like this:
__declspec(align(16)) float a[num] = { 1, 2, 3, 4, 5, 6, 7 };

Not declared in this scope - Arduino

I'm having a problem when I try this code I've made:
int ledStart = 30;
boolean commonHigh = true;
void setup() {
Serial.begin(115200);
SetTimer(0, 0, 10); // 10 seconds
StartTimer();
for (int i =0;i<9;++i) {
pinMode (i, OUTPUT);
}
pinMode(9, INPUT);
}
int counter = 0;
bool go_by_switch = true;
int last_input_value = LOW;
void loop() {
// put your main code here, to run repeatedly:
number++;
delay(1000);
if(number>9)
number=0; // If number is bigger than 9, then number is 0
}
// 0 6
// pins A B C D E F G
int ledpins[] = {12, 10, 7, 4, 2, 13, 8};
int pincnt = 7;
int number = 0;
int sevenseg[10][7] = {
// A, B, C, D, E, F, G
{1, 1, 1, 1, 1, 1, 0}, // A-F shall light. G shall not light.
{0, 1, 1, 0, 0, 0, 0}, // A shall not light. B and C shall light.
/*0*/
/*1*/
/*2*/
/*3*/
/*4*/
/*5*/
/*6*/
/*7*/
/*8*/
{1, 1, 1, 1, 1, 1, 1, 1}
if(go_by_switch) {
int switch_input_value = digitalRead(9);
if(last_input_value == LOW && switch_input_value == HIGH) {
counter = (counter + 1) % 10;
}
last_input_value = switch_input_value;
}
else {
delay(500);
counter = (counter + 1) % 10;
}
writeNumber(counter);
}
for (int p=0; p<pincnt; p++) {
pinMode (ledpins[P], OUTPUT);
//It will count from 0 to smaller than 7. {12, 10, 7, 4, 2, 13, 8}; It will count from 0 to smaller than 7.
// 0 1 2 3 4 5 6
digitalWrite(ledpins[P], LOW);
}
for (int x=0; x<pincnt; x++); { //x is smaller than 7. The point is to bring out one of the patterns that will show on the display
if (sevenseg[number][x]) // sevenseg = 7-segment display
digitalWrite (ledpins[x], HIGH); // If it is 1, then there will be light.
else
digitalWrite (ledpins[x], LOW); // If it is 0, then there will not be light.
// A
//F B
// G
//E C
// D
The error message I get is:
_28.10.2015.ino: In function 'void setup()':
_28.10.2015.ino:7:20: error: 'SetTimer' was not declared in this scope
_28.10.2015.ino:8:14: error: 'StartTimer' was not declared in this scope
_28.10.2015.ino: In function 'void loop()':
_28.10.2015.ino:22:1: error: 'number' was not declared in this scope
_28.10.2015.ino: At global scope:
_28.10.2015.ino:52:1: error: expected '}' before 'if'
_28.10.2015.ino:52:1: error: too many initializers for 'int [7]'
_28.10.2015.ino:52:1: error: expected ',' or ';' before 'if'
Feil ved kompilering.
(Feil ved kompilering=Errors at compile(Norwegian)
The problem is that you are not declaring these functions that you are getting errors, neither the "number" variable.
You need to declare them, like:
int number;
void StartTimer( )
{
// function code;
}
Or include a ".h" that contain these functions, like #Neil Locketz said.
There are quite a few issues with this code.
One of the first things that I notice is that you close out your loop() function with }, then you proceed to write more code that doesn't belong to any function at all.
Also, as #Raul points out, you define an array sevenseg[][], but you do not end the statement with a semicolon.
Your last for() loop is missing its closing brace, }.
Your last for() loop has a semicolon before the opening brace. It shouldn't be there.
You use the variable number in your loop() function, but you define what number is after you use it. You have to define a variable before you use it.
You call SetTimer() and StartTimer() in your setup() function, but those functions are not defined. That's because either 1, you have not included the library where those functions are defined or 2, you did not define those functions yourself. If your issue is 1, then I assume you intended to use #include <SimpleTimer.h>. Note that you also have to install that library. The instructions on how to download it and add it to your Arduino libraries are here. Finally, you have to create a timer object like this: SimpleTimer timer; and then you can call the function like this, timer.SetTimer(your-parameters-here);.
There are probably other things that I have missed, but that should give you a starting point. It looks like you have created a lot of code without testing to see if any of it worked. I would recommend taking this a step at a time... code one logical block and see if it works before you move on to coding your next idea. It may seem like it takes more time but, in the end, it is usually a much faster way to program.
Another suggestion that I would make is to define variables within the function in which you use them. Making all of your variables "global" like you have done is not a good way to write code. For example:
void loop()
{
static int number = 0;
number++;
delay(1000);
if (number > 9)
{
number = 0;
}
}
Note the use of the keyword static. This will ensure that the value stored in number will not go away when the function ends. In other words, the value will still be there the next time the loop() function is called.
Finally, if I had to guess at what you were trying to accomplish, I would think your code should look a little more like this. It appears as though you were trying out different things so I left a number of code snippets in there from your original code that don't actually do anything:
void setup() {
Serial.begin(115200);
for (int i = 0; i < 9; ++i)
{
pinMode (i, OUTPUT);
}
pinMode(9, INPUT);
}
void loop() {
static int counter = 0;
static int last_input_value = LOW;
static bool go_by_switch = true;
if(go_by_switch)
{
int switch_input_value = digitalRead(9);
if(last_input_value == LOW && switch_input_value == HIGH)
{
counter = (counter + 1) % 10;
}
last_input_value = switch_input_value;
}
else
{
delay(500);
counter = (counter + 1) % 10;
}
writeNumber(counter);
}
void writeNumber (int count)
{
#define PIN_COUNT 7
#define NUM_OF_SEGMENTS 7
#define NUM_OF_NUMBERS 10
// 0 6
// pins A B C D E F G
static const int ledpins[PIN_COUNT] = {12, 10, 7, 4, 2, 13, 8};
static const int sevenseg[NUM_OF_NUMBERS][NUM_OF_SEGMENTS] =
{
// A B C D E F G
{1, 1, 1, 1, 1, 1, 0}, //0
{0, 1, 1, 0, 0, 0, 0}, //1
{1, 1, 0, 1, 1, 0, 1}, //2
{1, 1, 1, 1, 0, 0, 1}, //3
{0, 1, 1, 0, 0, 1, 1}, //4
{1, 0, 1, 1, 0, 1, 1}, //5
{1, 0, 1, 1, 1, 1, 1}, //6
{1, 1, 1, 0, 0, 0, 0}, //7
{1, 1, 1, 1, 1, 1, 1}, //8
{1, 1, 1, 1, 0, 1, 1}, //9
};
static int number = 0;
int i;
number++;
delay(1000);
if(number >= NUM_OF_NUMBERS)
{
number = 0;
}
/* Clear all segments of the 7-segment display. */
for (i = 0; i < PIN_COUNT; i++)
{
pinMode (ledpins[i], OUTPUT);
digitalWrite(ledpins[i], LOW);
}
/* Set the 7-segment display with the current number. */
for (i = 0; i < PIN_COUNT; i++)
{
if (sevenseg[number][i]) // sevenseg = 7-segment display
digitalWrite (ledpins[i], HIGH); // If it is 1, then there will be light.
else
digitalWrite (ledpins[i], LOW); // If it is 0, then there will not be light.
}
}