Bug in OpenGL? Identical float computation leads to different results - opengl

The "minimial example" is quite long unfortunately as the bug disappears if I drop too much seamingly irrelevant code.
Here is the C++ code:
#define _USE_MATH_DEFINES
#include <iostream>
#include <iomanip>
#include <vector>
#include <math.h>
#include <time.h>
#include <fstream>
#include <sstream>
//OpenGL includes
#include <GL/glew.h> //OpenGL Extension Wrangler
#include <GL/freeglut.h> //Free OpenGL Utility Toolkit (includes gl.h and glu.h)
void fPressEnter(){
std::cout << "Press enter to exit.";
std::cin.get();
}
void fCompileShaderFromString(const std::string& Source, const GLuint ShaderId){
const char* Code = Source.c_str();
glShaderSource(ShaderId, 1, &Code, NULL);
glCompileShader(ShaderId);
int Status;
glGetShaderiv(ShaderId, GL_COMPILE_STATUS, &Status);
if(Status == GL_FALSE){
int Length = 0;
glGetShaderiv(ShaderId, GL_INFO_LOG_LENGTH, &Length);
if(Length > 0){
char* Log = new char[Length];
int Written = 0;
glGetShaderInfoLog(ShaderId, Length, &Written, Log);
std::cout << Log << std::endl;
delete[] Log;
}
}
}
void fCompileShaderFromFile(const char* FileName, const GLuint ShaderId){
std::ifstream InFile(FileName, std::ios::in);
std::ostringstream Code;
while(InFile.good()){
int TI = InFile.get();
if(!InFile.eof()) Code << (char) TI;
}
InFile.close();
fCompileShaderFromString(Code.str(), ShaderId);
}
void fLinkProgram(const GLuint ProgramId){
glLinkProgram(ProgramId);
int Status;
glGetProgramiv(ProgramId, GL_LINK_STATUS, &Status);
if(Status == GL_FALSE){
int Length = 0;
glGetProgramiv(ProgramId, GL_INFO_LOG_LENGTH, &Length);
if(Length > 0){
char* Log = new char[Length];
int Written = 0;
glGetProgramInfoLog(ProgramId, Length, &Written, Log);
std::cout << Log << std::endl;
delete[] Log;
}
}
}
int main(){
//OpenGL setup
//Glut
int argc = 1;
char* argv[1] = {(char*) ""};
glutInit(&argc, argv);
glutInitWindowPosition((glutGet(GLUT_SCREEN_WIDTH) - 256) / 2, (glutGet(GLUT_SCREEN_HEIGHT) - 256) / 2);
glutInitWindowSize(256, 256);
glutInitDisplayMode(GLUT_RGB);
glutCreateWindow("Bug");
glutHideWindow();
//Glew
GLenum glew_ok = glewInit();
if(glew_ok != GLEW_OK){fprintf(stderr, "Glew error: '%s'\n", glewGetErrorString(glew_ok)); fPressEnter(); return(1);}
//Main program
//Auxiliary variables
GLuint ShaderId, ProgramId, BufferId;
float* BufferPointer;
float* Data = new float[32 * 32 * 2];
float dsup, esup;
//Compile shader program and create buffer in graphics card memory
ShaderId = glCreateShader(GL_COMPUTE_SHADER);
fCompileShaderFromFile("Shader.txt", ShaderId);
ProgramId = glCreateProgram();
glAttachShader(ProgramId, ShaderId);
fLinkProgram(ProgramId);
glDetachShader(ProgramId, ShaderId);
glDeleteShader(ShaderId);
glUseProgram(ProgramId);
glGenBuffers(1, &BufferId);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BufferId);
glBufferData(GL_SHADER_STORAGE_BUFFER, 32 * 32 * 2 * sizeof(float), NULL, GL_STREAM_READ);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
//Actual computation
std::cout << "Starting computation" << std::endl << std::endl << std::flush;
dsup = 0.f;
esup = 0.f;
glUniform1i(0, 0);
glUniform1i(1, 0);
glDispatchCompute(4, 4, 1);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
glFinish();
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BufferId);
BufferPointer = (float*) glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
memcpy(Data, BufferPointer, 32 * 32 * 2 * sizeof(float));
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
for(int z2 = 0; z2 < 32; z2++) for(int z3 = 0; z3 < 32; z3++){
if(Data[z2 * 32 * 2 + z3 * 2 + 0] > dsup) dsup = Data[z2 * 32 * 2 + z3 * 2 + 0];
if(Data[z2 * 32 * 2 + z3 * 2 + 1] > esup) esup = Data[z2 * 32 * 2 + z3 * 2 + 1];
}
std::cout << std::setprecision(8) << dsup << ", " << std::setprecision(8) << esup << std::endl;
//Cleanup
delete[] Data;
glDeleteBuffers(1, &BufferId);
glDeleteProgram(ProgramId);
fPressEnter();
return(0);
}
and here is the shader:
#version 430 core
const uint mg = 2048;
const uint mb = 512;
const uint my = 128;
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
layout(binding = 0) buffer OutputBlock{
float Data[32][32][2];
} Output;
layout(location = 0) uniform int goffset;
layout(location = 1) uniform int boffset;
vec2 fe(float x){return(vec2(cos(6.2831853e0f * x), sin(6.2831853e0f * x)));}
vec2 fComplexTimes(const vec2 A, const vec2 B){return(vec2(A.x * B.x - A.y * B.y, A.x * B.y + A.y * B.x));}
vec2 fComplexTimesI(const vec2 A){return(vec2(-A.y, A.x));}
vec2 fComplexTimesMinusI(const vec2 A){return(vec2(A.y, -A.x));}
float dsup, esup;
void fDESupTaylor_a10_w3_d2(){
const vec2 one = vec2(1.f, 0.f);
uint q, zg, zb, zy, n;
float fq, fb, fg, fys, fymb, fmg, fmb, fatmy;
float dsum, esum, dsummandsup, esummandsup, dabs, eabs;
float t;
vec2 td, te;
vec2 Di_0_, Di_0_y, Di_0_yy, Di_1_, Di_1_g, Di_1_b, Di_1_y, Di_1_gg, Di_1_gb, Di_1_gy, Di_1_bg, Di_1_bb, Di_1_by, Di_1_yg, Di_1_yb, Di_1_yy, Di_2_, Di_2_g, Di_2_b, Di_2_y, Di_2_gg, Di_2_gb, Di_2_gy, Di_2_bg, Di_2_bb, Di_2_by, Di_2_yg, Di_2_yb, Di_2_yy;
vec2 Ei_0_, Ei_0_y, Ei_0_yy, Ei_1_, Ei_1_g, Ei_1_b, Ei_1_y, Ei_1_gg, Ei_1_gb, Ei_1_gy, Ei_1_bg, Ei_1_bb, Ei_1_by, Ei_1_yg, Ei_1_yb, Ei_1_yy, Ei_2_, Ei_2_g, Ei_2_b, Ei_2_y, Ei_2_gg, Ei_2_gb, Ei_2_gy, Ei_2_bg, Ei_2_bb, Ei_2_by, Ei_2_yg, Ei_2_yb, Ei_2_yy;
vec2 D_2_, D_2_g, D_2_b, D_2_y, D_2_gg, D_2_gb, D_2_gy, D_2_bg, D_2_bb, D_2_by, D_2_yg, D_2_yb, D_2_yy, D_3_, D_3_g, D_3_b, D_3_y, D_3_gg, D_3_gb, D_3_gy, D_3_bg, D_3_bb, D_3_by, D_3_yg, D_3_yb, D_3_yy;
vec2 E_2_, E_2_g, E_2_b, E_2_y, E_2_gg, E_2_gb, E_2_gy, E_2_bg, E_2_bb, E_2_by, E_2_yg, E_2_yb, E_2_yy, E_3_, E_3_g, E_3_b, E_3_y, E_3_gg, E_3_gb, E_3_gy, E_3_bg, E_3_bb, E_3_by, E_3_yg, E_3_yb, E_3_yy;
float R_g, R_b, R_y, R_gg, R_gb, R_gy, R_bg, R_bb, R_by, R_yg, R_yb, R_yy;
fmg = float(mg);
fmb = float(mb);
fatmy = 10.f * float(my);
R_g = 2.f * fmg;
R_b = 2.f * fmb;
R_y = 2.f * fatmy;
R_gg = 8.f * pow(fmg, 2.f);
R_gb = 8.f * fmg * fmb;
R_gy = 8.f * fmg * fatmy;
R_bg = 8.f * fmg * fmb;
R_bb = 8.f * pow(fmb, 2.f);
R_by = 8.f * fmb * fatmy;
R_yg = 8.f * fmg * fatmy;
R_yb = 8.f * fmb * fatmy;
R_yy = 8.f * pow(fatmy, 2.f);
dsup = 0.f;
esup = 0.f;
zg = goffset + gl_GlobalInvocationID.x;
fg = float(zg) / fmg;
zb = boffset + gl_GlobalInvocationID.y;
fb = float(zb) / fmb;
for(q = 0; q < 10; q++){
fq = float(q);
dsum = 0.f;
esum = 0.f;
for(n = 0; n <= 1031; n++){
dsummandsup = 0.f;
esummandsup = 0.f;
fys = float(n) / 10.f;
for(zy = 0; zy <= my; zy++){
fymb = fys + float(zy) / fatmy - fb;
t = fb + fymb;
Di_0_ = one + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);
Ei_0_ = fe(10.f * t);
td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_0_y = 6.2831853e0f * td; Ei_0_y = 6.2831853e1f * te;
td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_0_yy = 3.9478418e1f * td; Ei_0_yy = 3.9478418e3f * te;
t = fg + fb + 9.9019514e-2f * fymb;
Di_1_ = one + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);
Ei_1_ = fe(10.f * t);
td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_1_g = 6.2831853e0f * td; Ei_1_g = 6.2831853e1f * te;
Di_1_b = 5.6610274e0f * td; Ei_1_b = 5.6610274e1f * te;
Di_1_y = 6.2215795e-1f * td; Ei_1_y = 6.2215795e0f * te;
td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_1_gg = 3.9478418e1f * td; Ei_1_gg = 3.9478418e3f * te;
Di_1_gb = 3.5569284e1f * td; Ei_1_gb = 3.5569284e3f * te;
Di_1_gy = 3.9091337e0f * td; Ei_1_gy = 3.9091337e2f * te;
Di_1_bg = 3.5569284e1f * td; Ei_1_bg = 3.5569284e3f * te;
Di_1_bb = 3.2047231e1f * td; Ei_1_bb = 3.2047231e3f * te;
Di_1_by = 3.5220532e0f * td; Ei_1_by = 3.5220532e2f * te;
Di_1_yg = 3.9091337e0f * td; Ei_1_yg = 3.9091337e2f * te;
Di_1_yb = 3.5220532e0f * td; Ei_1_yb = 3.5220532e2f * te;
Di_1_yy = 3.8708052e-1f * td; Ei_1_yy = 3.8708052e1f * te;
t = -10.f * fg + fb + 9.8048641e-3f * fymb;
Di_2_ = one + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);
Ei_2_ = fe(10.f * t);
td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_2_g = -6.2831853e1f * td; Ei_2_g = -6.2831853e2f * te;
Di_2_b = 6.2215795e0f * td; Ei_2_b = 6.2215795e1f * te;
Di_2_y = 6.1605778e-2f * td; Ei_2_y = 6.1605778e-1f * te;
td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_2_gg = 3.9478418e3f * td; Ei_2_gg = 3.9478418e5f * te;
Di_2_gb = -3.9091337e2f * td; Ei_2_gb = -3.9091337e4f * te;
Di_2_gy = -3.8708052e0f * td; Ei_2_gy = -3.8708052e2f * te;
Di_2_bg = -3.9091337e2f * td; Ei_2_bg = -3.9091337e4f * te;
Di_2_bb = 3.8708052e1f * td; Ei_2_bb = 3.8708052e3f * te;
Di_2_by = 3.8328525e-1f * td; Ei_2_by = 3.8328525e1f * te;
Di_2_yg = -3.8708052e0f * td; Ei_2_yg = -3.8708052e2f * te;
Di_2_yb = 3.8328525e-1f * td; Ei_2_yb = 3.8328525e1f * te;
Di_2_yy = 3.7952719e-3f * td; Ei_2_yy = 3.7952719e-1f * te;
D_2_ = fComplexTimes(Di_0_, Di_1_) + Ei_0_;
D_2_g = fComplexTimes(Di_0_, Di_1_g);
D_2_b = fComplexTimes(Di_0_, Di_1_b);
D_2_y = fComplexTimes(Di_0_y, Di_1_) + fComplexTimes(Di_0_, Di_1_y) + Ei_0_y;
D_2_gg = fComplexTimes(Di_0_, Di_1_gg);
D_2_gb = fComplexTimes(Di_0_, Di_1_gb);
D_2_gy = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_gy);
D_2_bg = fComplexTimes(Di_0_, Di_1_bg);
D_2_bb = fComplexTimes(Di_0_, Di_1_bb);
D_2_by = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_by);
D_2_yg = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_yg);
D_2_yb = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_yb);
D_2_yy = fComplexTimes(Di_0_yy, Di_1_) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_, Di_1_yy) + Ei_0_yy;
E_2_ = fComplexTimes(Di_0_, Ei_1_);
E_2_g = fComplexTimes(Di_0_, Ei_1_g);
E_2_b = fComplexTimes(Di_0_, Ei_1_b);
E_2_y = fComplexTimes(Di_0_y, Ei_1_) + fComplexTimes(Di_0_, Ei_1_y);
E_2_gg = fComplexTimes(Di_0_, Ei_1_gg);
E_2_gb = fComplexTimes(Di_0_, Ei_1_gb);
E_2_gy = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_gy);
E_2_bg = fComplexTimes(Di_0_, Ei_1_bg);
E_2_bb = fComplexTimes(Di_0_, Ei_1_bb);
E_2_by = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_by);
E_2_yg = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_yg);
E_2_yb = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_yb);
E_2_yy = fComplexTimes(Di_0_yy, Ei_1_) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_, Ei_1_yy);
D_3_ = fComplexTimes(D_2_, Di_2_) + E_2_;
D_3_g = fComplexTimes(D_2_g, Di_2_) + fComplexTimes(D_2_, Di_2_g) + E_2_g;
D_3_b = fComplexTimes(D_2_b, Di_2_) + fComplexTimes(D_2_, Di_2_b) + E_2_b;
D_3_y = fComplexTimes(D_2_y, Di_2_) + fComplexTimes(D_2_, Di_2_y) + E_2_y;
D_3_gg = fComplexTimes(D_2_gg, Di_2_) + fComplexTimes(D_2_g, Di_2_g) + fComplexTimes(D_2_g, Di_2_g) + fComplexTimes(D_2_, Di_2_gg) + E_2_gg;
D_3_gb = fComplexTimes(D_2_gb, Di_2_) + fComplexTimes(D_2_g, Di_2_b) + fComplexTimes(D_2_b, Di_2_g) + fComplexTimes(D_2_, Di_2_gb) + E_2_gb;
D_3_gy = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;
D_3_bg = fComplexTimes(D_2_bg, Di_2_) + fComplexTimes(D_2_b, Di_2_g) + fComplexTimes(D_2_g, Di_2_b) + fComplexTimes(D_2_, Di_2_bg) + E_2_bg;
D_3_bb = fComplexTimes(D_2_bb, Di_2_) + fComplexTimes(D_2_b, Di_2_b) + fComplexTimes(D_2_b, Di_2_b) + fComplexTimes(D_2_, Di_2_bb) + E_2_bb;
D_3_by = fComplexTimes(D_2_by, Di_2_) + fComplexTimes(D_2_b, Di_2_y) + fComplexTimes(D_2_y, Di_2_b) + fComplexTimes(D_2_, Di_2_by) + E_2_by;
//The bug occurs here: Use one of the two following semantically identical lines to get different results
D_3_yg = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;
//D_3_yg = D_3_gy;
D_3_yb = fComplexTimes(D_2_yb, Di_2_) + fComplexTimes(D_2_y, Di_2_b) + fComplexTimes(D_2_b, Di_2_y) + fComplexTimes(D_2_, Di_2_yb) + E_2_yb;
D_3_yy = fComplexTimes(D_2_yy, Di_2_) + fComplexTimes(D_2_y, Di_2_y) + fComplexTimes(D_2_y, Di_2_y) + fComplexTimes(D_2_, Di_2_yy) + E_2_yy;
dabs = length(D_3_) + length(D_3_g) / R_g + length(D_3_b) / R_b + length(D_3_y) / R_y + length(D_3_gg) / R_gg + length(D_3_gb) / R_gb + length(D_3_gy) / R_gy + length(D_3_bg) / R_bg + length(D_3_bb) / R_bb + length(D_3_by) / R_by + length(D_3_yg) / R_yg + length(D_3_yb) / R_yb + length(D_3_yy) / R_yy;
dsummandsup = max(dsummandsup, dabs);
E_3_ = fComplexTimes(D_2_, Ei_2_);
E_3_g = fComplexTimes(D_2_g, Ei_2_) + fComplexTimes(D_2_, Ei_2_g);
E_3_b = fComplexTimes(D_2_b, Ei_2_) + fComplexTimes(D_2_, Ei_2_b);
E_3_y = fComplexTimes(D_2_y, Ei_2_) + fComplexTimes(D_2_, Ei_2_y);
E_3_gg = fComplexTimes(D_2_gg, Ei_2_) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gg);
E_3_gb = fComplexTimes(D_2_gb, Ei_2_) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gb);
E_3_gy = fComplexTimes(D_2_gy, Ei_2_) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gy);
E_3_bg = fComplexTimes(D_2_bg, Ei_2_) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bg);
E_3_bb = fComplexTimes(D_2_bb, Ei_2_) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bb);
E_3_by = fComplexTimes(D_2_by, Ei_2_) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_, Ei_2_by);
E_3_yg = fComplexTimes(D_2_yg, Ei_2_) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yg);
E_3_yb = fComplexTimes(D_2_yb, Ei_2_) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yb);
E_3_yy = fComplexTimes(D_2_yy, Ei_2_) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yy);
eabs = length(E_3_) + length(E_3_g) / R_g + length(E_3_b) / R_b + length(E_3_y) / R_y + length(E_3_gg) / R_gg + length(E_3_gb) / R_gb + length(E_3_gy) / R_gy + length(E_3_bg) / R_bg + length(E_3_bb) / R_bb + length(E_3_by) / R_by + length(E_3_yg) / R_yg + length(E_3_yb) / R_yb + length(E_3_yy) / R_yy;
esummandsup = max(esummandsup, eabs);
}
dsum += dsummandsup;
esum += esummandsup;
}
for(n = 1032; n <= 10402; n++){
dsummandsup = 0.f;
esummandsup = 0.f;
fys = float(n) / 10.f;
for(zy = 0; zy <= my; zy++){
fymb = fys + float(zy) / fatmy - fb;
t = fb + fymb;
Di_0_ = one + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);
Ei_0_ = fe(10.f * t);
td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_0_y = 6.2831853e0f * td; Ei_0_y = 6.2831853e1f * te;
td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_0_yy = 3.9478418e1f * td; Ei_0_yy = 3.9478418e3f * te;
t = fg + fb + 9.9019514e-2f * fymb;
Di_1_ = one + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);
Ei_1_ = fe(10.f * t);
td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_1_g = 6.2831853e0f * td; Ei_1_g = 6.2831853e1f * te;
Di_1_b = 5.6610274e0f * td; Ei_1_b = 5.6610274e1f * te;
Di_1_y = 6.2215795e-1f * td; Ei_1_y = 6.2215795e0f * te;
td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_1_gg = 3.9478418e1f * td; Ei_1_gg = 3.9478418e3f * te;
Di_1_gb = 3.5569284e1f * td; Ei_1_gb = 3.5569284e3f * te;
Di_1_gy = 3.9091337e0f * td; Ei_1_gy = 3.9091337e2f * te;
Di_1_bg = 3.5569284e1f * td; Ei_1_bg = 3.5569284e3f * te;
Di_1_bb = 3.2047231e1f * td; Ei_1_bb = 3.2047231e3f * te;
Di_1_by = 3.5220532e0f * td; Ei_1_by = 3.5220532e2f * te;
Di_1_yg = 3.9091337e0f * td; Ei_1_yg = 3.9091337e2f * te;
Di_1_yb = 3.5220532e0f * td; Ei_1_yb = 3.5220532e2f * te;
Di_1_yy = 3.8708052e-1f * td; Ei_1_yy = 3.8708052e1f * te;
t = -10.f * fg + fb + 9.8048641e-3f * fymb;
Di_2_ = one + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);
Ei_2_ = fe(10.f * t);
td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fComplexTimesI(td);
te = fe(10.f * t); te = fComplexTimesI(te);
Di_2_g = -6.2831853e1f * td; Ei_2_g = -6.2831853e2f * te;
Di_2_b = 6.2215795e0f * td; Ei_2_b = 6.2215795e1f * te;
Di_2_y = 6.1605778e-2f * td; Ei_2_y = 6.1605778e-1f * te;
td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;
te = fe(10.f * t); te = -te;
Di_2_gg = 3.9478418e3f * td; Ei_2_gg = 3.9478418e5f * te;
Di_2_gb = -3.9091337e2f * td; Ei_2_gb = -3.9091337e4f * te;
Di_2_gy = -3.8708052e0f * td; Ei_2_gy = -3.8708052e2f * te;
Di_2_bg = -3.9091337e2f * td; Ei_2_bg = -3.9091337e4f * te;
Di_2_bb = 3.8708052e1f * td; Ei_2_bb = 3.8708052e3f * te;
Di_2_by = 3.8328525e-1f * td; Ei_2_by = 3.8328525e1f * te;
Di_2_yg = -3.8708052e0f * td; Ei_2_yg = -3.8708052e2f * te;
Di_2_yb = 3.8328525e-1f * td; Ei_2_yb = 3.8328525e1f * te;
Di_2_yy = 3.7952719e-3f * td; Ei_2_yy = 3.7952719e-1f * te;
D_2_ = fComplexTimes(Di_0_, Di_1_) + Ei_0_;
D_2_g = fComplexTimes(Di_0_, Di_1_g);
D_2_b = fComplexTimes(Di_0_, Di_1_b);
D_2_y = fComplexTimes(Di_0_y, Di_1_) + fComplexTimes(Di_0_, Di_1_y) + Ei_0_y;
D_2_gg = fComplexTimes(Di_0_, Di_1_gg);
D_2_gb = fComplexTimes(Di_0_, Di_1_gb);
D_2_gy = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_gy);
D_2_bg = fComplexTimes(Di_0_, Di_1_bg);
D_2_bb = fComplexTimes(Di_0_, Di_1_bb);
D_2_by = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_by);
D_2_yg = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_yg);
D_2_yb = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_yb);
D_2_yy = fComplexTimes(Di_0_yy, Di_1_) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_, Di_1_yy) + Ei_0_yy;
E_2_ = fComplexTimes(Di_0_, Ei_1_);
E_2_g = fComplexTimes(Di_0_, Ei_1_g);
E_2_b = fComplexTimes(Di_0_, Ei_1_b);
E_2_y = fComplexTimes(Di_0_y, Ei_1_) + fComplexTimes(Di_0_, Ei_1_y);
E_2_gg = fComplexTimes(Di_0_, Ei_1_gg);
E_2_gb = fComplexTimes(Di_0_, Ei_1_gb);
E_2_gy = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_gy);
E_2_bg = fComplexTimes(Di_0_, Ei_1_bg);
E_2_bb = fComplexTimes(Di_0_, Ei_1_bb);
E_2_by = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_by);
E_2_yg = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_yg);
E_2_yb = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_yb);
E_2_yy = fComplexTimes(Di_0_yy, Ei_1_) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_, Ei_1_yy);
E_3_ = fComplexTimes(D_2_, Ei_2_);
E_3_g = fComplexTimes(D_2_g, Ei_2_) + fComplexTimes(D_2_, Ei_2_g);
E_3_b = fComplexTimes(D_2_b, Ei_2_) + fComplexTimes(D_2_, Ei_2_b);
E_3_y = fComplexTimes(D_2_y, Ei_2_) + fComplexTimes(D_2_, Ei_2_y);
E_3_gg = fComplexTimes(D_2_gg, Ei_2_) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gg);
E_3_gb = fComplexTimes(D_2_gb, Ei_2_) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gb);
E_3_gy = fComplexTimes(D_2_gy, Ei_2_) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gy);
E_3_bg = fComplexTimes(D_2_bg, Ei_2_) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bg);
E_3_bb = fComplexTimes(D_2_bb, Ei_2_) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bb);
E_3_by = fComplexTimes(D_2_by, Ei_2_) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_, Ei_2_by);
E_3_yg = fComplexTimes(D_2_yg, Ei_2_) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yg);
E_3_yb = fComplexTimes(D_2_yb, Ei_2_) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yb);
E_3_yy = fComplexTimes(D_2_yy, Ei_2_) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yy);
eabs = length(E_3_) + length(E_3_g) / R_g + length(E_3_b) / R_b + length(E_3_y) / R_y + length(E_3_gg) / R_gg + length(E_3_gb) / R_gb + length(E_3_gy) / R_gy + length(E_3_bg) / R_bg + length(E_3_bb) / R_bb + length(E_3_by) / R_by + length(E_3_yg) / R_yg + length(E_3_yb) / R_yb + length(E_3_yy) / R_yy;
esummandsup = max(esummandsup, eabs);
}
dsum += dsummandsup;
esum += esummandsup;
}
dsup = max(dsup, dsum);
esup = max(esup, esum);
}
}
void main(void){
fDESupTaylor_a10_w3_d2();
Output.Data[gl_GlobalInvocationID.x][gl_GlobalInvocationID.y][0] = dsup;
Output.Data[gl_GlobalInvocationID.x][gl_GlobalInvocationID.y][1] = esup;
}
If I run the above program and use one of the two identical lines in the shader which I marked with a comment I get different results for the value esup (second of the printed values). If I use the first line
D_3_yg = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;
I get esup = 85930.453 and if I use
D_3_yg = D_3_gy;
(note that this should give the same result) I get esup = 85928.164.
Also, If I change the lines
zg = goffset + gl_GlobalInvocationID.x;
fg = float(zg) / fmg;
zb = boffset + gl_GlobalInvocationID.y;
fb = float(zb) / fmb;
to
zg = gl_GlobalInvocationID.x;
fg = float(zg) / fmg;
zb = gl_GlobalInvocationID.y;
fb = float(zb) / fmb;
(note that goffset and boffset are set to 0 in C++) I get yet another result (esup = 85929.844 and esup = 85929.742 when I use the first or second lines respectively).
Another really strange aspect is that the "full bug" only seems to occur if the constant my is set to 128. For other values of my such as 64, 127, 129, or 256 changing the two lines does not change the value of esup but removing goffset and boffset still does.
Also, the value of D_3_yg (which is set by the two different lines) should actually play no role for the computation of esup but only for dsup. But dsup stays constant and esup changes.
I'm using an Nvidia Quadro M2000M and Visual Studio 2012 to compile. Any ideas what the problem could be?
Note that the program needs about 20 seconds to run and the screen freezes during that time. In Windows you need to increase your TdrDelay to, say, 60 (seconds)
https://learn.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys
as Windows kills GPU computations which take longer than 2 seconds by default.

Your shader does a lot of float calculus. Any line operates with previous results. The precision error accumulates.
Also, the GPU may do calculations in higher precision; but each time you store the result it gets truncated. Some times the compiler can optimize it, some times it can't.
You should read
What Every Programmer Should Know About Floating-Point Arithmetic
or
What Every Computer Scientist Should Know About Floating-Point Arithmetic
Problems increase for some special, bad conditioned cases. A simple geometrical analogous issue is determinig the point of intersection between two almost parallel lines. The result can vary really a lot.
You should try to simplify those long calculations. Using double-precision types (available since OPenGL 4.0) may help, but perhaps not enough.

Related

How can I "join" quadratic or cubic splines?

I have 2 function to either calculate a point on a spline, quadratic or cubic:
struct vec2 {float x, y;};
vec2 spline_quadratic(vec2 & a, vec2 & b, vec2 & c, float t) {
return {
(1 - t) * (1 - t) * p1.x + 2 * (1 - t) * t * p2.x + t * t * p3.x,
(1 - t) * (1 - t) * p1.y + 2 * (1 - t) * t * p2.y + t * t * p3.y
};
}
vec2 spline_cubic(vec2 & a, vec2 & b, vec2 & c, vec2 & d, float t){
return {
//B(t) = (1-t)**3 p0 + 3(1 - t)**2 t P1 + 3(1-t)t**2 P2 + t**3 P3
(1 - t) * (1 - t) * (1 - t) * p1.x + 3 * (1 - t) * (1 - t) * t * p2.x + 3 * (1 - t) * t * t * p3.x + t * t * t * p4.x,
(1 - t) * (1 - t) * (1 - t) * p1.y + 3 * (1 - t) * (1 - t) * t * p2.y + 3 * (1 - t) * t * t * p3.y + t * t * t * p4.y
};
Is it possible to join several curves of an array of points?
I'm looking to make a function that has this signature:
vector<vec2> spline_join(vector<vec2> & points, int segments = 16){
vector<vec2> spline_points;
for(int i = 0; i < points.size()-2; ++i){
for(int div = 0; div < segments; ++div){
spline_points.push_back(spline_quadratic(points[0], points[1], points[2], 1.f/segments);
}
}
}
I've read that it requires interpolation, but I'm not sure... What would the code look like? I've searched and I can't find relevant question and answers...
I've seen there are libraries, but I'm looking for a shorter implementation.
Edit: I've tried the question and answer here and apparently this is what I want:
Joining B-Spline segments in OpenGL / C++
The code is not really clean but after some cleaning, it does work.
I've cleaned this answer Joining B-Spline segments in OpenGL / C++
This is not an Hermite spline, an hermite spline passes through the points, a B-spline does not.
Here is what worked and the result
float B0(float u) {
//return float(pow(u - 1, 3) / 6.0);
// (1-t)*(1-t)*(1-t)/6.f
return float(pow(1-u, 3) / 6.0);
}
float B1(float u) {
return float((3 * pow(u, 3) - 6 * pow(u, 2) + 4) / 6.0);
// (3 * t * t * t - 6 * t * t + 4) / 6
}
float B2(float u) {
return float((-3 * pow(u, 3) + 3 * pow(u, 2) + 3 * u + 1) / 6.0);
// (-3 * t * t * t + 3 * t * t + 3 * t + 1) / 6
}
float B3(float u) {
return float(pow(u, 3) / 6.0);
// t * t * t / 6
}
vector<Vec2> computeBSpline(vector<Vec2>& points) {
vector<Vec2> result;
int MAX_STEPS = 100;
int NUM_OF_POINTS = points.size();
for (int i = 0; i < NUM_OF_POINTS - 3; i++)
{
//cout << "Computing for P" << i << " P " << i + 1 << " P " << i + 2 << " P " << i + 3 << endl;
for (int j = 0; j <= MAX_STEPS; j++)
{
float u = float(j) / float(MAX_STEPS);
float Qx =
B0(u) * points[i].x
+ B1(u) * points[i + 1].x
+ B2(u) * points[i + 2].x
+ B3(u) * points[i + 3].x;
float Qy =
B0(u) * points[i].y
+ B1(u) * points[i + 1].y
+ B2(u) * points[i + 2].y
+ B3(u) * points[i + 3].y;
result.push_back({ Qx, Qy });
//cout << count << '(' << Qx << ", " << Qy << ")\n";
}
}
return result;
}

C++ code runs fine in Visual Studio (windows) but gives a segmentation fault in CodeLite (Linux)

My code compiles and runs without error in Visual Studio, however I need to to run in CodeLite on Linux and it gives me a segmentation fault for the same code.
For reference this is my code:
#include <string>
#include <iostream>
#include <cmath>
#include <fstream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <tuple>
using namespace std;
tuple<vector<double>, vector<double>, vector<double>> RK4() {
//open parameters.txt, put data into a vector
ifstream fin("parameters.txt");
vector<double> data;
data.reserve(8);
double element;
while (fin >> element) {
data.push_back(element);
}
//define tspan
vector<double> tspan(2);
tspan[0] = 0.0;
tspan[1] = data[7];
//define y0
vector<double> y0(4);
//CHANGE TO DATA[4], DATA[5]
const double a = 3.141592653589793238462643383279;
y0[0] = data[4];
y0[1] = data[5];
y0[2] = 0.0;
y0[3] = 0.0;
double theta1 = y0[0];
double theta2 = y0[1];
double omega1 = y0[2];
double omega2 = y0[3];
//define stepSize
double stepSize;
stepSize = data[6];
//define range
int range = int(tspan[1] / stepSize);
//define other constants
double m1, m2, l1, l2;
m1 = data[0];
m2 = data[1];
l1 = data[2];
l2 = data[3];
double g = 9.81;
//define y, t vectors
vector<double> y1(range);
vector<double> y2(range);
vector<double> y3(range);
vector<double> y4(range);
vector<double> t(range);
for (double i = 0.0; i < 1.0 * range; i++) {
t[i] = i * stepSize;
}
//enter y0 into first value
y1[0] = theta1;
y2[0] = theta2;
y3[0] = omega1;
y4[0] = omega2;
//loop to find y, t vectors
for (int i = 0; i < range - 1; i++) {
//finding all k values:
//k1
double dTheta1_1 = y3[i];
double dOmega1_1 = (-g * (2 * m1 + m2) * sin(y1[i]) - m2 * g * sin(y1[i] - 2 * y2[i]) - 2 * sin(y1[i] - y2[i]) * m2 * (pow(y4[i], 2) * l2 + pow(y3[i], 2) * l1 * cos(y1[i] - y2[i]))) / (l1 * (2 * m1 + m2 - m2 * cos(2 * y1[i] - 2 * y2[i])));
double dTheta2_1 = y4[i];
double dOmega2_1 = (2 * sin(y1[i] - y2[i]) * (pow(y3[i], 2) * l1 * (m1 + m2) + g * (m1 + m2) * cos(y1[i]) + pow(y4[i], 2) * l2 * m2 * cos(y1[i] - y2[i]))) / (l2 * (2 * m1 + m2 - m2 * cos(2 * y1[i] - 2 * y2[i])));
//k2
double dTheta1_2 = y3[i] + 0.5 * stepSize * dTheta1_1;
double dOmega1_2 = (-g * (2 * m1 + m2) * sin(y1[i] + 0.5 * stepSize * dTheta1_1) - m2 * g * sin((y1[i] + 0.5 * stepSize * dTheta1_1) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_1)) - 2 * sin((y1[i] + 0.5 * stepSize * dTheta1_1) - (y2[i] + 0.5 * stepSize * dTheta2_1)) * m2 * (pow(y4[i] + 0.5 * stepSize * dOmega2_1, 2) * l2 + pow(y3[i] + 0.5 * stepSize * dOmega1_1, 2) * l1 * cos((y1[i] + 0.5 * stepSize * dTheta1_1) - (y2[i] + 0.5 * stepSize * dTheta2_1)))) / (l1 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + 0.5 * stepSize * dTheta1_1) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_1))));
double dTheta2_2 = y4[i] + 0.5 * stepSize * dTheta2_1;
double dOmega2_2 = (2 * sin((y1[i] + 0.5 * stepSize * dTheta1_1) - (y2[i] + 0.5 * stepSize * dTheta2_1)) * (pow(y3[i] + 0.5 * stepSize * dOmega1_1, 2) * l1 * (m1 + m2) + g * (m1 + m2) * cos(y1[i] + 0.5 * stepSize * dTheta1_1) + pow(y4[i] + 0.5 * stepSize * dOmega2_1, 2) * l2 * m2 * cos((y1[i] + 0.5 * stepSize * dTheta1_1) - (y2[i] + 0.5 * stepSize * dTheta2_1)))) / (l2 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + 0.5 * stepSize * dTheta1_1) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_1))));
//k3
double dTheta1_3 = y3[i] + 0.5 * stepSize * dTheta1_2;
double dOmega1_3 = (-g * (2 * m1 + m2) * sin(y1[i] + 0.5 * stepSize * dTheta1_2) - m2 * g * sin((y1[i] + 0.5 * stepSize * dTheta1_2) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_2)) - 2 * sin((y1[i] + 0.5 * stepSize * dTheta1_2) - (y2[i] + 0.5 * stepSize * dTheta2_2)) * m2 * (pow(y4[i] + 0.5 * stepSize * dOmega2_2, 2) * l2 + pow(y3[i] + 0.5 * stepSize * dOmega1_2, 2) * l1 * cos((y1[i] + 0.5 * stepSize * dTheta1_2) - (y2[i] + 0.5 * stepSize * dTheta2_2)))) / (l1 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + 0.5 * stepSize * dTheta1_2) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_2))));
double dTheta2_3 = y4[i] + 0.5 * stepSize * dTheta2_2;
double dOmega2_3 = (2 * sin((y1[i] + 0.5 * stepSize * dTheta1_2) - (y2[i] + 0.5 * stepSize * dTheta2_2)) * (pow(y3[i] + 0.5 * stepSize * dOmega1_2, 2) * l1 * (m1 + m2) + g * (m1 + m2) * cos(y1[i] + 0.5 * stepSize * dTheta1_2) + pow(y4[i] + 0.5 * stepSize * dOmega2_2, 2) * l2 * m2 * cos((y1[i] + 0.5 * stepSize * dTheta1_2) - (y2[i] + 0.5 * stepSize * dTheta2_2)))) / (l2 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + 0.5 * stepSize * dTheta1_2) - 2 * (y2[i] + 0.5 * stepSize * dTheta2_2))));
//k4
double dTheta1_4 = y3[i] + stepSize * dTheta1_3;
double dOmega1_4 = (-g * (2 * m1 + m2) * sin(y1[i] + stepSize * dTheta1_3) - m2 * g * sin((y1[i] + stepSize * dTheta1_3) - 2 * (y2[i] + stepSize * dTheta2_3)) - 2 * sin((y1[i] + stepSize * dTheta1_3) - (y2[i] + stepSize * dTheta2_3)) * m2 * (pow(y4[i] + stepSize * dOmega2_3, 2) * l2 + pow(y3[i] + stepSize * dOmega1_3, 2) * l1 * cos((y1[i] + stepSize * dTheta1_3) - (y2[i] + stepSize * dTheta2_3)))) / (l1 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + stepSize * dTheta1_3) - 2 * (y2[i] + stepSize * dTheta2_3))));
double dTheta2_4 = y4[i] + stepSize * dTheta2_3;
double dOmega2_4 = (2 * sin((y1[i] + stepSize * dTheta1_3) - (y2[i] + stepSize * dTheta2_3)) * (pow(y3[i] + stepSize * dOmega1_3, 2) * l1 * (m1 + m2) + g * (m1 + m2) * cos(y1[i] + stepSize * dTheta1_3) + pow(y4[i] + stepSize * dOmega2_3, 2) * l2 * m2 * cos((y1[i] + stepSize * dTheta1_3) - (y2[i] + stepSize * dTheta2_3)))) / (l2 * (2 * m1 + m2 - m2 * cos(2 * (y1[i] + stepSize * dTheta1_3) - 2 * (y2[i] + stepSize * dTheta2_3))));
double theta1New = y1[i] + (stepSize / 6.0) * (dTheta1_1 + 2 * dTheta1_2 + 2 * dTheta1_3 + dTheta1_4);
double omega1New = y3[i] + (stepSize / 6.0) * (dOmega1_1 + 2 * dOmega1_2 + 2 * dOmega1_3 + dOmega1_4);
double theta2New = y2[i] + (stepSize / 6.0) * (dTheta2_1 + 2 * dTheta2_2 + 2 * dTheta2_3 + dTheta2_4);
double omega2New = y4[i] + (stepSize / 6.0) * (dOmega2_1 + 2 * dOmega2_2 + 2 * dOmega2_3 + dOmega2_4);
// updating y arrays
y1[i + 1] = theta1New;
y2[i + 1] = theta2New;
y3[i + 1] = omega1New;
y4[i + 1] = omega2New;
}
return make_tuple(y1, y2, t);
}
int main() {
//open parameters.txt, put data into a vector
ifstream fin("parameters.txt");
vector<double> data;
data.reserve(8);
double element;
while (fin >> element) {
data.push_back(element);
}
//define tspan
vector<double> tspan(2);
tspan[0] = 0.0;
tspan[1] = data[7];
//define stepSize
double stepSize = data[6];
//define other constants
double l1 = data[2];
double l2 = data[3];
//get y1, y2, t from RK4 function
auto temp = RK4();
vector<double> y1 = get<0>(temp);
vector<double> y2 = get<1>(temp);
vector<double> t = get<2>(temp);
//define range
int const range = static_cast<int>(y1.size());
vector<double> x_1(range), y_1(range), x_2(range), y_2(range);
//define x_1, x_2, y_1, y_2
for (int i = 0; i < range; i++) {
x_1[i] = { sin(y1[i]) * l1 };
y_1[i] = { -cos(y1[i]) * l1 };
x_2[i] = { sin(y1[i]) * l1 + sin(y2[i]) * l2 };
y_2[i] = { -cos(y1[i]) * l1 - cos(y2[i]) * l2 };
}
//writing x,y positions at time t to output.txt
ofstream myfile;
myfile.open("output.txt");
if (myfile.is_open()) {
myfile << "t: " << endl;
for (int i = 0; i < range; i++) {
myfile << t[i] << " ";
}
cout << endl;
myfile << "x_1: " << endl;
for (int i = 0; i < range; i++) {
myfile << x_1[i] << " ";
}
cout << endl;
myfile << "y_1: " << endl;
for (int i = 0; i < range; i++) {
myfile << y_1[i] << " ";
}
cout << endl;
myfile << "x_2: " << endl;
for (int i = 0; i < range; i++) {
myfile << x_2[i] << " ";
}
cout << endl;
myfile << "y_2: " << endl;
for (int i = 0; i < range; i++) {
myfile << y_2[i] << " ";
}
cout << endl;
myfile.close();
}
else { cout << "Unable to open file"; }
return 0;
}
In both cases "parameters.txt" is in the working directory. Why does the operating system/compiler I use affect the outcome? What is the problem?

Inverse of Cumulative Normal Distribution Function with parameters

I want to implement equivalent of matlab icdf function in C++, I have already found this useful post: https://www.johndcook.com/blog/cpp_phi_inverse/. But I want it with optional mu and sigma parameters as in matlab.
What I am supposed to change?
Inspired from https://gist.github.com/kmpm/1211922/6b7fcd0155b23c3dc71e6f4969f2c48785371292:
double inverse_of_normal_cdf(const double p, const double mu, const double sigma)
{
if (p <= 0.0 || p >= 1.0)
{
std::stringstream os;
os << "Invalid input argument (" << p
<< "); must be larger than 0 but less than 1.";
throw std::invalid_argument(os.str());
}
double r, val;
const double q = p - 0.5;
if (std::abs(q) <= .425) {
r = .180625 - q * q;
val =
q * (((((((r * 2509.0809287301226727 +
33430.575583588128105) * r + 67265.770927008700853) * r +
45921.953931549871457) * r + 13731.693765509461125) * r +
1971.5909503065514427) * r + 133.14166789178437745) * r +
3.387132872796366608)
/ (((((((r * 5226.495278852854561 +
28729.085735721942674) * r + 39307.89580009271061) * r +
21213.794301586595867) * r + 5394.1960214247511077) * r +
687.1870074920579083) * r + 42.313330701600911252) * r + 1);
}
else {
if (q > 0) {
r = 1 - p;
}
else {
r = p;
}
r = std::sqrt(-std::log(r));
if (r <= 5)
{
r += -1.6;
val = (((((((r * 7.7454501427834140764e-4 +
.0227238449892691845833) * r + .24178072517745061177) *
r + 1.27045825245236838258) * r +
3.64784832476320460504) * r + 5.7694972214606914055) *
r + 4.6303378461565452959) * r +
1.42343711074968357734)
/ (((((((r *
1.05075007164441684324e-9 + 5.475938084995344946e-4) *
r + .0151986665636164571966) * r +
.14810397642748007459) * r + .68976733498510000455) *
r + 1.6763848301838038494) * r +
2.05319162663775882187) * r + 1);
}
else { /* very close to 0 or 1 */
r += -5;
val = (((((((r * 2.01033439929228813265e-7 +
2.71155556874348757815e-5) * r +
.0012426609473880784386) * r + .026532189526576123093) *
r + .29656057182850489123) * r +
1.7848265399172913358) * r + 5.4637849111641143699) *
r + 6.6579046435011037772)
/ (((((((r *
2.04426310338993978564e-15 + 1.4215117583164458887e-7) *
r + 1.8463183175100546818e-5) * r +
7.868691311456132591e-4) * r + .0148753612908506148525)
* r + .13692988092273580531) * r +
.59983220655588793769) * r + 1);
}
if (q < 0.0) {
val = -val;
}
}
return mu + sigma * val;
}

OpenGl rotate custom implementation

I'm trying to code my custom implementation of Opengl glRotatef(angle,x,y,z) function.
I wrote the rotation matrix, but when I try to use it, the effect is not the same as the original function. Here is my code;
void mglRotate(float angle, float x, float y, float z)
{
float angle_rad = angle * (PI/180.0f);
float c = cos(angle_rad);
float s = sin(angle_rad);
float t = 1 - c;
float m[16] = {
c+x*x*t,y*x*t+z*s,z*x*t-y*s,0,
x*y*t-z*s,c+y*y*t,z*y*t+x*s,0,
x*z*t+y*s,y*z*t-x*s,z*z*t+c,0,
0,0,0,1
};
glMultMatrixf(m);
}
Where is my mistake?
There is a library glm, that does exactly the same thing as old openGL functions. You can compare your implementation with implementation in glm and figure it out :)
template <typename T>
GLM_FUNC_QUALIFIER detail::tmat4x4<T> rotate
(
detail::tmat4x4<T> const & m,
T const & angle,
detail::tvec3<T> const & v
)
{
T a = radians(angle);
T c = cos(a);
T s = sin(a);
detail::tvec3<T> axis = normalize(v);
detail::tvec3<T> temp = (T(1) - c) * axis;
detail::tmat4x4<T> Rotate(detail::tmat4x4<T>::null);
Rotate[0][0] = c + temp[0] * axis[0];
Rotate[0][1] = 0 + temp[0] * axis[1] + s * axis[2];
Rotate[0][2] = 0 + temp[0] * axis[2] - s * axis[1];
Rotate[1][0] = 0 + temp[1] * axis[0] - s * axis[2];
Rotate[1][1] = c + temp[1] * axis[1];
Rotate[1][2] = 0 + temp[1] * axis[2] + s * axis[0];
Rotate[2][0] = 0 + temp[2] * axis[0] + s * axis[1];
Rotate[2][1] = 0 + temp[2] * axis[1] - s * axis[0];
Rotate[2][2] = c + temp[2] * axis[2];
detail::tmat4x4<T> Result(detail::tmat4x4<T>::null);
Result[0] = m[0] * Rotate[0][0] + m[1] * Rotate[0][1] + m[2] * Rotate[0][2];
Result[1] = m[0] * Rotate[1][0] + m[1] * Rotate[1][1] + m[2] * Rotate[1][2];
Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
Result[3] = m[3];
return Result;
}
The one thing that seems wrong to me in your code is that you don't normalize the axis.

Rotation of a point about the z-axis

I have 3 vectors in 3D space. Let's call them xaxis, yaxis, and zaxis. These vectors are centered about an arbitrary point somewhere in 3D space. I am interested in rotating the xaxis and yaxis vectors about the zaxis vector a number of degrees θ.
For the following code with values being arbitrary and unimportant:
double xaxis[3], yaxis[3], zaxis[3], point[3], theta;
How would I go about rotating xaxis and yaxis about the zaxis by theta degrees?
Future Note: These attempts do not work. See my answer for the proper solution, which was found with the help of BlueRaja-DannyPflughoeft
My attempt at matrix-based rotation:
double rx[3][3];
double ry[3][3];
double rz[3][3];
double r[3][3];
rx[0][0] = 1;
rx[0][1] = 0;
rx[0][2] = 0;
rx[1][0] = 0;
rx[1][1] = cos(theta);
rx[1][2] = sin(theta);
rx[2][0] = 0;
rx[2][1] = -1.0 * sin(theta);
rx[2][2] = cos(theta);
ry[0][0] = cos(theta);
ry[0][1] = 0;
ry[0][2] = -1.0 * sin(theta);
ry[1][0] = 0;
ry[1][1] = 1;
ry[1][2] = 0;
ry[2][0] = sin(theta);
ry[2][1] = 0;
ry[2][2] = cos(theta);
//No rotation wanted on the zaxis
rz[0][0] = cos(0);
rz[0][1] = sin(0);
rz[0][2] = 0;
rz[1][0] = -1.0 * sin(0);
rz[1][1] = cos(0);
rz[1][2] = 0;
rz[2][0] = 0;
rz[2][1] = 0;
rz[2][2] = 1;
vtkMath::Multiply3x3(rx, ry, r); //Multiplies rx by ry and stores into r
vtkMath::Multiply3x3(r, rz, r); //Multiplies r by rz and stores into r
vtkMath::Multiply3x3(r, xaxis, xaxis);//multiplies a 3x3 by a 3x1
vtkMath::Multiply3x3(r, yaxis, yaxis);//multiplies a 3x3 by a 3x1
This attempt only worked when the plane was in the x-y plane:
double x, y;
x = xaxis[0];
y = xaxis[1];
xaxis[0] = x * cos(theta) - y * sin(theta);
xaxis[1] = x * sin(theta) + y * cos(theta);
x = yaxis[0];
y = yaxis[1];
yaxis[0] = x * cos(theta) - y * sin(theta);
yaxis[1] = x * sin(theta) + y * cos(theta);
Using the axis-angle approach given by BlueRaja-DannyPflughoeft:
double c = cos(theta);
double s = sin(theta);
double C = 1.0 - c;
double Q[3][3];
Q[0][0] = xaxis[0] * xaxis[0] * C + c;
Q[0][1] = xaxis[1] * xaxis[0] * C + xaxis[2] * s;
Q[0][2] = xaxis[2] * xaxis[0] * C - xaxis[1] * s;
Q[1][0] = xaxis[1] * xaxis[0] * C - xaxis[2] * s;
Q[1][1] = xaxis[1] * xaxis[1] * C + c;
Q[1][2] = xaxis[2] * xaxis[1] * C + xaxis[0] * s;
Q[2][0] = xaxis[1] * xaxis[2] * C + xaxis[1] * s;
Q[2][1] = xaxis[2] * xaxis[1] * C - xaxis[0] * s;
Q[2][2] = xaxis[2] * xaxis[2] * C + c;
double x = Q[2][1] - Q[1][2], y = Q[0][2] - Q[2][0], z = Q[1][0] - Q[0][1];
double r = sqrt(x * x + y * y + z * z);
//xaxis[0] /= r;
//xaxis[1] /= r;
//xaxis[2] /= r;
xaxis[0] = x;// ?
xaxis[1] = y;
xaxis[2] = z;
Thanks to BlueRaja - Danny Pflughoeft:
double c = cos(theta);
double s = sin(theta);
double C = 1.0 - c;
double Q[3][3];
Q[0][0] = zaxis[0] * zaxis[0] * C + c;
Q[0][1] = zaxis[1] * zaxis[0] * C + zaxis[2] * s;
Q[0][2] = zaxis[2] * zaxis[0] * C - zaxis[1] * s;
Q[1][0] = zaxis[1] * zaxis[0] * C - zaxis[2] * s;
Q[1][1] = zaxis[1] * zaxis[1] * C + c;
Q[1][2] = zaxis[2] * zaxis[1] * C + zaxis[0] * s;
Q[2][0] = zaxis[0] * zaxis[2] * C + zaxis[1] * s;
Q[2][1] = zaxis[2] * zaxis[1] * C - zaxis[0] * s;
Q[2][2] = zaxis[2] * zaxis[2] * C + c;
xaxis[0] = xaxis[0] * Q[0][0] + xaxis[0] * Q[0][1] + xaxis[0] * Q[0][2];
xaxis[1] = xaxis[1] * Q[1][0] + xaxis[1] * Q[1][1] + xaxis[1] * Q[1][2];
xaxis[2] = xaxis[2] * Q[2][0] + xaxis[2] * Q[2][1] + xaxis[2] * Q[2][2]; // Multiply a 3x3 by 3x1 and store it as the new rotated axis
yaxis[0] = yaxis[0] * Q[0][0] + yaxis[0] * Q[0][1] + yaxis[0] * Q[0][2];
yaxis[1] = yaxis[1] * Q[1][0] + yaxis[1] * Q[1][1] + yaxis[1] * Q[1][2];
yaxis[2] = yaxis[2] * Q[2][0] + yaxis[2] * Q[2][1] + yaxis[2] * Q[2][2]; // Multiply a 3x3 by 3x1 and store it as the new rotated axis
I see that following matrix multiplication is wrong!
As stated above it can be factored with xaxis[0]
xaxis[0] = xaxis[0] * Q[0][0] + xaxis[0] * Q[0][1] + xaxis[0] * Q[0][2];
xaxis[0] = xaxis[0] * (Q[0][0] + Q[0][1] + Q[0][2]);
This does not look like a matrix multiplication. It should be:
xaxis1[0] = xaxis[0] * Q[0][0] + xaxis[1] * Q[0][1] + xaxis[2] * Q[0][2];
xaxis1[1] = xaxis[0] * Q[1][0] + xaxis[1] * Q[1][1] + xaxis[2] * Q[1][2];
xaxis1[2] = xaxis[0] * Q[2][0] + xaxis[1] * Q[2][1] + xaxis[2] * Q[2][2]; // Multiply a 3x3 by 3x1 and store it as the new rotated axis
yaxis1[0] = yaxis[0] * Q[0][0] + yaxis[1] * Q[0][1] + yaxis[2] * Q[0][2];
yaxis1[1] = yaxis[0] * Q[1][0] + yaxis[1] * Q[1][1] + yaxis[2] * Q[1][2];
yaxis1[2] = yaxis[0] * Q[2][0] + yaxis[1] * Q[2][1] + yaxis[2] * Q[2][2]; // Multiply a 3x3 by 3x1 and store it as the new rotated axis