I would like to use dynamic branching to skip unnecessary instructions. Please consider two functions:
float computeFirst(float s)
{
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
}
float computeSecond(float s)
{
[branch] if(abs(s) > 1.0)
{
return -1.0;
}
else
{
// a bunch of instructions
return acos(s); // acos just for example
}
}
Are these functions equivalent? Both have the dynamic branch but do they work the same way and the unnecessary instructions are actually skipped (when all the pixels in a warp follow the same branch)?
Using Shader Playground, I found that these two functions compile differently:
// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.y, l(-1.000000)
endif
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 17 instruction slots used
// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.x, l(-1.000000)
else
add r0.y, -|v0.x|, l(1.000000)
sqrt r0.y, r0.y
mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
mad r0.z, r0.z, |v0.x|, l(-0.212114)
mad r0.z, r0.z, |v0.x|, l(1.570729)
mul r0.w, r0.y, r0.z
mad r0.w, r0.w, l(-2.000000), l(3.141593)
lt r1.x, v0.x, -v0.x
and r0.w, r0.w, r1.x
mad r0.x, r0.z, r0.y, r0.w
endif
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 18 instruction slots used
In computeFirst, dynamic branch looks useless and never seems to allow unnecessary instructions to be skipped. Am I misunderstanding something and are these two compiled versions equivalent?
It seems that the computeFirst function is better optimized without the branch attribute. I just added the actual expressions instead of the comment line and now the compilation result depends on the number of instructions that can be skipped using dynamic branch:
float computeFirst(float s)
{
if(abs(s) > 1.0)
return -1.0;
s = acos(s) / acos(-1.0);
s = acos(s) / acos(-1.0);
s = acos(s) / acos(-1.0);
s = acos(s) / acos(-1.0);
s = acos(s) / acos(-1.0); // if you comment out this line, dynamic branching will not be used
return s;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 1
ge r0.x, l(1.000000), |v0.x|
if_nz r0.x
add r0.x, -|v0.x|, l(1.000000)
sqrt r0.x, r0.x
mad r0.y, |v0.x|, l(-0.018729), l(0.074261)
mad r0.y, r0.y, |v0.x|, l(-0.212114)
mad r0.y, r0.y, |v0.x|, l(1.570729)
mul r0.z, r0.x, r0.y
mad r0.z, r0.z, l(-2.000000), l(3.141593)
lt r0.w, v0.x, -v0.x
and r0.z, r0.w, r0.z
mad r0.x, r0.y, r0.x, r0.z
mul r0.y, r0.x, l(0.318310)
mad r0.z, -r0.x, l(0.318310), l(1.000000)
sqrt r0.z, r0.z
mad r0.x, r0.x, l(-0.005962), l(0.074261)
mad r0.x, r0.x, r0.y, l(-0.212114)
mad r0.x, r0.x, r0.y, l(1.570729)
mul r0.x, r0.z, r0.x
mul r0.y, r0.x, l(0.318310)
mad r0.z, -r0.x, l(0.318310), l(1.000000)
sqrt r0.z, r0.z
mad r0.x, r0.x, l(-0.005962), l(0.074261)
mad r0.x, r0.x, r0.y, l(-0.212114)
mad r0.x, r0.x, r0.y, l(1.570729)
mul r0.x, r0.z, r0.x
mul r0.y, r0.x, l(0.318310)
mad r0.z, -r0.x, l(0.318310), l(1.000000)
sqrt r0.z, r0.z
mad r0.x, r0.x, l(-0.005962), l(0.074261)
mad r0.x, r0.x, r0.y, l(-0.212114)
mad r0.x, r0.x, r0.y, l(1.570729)
mul r0.x, r0.z, r0.x
mul r0.y, r0.x, l(0.318310)
mad r0.z, -r0.x, l(0.318310), l(1.000000)
sqrt r0.z, r0.z
mad r0.x, r0.x, l(-0.005962), l(0.074261)
mad r0.x, r0.x, r0.y, l(-0.212114)
mad r0.x, r0.x, r0.y, l(1.570729)
mul r0.x, r0.z, r0.x
mul r0.x, r0.x, l(0.318310)
else
mov r0.x, l(-1.000000)
endif
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 47 instruction slots used
Related
Can anyone help me understand why the following HLSL doesn't produce a warning or error? I'm defining a type and returning it, but it doesn't match the function's return type. Is this allowed for any particular reason, or would this be a bug to report (to compiler team)?
I would assume it was a bug, but this seems like a pretty strange and obvious thing to go unnoticed. If not, is there a reason something like this would be allowed? The types are the same size, and possibly could be compatible, but I still wouldn't expect this to work.
The version of my dxc compiler is 1.7.2207.
struct vinBake
{
float4 Position : ATTRIB0; // local position of the vertex
float4 Color : ATTRIB1; // color channels
float3 TexCoord : ATTRIB2; // UV Texture Coordinates (z value represents texture index, if used)
float4 Prop : ATTRIB3; // enhanced logic properties
float4 Attr : ATTRIB4; // enhanced logic attributes
};
struct lerpBlit
{
float4 ClipPos : SV_POSITION; // projected clip-space screen position of vertex
float4 Diffuse : COLOR0; // diffuse color
float3 Tex : TEXCOORD0; // tex coords (x,y) + texture array index (z)
};
struct lerpLine
{
float4 ClipPos : SV_POSITION; // projected clip-space screen position of vertex
float4 Diffuse : COLOR0; // diffuse color
float Factor : TEXCOORD0; // factor value of this position (0->1)
float Thickness : TEXCOORD1; // thickness of line
float Feather : TEXCOORD2; // falloff of line
};
lerpBlit main(vinBake vin)
{
lerpLine pin;
pin.ClipPos = float4(0,0,0,1);
pin.Diffuse = float4(1,1,1,1);
pin.Factor = 0;
pin.Thickness = 0;
pin.Feather = 0;
return pin;
}
You observe implicit casting between structure types that is not a bug.
Two your structs lerpBlit and lerpBlit are structural identical, they have the same byte size and all underlying types can be implicitly casted if we flatten both structs. Floats Factor, Thickness and Feather are combined to float3 Tex that is just a composite type, array of 3 floats. ClipPos and Diffuse are mapped 1-to-1. It means they are eligible for implicit casting as in your case.
More than that, they can be casted explicitly if right-hand type has more size.
For example:
struct lerpBlit
{
float4 ClipPos : SV_POSITION; // projected clip-space screen position of vertex
float4 Diffuse : COLOR0; // diffuse color
float3 Tex : TEXCOORD0; // tex coords (x,y) + texture array index (z)
};
struct lerpLine
{
float4 ClipPos : SV_POSITION; // projected clip-space screen position of vertex
float4 Diffuse : COLOR0; // diffuse color
float Factor : TEXCOORD0; // factor value of this position (0->1)
float Thickness : TEXCOORD1; // thickness of line
float Feather : TEXCOORD2; // falloff of line
float NewVar : TEXCOORD3;
};
lerpBlit main()
{
lerpLine pin;
pin.ClipPos = float4(0,0,0,1);
pin.Diffuse = float4(1,1,1,1);
pin.Factor = 0;
pin.Thickness = 0;
pin.Feather = 0;
pin.Feather = 0;
pin.NewVar = 42;
// return pin; // Error! No implicit casting more.
return (lerpBlit)pin; // That's fine, sizeof(lerpLine) >= sizeof(lerpBlit)
}
Additionally, you can output DXIL to check how your structs are converted and what output is.
dxbc2dxil.exe <file_with_dxc.exe_output> /disasm-dxbc
Part of the output for your case is below:
; Output signature:
;
; Name Index InterpMode DynIdx
; -------------------- ----- ---------------------- ------
; SV_Position 0 noperspective
; COLOR 0 linear
; TEXCOORD 0 linear
call void #dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 1.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 1, float 1.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 2, float 1.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 3, float 1.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 2, i32 0, i8 0, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 2, i32 0, i8 1, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void #dx.op.storeOutput.f32(i32 5, i32 2, i32 0, i8 2, float 0.000000e+00), !dbg !83 ; line:31 col:12 ; StoreOutput(outputSigId,rowIndex,colIndex,value)
With regards to reasons why it's so. I assume it's a part of support for composite types that allow you to write the code like that:
struct {float x, y, z;} s;
struct S {float r, g, b;} s2;
struct {float r, g, b, a;} s3;
s2 = s;
s2 = (S)s3;
float3 v;
float4 v2;
v = (float3)v2;
This question already has answers here:
Calculating normals in a triangle mesh
(6 answers)
calculate normal per vertex OpenGL
(1 answer)
Closed 1 year ago.
I've created a surface of revolution and need to calculate the per vertex normals to pass into my vertex shader. I'm not sure where I'm going wrong:
for(int i = 0, j = 0; i <= c && j <= c; i += 3, j += 9){
GLfloat point[] = {vp1[i], vp1[i + 1], vp1[i + 2], 1.0};
multiply(scale1, point, result);
vp[i] = result[0];
vp[i + 1] = result[1];
vp[i + 2] = result[2];
GLfloat pointC[] = {vp1[j], vp1[j + 1], vp1[j + 2], 1.0};
GLfloat pointA[] = {vp1[j + 3], vp1[j + 4], vp1[j + 5], 1.0};
GLfloat pointB[] = {vp1[j + 6], vp1[j + 7], vp1[j + 8], 1.0};
GLfloat vec1[] = {pointA[0] - pointC[0], pointA[1] - pointC[1], pointA[2] - pointC[2]};
GLfloat vec2[] = {pointB[0] - pointC[0], pointB[1] - pointC[1], pointB[2] - pointC[2]};
crossProd(vec1, vec2, normal);
float mag = sqrt((normal[0]*normal[0]) + (normal[1]*normal[1]) + (normal[2]*normal[2]));
normals[j + 0] = normal[0] / mag;
normals[j + 1] = normal[1] / mag;
normals[j + 2] = normal[2] / mag;
normals[j + 3] = normal[0] / mag;
normals[j + 4] = normal[1] / mag;
normals[j + 5] = normal[2] / mag;
normals[j + 6] = normal[0] / mag;
normals[j + 7] = normal[1] / mag;
normals[j + 8] = normal[2] / mag;
}
This code is adding my vertices to vp and then trying to calculate the normal. My surface definitely doesn't look right when rendered.
I think I'm generating the surface normal from everything I've read so far, but I'm not sure how to turn my triangle soup into per-vertex normals.
Edit: To a commenter below, three successive indices make 1 point. 9 successive indices make a triangle.
Edit 2: How do I convert this into per-vertex normals?
I'm trying to improve Henry Thasler's GLSL implementation of double-single arithmetic (from his GLSL Mandelbrot demo) to work reliably on NVIDIA graphics on Linux. I've recently learned that since OpenGL 4.0 (§4.7 The Precise Qualifier in the spec) or with GL_ARB_gpu_shader5 extension (spec) we can use precise qualifier to make the calculations follow exact sequence of arithmetic operations specified in the GLSL source.
But the following attempt appears to not give any improvement:
#version 330
#extension GL_ARB_gpu_shader5 : require
vec2 ds_add(vec2 dsa, vec2 dsb)
{
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
precise float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
precise vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (dsc.x - t1);
return dsc;
}
The result is the same as if there were no precise added. I've checked that the algorithm itself is correct: it works as is (even without precise) on Intel Core i7-4765T built-in graphics, and if I hide some variables to inhibit optimizations, then NVidia also gives the correct results. Here's how I inhibit the optimizations:
#version 330
#define hide(x) ((x)*one)
uniform float one=1;
vec2 ds_add(vec2 dsa, vec2 dsb)
{
float t1 = dsa.x + dsb.x;
float e = hide(t1) - dsa.x;
float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (hide(dsc.x) - t1);
return dsc;
}
So, apparently, I'm using the precise qualifier incorrectly. But what exactly is wrong here?
For reference, I'm using NVidia GeForce GTX 750Ti with binary nvidia driver 390.116. Here's the full C++ test:
#include <cmath>
#include <vector>
#include <string>
#include <limits>
#include <iomanip>
#include <iostream>
// glad.h is generated by the following command:
// glad --out-path=. --generator=c --omit-khrplatform --api="gl=3.3" --profile=core --extensions=
#include "glad/glad.h"
#include <GL/freeglut.h>
#include <glm/glm.hpp>
using glm::vec4;
GLuint vao, vbo;
GLuint texFBO;
GLuint program;
GLuint fbo;
int width=1, height=2;
void printShaderOutput(int texW, int texH)
{
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texFBO);
std::vector<vec4> data(texW*texH);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, data.data());
std::cout << "a,b,sum,relError(sum),note\n";
for(int i=0;i<width;++i)
{
const auto a=double(data[i+width*0].x)+double(data[i+width*0].y);
const auto b=double(data[i+width*0].z)+double(data[i+width*0].w);
const auto sum=double(data[i+width*1].x)+double(data[i+width*1].y);
const auto trueSum=a+b;
const auto sumErr=(sum-trueSum)/trueSum;
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10)
<< a << ',' << b << ','
<< sum << ','
<< std::setprecision(3)
<< sumErr << ','
<< (std::abs(sumErr)>1e-14 ? "WARN" : "OK")
<< '\n';
}
std::cout.flush();
}
GLuint makeShader(GLenum type, std::string const& srcStr)
{
const auto shader=glCreateShader(type);
const GLint srcLen=srcStr.size();
const GLchar*const src=srcStr.c_str();
glShaderSource(shader, 1, &src, &srcLen);
glCompileShader(shader);
GLint status=-1;
glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
assert(glGetError()==GL_NO_ERROR);
assert(status);
return shader;
}
void loadShaders()
{
program=glCreateProgram();
const auto vertexShader=makeShader(GL_VERTEX_SHADER, 1+R"(
#version 330
in vec4 vertex;
void main() { gl_Position=vertex; }
)");
glAttachShader(program, vertexShader);
const auto fragmentShader=makeShader(GL_FRAGMENT_SHADER, 1+R"(
#version 330
#extension GL_ARB_gpu_shader5 : require
vec2 ds_add(vec2 dsa, vec2 dsb)
{
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
precise float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
precise vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (dsc.x - t1);
return dsc;
}
uniform vec2 a, b;
out vec4 color;
void main()
{
if(gl_FragCoord.y<1) // first row
color=vec4(a,b);
else if(gl_FragCoord.y<2) // second row
color=vec4(ds_add(a,b),0,0);
}
)");
glAttachShader(program, fragmentShader);
glLinkProgram(program);
GLint status=0;
glGetProgramiv(program, GL_LINK_STATUS, &status);
assert(glGetError()==GL_NO_ERROR);
assert(status);
glDetachShader(program, fragmentShader);
glDeleteShader(fragmentShader);
glDetachShader(program, vertexShader);
glDeleteShader(vertexShader);
}
void setupBuffers()
{
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
const GLfloat vertices[]=
{
-1, -1,
1, -1,
-1, 1,
1, 1,
};
glBufferData(GL_ARRAY_BUFFER, sizeof vertices, vertices, GL_STATIC_DRAW);
constexpr GLuint attribIndex=0;
constexpr int coordsPerVertex=2;
glVertexAttribPointer(attribIndex, coordsPerVertex, GL_FLOAT, false, 0, 0);
glEnableVertexAttribArray(attribIndex);
glBindVertexArray(0);
}
bool init()
{
if(!gladLoadGL())
{
std::cerr << "Failed to initialize GLAD\n";
return false;
}
if(!GLAD_GL_VERSION_3_3)
{
std::cerr << "OpenGL 3.3 not supported\n";
return false;
}
glGenTextures(1, &texFBO);
glGenFramebuffers(1,&fbo);
loadShaders();
setupBuffers();
glViewport(0,0,width,height);
glBindTexture(GL_TEXTURE_2D,texFBO);
glTexImage2D(GL_TEXTURE_2D,0,GL_RGBA32F,width,height,0,GL_RGBA,GL_UNSIGNED_BYTE,nullptr);
glBindTexture(GL_TEXTURE_2D,0);
glBindFramebuffer(GL_FRAMEBUFFER,fbo);
glFramebufferTexture2D(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_TEXTURE_2D,texFBO,0);
const auto status=glCheckFramebufferStatus(GL_FRAMEBUFFER);
assert(status==GL_FRAMEBUFFER_COMPLETE);
glBindFramebuffer(GL_FRAMEBUFFER,0);
return true;
}
void display()
{
const static bool inited=init();
if(!inited) std::exit(1);
glBindFramebuffer(GL_FRAMEBUFFER,fbo);
glUseProgram(program);
#define SPLIT_DOUBLE_TO_FLOATS(x) GLfloat(x),GLfloat(x-GLfloat(x))
glUniform2f(glGetUniformLocation(program,"a"),SPLIT_DOUBLE_TO_FLOATS(3.1415926535897932));
glUniform2f(glGetUniformLocation(program,"b"),SPLIT_DOUBLE_TO_FLOATS(2.7182818284590452));
glUniform1f(glGetUniformLocation(program,"rtWidth"),width);
glBindVertexArray(vao);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glBindVertexArray(0);
printShaderOutput(width, height);
std::exit(0);
glFinish();
}
int main(int argc, char** argv)
{
glutInitContextVersion(3,3);
glutInitContextProfile(GLUT_CORE_PROFILE);
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB);
glutInitWindowSize(width, height);
glutCreateWindow("Test");
glutDisplayFunc(display);
glutMainLoop();
}
I've been able to extract the NVfp5.0 assembly from the GLSL program binaries in the different cases:
Naïve case without hide and without precise:
!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[2] = { program.local[0..1] };
TEMP R0;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[0];
MOV.F result_color0.zw, c[1].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F R0.y, -c[0].x, c[0].x;
ADD.F R0.x, -c[1], c[1];
ADD.F R0.x, R0, R0.y;
ADD.F R0.x, R0, c[0].y;
ADD.F R0.y, R0.x, c[1];
ADD.F R0.x, c[0], c[1];
ADD.F result_color0.x, R0, R0.y;
ADD.F result_color0.y, R0, -R0;
MOV.F result_color0.zw, {0, 0, 0, 0}.x;
ENDIF;
ENDIF;
END
The case with precise (notice that nothing changes except .PREC suffix in the "instructions"):
!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[2] = { program.local[0..1] };
TEMP R0;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[0];
MOV.F result_color0.zw, c[1].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F.PREC R0.y, -c[0].x, c[0].x;
ADD.F.PREC R0.x, -c[1], c[1];
ADD.F.PREC R0.x, R0, R0.y;
ADD.F.PREC R0.x, R0, c[0].y;
ADD.F.PREC R0.y, R0.x, c[1];
ADD.F.PREC R0.x, c[0], c[1];
ADD.F.PREC result_color0.x, R0, R0.y;
ADD.F.PREC result_color0.y, R0, -R0;
MOV.F result_color0.zw, {0, 0, 0, 0}.x;
ENDIF;
ENDIF;
END
The case with hide, which does work, and obviously has a different sequence of arithmetic operations:
!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[3] = { program.local[0..2] };
TEMP R0, R1;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[1];
MOV.F result_color0.zw, c[2].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F R0.x, c[1], c[2];
MAD.F R0.y, R0.x, c[0].x, -c[1].x;
ADD.F R0.z, R0.x, -R0.y;
ADD.F R0.z, -R0, c[1].x;
ADD.F R0.y, -R0, c[2].x;
ADD.F R0.y, R0, R0.z;
ADD.F R0.y, R0, c[1];
ADD.F R0.y, R0, c[2];
ADD.F R1.x, R0, R0.y;
MAD.F R0.x, R1, c[0], -R0;
MOV.F R1.zw, {0, 0, 0, 0}.x;
ADD.F R1.y, R0, -R0.x;
MOV.F result_color0, R1;
ENDIF;
ENDIF;
END
I've never used precise myself, though you may benefit from learning OpenCL or CUDA here.
In any case, your GLSL version is 3.30, which is tied with OpenGL 3.3. The precise qualifier is avaiable through an extension, but I would always attempt to use a built-in feature of OpenGL if you can.
The extension may not be implemented in the same manner, I suggest you try using at least GLSL version 4.0, ideally the latest OpenGL / GLSL version.
Sometimes these old extensions can have regressions on newer GPUs if no one is using them.
GPU compilers tend to be more liberal with optimizations. You may benefit from seeing the output from the compiled shader, there may be some way to view the PTX assembly output from the Nvidia compiler with GLSL. With CUDA, you can definitely preview the assembly output to ensure the operations are not being re-ordered by the compiler.
The spec mentions MAD as being the main reason for the qualifier--it will force the compiler not to use the MAD instruction. Perhaps little testing was done with addition / subtraction with the precise qualifier.
If hide solves it for you, it's probably best to just call it a day, I doubt the precise qualifier has been thoroughly checked on the GLSL side. I highly recommend CUDA or OpenCL for this, you can use CL-GL interop if you want to display the texture quickly as well, which isn't terribly painful.
The precise qualifier ensures there is no re-ordering of operations, but doesn't mention optimizations which don't affect the ordering. It seems like AMD just turns off optimizations when using it. It's still possible that Nvidia still applies optimizations which are affecting your result which aren't related to the order of operations but rather to specific optimizations to addition being performed.
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
This will probably compute e as simply dsb.x. The compiler may potentially still be adding optimizations which don't affect the order of operations, as that's all that the spec guarantees. I can't think of anything other than the operations being re-ordered that would affect this result, but I'm no expert here.
Another thing to note is that based on my cursory reading of the spec, the result from the ds_add may need to be stored into a precise variable as well in order for the computation to be precise. The function may be inlined only on Nvidia (they have much better optimizations, at least historically), so I imagine that the compiler may perform the inlining and then if you store the result into a non-precise variable then all the existing precise qualifiers are ignored.
Nothing wrong with your shader. The ds_add() code just haven't any operation that could be merged at the compilation time. Usually add and multiply/divide merge. But your code have add operations only.
Update:
In the case all your variables are stored in GPU registers during calculation process. Order of operations for registers doesn't depend on code or compiler. It doesn't even depend on just hardware. It depends on currently running operations in the GPU.
Precision of floating point operations between registers isn't strictly 32 bit. It usually higher then. The actual precision for GPUs is a commercial secret. The actual precision for x86 FPU is 80 bit or 128 bit, despite the variables are stored in 32 bit memory.
However GPUs are not designed for very precise calculation. The algorithm's author knows it and implements double thought pairs of 32-bit floats. If you need to advance precision, then you have to use long double with quads of 32-bit floats. Simple 'precise' doesn't help.
After coming across this problem here, where the the Entity::draw() call would not be displayed due to the vertex shader values returning 0 on multiplication with the world view matrix.
The problem was funneled to a faulty constant buffer input. However, after checking the values, I can't seem to understand the problem at hand. I pre-multiplied the World, View, and Projection matrices:
mWorld = XMMatrixIdentity();
mView = XMMatrixLookAtLH(Eye, At, Up);
mProjection = XMMatrixPerspectiveFovLH(XM_PIDIV2, 1.0, 0.0f, 1000.0f);
mWVP = mWorld*mView*mProjection;
mWVP
-0.999999940, 0.000000000, 0.000000000, 0.000000000
0.000000000, 0.999999940, 0.000000000, 0.000000000
0.000000000, 0.000000000, -1.00000000, -1.00000000
0.000000000, 0.000000000, 5.00000000, 5.00000000
mWVP enters the constant buffer after being transposed:
WorldCB.mWorldVP = XMMatrixTranspose(mWVP);
DeviceContext->UpdateSubresource(MatrixBuffer, 0, NULL, &WorldCB, 0, 0);
DeviceContext->VSSetConstantBuffers(0, 1, &MatrixBuffer);
XMMatrixTranspose(mWVP);
-0.999999940, 0.000000000, 0.000000000, 0.000000000
0.000000000, 0.999999940, 0.000000000, 0.000000000
0.000000000, 0.000000000, -1.00000000, 5.00000000
0.000000000, 0.000000000, -1.00000000, 5.00000000
Which looks OK, at least to me. Next my shader starts doing its thing, but here's where things get funky, checking the disassembly yields that when:
output.position = mul(position, WVP);
Vertex Shader:
00000000 dp4 o0.x, v0.xyzw, cb0[0].xyzw
00000001 dp4 o0.y, v0.xyzw, cb0[1].xyzw
00000002 dp4 o0.z, v0.xyzw, cb0[2].xyzw
00000003 dp4 o0.w, v0.xyzw, cb0[3].xyzw
00000004 mov o1.xyzw, v1.xyzw
For each multiplication, values return 0. And if output.position = position; Values are correct, and the box displays, but not inside the world transformation.
The full shader file below:
cbuffer ConstantBuffer:register(b0)
{
matrix WVP;
}
struct VOut
{
float4 position : SV_POSITION;
float4 color : COLOR;
};
VOut VShader(float4 position : POSITION, float4 color : COLOR)
{
VOut output;
output.position = mul(position, WVP); // position;
output.color = color;
return output;
}
float4 PShader(float4 position : SV_POSITION, float4 color : COLOR) : SV_TARGET
{
return color;
}
Edit: Also noted that the Transpose of the World matrix equals zero:
ObjectSpace = m_Scale*m_Rotation*m_Translate;
mWVP = ObjectSpace*direct3D.mView*direct3D.mProjection;
LocalWorld.mWorldVP = XMMatrixTranspose(wWVP);
XMMatrixTranspose(wWVP) comes out:
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
And is likely the problem. Any guesses as to why the transpose of a matrix would equal 0?
The near plane of the perspective projection must be some value larger than zero. If it is zero, then the near plane is exactly where the camera is located, and everything in the scene converges to a single point.
I'm having strange results of XMVector3AngleBetweenVectors function. Consider this code:
float angle = XMConvertToDegrees(XMVectorGetX(
XMVector3AngleBetweenVectors(GMathFV(XMFLOAT3(0.0f, 100.0f, 0.0f)),
GMathFV(XMFLOAT3(0.0f, 200.0f, 0.0f)))));
It's looking for angle between two 3D vectors, described by XMFLOAT3 structures. GMathFV is user-defined function which converts XMFLOAT3 to XMVECTOR as follows:
inline XMVECTOR GMathFV(XMFLOAT3& val)
{
return XMLoadFloat3(&val);
}
Everything else is directxmath.h library. Here everything is fine and result angle is 0.00000 just as expected.
But for other vectors with negative y-axis value, for example:
float angle = XMConvertToDegrees(XMVectorGetX(
XMVector3AngleBetweenVectors(GMathFV(XMFLOAT3(0.0f, -100.0f, 0.0f)),
GMathFV(XMFLOAT3(0.0f, -99.0f, 0.0f)))));
Result is 0.0197823402, which I can hardly call a zero angle.
Please someone help me figure out the problem. Is it negative number precision, too close vector coordinates or maybe something else?
UPD: Amazing, but it gives 0.0197823402 for a(0.0f, 100.0f, 0.0f) x b(0.0f, 99.0f, 0.0f), but 0.000000 for a(0.0f, 101.0f, 0.0f) x b(0.0f, 100.0f, 0.0f)
DirectXMath is designed for 32bit floating point math. You're seeing floating point error escalation. Here's the definition of XMVector3AngleBetweenVectors.
inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2)
{
XMVECTOR L1 = XMVector3ReciprocalLength(V1);
XMVECTOR L2 = XMVector3ReciprocalLength(V2);
XMVECTOR Dot = XMVector3Dot(V1, V2);
L1 = XMVectorMultiply(L1, L2);
XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
return XMVectorACos(CosAngle);
}
In your first example CosAngle equals 1.000000000
In your second example CosAngle equals 0.999999940
XMVectorACos(0.999999940) = 0.000345266977
This large error comes from a polynomial approximation of ACos. In general you should avoid trigonometric inverses whenever possible. They are slow and noisy. Here's the definition so you can get an idea of its size.
inline XMVECTOR XM_CALLCONV XMVectorACos (FXMVECTOR V)
{
__m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
__m128 mvalue = _mm_sub_ps(g_XMZero, V);
__m128 x = _mm_max_ps(V, mvalue); // |V|
// Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
__m128 oneMValue = _mm_sub_ps(g_XMOne, x);
__m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
__m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
// Compute polynomial approximation
const XMVECTOR AC1 = g_XMArcCoefficients1;
XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
__m128 t0 = _mm_mul_ps(vConstants, x);
vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
const XMVECTOR AC0 = g_XMArcCoefficients0;
vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, x);
vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
t0 = _mm_add_ps(t0, vConstants);
t0 = _mm_mul_ps(t0, root);
__m128 t1 = _mm_sub_ps(g_XMPi, t0);
t0 = _mm_and_ps(nonnegative, t0);
t1 = _mm_andnot_ps(nonnegative, t1);
t0 = _mm_or_ps(t0, t1);
return t0;
}