I want to calculate the per-row minimum of a matrix of floats in GLSL in the browser, of about 1000 rows, 4000 columns.
Building on previous answers (see this) I used a for loop. However I would like to use a uniform for the upper bound, which is not possible in WebGL GLSL ES 1.0. This is because the length of the row is defined after the fragment shader, and I'd like to avoid messing with #DEFINEs.
So I found out that this workaround - fixed cycle length with a if/break defined by a uniform - works ok:
#define MAX_INT 65536
void main(void) {
float m = 0.0;
float k = -1.0;
int r = 40;
for(int i = 0; i < MAX_INT; ++i){
float ndx = floor(gl_FragCoord.y) * float(r) + float(i);
float a = getPoint(values, dimensions, ndx).x;
m = m > a ? m : a;
if (i >= r) { break; }
};
}
Now the question: does this have big drawbacks? Is there something weird I am doing and I'm missing something?
I believe, but am not entirely sure, that the only risk is some driver/gpu will still make the long loop.
As an example imagine this loop
uniform int limit;
void main() {
float sum = 0;
for (int i = 0; i < 3; ++i) {
sum += texture2D(tex, vec2(float(i) / 3, 0)).r;
if (i >= limit) {
break;
}
}
gl_FragColor = vec4(sum);
}
that can be re-written by the driver like this
uniform int limit;
void main() {
float sum = 0;
for (int i = 0; i < 3; ++i) {
float temp = texture2D(tex, vec2(float(i) / 3, 0)).r;
sum += temp * step(float(i), float(limit));
}
gl_FragColor = vec4(sum);
}
no branches. I don't know if any such drivers/gpus still exist that have no conditionals but the idea of requiring a const integer expression for a loop is so the branches can be removed and/or the loop un-rolled at compile time if the driver/GPU decided to do either.
uniform int limit;
void main() {
float sum = 0;
sum += step(float(0), float(limit)) * texture2D(tex, vec2(float(0) / 3, 0)).r;
sum += step(float(1), float(limit)) * texture2D(tex, vec2(float(1) / 3, 0)).r;
sum += step(float(2), float(limit)) * texture2D(tex, vec2(float(2) / 3, 0)).r;
gl_FragColor = vec4(sum);
}
Also, as an aside, the specific example you have above doesn't output anything so most drivers would turn the entire shader into a no-op.
im trying to fix this shader. the effects is a radial blur around a point position, passing from the cpu in a array. The calculations works fine for each point and generates de effect, but as you can see in this picture, for each loop the shader keep generate samples, and i dont know how to avoid. i only want the blur for each point in the array
#version 150
in vec2 varyingtexcoord;
uniform sampler2DRect tex0;
uniform int size;
float exposure = 0.79;
float decay = 0.9;
float density = .9;
float weight = .1;
int samples = 25;
out vec4 fragColor;
const int MAX_SAMPLES = 25;
const int N = 3;
uniform vec2 ligthPos [N];
int a = 1;
vec4 halo(vec2 pos){
float illuminationDecay = 1.2;
vec2 texCoord = varyingtexcoord;
vec2 current = pos.xy;
vec2 deltaTextCoord = texCoord - current;
deltaTextCoord *= 1.0 / float(samples) * density;
vec4 color = texture(tex0, texCoord);
for(int i=0; i < MAX_SAMPLES; i++){
texCoord -= deltaTextCoord;
vec4 sample = texture(tex0, texCoord);
sample *= illuminationDecay * weight;
color += sample;
illuminationDecay *= decay;
}
return color;
}
void main(){
vec4 accum = vec4(0.0);
for(int e = 0; e < N;e++){
vec2 current =ligthPos[e];
accum += halo(current);
}
fragColor = (accum) * exposure;
}
this is what happen:
NOTE: THIS QUESTION HAS BEEN DRASTICALLY EDITED FROM ITS ORIGINAL FORM
I am attempting to create a logarithmic raytracer by implementing an oct tree data structure combined with voxelization to achieve fast ray tracing.
Currently I am having issues with the ray collision detection.
The expected output should be the voxelized stanford dragon with its normal map.
Currrently the issue is that some regions are transparent:
The full dragon:
Transparent regions:
From these images it should be clear that the geometry is correct, but the collision checks are wrong.
There are 2 fragment shaders involved in this process:
The voxelizer fragment shader:
#version 430
in vec3 f_pos;
in vec3 f_norm;
in vec2 f_uv;
out vec4 f_color;
struct Voxel
{
vec4 position;
vec4 normal;
vec4 color;
};
struct Node
{
int children[8];
};
layout(std430, binding = 0) buffer voxel_buffer
{
Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
uint t_index;
};
out vec4 fragment_color;
uniform int voxel_resolution;
uniform int cube_dim;
int getVIndex(vec3 position, int level)
{
float size = cube_dim / pow(2,level);
int bit2 = int(position.x > size);
int bit1 = int(position.y > size);
int bit0 = int(position.z > size);
return 4*bit2 + 2*bit1 + bit0;
}
void main()
{
uint m_index = atomicAdd(index, 1);
voxels[m_index].position = vec4(f_pos*cube_dim,1);
voxels[m_index].normal = vec4(f_norm,1);
voxels[m_index].color = vec4(f_norm,1);
int max_level = int(log2(voxel_resolution));
int node = 0;
vec3 corner = vec3(-cube_dim);
int child;
for(int level=0; level<max_level-1; level++)
{
float size = cube_dim / pow(2,level);
vec3 corners[] =
{corner, corner+vec3(0,0,size),
corner+vec3(0,size,0), corner+vec3(0,size,size),
corner+vec3(size,0,0), corner+vec3(size,0,size),
corner+vec3(size,size,0), corner+vec3(size,size,size)};
vec3 offsetPos = (vec3(voxels[m_index].position));
child = getVIndex(offsetPos-corner, level);
int mrun = 500;
while ((tree[node].children[child] <= 0) && (mrun > 0)){
mrun--;
if( (atomicCompSwap( tree[node].children[child] , 0 , -1) == 0 ))
{
tree[node].children[child] = int(atomicAdd(t_index, 1));
}
}
if(mrun < 1)
discard;
if(level==max_level-2)
break;
node = tree[node].children[child];
corner = corners[child];
}
tree[node].children[child] = int(m_index);
}
I understand the logic may not be clear so let me explain:
We start with a 3D psoition voxels[m_index].position = vec4(f_pos*cube_dim,1); And we know there is a cube with dimensions (-cube_dim,-cube_dim,-cube_dim) to (cube_dim,cube_dim,cube_dim)
So a cube whose diagonals intersect at the origin with side length of 2*cube_dim. That has been divided into multiple little cubes with side length 2*cube_dim/voxel_resolution. Basically this is just a cube subdivided n times to make a cartesian grid.
Using this coordinate we start at the big cube, subdividing it into 8 equal sized subsapaces and detecting which of these subspaces contians the coordinate.
We do this until we find the smallest box containing the position.
The raytracer
#version 430
in vec2 f_coord;
out vec4 fragment_color;
struct Voxel
{
vec4 position;
vec4 normal;
vec4 color;
};
struct Node
{
int children[8];
};
layout(std430, binding = 0) buffer voxel_buffer
{
Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
uint t_index;
};
uniform vec3 camera_pos;
uniform float aspect_ratio;
uniform float cube_dim;
uniform int voxel_resolution;
float planeIntersection(vec3 origin, vec3 ray, vec3 pNormal, vec3 pPoint)
{
pNormal = normalize(pNormal);
return (dot(pPoint,pNormal)-dot(pNormal,origin))/dot(ray,pNormal);
}
#define EPSILON 0.001
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
bool inside = true;
position-=corner;
for(int i=0; i<3; i++)
{
inside = inside && (position[i] > -EPSILON);
inside = inside && (position[i] < size+EPSILON);
}
return inside;
}
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
dir = normalize(dir);
vec3 corner1 = corner0 + vec3(size,size,size);
vec3 normals[6] =
{ vec3(-1,0,0), vec3(0,-1,0), vec3(0,0,-1), vec3(1,0,0), vec3(0,1,0), vec3(0,0,1) };
float coeffs[6];
for(uint i=0; i<3; i++)
coeffs[i] = planeIntersection(origin, dir, normals[i], corner0);
for(uint i=3; i<6; i++)
coeffs[i] = planeIntersection(origin, dir, normals[i], corner1);
float t = 1.f/0.f;
for(uint i=0; i<6; i++){
coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
}
return t;
}
void sort(float elements[8], int indices[8], vec3 vectors[8])
{
for(uint i=0; i<8; i++)
{
for(uint j=i; j<8; j++)
{
if(elements[j] < elements[i])
{
float swap = elements[i];
elements[i] = elements[j];
elements[j] = swap;
int iSwap = indices[i];
indices[i] = indices[j];
indices[j] = iSwap;
vec3 vSwap = vectors[i];
vectors[i] = vectors[j];
vectors[j] = vSwap;
}
}
}
}
int getVIndex(vec3 position, int level)
{
float size = cube_dim / pow(2,level);
int bit2 = int(position.x > size);
int bit1 = int(position.y > size);
int bit0 = int(position.z > size);
return 4*bit2 + 2*bit1 + bit0;
}
#define MAX_TREE_HEIGHT 11
int nodes[8*MAX_TREE_HEIGHT];
int levels[8*MAX_TREE_HEIGHT];
vec3 positions[8*MAX_TREE_HEIGHT];
int sp=0;
void push(int node, int level, vec3 corner)
{
nodes[sp] = node;
levels[sp] = level;
positions[sp] = corner;
sp++;
}
void main()
{
vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40)));
r.y/=aspect_ratio;
vec3 dir = r;
r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos;
fragment_color = vec4(0);
//int level = 0;
int max_level = int(log2(voxel_resolution));
push(0,0,vec3(-cube_dim));
float tc = 1.f;
int level=0;
int node=0;
do
{
sp--;
node = nodes[sp];
level = levels[sp];
vec3 corner = positions[sp];
float size = cube_dim / pow(2,level);
vec3 corners[] =
{corner, corner+vec3(0,0,size),
corner+vec3(0, size,0), corner+vec3(0,size,size),
corner+vec3(size,0,0), corner+vec3(size,0,size),
corner+vec3(size,size,0), corner+vec3(size,size,size)};
float t = boxIntersection(r, dir, corner, size*2);
if(!isinf(t))
tc *= 0.9f;
float coeffs[8];
for(int child=0; child<8; child++)
{
if(tree[node].children[child]>0)
coeffs[child] = boxIntersection(r, dir, corners[child], size);
else
coeffs[child] = 1.f/0.f;
}
int indices[8] = {0,1,2,3,4,5,6,7};
sort(coeffs, indices, corners);
for(uint i=7; i>=0; i--)
{
if(!isinf(coeffs[i]))
{
push(tree[node].children[indices[i]],
level+1, corners[i]);
}
}
}while(level < (max_level-1) && sp>0);
if(level==max_level-1)
{
fragment_color = abs(voxels[node].normal);
}
else
{
fragment_color=vec4(tc);
}
}
}
In here, we start at the biggest cube, testing intersections with each set of 8 children (the 8 cubes resulting from subdividing a cube). Each time we successfully detect a collision, we move down the tree, until we reach the lowest level which describes the actual geometry and we color the scene based on that.
Debugging and Problem
The important part is that there are 2 buffers, one to store the tree except the leafs, and one to store the leafs.
So in both the voxelization and the ray tracing, the last layer needs to be treated differently.
The issues I have noticed about the transparency are as follows:
It happens only on planes aligned with the cartesian grid
It seems it happens when the ray moves in a negative direction (down
or to the left). (At least that's my imperssion but it's not 100%
certain)
I am not sure what I am doing wrong.
EDIT:
The original issue seems to have been fixed, however the raytracer is still bugged. I have edited the question to refelct the current state of the problem.
The error comes from the sorting function as someone in the comments mentioned although not for the same reasons.
What has happened is that, I thought the sort function would modify the arrays passed to it, but it seems to be copying the data, so it does not return anything.
In other words:
void sort(float elements[8], int indices[8], vec3 vectors[8])
{
for(uint i=0; i<8; i++)
{
for(uint j=i; j<8; j++)
{
if((elements[j] < elements[i]))
{
float swap = elements[i];
elements[i] = elements[j];
elements[j] = swap;
int iSwap = indices[i];
indices[i] = indices[j];
indices[j] = iSwap;
vec3 vSwap = vectors[i];
vectors[i] = vectors[j];
vectors[j] = vSwap;
}
}
}
}
Does not return the correct values inside of elements, indices and vectors, so calling this function does nothing but waste computation cycles.
What would be the best way of converting std::vector of Vertices to float*? I have vtx as my original data, which contains two vertices with position, normal and uv and I have std::vector of vertices v with the same position, normal and uv. What I am trying to achieve is getting the same memory layout and data as vtx into vtx2 using std::vector v. I tried copying the memory from v to vtx2 using memcpy but when I print them they are ordered in different way.
#include <iostream>
#include <vector>
using namespace std;
struct Vector3
{
float x;
float y;
float z;
};
struct Vector2
{
float x;
float y;
};
struct Vertex
{
Vector3 position;
Vector3 normal;
Vector2 uv;
};
int main(int argc, char *argv[])
{
const int n = 16;
float* vtx = new float[n];
// Vertex 1
// Position
vtx[0] = 1.0f;
vtx[1] = 2.0f;
vtx[2] = 3.0f;
// Normal
vtx[3] = 0.1f;
vtx[4] = 0.2f;
vtx[5] = 0.3f;
// UV
vtx[6] = 0.0f;
vtx[7] = 1.0f;
vtx += 8;
// Vertex 2
// Position
vtx[0] = 4.0f;
vtx[1] = 5.0f;
vtx[2] = 6.0f;
// Normal
vtx[3] = 0.2f;
vtx[4] = 0.3f;
vtx[5] = 0.4f;
// UV
vtx[6] = 0.0f;
vtx[7] = 1.0f;
vtx += 8;
for (int i = n; i>0; i--)
{
cout << *(vtx + i * -1) << endl;
}
vector<Vertex> v;
Vertex vt;
// Vertex 1
// Position
Vector3 pos1 = {1.0, 2.0, 3.0};
vt.position = pos1;
// Normal
Vector3 normal1 = {0.1, 0.2, 0.3};
vt.position = normal1;
// UV
Vector2 uv1 = {0.0, 1.0};
vt.uv = uv1;
v.push_back(vt);
// Vertex 2
// Position
Vector3 pos2 = {4.0, 5.0, 6.0};
vt.position = pos2;
// Normal
Vector3 normal2 = {0.2, 0.3, 0.4};
vt.position = normal2;
// UV
Vector2 uv2 = {0.0, 1.0};
vt.uv = uv2;
v.push_back(vt);
float* vtx2 = new float[n];
memcpy(vtx2, &v[0], v.size() * sizeof(Vertex));
for (int i = n; i>0; i--)
{
cout << *(vtx2 + i * -1) << endl;
}
delete[] vtx;
delete[] vtx2;
return 0;
}
#include <cstring>
#include <iostream>
#include <vector>
#include <cstddef>
using namespace std;
struct Vector3
{
float x;
float y;
float z;
};
struct Vector2
{
float x;
float y;
};
struct Vertex
{
Vector3 position;
Vector3 normal;
Vector2 uv;
};
int main(int argc, char *argv[])
{
const int n = 16;
float* vtx1 = new float[n];
float* vtx = vtx1;
cout << offsetof(Vertex, normal) << " " << offsetof(Vertex, uv) << " " << sizeof(Vertex) << "\n";
// Vertex 1
// Position
vtx[0] = 1.0f;
vtx[1] = 2.0f;
vtx[2] = 3.0f;
// Normal
vtx[3] = 0.1f;
vtx[4] = 0.2f;
vtx[5] = 0.3f;
// UV
vtx[6] = 0.0f;
vtx[7] = 1.0f;
vtx += 8;
// Vertex 2
// Position
vtx[0] = 4.0f;
vtx[1] = 5.0f;
vtx[2] = 6.0f;
// Normal
vtx[3] = 0.2f;
vtx[4] = 0.3f;
vtx[5] = 0.4f;
// UV
vtx[6] = 0.0f;
vtx[7] = 1.0f;
vtx += 8;
for (int i = n; i>0; i--)
{
cout << *(vtx + i * -1) << endl;
}
cout << "\n";
vector<Vertex> v;
Vertex vt;
// Vertex 1
// Position
Vector3 pos1 = {1.0, 2.0, 3.0};
vt.position = pos1;
// Normal
Vector3 normal1 = {0.1, 0.2, 0.3};
vt.normal = normal1;
// UV
Vector2 uv1 = {0.0, 1.0};
vt.uv = uv1;
v.push_back(vt);
// Vertex 2
// Position
Vector3 pos2 = {4.0, 5.0, 6.0};
vt.position = pos2;
// Normal
Vector3 normal2 = {0.2, 0.3, 0.4};
vt.normal = normal2;
// UV
Vector2 uv2 = {0.0, 1.0};
vt.uv = uv2;
v.push_back(vt);
float* vtx2 = new float[n];
vtx = vtx2;
memcpy(vtx, &v[0], n*sizeof(float));
vtx += n;
for (int i = n; i>0; i--)
{
cout << *(vtx + i * -1) << endl;
}
delete[] vtx1;
delete[] vtx2;
return 0;
}
Here is some corrected code with .normal instead of .position, it doesn't delete random memory by deleting vtx and the second print loop is fixed to show the data in the array instead of the 16 bytes of memory preceding it. It also prints the struct size and offsets in the first line. If you don't get 12 24 32 as the first line, your compiler is padding the structs with empty space which is causing your problems. You can use struct Vertex __attribute__((packed)) to prevent this on GCC or clang. Other compilers have different ways of doing it.
There is a bug in your code:
vt.position = normal1
should read
vt.normal = normal1
And similarly for the second vertex in your vector. Upon fixing that you may find the output matches (it does for me), but it may depend on how your compiler is padding structs.
For example, forcing a different alignment on Vector3 using struct Vector3 {...} __attribute__ ((aligned (16))); will generate "corrupted" output.
I have a uniform variable called control_count (count of the control points in a bezier curve). In the marked part in my code, if I replace the constant 4 with this variable, it's just stops working, if it's 4 it's working fine. The variable must have the value 4 in it, I tested it before and after the loop as well, I marked this in the code too. It should be an unrolling problem? How do I force the compiler not to do this?
#version 150
layout(lines_adjacency) in;
layout(line_strip, max_vertices = 101) out;
out vec4 gs_out_col;
uniform mat4 MVP;
uniform int control_count;
uniform int tess_count;
int degree;
int binom( int n, int k );
void main()
{
degree = control_count - 1;
vec3 b[10];
float B[10];
////////////MARK//////////////////
//control_count must be 4, other ways it'd draw less points
for(int i = 0; i < control_count; ++i){
b[i] = gl_in[i].gl_Position.xyz;
}
////////////END MARK//////////////////
for(int i = 0; i <= tess_count; ++i){
float t = i / float(tess_count);
gl_Position = vec4(0);
////////////MARK//////////////////
//here, if I write control_count instead of 4, I don't get what I expect
for(int j = 0; j < 4; ++j){
////////////END MARK//////////////////
B[j] = binom(3, j) * pow(1 - t, 3 - j) * pow(t, j);
gl_Position += vec4(b[j] * B[j], B[j]);
}
gl_Position = MVP * gl_Position;
////////////MARK//////////////////
//control_count - 4 --> I get red color,
//control_count - 3 --> I get purple,
//so the variable must have the value 4
gs_out_col = vec4(1, 0, control_count - 4, 1);//gl_Position;
////////////END MARK//////////////////
EmitVertex();
}
}
The "good" result using the constant 4:
The "wrong" result using the variable control_count: