I have coded a voxelization based raytracer which is working as expected but is very slow.
Currently the raytracer code is as follows:
#version 430
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;
out vec4 fragment_color;
struct Voxel
{
vec4 position;
vec4 normal;
vec4 color;
};
struct Node
{
//children of the current node
int children[8];
};
layout(std430, binding = 0) buffer voxel_buffer
{
//last layer of the tree, the leafs
Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
//tree structure
Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
uint t_index;
};
uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels
#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
bool inside = true;
position-=corner;//coordinate of the position relative to the box coordinate system
//Test that all coordinates are inside the box, if any is outisde, the point is out the box
for(int i=0; i<3; i++)
{
inside = inside && (position[i] > -EPSILON);
inside = inside && (position[i] < size+EPSILON);
}
return inside;
}
//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
dir = normalize(dir);
vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box
float coeffs[6];
//Calculate the intersaction coefficients with te 6 bonding planes
coeffs[0] = (corner0.x - origin.x)/(dir.x);
coeffs[1] = (corner0.y - origin.y)/(dir.y);
coeffs[2] = (corner0.z - origin.z)/(dir.z);
coeffs[3] = (corner1.x - origin.x)/(dir.x);
coeffs[4] = (corner1.y - origin.y)/(dir.y);
coeffs[5] = (corner1.z - origin.z)/(dir.z);
//by default the distance to the box is infinity
float t = 1.f/0.f;
for(uint i=0; i<6; i++){
//if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
//The distance is the minumum of the previous calculated distance and the current distance
t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
}
return t;
}
#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;
void push(int node, int level, vec3 corner)
{
nodes[sp] = node;
levels[sp] = level;
positions[sp] = corner;
sp++;
}
void main()
{
int count = 0; //count the iterations of the algorithm
vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
vec3 dir = r;
r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position
fragment_color = vec4(0);
int max_level = int(log2(voxel_resolution));//height of the tree
push(0,0,vec3(-cube_dim));//set the stack
float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
//tree variables
int level=0;
int node=0;
vec3 corner;
do
{
//pop from stack
sp--;
node = nodes[sp];
level = levels[sp];
corner = positions[sp];
//set the size of the current voxel
float size = cube_dim / pow(2,level);
//set the corners of the children
vec3 corners[] =
{corner, corner+vec3(0,0,size),
corner+vec3(0, size,0), corner+vec3(0,size,size),
corner+vec3(size,0,0), corner+vec3(size,0,size),
corner+vec3(size,size,0), corner+vec3(size,size,size)};
float coeffs[8];
for(int child=0; child<8; child++)
{
//Test non zero childs, zero childs are empty and thus should be discarded
coeffs[child] = tree[node].children[child]>0?
//Get the distance to your child if it's not empty or infinity if it's empty
boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
}
int indices[8] = {0,1,2,3,4,5,6,7};
//sort the children from closest to farthest
for(uint i=0; i<8; i++)
{
for(uint j=i; j<8; j++)
{
if((coeffs[j] < coeffs[i]))
{
float swap = coeffs[i];
coeffs[i] = coeffs[j];
coeffs[j] = swap;
int iSwap = indices[i];
indices[i] = indices[j];
indices[j] = iSwap;
vec3 vSwap = corners[i];
corners[i] = corners[j];
corners[j] = vSwap;
}
}
}
//push to stack
for(uint i=7; i>=0; i--)
{
if(!isinf(coeffs[i]))
{
push(tree[node].children[indices[i]],
level+1, corners[i]);
}
}
count++;
}while(level < (max_level-1) && sp>0);
//set color
fragment_color = vec4(count)/100;
}
As it may not be fully clear what this does, let me explain.
We check ray-box intersections starting with a big cube. If we hit it we test intersection with the 8 cubes that compose it.
If we hit any fo those we check intersections with the 8 cubes that make up that cube.
In 2D this would look as follows:
In this case we have 4 layers, we check the big box first, then the ones colored in red, then the ones colored in green, and finally the ones colored in blue.
Printing out the number of times the raytracing step executed as a color (which is what the code snippet I have provided does)
results in the following image:
As you can see, most of the time the shader doesn't do more than 100 iterations.
However this shader takes 200 000 microseconds to execute on average in a gtx 1070.
Since the issue is not number of executions, my problem is likely to be on thread execution.
Does anyone know how I could optimize this code?
The biggest botttleneck seems to be the use of a stack.
If I run the same code without pushing to the stack (generating wrong output), there is a 10 fold improvement in runtime
It seems you test for intersection with the ray most of all voxels in each level of the octree. And sort them (by some distance) also in each level.
I propose another approach.
If the ray intersects with the bounding box (level 0 of the octree) it makes it at two faces of the box. Or in a corner or an edge, these are "corner" cases.
Finding the 3D ray-plane intersection can be done like here. Finding if the intersection is inside the face (quad) can be done by testing if the point is inside of one of the two triangles of the face, like here.
Get the farthest intersection I0 from the camera. Also let r be a unit vector of the ray in the direction I0 toward the camera.
Find the deepest voxel for I0 coordinates. This is the farthest voxel from the camera.
Now we want the exit-coordinates I0e for the ray in that voxel, through another face. While you could do again the calculations for all 6 faces, if your voxels are X,Y,X aligned and you define the ray in the same coordinates system as the octree, then calculae simplify a lot.
Apply a little displacement (e.g. a 1/1000 of the smallest voxel size) to I0e by the r unit vector of the ray: I1 = I0e + r/1000. Find the voxel to these I1. This is the next voxel in the sorted list of voxel-ray intersections.
Repeat finding I1e then I2 then I2e then I3 etc. until the bounding box is exited. The list of crossed voxels is sorted.
Working with the octree can be optimized depending on how you store its info: All possible nodes or just used. Nodes with data or just "pointers" to another container with the data. This is matter for another question.
The first thing that stands out is your box intersection function. Have a look at inigo quilez' procedural box function for a much faster version. Since your boxsize is uniform in all axes and you don't need outNormal, you can get an even lighter version. In essence, use maths instead of the brute force approach that tests each box plane.
Also, try to avoid temporary storage where possible. For example, the corners array could be computed on demand for each octree box. Of course, with the above suggestion, these will be changed to box centers.
Since nodes, levels and positions are always accessed together, try co-locating them in a new single struct and access them as a single unit.
Will look more later...
Thread execution on a GPU may be massively parallel, but that doesn’t mean that all threads run independently from one another. Groups of threads execute exactly the same instructions, the only difference is the input data. That means that branches and therefore loops can’t make a thread diverge in execution and therefore also not let them terminate early.
Your example shows the most extreme edge case of this: when there is a high likelyhood that in a group of threads all work that’s done is relevant to one thread only.
To alleviate this, you should try to reduce the difference in execution length (iterations in your case) for threads in a group (or in total). This can be done by setting a limit on the number of iterations per shader pass and rescheduling only those threads/pixels that need more iterations.
I'm having trouble figuring out how to ensure particles aligned in a square will always be placed in the middle of the screen, regardless of the size of the square. The square is created with:
for(int i=0; i<(int)sqrt(d_MAXPARTICLES); i++) {
for(int j=0; j<(int)sqrt(d_MAXPARTICLES); j++) {
Particle particle;
glm::vec2 d2Pos = glm::vec2(j*0.06, i*0.06) + glm::vec2(-17.0f,-17.0f);
particle.pos = glm::vec3(d2Pos.x,d2Pos.y,-70);
particle.life = 1000.0f;
particle.cameradistance = -1.0f;
particle.r = d_R;
particle.g = d_G;
particle.b = d_B;
particle.a = d_A;
particle.size = d_SIZE;
d_particles_container.push_back(particle);
}
}
the most important part is the glm::vec2(-17.0f, -17.0f) which correctly positions the square in the center of the screen. This looks like:
the problem is that my program supports any number of particles, so only specifying
now my square is off center, but how can I change glm::vec2(-17.0f,-17.0f) to account for different particles?
Do not make position dependent on "i", and "j" indices if you want a fixed position.
glm::vec2 d2Pos = glm::vec2(centerOfScreenX,centerOfScreenY); //much better
But how to compute centerOfSCreen? It depends if you are using a 2D or a 3D camera.
If you use a fixed 2D camera, then center is (Width/2,Height/2).
If you use a moving 3d camera, you need to launch a ray from the center of the screen and get any point on the ray (so you just use X,Y and then set Z as you wish)
Edit:
Now that the question is clearer here is the answer:
int maxParticles = (int)sqrt(d_MAXPARTICLES);
factorx = (i-(float)maxParticles/2.0f)/(float)maxParticles;
factory = (j-(float)maxParticles/2.0f)/(float)maxParticles;
glm::vec2 particleLocaleDelta = glm::vec2(extentX*factorx,extentY*factory)
glm::vec2 d2Pos = glm::vec2(centerOfScreenX,centerOfScreenY)
d2Pos += particleLocaleDelta;
where
extentX,extentY
are the dimensions of the "big square" and factor is the current scale by "i" and "j". The code is not optimized. Just thinked to work (assuming you have a 2D camera with world units corresponding to pixel units).
I would like to combine mouse and keyboard inputs with the Oculus Rift to create a smooth experience for the user. The goals are:
Positional movement 100% controlled by the keyboard relative to the direction the person is facing.
Orientation controlled 100% by HMD devices like the Oculus Rift.
Mouse orbit capabilities adding to the orientation of the person using the Oculus Rift. For example, if I am looking left I can still move my mouse to "move" more leftward.
Now, I have 100% working code for when someone doesn't have an Oculus Rift, I just don't know how to combine the orientation and other elements of the Oculus Rift to my already working code to get it 100%.
Anyway, here is my working code for controlling the keyboard and mouse without the Oculus Rift:
Note that all of this code assumes a perspective mode of the camera:
/*
Variables
*/
glm::vec3 DirectionOfWhereCameraIsFacing;
glm::vec3 CenterOfWhatIsBeingLookedAt;
glm::vec3 PositionOfEyesOfPerson;
glm::vec3 CameraAxis;
glm::vec3 DirectionOfUpForPerson;
glm::quat CameraQuatPitch;
float Pitch;
float Yaw;
float Roll;
float MouseDampingRate;
float PhysicalMovementDampingRate;
glm::quat CameraQuatYaw;
glm::quat CameraQuatRoll;
glm::quat CameraQuatBothPitchAndYaw;
glm::vec3 CameraPositionDelta;
/*
Inside display update function.
*/
DirectionOfWhereCameraIsFacing = glm::normalize(CenterOfWhatIsBeingLookedAt - PositionOfEyesOfPerson);
CameraAxis = glm::cross(DirectionOfWhereCameraIsFacing, DirectionOfUpForPerson);
CameraQuatPitch = glm::angleAxis(Pitch, CameraAxis);
CameraQuatYaw = glm::angleAxis(Yaw, DirectionOfUpForPerson);
CameraQuatRoll = glm::angleAxis(Roll, CameraAxis);
CameraQuatBothPitchAndYaw = glm::cross(CameraQuatPitch, CameraQuatYaw);
CameraQuatBothPitchAndYaw = glm::normalize(CameraQuatBothPitchAndYaw);
DirectionOfWhereCameraIsFacing = glm::rotate(CameraQuatBothPitchAndYaw, DirectionOfWhereCameraIsFacing);
PositionOfEyesOfPerson += CameraPositionDelta;
CenterOfWhatIsBeingLookedAt = PositionOfEyesOfPerson + DirectionOfWhereCameraIsFacing * 1.0f;
Yaw *= MouseDampingRate;
Pitch *= MouseDampingRate;
CameraPositionDelta = CameraPositionDelta * PhysicalMovementDampingRate;
View = glm::lookAt(PositionOfEyesOfPerson, CenterOfWhatIsBeingLookedAt, DirectionOfUpForPerson);
ProjectionViewMatrix = Projection * View;
The Oculus Rift provides orientation data via their SDK and can be accessed like so:
/*
Variables
*/
ovrMatrix4f OculusRiftProjection;
glm::mat4 Projection;
OVR::Quatf OculusRiftOrientation;
glm::quat CurrentOrientation;
/*
Partial Code for retrieving projection and orientation data from Oculus SDK
*/
OculusRiftProjection = ovrMatrix4f_Projection(MainEyeRenderDesc[l_Eye].Desc.Fov, 10.0f, 6000.0f, true);
for (int o = 0; o < 4; o++){
for (int i = 0; i < 4; i++) {
Projection[o][i] = OculusRiftProjection.M[o][i];
}
}
Projection = glm::transpose(Projection);
OculusRiftOrientation = PredictedPose.Orientation.Conj();
CurrentOrientation.w = OculusRiftOrientation.w;
CurrentOrientation.x = OculusRiftOrientation.x;
CurrentOrientation.y = OculusRiftOrientation.y;
CurrentOrientation.z = OculusRiftOrientation.z;
CurrentOrientation = glm::normalize(CurrentOrientation);
After that last line the glm based quaterion "CurrentOrientation" has the correct information which, if plugged straight into an existing MVP matrix structure and sent into OpenGL will allow you to move your head around in the environment without issue.
Now, my problem is how to combine the two parts together successfully.
When I have done this in the past it results in the rotation stuck in place (when you turn your head left you keep rotating left as opposed to just rotating in the amount that you turned) and the fact that I can no longer accurately determine the direction the person is facing so that my position controls work.
So at that point since I can no longer determine what is "forward" my position controls essentially become crap...
How can I successfully achieve my goals?
I've done some work on this by maintaining a 'camera' matrix which represents the position and orientation of they player, and then during rendering, composing that with the most recent orientation data collected from the headset.
I have a single interaction class which is designed to pull input from a variety of sources, including keyboard and joystick (as well as a spacemouse, or a Razer Hydra).
You'll probably find it easier to maintain the state as a single combined matrix like I do, rather than trying to compose a lookat matrix every frame.
If you look at my Rift.cpp base class for developing my examples you'll see that I capture keyboard input and accumulate it in the CameraControl instance. This is accumulated in the instance so that during the applyInteraction call later we can apply movement indicated by the keyboard, along with other inputs:
void RiftApp::onKey(int key, int scancode, int action, int mods) {
...
// Allow the camera controller to intercept the input
if (CameraControl::instance().onKey(player, key, scancode, action, mods)) {
return;
}
...
}
In my per-frame update code I query any other enabled devices and apply all the inputs to the matrix. Then I update the modelview matrix with the inverse of the player position:
void RiftApp::update() {
...
CameraControl::instance().applyInteraction(player);
gl::Stacks::modelview().top() = glm::inverse(player);
...
}
Finally, in my rendering code I have the following, which applies the headset orientation:
void RiftApp::draw() {
gl::MatrixStack & mv = gl::Stacks::modelview();
gl::MatrixStack & pr = gl::Stacks::projection();
for_each_eye([&](ovrEyeType eye) {
gl::Stacks::with_push(pr, mv, [&]{
ovrPosef renderPose = ovrHmd_BeginEyeRender(hmd, eye);
// Set up the per-eye modelview matrix
{
// Apply the head pose
glm::mat4 m = Rift::fromOvr(renderPose);
mv.preMultiply(glm::inverse(m));
// Apply the per-eye offset
glm::vec3 eyeOffset = Rift::fromOvr(erd.ViewAdjust);
mv.preMultiply(glm::translate(glm::mat4(), eyeOffset));
}
// Render the scene to an offscreen buffer
frameBuffers[eye].activate();
renderScene();
frameBuffers[eye].deactivate();
ovrHmd_EndEyeRender(hmd, eye, renderPose, &eyeTextures[eye].Texture);
});
GL_CHECK_ERROR;
});
...
}
I'm trying to implement skeletal animation in a small program I'm writing. The idea is to calculate the transformation matrix on the CPU every frame by interpolating keyframe data, then feeding this data to my vertex shader which multiplies my vertices by this matrix like this:
vec4 v = animationMatrices[int(boneIndices.x)] * gl_Vertex * boneWeights.x;
Where boneWeights and boneIndices are attributes and animationMatrices is a uniform array of transformation matrices updated every frame before drawing. (The idea is to have multiple bones affecting one vertex eventually, but right now I'm testing with one bone per vertex so just taking the weight.x and indices.x is enough).
Now the problem is calculating the transformation matrix for each bone. My transformation matrix for the single joint is good, the problem is that it always takes (0,0,0) as pivot instead of the pivot. I took the joint matrices from the COLLADA which correctly shows my skeleton when I draw them like this:
public void Draw()
{
GL.PushMatrix();
drawBone(Root);
GL.PopMatrix();
}
private void drawBone(Bone b)
{
GL.PointSize(50);
GL.MultMatrix(ref b.restMatrix);
GL.Begin(BeginMode.Points);
GL.Color3((byte)0, (byte)50, (byte)0);
if (b.Name == "Blades")
{
GL.Vertex3(0, 0, 0);
}
GL.End();
foreach (Bone bc in b.Children)
{
GL.PushMatrix();
drawBone(bc);
GL.PopMatrix();
}
}
So now to calculate the actual matrix I've tried:
Matrix4 jointMatrix = b.restMatrixInv * boneTransform * b.restMatrix;
or according to the collada documentation (this doesn't really make sense to me):
Matrix4 jointMatrix = b.restMatrix * b.restMatrixInv * boneTransform;
And I know I also have to put the parent matrix in here somewhere, I'm guessing something like this:
Matrix4 jointMatrix = b.restMatrixInv * boneTransform * b.restMatrix * b.Parent.jointMatrix;
But at the moment I'm mostly just confused and any push in the right direction would help. I really need to get this order right...
When I load an .obj file in GMax it is positioned in the center of the space (0,0,0).
How can I change this position? Is there any special function?
I don't want to use glTranslatef. Rather I would like the whole pModel to move (the pModel structure to change). I found the function glmScale. Is there anything similar for translating or rotating?
When I load the obj I do smth like this:
pModelScaun=glmReadOBJ(filename);
glmUnitize(pModelScaun);
glmFacetNormals(pModelScaun);
glmVertexNormals(pModelScaun,90.0);
and then I use the triangle to determine the light position and the shadow frascum
for (unsigned int i = 0; i < pModelScaun->numtriangles; i++)
{
//compute the light vector (between the center of the current
//triangle and the position of the light (converted to object space)
for (unsigned int j = 0; j < 3; j++)
{
fvIncidentLightDir[j] = (pModelScaun->vertices[3*pModelScaun->triangles[i].vindices[0]+j] +
pModelScaun->vertices[3*pModelScaun->triangles[i].vindices[1]+j] +
pModelScaun->vertices[3*pModelScaun->triangles[i].vindices[2]+j]) / 3.0 - lp[j];
}
Can you point me a way in which I could use the transformation matrices in this situation?
Since you have are using the vertices in object-space, and have transformed the light into their object-space, you can multiply the fvIncidentLightDir by your object's transformation matrix to transform it back into world space.