DirectX 11 Compute Shader 5 loop - hlsl

I have the following compute shader code for computing depth of field. However, very unusually, the loop executes just once, even if g_rayCount is 10. Please have a look in the main function raycastercs where the for loop lies.
//--------------------------------------------------------------------------------------
// Compute Shader
//-------------------------------------------------------------------------------------
SamplerState SSLinear
{
Filter = Min_Mag_Linear_Mip_Point;
AddressU = Border;
AddressV = Border;
AddressW = Border;
};
float3 CalculateDoF(uint seedIndex, uint2 fragPos)
{
;
}
[numthreads(RAYCASTER_THREAD_BLOCK_SIZE, RAYCASTER_THREAD_BLOCK_SIZE, 1)]
void RaycasterCS(in uint3 threadID: SV_GroupThreadID, in uint3 groupID: SV_GroupID, in uint3 dispatchThreadID :SV_DispatchThreadID)
{
uint2 fragPos = groupID.xy * RAYCASTER_THREAD_BLOCK_SIZE + threadID.xy;
float4 dstColor = g_texFinal[fragPos];
uint seedIndex = dispatchThreadID.x * dispatchThreadID.y;
float3 final = float3(0, 0, 0);
float color = 0;
[loop][allow_uav_condition]
for (int i = 0; i < g_rayCount; ++i);
{
float3 dof = CalculateDoF(seedIndex, fragPos);
final += dof;
}
final *= 1.0f / ((float) g_rayCount);
g_texFinalRW[fragPos] = float4(final, 1);
}
//--------------------------------------------------------------------------------------
technique10 Raycaster
{
pass RaycastDefault
{
SetVertexShader(NULL);
SetGeometryShader(NULL);
SetPixelShader(NULL);
SetComputeShader(CompileShader(cs_5_0, RaycasterCS()));
}
}

Remove the semicolon at the end of the for statement
for (int i = 0; i < g_rayCount; ++i) // removed semicolon
{
float3 dof = CalculateDoF(seedIndex, fragPos);
final += dof;
}
As I guess you know, the semicolon was just running an empty for loop, then the code in braces was thereafter executed just once.

Related

I'm experiencing very slow OpenGL compute shader compilation (10+ minutes) when using larger work groups, is there anything I can do to speed it up?

So, I'm encountering a really bizarre (at least to me as a compute shader noob) phenomenon when I compile my compute shader using glGetShaderiv(m_shaderID, GL_COMPILE_STATUS, &status). Inexplicably, my compute shader takes much longer to compile when I increase the size of my work groups! When I have one-dimensional work groups, it compiles in less than a second, but when I increase the size of my work groups to 4x1x6, the compute shader takes 10+ minutes to compile! How strange.
For background, I'm trying to implement a light clustering algorithm (essentially the one shown here: http://www.aortiz.me/2018/12/21/CG.html#tiled-shading--forward), and my compute shader is this monster:
// TODO: Figure out optimal tile size, currently using a 16x9x24 subdivision
#define FLT_MAX 3.402823466e+38
#define FLT_MIN 1.175494351e-38
#define DBL_MAX 1.7976931348623158e+308
#define DBL_MIN 2.2250738585072014e-308
layout(local_size_x = 4, local_size_y = 9, local_size_z = 4) in;
// TODO: Change to reflect my light structure
// struct PointLight{
// vec4 position;
// vec4 color;
// uint enabled;
// float intensity;
// float range;
// };
// TODO: Pack this more efficiently
struct Light {
vec4 position;
vec4 direction;
vec4 ambientColor;
vec4 diffuseColor;
vec4 specularColor;
vec4 attributes;
vec4 intensity;
ivec4 typeIndexAndFlags;
// uint flags;
};
// Array containing offset and number of lights in a cluster
struct LightGrid{
uint offset;
uint count;
};
struct VolumeTileAABB{
vec4 minPoint;
vec4 maxPoint;
};
layout(std430, binding = 0) readonly buffer LightBuffer {
Light data[];
} lightBuffer;
layout (std430, binding = 1) buffer clusterAABB{
VolumeTileAABB cluster[ ];
};
layout (std430, binding = 2) buffer screenToView{
mat4 inverseProjection;
uvec4 tileSizes;
uvec2 screenDimensions;
};
// layout (std430, binding = 3) buffer lightSSBO{
// PointLight pointLight[];
// };
// SSBO of active light indices
layout (std430, binding = 4) buffer lightIndexSSBO{
uint globalLightIndexList[];
};
layout (std430, binding = 5) buffer lightGridSSBO{
LightGrid lightGrid[];
};
layout (std430, binding = 6) buffer globalIndexCountSSBO{
uint globalIndexCount;
};
// Shared variables, shared between all invocations WITHIN A WORK GROUP
// TODO: See if I can use gl_WorkGroupSize for this, gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z
// A grouped-shared array which contains all the lights being evaluated
shared Light sharedLights[4*9*4]; // A grouped-shared array which contains all the lights being evaluated, size is thread-count
uniform mat4 viewMatrix;
bool testSphereAABB(uint light, uint tile);
float sqDistPointAABB(vec3 point, uint tile);
bool testConeAABB(uint light, uint tile);
float getLightRange(uint lightIndex);
bool isEnabled(uint lightIndex);
// Runs in batches of multiple Z slices at once
// In this implementation, 6 batches, since each thread group contains four z slices (24/4=6)
// We begin by each thread representing a cluster
// Then in the light traversal loop they change to representing lights
// Then change again near the end to represent clusters
// NOTE: Tiles actually mean clusters, it's just a legacy name from tiled shading
void main(){
// Reset every frame
globalIndexCount = 0; // How many lights are active in t his scene
uint threadCount = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; // Number of threads in a group, same as local_size_x, local_size_y, local_size_z
uint lightCount = lightBuffer.data.length(); // Number of total lights in the scene
uint numBatches = uint((lightCount + threadCount -1) / threadCount); // Number of groups of lights that will be completed, i.e., number of passes
uint tileIndex = gl_LocalInvocationIndex + gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * gl_WorkGroupID.z;
// uint tileIndex = gl_GlobalInvocationID; // doesn't wortk, is uvec3
// Local thread variables
uint visibleLightCount = 0;
uint visibleLightIndices[100]; // local light index list, to be transferred to global list
// Every light is being checked against every cluster in the view frustum
// TODO: Perform active cluster determination
// Each individual thread will be responsible for loading a light and writing it to shared memory so other threads can read it
for( uint batch = 0; batch < numBatches; ++batch){
uint lightIndex = batch * threadCount + gl_LocalInvocationIndex;
//Prevent overflow by clamping to last light which is always null
lightIndex = min(lightIndex, lightCount);
//Populating shared light array
// NOTE: It is VERY important that lightBuffer.data not be referenced after this point,
// since that is not thread-safe
sharedLights[gl_LocalInvocationIndex] = lightBuffer.data[lightIndex];
barrier(); // Synchronize read/writes between invocations within a work group
//Iterating within the current batch of lights
for( uint light = 0; light < threadCount; ++light){
if( isEnabled(light)){
uint lightType = uint(sharedLights[light].typeIndexAndFlags[0]);
if(lightType == 0){
// Point light
if( testSphereAABB(light, tileIndex) ){
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
}
else if(lightType == 1){
// Directional light
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
else if(lightType == 2){
// Spot light
if( testConeAABB(light, tileIndex) ){
visibleLightIndices[visibleLightCount] = batch * threadCount + light;
visibleLightCount += 1;
}
}
}
}
}
// We want all thread groups to have completed the light tests before continuing
barrier();
// Back to every thread representing a cluster
// Adding the light indices to the cluster light index list
uint offset = atomicAdd(globalIndexCount, visibleLightCount);
for(uint i = 0; i < visibleLightCount; ++i){
globalLightIndexList[offset + i] = visibleLightIndices[i];
}
// Updating the light grid for each cluster
lightGrid[tileIndex].offset = offset;
lightGrid[tileIndex].count = visibleLightCount;
}
// Return whether or not the specified light intersects with the specified tile (cluster)
bool testSphereAABB(uint light, uint tile){
float radius = getLightRange(light);
vec3 center = vec3(viewMatrix * sharedLights[light].position);
float squaredDistance = sqDistPointAABB(center, tile);
return squaredDistance <= (radius * radius);
}
// TODO: Different test for spot-lights
// Has been done by using several AABBs for spot-light cone, this could be a good approach, or even just use one to start.
bool testConeAABB(uint light, uint tile){
// Light light = lightBuffer.data[lightIndex];
// float innerAngleCos = light.attributes[0];
// float outerAngleCos = light.attributes[1];
// float innerAngle = acos(innerAngleCos);
// float outerAngle = acos(outerAngleCos);
// FIXME: Actually do something clever here
return true;
}
// Get range of light given the specified light index
float getLightRange(uint lightIndex){
int lightType = sharedLights[lightIndex].typeIndexAndFlags[0];
float range;
if(lightType == 0){
// Point light
float brightness = 0.01; // cutoff for end of range
float c = sharedLights[lightIndex].attributes.x;
float lin = sharedLights[lightIndex].attributes.y;
float quad = sharedLights[lightIndex].attributes.z;
range = (-lin + sqrt(lin*lin - 4.0 * c * quad + (4.0/brightness)* quad)) / (2.0 * quad);
}
else if(lightType == 1){
// Directional light
range = FLT_MAX;
}
else{
// Spot light
range = FLT_MAX;
}
return range;
}
// Whether the light at the specified index is enabled
bool isEnabled(uint lightIndex){
uint flags = sharedLights[lightIndex].typeIndexAndFlags[2];
return (flags | 1) != 0;
}
// Get squared distance from a point to the AABB of the specified tile (cluster)
float sqDistPointAABB(vec3 point, uint tile){
float sqDist = 0.0;
VolumeTileAABB currentCell = cluster[tile];
cluster[tile].maxPoint[3] = tile;
for(int i = 0; i < 3; ++i){
float v = point[i];
if(v < currentCell.minPoint[i]){
sqDist += (currentCell.minPoint[i] - v) * (currentCell.minPoint[i] - v);
}
if(v > currentCell.maxPoint[i]){
sqDist += (v - currentCell.maxPoint[i]) * (v - currentCell.maxPoint[i]);
}
}
return sqDist;
}
Edit: Whoops, lost the bottom part of this!
What I don't understand is why changing the size of the work groups affects compilation time at all? It sort of defeats the point of the algorithm if my work group sizes are too small for the compute shader to run efficiently, so I'm hoping there's something that I'm missing.
As a last note, I'd like to avoid using glGetProgramBinary as a solution. Not only because it merely circumvents the issue instead of solving it, but because pre-compiling shaders will not play nicely with the engine's current architecture.
So, I'm figuring that this must be a bug in the compiler, since I've replaced the loop in my sqDistPointAABB function with:
vec3 minPoint = currentCell.minPoint.xyz;
vec3 maxPoint = currentCell.maxPoint.xyz;
vec3 t1 = vec3(lessThan(point, minPoint));
vec3 t2 = vec3(greaterThan(point, maxPoint));
vec3 sqDist = t1 * (minPoint - point) * (minPoint - point) + t2 * (maxPoint - point) * (maxPoint - point);
return sqDist.x + sqDist.y + sqDist.z;
And it compiles just fine now, in less than a second! So strange

Problem with PathTracing ShadowRay, Spheres all black

So i'm making a raytracer in OpenGL, fully shader based, and i'm struggling to know where the problem is with my Shadow rays. If i multiply the radiance of the object by the shadowRay output, it seems like only the "edge" of the sphere is lighten up
I verified multiple times the code without finding where the problem comes from.
This is what i get:
vec3 TraceShadowRay(vec3 hitPoint, vec3 normal, Object objects[3])
{
Light pointLight;
pointLight.position = vec3(0, 80, 0);
pointLight.intensity = 2;
Ray ShadowRay;
ShadowRay.origin = hitPoint + normal * 1e-4;
ShadowRay.dir = normalize(pointLight.position - ShadowRay.origin);
ShadowRay.t = 100000;
//ShadowRay.dir = vec3(0, 1, 0);
for(int i = 0; i < 3; ++i)
{
if(objects[i].type == 0)
{
if(interectSphere(objects[i].position, objects[i].radius, ShadowRay))
{
return vec3(0);
}
}
if(objects[i].type == 1)
{
if(intersectPlane(objects[i].normal, objects[i].position, ShadowRay))
{
return vec3(0);
}
}
}
float AngleNormalShadow = dot(ShadowRay.dir, normal);
clamp(AngleNormalShadow, 0, 1);
return GetLight(ShadowRay.origin, pointLight);// * AngleNormalShadow;
}
The getLight function:
vec3 GetLight(vec3 origin, Light light)
{
return vec3(1, 1, 1) * light.intensity;
//float dist = sqrt( ((origin.x - light.position.x) * (origin.x - light.position.x)) + ((origin.y - light.position.y) * (origin.y - light.position.y)));
//return (vec3(1, 1, 1) * light.intensity) / (4 * M_PI * ((origin - light.position).length * (origin - light.position).length));
}
The intersectSphere function:
bool interectSphere(const vec3 center, float radius, inout Ray r)
{
vec3 o = r.origin;
vec3 d = r.dir;
vec3 v = o - center;
float b = 2 * dot(v, d);
float c = dot(v, v) - radius*radius;
float delta = b*b - 4 * c;
if(delta < 1e-4)
return false;
float t1 = (-b - sqrt(delta))/2;
float t2 = (-b + sqrt(delta))/2;
if(t1 < t2)
{
r.t = t1;
r.t2 = t2;
}
else if(t2 < t1)
{
r.t = t2;
r.t2 = t1;
}
r.reflectionNormal = normalize((r.origin + r.dir * r.t) - center);
return true;
}
The result expected is a nice shaded sphere with light coming from the top of the spheres
Could it be a missing negation? Looks like interectSphere() returns true when there is a collision, but the calling code in TraceShadowRay() bails out when it returns true.
old:
if(interectSphere(objects[i].position, objects[i].radius, ShadowRay))
{
return vec3(0);
}
new:
if(!interectSphere(objects[i].position, objects[i].radius, ShadowRay))
{
return vec3(0);
}

Skinning with Assimp.Net and OpenTK

I'm trying to implement skeletal animation using Assimp.net and OpenTK and have been following this tutorial but I cannot get it to work.
The model appears fine with identity matrices but is terribly garbled when using the transforms I generate from Assimp.
I suspect the issue is the way I am combining all of the matrices or that there is a difference in OpenTK that I am not realising. I have making similar adjustments from the tutorial as suggested here: Matrix calculations for gpu skinning
but it is still garbled, just differently, I have also tried converting all Assimp matrices to OpenTK matrices before performing any multiplication. These are the areas of the code related to the matrices, I can provide more if needed:
Matrix Conversion
public static OpenTK.Matrix4 TKMatrix(Assimp.Matrix4x4 input)
{
return new OpenTK.Matrix4(input.A1, input.B1, input.C1, input.D1,
input.A2, input.B2, input.C2, input.D2,
input.A3, input.B3, input.C3, input.D3,
input.A4, input.B4, input.C4, input.D4);
}
Storing the GLobal Inverse
public class LoaderMesh
{
public Scene mScene;
public Mesh mMesh;
public OpenTK.Matrix4 GlobalInverseTransform { get; set; }
public LoaderMesh(Scene aiScene, Mesh aiMesh)
{
mScene = aiScene;
mMesh = aiMesh;
GlobalInverseTransform = Util.TKMatrix(mScene.RootNode.Transform);
GlobalInverseTransform.Invert();
}
Loading the bones
public void LoadBones(List<VBO.Vtx_BoneWeight.Vtx> boneData)
{
for (uint iBone = 0; iBone < mMesh.BoneCount; ++iBone)
{
uint boneIndex = 0;
String bonename = mMesh.Bones[iBone].Name;
if (!BoneMapping.ContainsKey(bonename))
{
boneIndex = (uint)NumBones;
NumBones++;
BoneInfo bi = new BoneInfo();
BoneInfos.Add(bi);
}
else
{
boneIndex = BoneMapping[bonename];
}
BoneMapping[bonename] = boneIndex;
BoneInfos[(int)boneIndex].OffsetMatrix = Util.TKMatrix(mMesh.Bones[iBone].OffsetMatrix);
for (uint iWeight = 0; iWeight < mMesh.Bones[iBone].VertexWeightCount; iWeight++)
{
uint VertexID = /*m_Entries[MeshIndex].BaseVertex*/ mMesh.Bones[iBone].VertexWeights[iWeight].VertexID;
float Weight = mMesh.Bones[iBone].VertexWeights[iWeight].Weight;
VBO.Vtx_BoneWeight.Vtx vtx = boneData[(int)VertexID];
VBO.Vtx_BoneWeight.AddWeight(ref vtx, boneIndex, Weight);
boneData[(int)VertexID] = vtx;
}
}
}
Calculating the Transforms
public void ReadNodeHierarchy(float animationTime, Node aiNode, ref OpenTK.Matrix4 parentTransform)
{
String NodeName = aiNode.Name;
Animation animation = mScene.Animations[0];
OpenTK.Matrix4 NodeTransformation = Util.TKMatrix(aiNode.Transform);
NodeAnimationChannel nodeAnim = FindNodeAnim(animation, NodeName);
OpenTK.Matrix4 localTransform = OpenTK.Matrix4.Identity;
if (nodeAnim != null)
{
// Interpolate scaling and generate scaling transformation matrix
Vector3D Scaling = new Vector3D();
CalcInterpolatedScaling(ref Scaling, animationTime, nodeAnim);
Console.WriteLine("Scaling: " + Scaling.ToString());
OpenTK.Matrix4 ScalingM = Util.TKMatrix(Matrix4x4.FromScaling(Scaling));
// Interpolate rotation and generate rotation transformation matrix
Quaternion RotationQ = new Quaternion();
CalcInterpolatedRotation(ref RotationQ, animationTime, nodeAnim);
Console.WriteLine("Rotation: " + RotationQ.ToString());
OpenTK.Matrix4 RotationM = Util.TKMatrix(RotationQ.GetMatrix());
// Interpolate translation and generate translation transformation matrix
Vector3D Translation = new Vector3D();
CalcInterpolatedPosition(ref Translation, animationTime, nodeAnim);
Console.WriteLine("Transform: " + Translation.ToString());
OpenTK.Matrix4 TranslationM = Util.TKMatrix(Matrix4x4.FromTranslation(Translation));
// Combine the above transformations
NodeTransformation = TranslationM * RotationM * ScalingM;
localTransform = TranslationM * RotationM * ScalingM;
}
OpenTK.Matrix4 GlobalTransformation = parentTransform * NodeTransformation;
OpenTK.Matrix4 parentPass = OpenTK.Matrix4.Identity;
if (BoneMapping.ContainsKey(NodeName) == true)
{
uint BoneIndex = BoneMapping[NodeName];
//BoneInfos[(int)BoneIndex].FinalTransformation = GlobalInverseTransform * BoneInfos[(int)BoneIndex].OffsetMatrix * GlobalTransformation;
BoneInfos[(int)BoneIndex].NodeTransformation = parentTransform * Util.TKMatrix(aiNode.Transform) * localTransform;
parentPass = BoneInfos[(int)BoneIndex].NodeTransformation;
BoneInfos[(int)BoneIndex].FinalTransformation = GlobalInverseTransform * BoneInfos[(int)BoneIndex].NodeTransformation * BoneInfos[(int)BoneIndex].OffsetMatrix;
}
for (uint i = 0; i < aiNode.ChildCount; i++)
{
ReadNodeHierarchy(animationTime, aiNode.Children[i], ref parentPass);
}
}
And this is the vertex shader code
#version 400
layout(location = 0)in vec4 vert;
layout(location = 1)in vec4 normal;
layout(location = 2)in vec4 texCoord;
layout(location = 3)in vec4 tanCoord;
layout(location = 4)in ivec4 boneIDs;
layout(location = 5)in vec4 boneWeights;
uniform mat4 projectionMtx;
uniform mat4 viewMtx;
uniform mat4 modelMtx;
const int MAX_BONES = 100;
uniform mat4 bones[MAX_BONES];
out vec3 positionFrg_CS;
out vec3 normalFrg_CS;
out vec3 tanCoordFrg_CS;
out vec3 bitCoordFrg_CS;
out vec4 texCoordFrg;
void main()
{
mat4 BoneTransform = bones[boneIDs[0]] * boneWeights[0];
BoneTransform += bones[boneIDs[1]] * boneWeights[1];
BoneTransform += bones[boneIDs[2]] * boneWeights[2];
BoneTransform += bones[boneIDs[3]] * boneWeights[3];
gl_Position = projectionMtx * viewMtx * modelMtx * BoneTransform * vert;
}
Is there anything I am doing wrong multiplying the matrices together?
In reply to livin_amuk, I have got this working, at least well enough for my needs, however I fixed this 6 months ago and my memory is vague...
If I remember correctly my main issue was the bone/vertex indices, I think I messed up the BaseVertex because I was in a rush. Here is my current working LoadBones function.
public void LoadBones(List<VBO.Vtx_BoneWeight.Vtx> boneData, SubMesh mesh)
{
for (int iBone = 0; iBone < mesh.mMesh.BoneCount; ++iBone)
{
uint boneIndex = 0;
String bonename = mesh.mMesh.Bones[iBone].Name;
if (!BoneMapping.ContainsKey(bonename))
{
boneIndex = (uint)NumBones;
NumBones++;
BoneInfo bi = new BoneInfo();
BoneInfos.Add(bi);
//Note, I have these two lines included inside the if statement, the original tut does not. Not sure if it makes a difference.
BoneMapping[bonename] = boneIndex;
BoneInfos[(int)boneIndex].OffsetMatrix = AssimpToOpenTK.TKMatrix(mesh.mMesh.Bones[iBone].OffsetMatrix);
}
else
{
boneIndex = BoneMapping[bonename];
}
for (int iWeight = 0; iWeight < mesh.mMesh.Bones[iBone].VertexWeightCount; iWeight++)
{
//My question has the mesh.BaseVertex commented out. it is important!
long VertexID = mesh.BaseVertex + mesh.mMesh.Bones[iBone].VertexWeights[iWeight].VertexID;
float Weight = mesh.mMesh.Bones[iBone].VertexWeights[iWeight].Weight;
VBO.Vtx_BoneWeight.Vtx vtx = boneData[(int)VertexID];
VBO.Vtx_BoneWeight.AddWeight(ref vtx, boneIndex, Weight);
boneData[(int)VertexID] = vtx;
}
}
}
I also had the transforms backwards. Read node hierarchy function.
public void ReadNodeHierarchy(float animationTime, Node aiNode, ref OpenTK.Matrix4 parentTransform)
{
String NodeName = aiNode.Name;
Animation animation = mScene.Animations[0];
OpenTK.Matrix4 NodeTransformation = AssimpToOpenTK.TKMatrix(aiNode.Transform);
NodeAnimationChannel nodeAnim = FindNodeAnim(animation, NodeName);
if (nodeAnim != null)
{
// Interpolate scaling and generate scaling transformation matrix
Vector3D Scaling = new Vector3D();
CalcInterpolatedScaling(ref Scaling, animationTime, nodeAnim);
OpenTK.Matrix4 ScalingM = AssimpToOpenTK.TKMatrix(Matrix4x4.FromScaling(Scaling));
// Interpolate rotation and generate rotation transformation matrix
Quaternion RotationQ = new Quaternion();
CalcInterpolatedRotation(ref RotationQ, animationTime, nodeAnim);
OpenTK.Matrix4 RotationM = AssimpToOpenTK.TKMatrix(RotationQ.GetMatrix());
// Interpolate translation and generate translation transformation matrix
Vector3D Translation = new Vector3D();
CalcInterpolatedPosition(ref Translation, animationTime, nodeAnim);
OpenTK.Matrix4 TranslationM = AssimpToOpenTK.TKMatrix(Matrix4x4.FromTranslation(Translation));
// Combine the above transformations
//All that local transform stuff is gone. The order of the transforms is reversed from my question AND the original tut.
NodeTransformation = ScalingM * RotationM * TranslationM;
}
//Also reversed.
OpenTK.Matrix4 GlobalTransformation = NodeTransformation * parentTransform;
//GlobalTransformation = OpenTK.Matrix4.Identity;
if (BoneMapping.ContainsKey(NodeName) == true)
{
uint BoneIndex = BoneMapping[NodeName];
//Also, Also, reversed.
BoneInfos[(int)BoneIndex].FinalTransformation = BoneInfos[(int)BoneIndex].OffsetMatrix * GlobalTransformation * GlobalInverseTransform;
}
for (int i = 0; i < aiNode.ChildCount; i++)
{
ReadNodeHierarchy(animationTime, aiNode.Children[i], ref GlobalTransformation);
}
}
The Matrix conversion at the top is also correct, as is the Shader code.

Compute shaders : error in the initialization of textures

I have an image2DArray in my compute shaders with 7 slices.
I can write in it with the function imageStore without problem and also display these textures.
My problem comes with the initialization, I try to initialize my textures but I can't. Indeed, I make a loop for the initialization :
for(int i=0; i<N; i++){
imageStore( outputTexture , ivec3(texel, i), vec4(0));
}
When N = 7, nothing is displayed but when N < 7 everything works well and my textures initialized.
Is someone can explain me why I can't initialize correctly my image2DArray ?
Edit :
What I test to see that : try to write in all slices of the texture and display it. It works fine but data from the previous frame stay if I don't initialize the texture. So, I initialize all pixels of the slices to 0 but nothing display anymore if N=7.
Some code :
#version 430 compatibility
layout(rgba8) coherent uniform image2DArray outputTexture;
...
void main(){
ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
ivec2 outSize = imageSize( outputTexture ).xy;
if( texel.x >= outSize.x || texel.y >= outSize.y )
return;
initializeMeshSet( meshSet );
vec4 pWorld = texelFetch(gBuffer[0],texel,0);
pWorld /= pWorld.w;
vec4 nWorld = texelFetch(gBuffer[1],texel,0);
nWorld /= nWorld.w;
if( length(nWorld.xyz) < 0.1 ){
for(int i=0; i<4; i++){
imageStore( outputTexture , ivec3(texel, i), vec4(0));
}
return;
}
if(nbFrame == 0){
float value = treatment(texel, pWorld, nWorld.xyz, outSize.x);
imageStore( outputTexture, ivec3(texel, 0), vec4(vec3(value),1.0));
imageStore( outputTexture, ivec3(texel, 1), vec4(0.0,0.0,0.0, 1.0));
}
else if(nbFrame == 1){
float value = treatment2(texel, pWorld, nWorld.xyz, outSize.x);
vec3 previousValue = imageLoad(outputTexture, ivec3(texel, 1)).xyz * (nbFrame - 1);
value += previousValue;
value /= nbFrame;
imageStore( outputTexture, ivec3(texel, 1), vec4(vec3(value), 1.0));
}
}

GLSL linker error(Sampler needs to be a uniform (global or parameter to main))

we have a GLSL fragment shader :
but the problem is in this code
vec4 TFSelection(StrVolumeColorMap volumeColorMap , vec4 textureCoordinate)
{
vec4 finalColor = vec4(0.0);
if(volumeColorMap.TransferFunctions[0].numberOfBits == 0)
{
return texture(volumeColorMap.TransferFunctions[0].TransferFunctionID,textureCoordinate.x);
}
if(textureCoordinate.x == 0)
return finalColor;
float deNormalize = textureCoordinate.x *65535/*255*/;
for(int i = 0; i < volumeColorMap.TransferFunctions.length(); i++)
{
int NormFactor = volumeColorMap.TransferFunctions[i].startBit + volumeColorMap.TransferFunctions[i].numberOfBits;
float minval = CalculatePower(2, volumeColorMap.TransferFunctions[i].startBit);
if(deNormalize >= minval)
{
float maxval = CalculatePower(2, NormFactor);
if(deNormalize <maxval)
{
//float tempPower = CalculatePower(2 , NormFactor);
float coord = deNormalize /maxval/*tempPower*/;
return texture(volumeColorMap.TransferFunctions[i].TransferFunctionID,coord);
}
}
}
return finalColor;
}
when we compile and link shader this message logs:
Sampler needs to be a uniform (global or parameter to main), need to
inline function or resolve conditional expression
with a simple change like maybe the shader link successfully like changing
float `coord = deNormalize /maxval
to
float coord = deNormalize .`
driver:nvidia 320.49