How to avoid extra calculations in fragment shader - glsl

im trying to fix this shader. the effects is a radial blur around a point position, passing from the cpu in a array. The calculations works fine for each point and generates de effect, but as you can see in this picture, for each loop the shader keep generate samples, and i dont know how to avoid. i only want the blur for each point in the array
#version 150
in vec2 varyingtexcoord;
uniform sampler2DRect tex0;
uniform int size;
float exposure = 0.79;
float decay = 0.9;
float density = .9;
float weight = .1;
int samples = 25;
out vec4 fragColor;
const int MAX_SAMPLES = 25;
const int N = 3;
uniform vec2 ligthPos [N];
int a = 1;
vec4 halo(vec2 pos){
float illuminationDecay = 1.2;
vec2 texCoord = varyingtexcoord;
vec2 current = pos.xy;
vec2 deltaTextCoord = texCoord - current;
deltaTextCoord *= 1.0 / float(samples) * density;
vec4 color = texture(tex0, texCoord);
for(int i=0; i < MAX_SAMPLES; i++){
texCoord -= deltaTextCoord;
vec4 sample = texture(tex0, texCoord);
sample *= illuminationDecay * weight;
color += sample;
illuminationDecay *= decay;
}
return color;
}
void main(){
vec4 accum = vec4(0.0);
for(int e = 0; e < N;e++){
vec2 current =ligthPos[e];
accum += halo(current);
}
fragColor = (accum) * exposure;
}
this is what happen:

Related

Slower read/write to shared memory in CUDA than in Compute Shader [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 4 months ago.
Improve this question
I am currently comparing the implementation of a n-body simulation in the GPU using CUDA and OpenGL (Compute Shaders) for a project, but I run into a problem using shared memory.
First I implemented the version with no shared memory as follows:
CUDA
#include "helper_math.h"
//...
__device__ float dist2(float3 A, float3 B)
{
float3 C = A - B;
return dot(C, C);
}
__global__ void n_body_vel_calc(float3* positions, float3* velocities,
unsigned numParticles, float mass, float deltaTime)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= numParticles)
return;
const float G = 6.6743e-11f;
float3 cur_position = positions[i];
float3 force = make_float3(0.0f, 0.0f, 0.0f);
for (unsigned j = 0; j < numParticles; ++j)
{
if (i == j)
continue;
float3 neighbor_position = positions[j];
float inv_distance2 = 1.0f / dist2(cur_position, neighbor_position);
float3 direction = normalize(neighbor_position - cur_position);
force += G * mass * mass * inv_distance2 * direction;
}
float3 acceleration = force / mass;
velocities[i] += acceleration * deltaTime;
}
OpenGL
// glBufferStorage(GL_SHADER_STORAGE_BUFFER, ..., ..., ...);
#version 460
layout(local_size_x=128) in;
layout(location = 0) uniform int numParticles;
layout(location = 1) uniform float mass;
layout(location = 2) uniform float dt;
layout(std430, binding=0) buffer pblock { vec3 positions[]; };
layout(std430, binding=1) buffer vblock { vec3 velocities[]; };
float dist2(vec3 A, vec3 B)
{
vec3 C = A - B;
return dot( C, C );
}
void main()
{
int i = int(gl_GlobalInvocationID);
if (i >= numParticles)
return;
const float G = 6.6743e-11f;
vec3 cur_position = positions[i];
vec3 force = vec3(0.0);
for (uint j = 0; j < numParticles; ++j)
{
if (i == j)
continue;
vec3 neighbor_position = positions[j];
float inv_distance2 = 1.0 / dist2(cur_position, neighbor_position);
vec3 direction = normalize(neighbor_position - cur_position);
force += G * mass * mass * inv_distance2 * direction;
}
vec3 acceleration = force / mass;
velocities[i] += acceleration * dt;
}
With the same number of threads per group, number of particles and the same number of times executing the kernel, the CUDA version takes 82 ms and OpengGL takes 70 ms. Weird thing that there speed is much different, but I can attribute that to GLSL having geometric operations optimized somehow.
My problem comes next, when I write the versions with shared memory, which should increase the performance by not reading from global memory multiple times.
CUDA
__global__ void n_body_vel_calc(float3* positions, float3 * velocities, unsigned workgroupSize,
unsigned numParticles, float mass, float deltaTime)
{
// size of array == workgroupSize
extern __shared__ float3 temp_tile[];
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= numParticles)
return;
const float G = 6.6743e-11f;
float3 cur_position = positions[i];
float3 force = make_float3(0.0f, 0.0f, 0.0f);
for (unsigned tile = 0; tile < numParticles; tile += workgroupSize)
{
temp_tile[threadIdx.x] = positions[tile + threadIdx.x];
__syncthreads();
for (unsigned j = 0; j < workgroupSize; ++j)
{
if (i == j || ((tile + j) >= numParticles))
continue;
float3 neighbor_position = temp_tile[j];
float inv_distance2 = 1.0f / dist2(cur_position, neighbor_position);
float3 direction = normalize(neighbor_position - cur_position);
force += G * mass * mass * inv_distance2 * direction;
}
__syncthreads();
}
float3 acceleration = force / mass;
velocities[i] += acceleration * deltaTime;
}
OpenGL
#version 460
layout(local_size_x=128) in;
layout(location = 0) uniform int numParticles;
layout(location = 1) uniform float mass;
layout(location = 2) uniform float dt;
layout(std430, binding=0) buffer pblock { vec3 positions[]; };
layout(std430, binding=1) buffer vblock { vec3 velocities[]; };
// Shared variables
shared vec3 temp_tile[gl_WorkGroupSize.x];
void main()
{
int i = int(gl_GlobalInvocationID);
if (i >= numParticles)
return;
const float G = 6.6743e-11f;
vec3 cur_position = positions[i];
vec3 force = vec3(0.0);
for (uint tile = 0; tile < numParticles; tile += gl_WorkGroupSize.x)
{
temp_tile[gl_LocalInvocationIndex] = positions[tile + gl_LocalInvocationIndex];
groupMemoryBarrier();
barrier();
for (uint j = 0; j < gl_WorkGroupSize.x; ++j)
{
if (i == j || (tile + j) >= numParticles)
continue;
vec3 neighbor_position = temp_tile[j];
float inv_distance2 = 1.0 / dist2(cur_position, neighbor_position);
vec3 direction = normalize(neighbor_position - cur_position);
force += G * mass * mass * inv_distance2 * direction;
}
groupMemoryBarrier();
barrier();
}
vec3 acceleration = force / mass;
velocities[i] += acceleration * dt;
}
My principal problem comes next. With the same parameters as above, the CUDA version increases its execution time to 128 ms (greatly diminishing its performance), and the OpenGL one took 68 (a small improvement over the other version).
I have compiled the CUDA version with the toolkit version 11.7 and 10.0 with MSVC V143 and V142 and the results are more or less the same.
Why the OpenGL implementation is faster with shared memory, but the CUDA one its not? Am I missing something?

Why are my Ray march fragment shader refelction texture lookups slowing my frame rate?

I’ve written a Fragment shader in GLSL, using shader toy.
Link : https://www.shadertoy.com/view/wtGSzy
most of it works, but when I enable texture lookups in the reflection function, the performance drops from 60FPS to 5~FPS.
The code in question is on lines 173 - 176
if(SDFObjectToDraw.texChannelID == 0)
col = texture(iChannel0, uv);
if(SDFObjectToDraw.texChannelID == 1)
col = texture(iChannel1, uv);
This same code can bee seen in my rayMarch function (lines 274-277) and works fine for colouring my objects. It only causes issues in the reflection function.
My question is, why are my texture lookups, in the reflection code, dropping my performance this much and what can I do to improve it?
/**
* Return the normalized direction to march in from the eye point for a single pixel.
*
* fieldOfView: vertical field of view in degrees
* size: resolution of the output image
* fragCoord: the x,y coordinate of the pixel in the output image
*/
vec3 rayDirection(float fieldOfView, vec2 size, vec2 fragCoord) {
vec2 xy = fragCoord - size / 2.0;
float z = size.y / tan(radians(fieldOfView) / 2.0);
return normalize(vec3(xy, -z));
}
float start = 0.0;
vec3 eye = vec3(0,0,5);
int MAX_MARCHING_STEPS = 255;
float EPSILON = 0.00001;
float end = 10.0;
const uint Shpere = 1u;
const uint Box = 2u;
const uint Plane = 4u;
vec3 lightPos = vec3(-10,0,5);
#define M_PI 3.1415926535897932384626433832795
const int SDF_OBJECT_COUNT = 4;
struct SDFObject
{
uint Shape;
vec3 Position;
float Radius;
int texChannelID;
float Ambiant;
float Spec;
float Diff;
vec3 BoxSize;
bool isMirror; //quick hack to get refletions working
};
SDFObject SDFObjects[SDF_OBJECT_COUNT] = SDFObject[SDF_OBJECT_COUNT](
SDFObject(Shpere, vec3(2,0,-3),1.0,0,0.2,0.2,0.8, vec3(0,0,0),true)
,SDFObject(Shpere, vec3(-2,0,-3),1.0,0,0.1,1.0,1.0, vec3(0,0,0),false)
,SDFObject(Box, vec3(0,0,-6),0.2,1,0.2,0.2,0.8, vec3(1.0,0.5,0.5),false)
,SDFObject(Plane, vec3(0,0,0),1.0,1,0.2,0.2,0.8, vec3(0.0,1.0,0.0),false)
);
float shereSDF(vec3 p, SDFObject o)
{
return length(p-o.Position)-o.Radius;
}
float boxSDF(vec3 pointToTest, vec3 boxBoundery, float radius, vec3 boxPos)
{
vec3 q = abs(pointToTest - boxPos) - boxBoundery;
return length(max(q,0.0)) + min(max(q.x, max(q.y,q.z)) ,0.0) -radius;
}
float planeSDF(vec3 p, vec4 n, vec3 Pos)
{
return dot(p-Pos, n.xyz) + n.w;
}
bool IsShadow(vec3 LightPos, vec3 HitPos)
{
bool isShadow = false;
vec3 viewRayDirection = normalize(lightPos- HitPos) ;
float depth = start;
vec3 hitpoint;
for(int i=0; i<MAX_MARCHING_STEPS; i++)
{
hitpoint = (HitPos+ depth * viewRayDirection);
float dist = end;
for(int j =0; j<SDF_OBJECT_COUNT; j++)
{
float distToObjectBeingConsidered;
if(SDFObjects[j].Shape == Shpere)
distToObjectBeingConsidered = shereSDF(hitpoint, SDFObjects[j]);
if(SDFObjects[j].Shape == Box)
distToObjectBeingConsidered = boxSDF(hitpoint, SDFObjects[j].BoxSize , SDFObjects[j].Radius, SDFObjects[j].Position);
if(SDFObjects[j].Shape == Plane)
distToObjectBeingConsidered= planeSDF(hitpoint, vec4(SDFObjects[j].BoxSize, SDFObjects[j].Radius), SDFObjects[j].Position);
if( distToObjectBeingConsidered < dist)
{
dist = distToObjectBeingConsidered;
}
}
if(dist < EPSILON)
{
isShadow = true;
}
depth += dist;
if(depth >= end)
{
isShadow = false;
}
}
return isShadow;
}
vec3 MirrorReflection(vec3 inComingRay, vec3 surfNormal, vec3 HitPos, int objectIndexToIgnore)
{
vec3 returnCol;
vec3 reflectedRay = reflect(inComingRay, surfNormal);
vec3 RayDirection = normalize(reflectedRay) ;
float depth = start;
vec3 hitpoint;
int i;
for(i=0; i<MAX_MARCHING_STEPS; i++)
{
hitpoint = (HitPos+ depth * RayDirection);
SDFObject SDFObjectToDraw;
float dist = end;
for(int j =0; j<SDF_OBJECT_COUNT; j++)
{
float distToObjectBeingConsidered;
if(SDFObjects[j].Shape == Shpere)
distToObjectBeingConsidered = shereSDF(hitpoint, SDFObjects[j]);
if(SDFObjects[j].Shape == Box)
distToObjectBeingConsidered = boxSDF(hitpoint, SDFObjects[j].BoxSize , SDFObjects[j].Radius, SDFObjects[j].Position);
if(SDFObjects[j].Shape == Plane)
distToObjectBeingConsidered= planeSDF(hitpoint, vec4(SDFObjects[j].BoxSize, SDFObjects[j].Radius), SDFObjects[j].Position);
if( distToObjectBeingConsidered < dist && j!= objectIndexToIgnore )// D > 0.0)
{
dist = distToObjectBeingConsidered;
SDFObjectToDraw = SDFObjects[j];
}
}
if(dist < EPSILON)
{
vec3 normal =normalize(hitpoint-SDFObjectToDraw.Position);
float u = 0.5+ (atan(normal.z, normal.x)/(2.0*M_PI));
float v = 0.5+ (asin(normal.y)/(M_PI));
vec2 uv =vec2(u,v);
vec4 col = vec4(0,0.5,0.5,0);
///>>>>>>>>>>>> THESE LINES ARE broken, WHY?
//if(SDFObjectToDraw.texChannelID == 0)
//col = texture(iChannel0, uv);
//if(SDFObjectToDraw.texChannelID == 1)
//col = texture(iChannel1, uv);
vec3 NormalizedDirToLight = normalize(lightPos-SDFObjectToDraw.Position);
float theta = dot(normal,NormalizedDirToLight);
vec3 reflectionOfLight = reflect(NormalizedDirToLight, normal);
vec3 viewDir = normalize(SDFObjectToDraw.Position);
float Spec = dot(reflectionOfLight, viewDir);
if(IsShadow(lightPos, hitpoint))
{
returnCol= (col.xyz*SDFObjectToDraw.Ambiant);
}
else
{
returnCol= (col.xyz*SDFObjectToDraw.Ambiant)
+(col.xyz * max(theta *SDFObjectToDraw.Diff, SDFObjectToDraw.Ambiant));
}
break;
}
depth += dist;
if(depth >= end)
{
//should look up bg texture here but cant be assed right now
returnCol = vec3(1.0,0.0,0.0);
break;
}
}
return returnCol;//*= (vec3(i+1)/vec3(MAX_MARCHING_STEPS));
}
vec3 rayMarch(vec2 fragCoord)
{
vec3 viewRayDirection = rayDirection(45.0, iResolution.xy, fragCoord);
float depth = start;
vec3 hitpoint;
vec3 ReturnColour = vec3(0,0,0);
for(int i=0; i<MAX_MARCHING_STEPS; i++)
{
hitpoint = (eye+ depth * viewRayDirection);
float dist = end;
SDFObject SDFObjectToDraw;
int objectInDexToIgnore=-1;
//find closest objecct to current point
for(int j =0; j<SDF_OBJECT_COUNT; j++)
{
float distToObjectBeingConsidered;
if(SDFObjects[j].Shape == Shpere)
distToObjectBeingConsidered = shereSDF(hitpoint, SDFObjects[j]);
if(SDFObjects[j].Shape == Box)
distToObjectBeingConsidered = boxSDF(hitpoint, SDFObjects[j].BoxSize , SDFObjects[j].Radius, SDFObjects[j].Position);
if(SDFObjects[j].Shape == Plane)
distToObjectBeingConsidered= planeSDF(hitpoint, vec4(SDFObjects[j].BoxSize, SDFObjects[j].Radius), SDFObjects[j].Position);
if( distToObjectBeingConsidered < dist)
{
dist = distToObjectBeingConsidered;
SDFObjectToDraw = SDFObjects[j];
objectInDexToIgnore = j;
}
}
//if we are close enough to an objectoto hit it.
if(dist < EPSILON)
{
vec3 normal =normalize(hitpoint-SDFObjectToDraw.Position);
if(SDFObjectToDraw.isMirror)
{
ReturnColour = MirrorReflection( viewRayDirection, normal, hitpoint, objectInDexToIgnore);
}
else
{
float u = 0.5+ (atan(normal.z, normal.x)/(2.0*M_PI));
float v = 0.5+ (asin(normal.y)/(M_PI));
vec2 uv =vec2(u,v);
vec4 col;
if(SDFObjectToDraw.texChannelID == 0)
col = texture(iChannel0, uv);
if(SDFObjectToDraw.texChannelID == 1)
col = texture(iChannel1, uv);
vec3 NormalizedDirToLight = normalize(lightPos-SDFObjectToDraw.Position);
float theta = dot(normal,NormalizedDirToLight);
vec3 reflectionOfLight = reflect(NormalizedDirToLight, normal);
vec3 viewDir = normalize(SDFObjectToDraw.Position);
float Spec = dot(reflectionOfLight, viewDir);
if(IsShadow(lightPos, hitpoint))
{
ReturnColour= (col.xyz*SDFObjectToDraw.Ambiant);
}
else
{
ReturnColour= (col.xyz*SDFObjectToDraw.Ambiant)
+(col.xyz * max(theta *SDFObjectToDraw.Diff, SDFObjectToDraw.Ambiant));
//+(col.xyz* Spec * SDFObjectToDraw.Spec);
}
}
return ReturnColour;
}
depth += dist;
if(depth >= end)
{
float u = fragCoord.x/ iResolution.x;
float v = fragCoord.y/ iResolution.y;
vec4 col = texture(iChannel2, vec2(u,v));
ReturnColour =col.xyz;
}
}
return ReturnColour;
}
void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
// Normalized pixel coordinates (from 0 to 1)
//vec2 uv = fragCoord/iResolution.xy;
// Time varying pixel color
//vec3 col = 0.5 + 0.5*cos(iTime+uv.xyx+vec3(0,2,4));
// Output to screen
lightPos *= cos(iTime+vec3(1.5,2,2));
//lightPos= vec3(cos(iTime)*2.0,0,0);
vec3 SDFCol= rayMarch(fragCoord);
vec3 col = vec3(0);
//if(SDFVal <=1.0)
// col = vec3(1,0,0);
//col = vec3(SDFVal,0,0);
col = vec3(0.5,0,0);
col = SDFCol;
fragColor = vec4(col,1.0);
}
[...] This same code can bee seen in my rayMarch function (lines 274-277) and works fine for colouring my objects. [...]
The "working" texture lookup is executed in a loop in rayMarch. MAX_MARCHING_STEPS is 255, so the lookup is done at most 255 times.
vec3 rayMarch(vec2 fragCoord)
{
// [...]
for(int i=0; i<MAX_MARCHING_STEPS; i++)
{
// [...]
if(SDFObjectToDraw.texChannelID == 0)
col = texture(iChannel0, uv);
if(SDFObjectToDraw.texChannelID == 1)
col = texture(iChannel1, uv);
// [...]
}
// [...]
}
When you do the lookup in MirrorReflection then the performance breaks down, because it is done in a loop in MirrorReflection and MirrorReflection is called in a loop in rayMarch. In this case the lookup is done up to 255*255 = 65025 times.
~65000 texture lookups for a fragment is far to much and cause the break down of performance.
vec3 MirrorReflection(vec3 inComingRay, vec3 surfNormal, vec3 HitPos, int objectIndexToIgnore)
{
// [...]
for(i=0; i<MAX_MARCHING_STEPS; i++)
{
// [...]
if(SDFObjectToDraw.texChannelID == 0)
col = texture(iChannel0, uv);
if(SDFObjectToDraw.texChannelID == 1)
col = texture(iChannel1, uv);
// [...]
}
// [...]
}
vec3 rayMarch(vec2 fragCoord)
{
// [...]
for(int i=0; i<MAX_MARCHING_STEPS; i++)
{
// [...]
ReturnColour = MirrorReflection(viewRayDirection, normal, hitpoint, objectInDexToIgnore);
// [...]
}
// [...]
}

Depth of field artefacts

I began to implement the depth of field in my application, but I ran into a problem. Artifacts appear in the form of a non-smooth transition between depths.
I'm doing the depth of field in the following way:
With the main scene rendering, I record the blur value in the alpha channel. I do this using this: fragColor.a = clamp(abs(focalDepth + fragPos.z) / focalRange, 0.0, 1.0), where focalDepth = 8, focalRange = 20.
After that I apply a two-step (horizontally and vertically) Gaussian blur with dynamic size and sigma, depending on the blur value (which I previously recorded in the alpha channel)(shader below)
But I have an artifact, where you see a clear transition between the depths.
The whole scene:
And with an increased scale:
My fragment blur shader:
#version 330
precision mediump float;
#define BLOOM_KERNEL_SIZE 8
#define DOF_KERNEL_SIZE 8
/* ^^^ definitions ^^^ */
layout (location = 0) out vec4 bloomFragColor;
layout (location = 1) out vec4 dofFragColor;
in vec2 texCoords;
uniform sampler2D image; // bloom
uniform sampler2D image2; // dof
uniform bool isHorizontal;
uniform float kernel[BLOOM_KERNEL_SIZE];
float dof_kernel[DOF_KERNEL_SIZE];
vec4 tmp;
vec3 bloom_result;
vec3 dof_result;
float fdof;
float dofSigma;
int dofSize;
void makeDofKernel(int size, float sigma) {
size = size * 2 - 1;
float tmpKernel[DOF_KERNEL_SIZE * 2 - 1];
int mean = size / 2;
float sum = 0; // For accumulating the kernel values
for (int x = 0; x < size; x++) {
tmpKernel[x] = exp(-0.5 * pow((x - mean) / sigma, 2.0));
// Accumulate the kernel values
sum += tmpKernel[x];
}
// Normalize the kernel
for (int x = 0; x < size; x++)
tmpKernel[x] /= sum;
// need center and right part
for (int i = 0; i < mean + 1; i++) dof_kernel[i] = tmpKernel[size / 2 + i];
}
void main() {
vec2 texOffset = 1.0 / textureSize(image, 0); // gets size of single texel
tmp = texture(image2, texCoords);
fdof = tmp.a;
dofSize = clamp(int(tmp.a * DOF_KERNEL_SIZE), 1, DOF_KERNEL_SIZE);
if (dofSize % 2 == 0) dofSize++;
makeDofKernel(dofSize, 12.0 * fdof + 1);
bloom_result = texture(image, texCoords).rgb * kernel[0]; // current fragment’s contribution
dof_result = tmp.rgb * dof_kernel[0];
if(isHorizontal) {
for(int i = 1; i < kernel.length(); i++) {
bloom_result += texture(image, texCoords + vec2(texOffset.x * i, 0.0)).rgb * kernel[i];
bloom_result += texture(image, texCoords - vec2(texOffset.x * i, 0.0)).rgb * kernel[i];
}
for(int i = 1; i < dofSize; i++) {
dof_result += texture(image2, texCoords + vec2(texOffset.x * i, 0.0)).rgb * dof_kernel[i];
dof_result += texture(image2, texCoords - vec2(texOffset.x * i, 0.0)).rgb * dof_kernel[i];
}
} else {
for(int i = 1; i < kernel.length(); i++) {
bloom_result += texture(image, texCoords + vec2(0.0, texOffset.y * i)).rgb * kernel[i];
bloom_result += texture(image, texCoords - vec2(0.0, texOffset.y * i)).rgb * kernel[i];
}
for(int i = 1; i < dofSize; i++) {
dof_result += texture(image2, texCoords + vec2(0.0, texOffset.y * i)).rgb * dof_kernel[i];
dof_result += texture(image2, texCoords - vec2(0.0, texOffset.y * i)).rgb * dof_kernel[i];
}
}
bloomFragColor = vec4(bloom_result, 1.0);
dofFragColor = vec4(dof_result, fdof);
}
And the settings for the DOF texture: glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, SCR_W, SCR_H, 0, GL_RGBA, GL_FLOAT, NULL)
Optimization of the shader I'll do later, now I'm very concerned about this artifact. How it can be eliminated? It is desirable not to change the way of realization of the depth of field. But if you know a more productive way - a big request to share it.
I will be grateful for help.
The problem is solved. My mistake was that I changed the size of DOF blur kernel, although I had to change only the sigma. Corrected shader code:
#version 330
precision mediump float;
#define BLOOM_KERNEL_SIZE 8
#define DOF_KERNEL_SIZE 8
/* ^^^ definitions ^^^ */
layout (location = 0) out vec4 bloomFragColor;
layout (location = 1) out vec4 dofFragColor;
in vec2 texCoords;
uniform sampler2D image; // bloom
uniform sampler2D image2; // dof
uniform bool isHorizontal;
uniform float max_sigma = 12.0;
uniform float min_sigma = 0.0001;
uniform float kernel[BLOOM_KERNEL_SIZE];
float dof_kernel[DOF_KERNEL_SIZE];
vec4 tmp;
vec3 bloom_result;
vec3 dof_result;
float fdof;
const int DOF_LCR_SIZE = DOF_KERNEL_SIZE * 2 - 1; // left-center-right (lllcrrr)
const int DOF_MEAN = DOF_LCR_SIZE / 2;
void makeDofKernel(float sigma) {
float sum = 0; // For accumulating the kernel values
for (int x = DOF_MEAN; x < DOF_LCR_SIZE; x++) {
dof_kernel[x - DOF_MEAN] = exp(-0.5 * pow((x - DOF_MEAN) / sigma, 2.0));
// Accumulate the kernel values
sum += dof_kernel[x - DOF_MEAN];
}
sum += sum - dof_kernel[0];
// Normalize the kernel
for (int x = 0; x < DOF_KERNEL_SIZE; x++) dof_kernel[x] /= sum;
}
void main() {
vec2 texOffset = 1.0 / textureSize(image, 0); // gets size of single texel
tmp = texture(image2, texCoords);
fdof = tmp.a;
makeDofKernel(max_sigma * fdof + min_sigma);
bloom_result = texture(image, texCoords).rgb * kernel[0]; // current fragment’s contribution
dof_result = tmp.rgb * dof_kernel[0];
if(isHorizontal) {
for(int i = 1; i < BLOOM_KERNEL_SIZE; i++) {
bloom_result += texture(image, texCoords + vec2(texOffset.x * i, 0.0)).rgb * kernel[i];
bloom_result += texture(image, texCoords - vec2(texOffset.x * i, 0.0)).rgb * kernel[i];
}
for(int i = 1; i < DOF_KERNEL_SIZE; i++) {
dof_result += texture(image2, texCoords + vec2(texOffset.x * i, 0.0)).rgb * dof_kernel[i];
dof_result += texture(image2, texCoords - vec2(texOffset.x * i, 0.0)).rgb * dof_kernel[i];
}
} else {
for(int i = 1; i < BLOOM_KERNEL_SIZE; i++) {
bloom_result += texture(image, texCoords + vec2(0.0, texOffset.y * i)).rgb * kernel[i];
bloom_result += texture(image, texCoords - vec2(0.0, texOffset.y * i)).rgb * kernel[i];
}
for(int i = 1; i < DOF_KERNEL_SIZE; i++) {
dof_result += texture(image2, texCoords + vec2(0.0, texOffset.y * i)).rgb * dof_kernel[i];
dof_result += texture(image2, texCoords - vec2(0.0, texOffset.y * i)).rgb * dof_kernel[i];
}
}
bloomFragColor = vec4(bloom_result, 1.0);
dofFragColor = vec4(dof_result, fdof);
}
Result:

OpenGL "optimizing" uniform variable

I have a uniform variable called control_count (count of the control points in a bezier curve). In the marked part in my code, if I replace the constant 4 with this variable, it's just stops working, if it's 4 it's working fine. The variable must have the value 4 in it, I tested it before and after the loop as well, I marked this in the code too. It should be an unrolling problem? How do I force the compiler not to do this?
#version 150
layout(lines_adjacency) in;
layout(line_strip, max_vertices = 101) out;
out vec4 gs_out_col;
uniform mat4 MVP;
uniform int control_count;
uniform int tess_count;
int degree;
int binom( int n, int k );
void main()
{
degree = control_count - 1;
vec3 b[10];
float B[10];
////////////MARK//////////////////
//control_count must be 4, other ways it'd draw less points
for(int i = 0; i < control_count; ++i){
b[i] = gl_in[i].gl_Position.xyz;
}
////////////END MARK//////////////////
for(int i = 0; i <= tess_count; ++i){
float t = i / float(tess_count);
gl_Position = vec4(0);
////////////MARK//////////////////
//here, if I write control_count instead of 4, I don't get what I expect
for(int j = 0; j < 4; ++j){
////////////END MARK//////////////////
B[j] = binom(3, j) * pow(1 - t, 3 - j) * pow(t, j);
gl_Position += vec4(b[j] * B[j], B[j]);
}
gl_Position = MVP * gl_Position;
////////////MARK//////////////////
//control_count - 4 --> I get red color,
//control_count - 3 --> I get purple,
//so the variable must have the value 4
gs_out_col = vec4(1, 0, control_count - 4, 1);//gl_Position;
////////////END MARK//////////////////
EmitVertex();
}
}
The "good" result using the constant 4:
The "wrong" result using the variable control_count:

Uniform optimization ( - 1 values)

Consider following fragment shader:
uniform PolygonData
{
int count;
float points[1024];
} polygon;
out vec4 outColor;
void main()
{
float j;
for (int i = 0; i < polygon.count; ++i)
{
j = polygon.points[i++];
j = polygon.points[i++];
j = polygon.points[i++];
}
outColor = vec4(1, 1, j, 1);
}
Why is polygon.count and polygon.points optimized out?
#GuyRT
Yes. That was it. From there I got the ingormation that I was exceeding the maximum uniform array size.
This is what i was doing:
Vertex:
in vec3 inPosition;
void main(void)
{
gl_Position = vec4(inPosition, 1.0);
}
Geometry:
layout(lines) in;
layout(triangle_strip) out;
layout(max_vertices = 4) out;
out vec3 worldPos;
uniform mat4 u_proj;
void main()
{
vec4 pos0 = u_proj * gl_in[0].gl_Position;
vec4 pos1 = u_proj * gl_in[1].gl_Position;
//left up
gl_Position.x = pos0.x;
gl_Position.y = pos1.y;
gl_Position.w = pos0.w;
worldPos = gl_Position.xyz;
EmitVertex();
//left down
gl_Position = pos0;
worldPos = gl_Position.xyz;
EmitVertex();
//right up
gl_Position = pos1;
worldPos = gl_Position.xyz;
EmitVertex();
//right down
gl_Position.x = pos1.x;
gl_Position.y = pos0.y;
gl_Position.w = pos1.w;
worldPos = gl_Position.xyz;
EmitVertex();
EndPrimitive();
}
Fragment:
struct PolyData
{
int count;
float points[256];
};
uniform PolyData p;
uniform mat4 u_proj;
in vec3 worldPos;
out vec4 outColor;
void main()
{
float testx = worldPos.x;
float testy = worldPos.y;
bool c = false;
int i;
int j;
for (i = 0, j = p.count - 4; i < p.count; j = i, i = i + 3)
{
vec4 i_vec = u_proj * vec4(p.points[i], p.points[i + 1], 0, 1.0);
vec4 j_vec = u_proj * vec4(p.points[j], p.points[j + 1], 0, 1.0);
if ( (i_vec.y >= testy) != (j_vec.y >= testy) && (testx <= (j_vec.x - i_vec.x) * (testy - i_vec.y) / (j_vec.y - i_vec.y) + i_vec.x))
c = !c;
}
if (c == true)
{
outColor = vec4(1, 1, 0, 1);
}
else
{
outColor = vec4(0, 0, 0, 0);
}
}
Thanks for all the help.