I am rendering a large layered terrain with a texture array that has normal, roughness, diffuse, and AO maps. However, each piece of terrain is too large to texture in detail (128x128), so I repeat each of the textures by scaling the UV and using GL_REPEAT. When I use a scale, the FPS drops from 170 to 100. Nvidia Nsight reports that the texture cache is used ineffectively.
As opposed to:
Here is the important part of the shader:
// Uses cache ineffectively
// vec2 coord = data.Texcoord * 16.0;
vec2 coord = data.Texcoord;
mat.Diffuse = (texture(TerrainAtlas, vec3(coord, 0)).rgb * alpha.r + texture(TerrainAtlas, vec3(coord, 5)).rgb * alpha.g);
mat.Normal = data.TBNMatrix * (texture(TerrainAtlas, vec3(coord, 1)).rgb * alpha.r + texture(TerrainAtlas, vec3(coord, 6)).rgb * alpha.g);
mat.Metalness = 0;
mat.Roughness = texture(TerrainAtlas, vec3(coord, 1)).r * alpha.r + texture(TerrainAtlas, vec3(coord, 7)).r * alpha.g;
mat.AO = texture(TerrainAtlas, vec3(coord, 2)).r * alpha.r + texture(TerrainAtlas, vec3(coord, 8)).r * alpha.g;
mat.Alpha = texture(TerrainAtlas, vec3(coord, 0)).a;
Is there any way to repeat a texture while using the cache effectively (without doing so manually by increasing texture size)?
I decreased the tileable textures' sizes from 1024x1024 to 256x256 and that seemed to do the trick. Turns out the GPU (GTX 1080 with 2048 KB of L2 cache) ran out of cache space every time repeat.
Related
I use a shader that has a rotation over time option, and it worked great for years,
But after updating Unity (2017.2 to 2018.2) I get this error- "Shader error in 'Custom/NewSurfaceShader': Too many texture interpolators would be used for ForwardBase pass (11 out of max 10) "
and the material using this shader became white.
I don't know what to do, I looked online but everyone has a different problem
Here is my code:
Shader "Custom/NewSurfaceShader" {
Properties{
//Tint
_Color("Color", Color) = (1,1,1,1)
//Textures and Alphas
_TexOne("Texture One (RGB)", 2D) = "white" {}
_TexTwo("Texture Two (RGB)", 2D) = "white" {}
_AlphaTexOne("Alpha One (A)", 2D) = "white" {}
_AlphaTexTwo("Alpha Two(A)", 2D) = "white" {}
_AlphaTexThree("Alpha Two(A)", 2D) = "white" {}
_Brightness("Brightness", Range(0,5)) = 1
_AlphaWeakness("Alpha Weakness", Range(0,10)) = 1
_ScrollSpeed1X("Scroll Speed Texture One X", Range(-10,10)) = 0
_ScrollSpeed1Y("Scroll Speed Texture One Y", Range(-10,10)) = 0
_ScrollSpeed2X("Scroll Speed Texture Two X", Range(-10,10)) = 0
_ScrollSpeed2Y("Scroll Speed Texture Two Y", Range(-10,10)) = 0
_ScrollSpeedAlpha1X("Scroll Speed Alpha One X", Range(-10,10)) = 0
_ScrollSpeedAlpha1Y("Scroll Speed Alpha One Y", Range(-10,10)) = 0
_ScrollSpeedAlpha2X("Scroll Speed Alpha Two X", Range(-10,10)) = 0
_ScrollSpeedAlpha2Y("Scroll Speed Alpha Two Y", Range(-10,10)) = 0
_RotationSpeed1("Rotation Speed Texture 1", Float) = 0.0
_RotationCenter1("Rotation Center Texture 1", Range(0,1)) = 0.5
_RotationSpeed2("Rotation Speed Texture 2", Float) = 0.0
_RotationCenter2("Rotation Center Texture 2", Range(0,1)) = 0.5
_Speed("Wave Speed", Range(-80, 80)) = 5
_Freq("Frequency", Range(0, 5)) = 2
_Amp("Amplitude", Range(-1, 1)) = 1
}
SubShader{
//Default Queues - Background, Geometry, AlphaTest, Transparent, and Overlay
Tags{ "Queue" = "Transparent" "IgnoreProjector" = "True" "RenderType" = "Transparent" }
LOD 200
CGPROGRAM #pragma surface surf Lambert alpha:fade vertex:vert
//sampler2D _Color;
sampler2D _TexOne;
sampler2D _TexTwo;
sampler2D _AlphaTexOne;
sampler2D _AlphaTexTwo;
sampler2D _AlphaTexThree;
fixed4 _Color;
float _ScrollSpeed1X;
float _ScrollSpeed1Y;
float _ScrollSpeed2X;
float _ScrollSpeed2Y;
float _ScrollSpeedAlpha1X;
float _ScrollSpeedAlpha1Y;
float _ScrollSpeedAlpha2X;
float _ScrollSpeedAlpha2Y;
float _RotationSpeed1;
float _RotationCenter1;
float _RotationSpeed2;
float _RotationCenter2;
float _Brightness;
float _AlphaWeakness;
float _RotationSpeed;
float _Speed;
float _Freq;
float _Amp;
float _OffsetVal;
struct Input {
float2 uv_TexOne;
float2 uv_TexTwo;
float2 uv_AlphaTexOne;
float2 uv_AlphaTexTwo;
float2 uv_AlphaTexThree;
};
void vert(inout appdata_full v) {
float time = _Time * _Speed;
// float waveValueA = sin(time + v.vertex.x * _Freq) * _Amp;
// v.vertex.xyz = float3(v.vertex.x, v.vertex.y + waveValueA, v.vertex.z);
// v.normal = normalize(float3(v.normal.x + waveValueA, v.normal.y, v.normal.z));
}
// This is the only code you need to touch
void surf(Input IN, inout SurfaceOutput o) {
//Rotation
float sinX, cosX, sinY;
float2x2 rotationMatrix;
sinX = sin(_RotationSpeed1 * _Time);
cosX = cos(_RotationSpeed1 * _Time);
sinY = sin(_RotationSpeed1 * _Time);
rotationMatrix = float2x2(cosX, -sinX, sinY, cosX);
//Center the rotation point and apply rotation
IN.uv_TexOne.xy -= _RotationCenter1;
IN.uv_TexOne.xy = mul(IN.uv_TexOne.xy, rotationMatrix);
IN.uv_TexOne.xy += _RotationCenter1;
sinX = sin(_RotationSpeed2 * _Time);
cosX = cos(_RotationSpeed2 * _Time);
sinY = sin(_RotationSpeed2 * _Time);
rotationMatrix = float2x2(cosX, -sinX, sinY, cosX);
//Center the rotation point and apply rotation
IN.uv_TexTwo.xy -= _RotationCenter2;
IN.uv_TexTwo.xy = mul(IN.uv_TexTwo.xy, rotationMatrix);
IN.uv_TexTwo.xy += _RotationCenter2;
//Scrolling
IN.uv_TexOne.x -= _ScrollSpeed1X * _Time;
IN.uv_TexOne.y -= _ScrollSpeed1Y * _Time;
IN.uv_TexTwo.x -= _ScrollSpeed2X * _Time;
IN.uv_TexTwo.y -= _ScrollSpeed2Y * _Time;
IN.uv_AlphaTexOne.x -= _ScrollSpeedAlpha1X * _Time;
IN.uv_AlphaTexOne.y -= _ScrollSpeedAlpha1Y * _Time;
IN.uv_AlphaTexTwo.x -= _ScrollSpeedAlpha2X * _Time;
IN.uv_AlphaTexTwo.y -= _ScrollSpeedAlpha2Y * _Time;
//Textures
fixed4 c1 = tex2D(_TexOne, IN.uv_TexOne) * (_Color * _Brightness); // This is your color texture
fixed4 c2 = tex2D(_TexTwo, IN.uv_TexTwo) * (_Color * _Brightness); // This is your color texture
//Alphas
fixed4 a1 = tex2D(_AlphaTexOne, IN.uv_AlphaTexOne); // This is your alpha texture
fixed4 a2 = tex2D(_AlphaTexTwo, IN.uv_AlphaTexTwo); // This is your alpha texture
fixed4 a3 = tex2D(_AlphaTexThree, IN.uv_AlphaTexThree); // This is your alpha texture
//Assignment
o.Albedo = (c1.rgb * c2.rgb * 2); // Setting your color from the one texture
o.Alpha = ((a1.a * a2.a * 2) * a3.a * 2) *_AlphaWeakness; // Setting your alpha from the other texture
}
ENDCG
}
}
Straightforward solution: target your shader for newer platform (3.5 or higher) by adding
#pragma target 3.5 after CGPROGRAM:
CGPROGRAM #pragma surface surf Lambert alpha:fade vertex:vert
#pragma target 3.5
This is because in shader model 3.0 maximum 10 interpolators are available, i.e. your Input structure may have maximum 10 float fields. Now your structure has exactly 10 (each float2 is 2), but don't forget that the engine may have some internal interpolations that are done behind the scenes and do not come from your input data. This is the case and as a result you have 11 interpolators.
If you target older platforms, you will need to think how to optimize your shader, as there are too many field in Input structure. For example, do you really need 3 alpha channels? Do you use it all? Maybe remove uv_AlphaTexThree?
I'm porting some old OpenGL 1.2 bitmap font rendering code to modern OpenGL (at least OpenGL 3.2+), and I'm wondering if I can use a GLSL shader to achieve what I've been doing manually.
When I want to draw the string "123", scaled to particular size, I do the following steps with the sprites below.
I draw the sprite to the screen, scaled 2x with GL_NEAREST. However, to get a black outline, I actually draw the sprite several times.
x + 1, y + 0, BLACK
x + 0, y + 1, BLACK
x - 1, y + 0, BLACK
x + 0, y - 1, BLACK
x + 0, y + 0, COLOR (RED)
After the sprites have been drawn to the screen, I copy the screen to a texture, via glCopyTexSubImage2D.
I draw that texture back to the screen, but with GL_LINEAR.
The end result is a more visually appealing form of scaling pixel sprites. When upscaling small pixel sprites to arbitrary dimensions, using just GL_NEAREST (bottom-right) or just GL_LINEAR (bottom-left) gives an effect I don't like. Pixel doubling with GL_NEAREST, and then do the remaining scaling with GL_LINEAR, gives a result that I prefer (top).
I'm pretty sure GLSL can do the black outline (thus saving me from having to do lots of draws), but could it also do the combination of GL_NEAREST and GL_LINEAR scaling?
You could achieve the effect of "2x nearest-neighbour upscaling followed by linear sampling" by pretending to sample a 4-texel neighbourhood from the upscaled texture while in reality sampling them from the original one. Then you'll have to implement bilinear interpolation manually. If you were targeting OpenGL 4+, textureGather() would be useful, though do keep this issue in mind. In my proposed solution below, I'll be using 4 texelFetch() calls, rather than textureGather(), as textureGather() would complicate things quite a bit.
Suppose you have an unscaled texture with black borders around the glyphs already present. Let's assume you have a normalized texture coordinate of vec2 pn = ... into that texture, where pn.x and pn.y are between 0 and 1. The following code should achieve the desired effect, though I haven't tested it:
ivec2 origTexSize = textureSize(sampler, 0);
int upscaleFactor = 2;
// Floating point texel coordinate into the upscaled texture.
vec2 ptu = pn * vec2(origTexSize * upscaleFactor);
// Decompose "ptu - 0.5" into the integer and fractional parts.
vec2 ptuf;
vec2 ptui = modf(ptu - 0.5, ptuf);
// Integer texel coordinates into the upscaled texture.
ivec2 ptu00 = ivec2(ptui);
ivec2 ptu01 = ptu00 + ivec2(0, 1);
ivec2 ptu10 = ptu00 + ivec2(1, 0);
ivec2 ptu11 = ptu00 + ivec2(1, 1);
// Integer texel coordinates into the original texture.
ivec2 pt00 = clamp(ptu00 / upscaleFactor, ivec2(0), origTexSize - 1);
ivec2 pt01 = clamp(ptu01 / upscaleFactor, ivec2(0), origTexSize - 1);
ivec2 pt10 = clamp(ptu10 / upscaleFactor, ivec2(0), origTexSize - 1);
ivec2 pt11 = clamp(ptu11 / upscaleFactor, ivec2(0), origTexSize - 1);
// Sampled colours.
vec4 clr00 = texelFetch(sampler, pt00, 0);
vec4 clr01 = texelFetch(sampler, pt01, 0);
vec4 clr10 = texelFetch(sampler, pt10, 0);
vec4 clr11 = texelFetch(sampler, pt11, 0);
// Bilinear interpolation.
vec4 clr0x = mix(clr00, clr01, ptuf.y);
vec4 clr1x = mix(clr10, clr11, ptuf.y);
vec4 clrFinal = mix(clr0x, clr1x, ptuf.x);
I have an height cube map and I want to generate a normal cube map texture from it. My height cube map is just a 2048x2048 image that I load at the beginning of the application for each face of the cube, and I can modify in real time a "maximum height" value which is used as a multiplicator when retrieving a pixel in the height map.
Initially I was calculating the normals in the vertex shader, but it gave me bad lighting results so I decided to move the calculations in the fragment shader.
As the height map does not change every frame (only when I modify the "maximum height" value), I want to generate a normal map texture from it, using a compute shader because I don't need any rasterization, but it gives me very poor performances.
With the fragment shader I ran at 200FPS but using the compute shader I run at 40 FPS.
Here is how I bind my images and start the compute work:
_computeShaderProgram.use();
glUniform1f(_computeShaderProgram.getUniformLocation("maxHeight"), maxHeight);
glBindImageTexture(
0,
static_cast<GLuint>(heightMap),
0,
GL_TRUE,
0,
GL_READ_ONLY,
GL_RGBA32F
);
glBindImageTexture(
1,
static_cast<GLuint>(normalMap),
0,
GL_TRUE,
0,
GL_WRITE_ONLY,
GL_RGBA32F
);
// Start compute work
// I only compute for one face of the cube map
glDispatchCompute(normalMap.getWidth() / 16, normalMap.getWidth() / 16, 1);
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
And the compute shader:
#version 430 core
#extension GL_ARB_compute_shader : enable
layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
layout(rgba32f, binding = 0) readonly uniform imageCube heightMap;
layout(rgba32f, binding = 1) writeonly uniform imageCube normalMap;
uniform float maxHeight;
float getHeight(ivec3 heightMapCoord) {
vec4 heightMapValue = imageLoad(heightMap, heightMapCoord);
return heightMapValue.r * maxHeight;
}
void main() {
ivec3 textCoord = ivec3(gl_GlobalInvocationID);
// Calculate height of neighbors
float leftCubePosHeight = getHeight(textCoord + ivec3(-1, 0, 0));
float rightCubePosHeight = getHeight(textCoord + ivec3(1, 0, 0));
float topCubePosHeight = getHeight(textCoord + ivec3(0, -1, 0));
float bottomCubePosHeight = getHeight(textCoord + ivec3(0, 1, 0));
// Calculate normal using central differences method
vec3 horizontal = vec3(2.0, rightCubePosHeight - leftCubePosHeight, 0.0);
vec3 vertical = vec3(0.0, bottomCubePosHeight - topCubePosHeight, 2.0);
vec3 normal = normalize(cross(vertical, horizontal));
imageStore(normalMap, textCoord, vec4(normal, 1.0));
}
I tried with different work groups sizes (width, width / 8, width / 16, width / 32) and local sizes (1, 8, 16, 32) but the performance is always poor, around 40 FPS or 20 FPS for work group with a size of the full width.
I know I can use shared memory for threads in the same work group to prevent fetching the same texture coordinate 4 times but later I will have height map generated procedurally and will be larger than 2048x2048 I think.
What is the difference between the fragment shader and the compute shader that make it so slow ? Am I doing something wrong ?
Is there any other solutions to generate this normal map ?
EDIT:
The fps I gave above are not right because I was generating 1/16 of the normal map (when I had 40FPS), and I also used the central differences technique to calculate the normals, which is cheap but does not give good lighting results, so I switched to Sobel technique, which is a little more expensive.
I made some tests to know which technique could give the best performance.
Each frame I generate the normal map (this will not be the case later, but it's just to test the performance). Here are my tests:
CPU side single thread: 1.5FPS
Compute shader with local sizes of 1 and one worker group for each image pixel: 4FPS
Compute shader with local sizes of 16 and one worker group for each 16x16 image pixels block: 11FPS
Fragment shader using framebuffer and MRT with 6 color attachments (one for each face of the normal map): 12.5FPS
This is a little laggy when I modify the max height (which generate the normal map again), but I think it's okay as I won't modify it a lot.
This question is related to Repeating OpenGL-es texture bound to hills in cocos2d 2.0
After reading the answers posted in the above post, I've used the following code for computing the vertices and texture coordinates:
CGPoint pt0,pt1;
float ymid = (p0.y + p1.y) / 2;
float ampl = (p0.y - p1.y) / 2;
pt0 = p0;
float U_Off = floor(pt0.x / 512);
for (int j=1; j<_segments+1; j++)
{
pt1.x = p0.x + j*_dx;
pt1.y = ymid + ampl * cosf(_da*j);
float xTex0 = pt0.x/512 - U_Off;
_vertices[vertices++]=CGPointMake(pt0.x, 0);
_vertices[vertices++]=CGPointMake(pt0.x, pt0.y);
_texCoords[texCoords++]=CGPointMake(xTex0, 1.0f);
_texCoords[texCoords++]=CGPointMake(xTex0, 0);
pt0 = pt1;
}
p0 = p1;
But unfortunately, I still get a tear / misalignment in my texture (circled in yellow):
I've attached dumps of the arrays of vertices and texcoords
I'm new to OpenGl, and can't figure out where the miscalculation is. How do I prevent the line (circled in yellow in image) from appearing ?
EDIT: My texture is either 1024x512 or 512x512 depending on the device. I use the following texture parameters:
ccTexParams tp2 = {GL_LINEAR, GL_LINEAR, GL_REPEAT, GL_CLAMP_TO_EDGE};
Most likely the reason is in non-continuous texture coordinates.
In texcoords dump you have the following coordinates:
(CGPoint) 0x34b0b28 = (x=1.00390625, y=0)
(CGPoint) 0x34b0b30 = (x=0.005859375, y=1)
It means that between these two points texture is mapped from 1 to 0 (in reverse direction). You should continue texcoords after 1.00390625 => 1.005859375 => ... Also, your texture must have power-of-two size and must be set up with REPEAT mode.
If your texture is in atlas and you cannot set REPEAT mode, you may try to clamp texcoords to [0; 1] range and place two edge points with x=1 and x=0 in the same position.
And, at last, if your texture doesn't change in x-axis you may set x = 0.5 for all points.
I need to write a function which shall take a sub-rectangle from a 2D texture (non power-of-2) and copy it to a destination sub-rectangle of an output 2D texture, using a shader (no glSubImage or similar).
Also the source and the destination may not have the same size, so I need to use linear filtering (or even mipmap).
void CopyToTex(GLuint dest_tex,GLuint src_tex,
GLuint src_width,GLuint src_height,
GLuint dest_width,GLuint dest_height,
float srcRect[4],
GLuint destRect[4]);
Here srcRect is in normalized 0-1 coordinates, that is the rectangle [0,1]x[0,1] touch the center of every border pixel of the input texture.
To achieve a good result when the input and source dimensions don't match, I want to use a GL_LINEAR filtering.
I want this function to behave in a coherent manner, i.e. calling it multiple times with many subrects shall produce the same result as one invocation with the union of the subrects; that is the linear sampler should sample the exact center of the input pixel.
Moreover, if the input rectangle fit exactly the destination rectangle an exact copy should occur.
This seems to be particularly hard.
What I've got now is something like this:
//Setup RTT, filtering and program
float vertices[4] = {
float(destRect[0]) / dest_width * 2.0 - 1.0,
float(destRect[1]) / dest_height * 2.0 - 1.0,
//etc..
};
float texcoords[4] = {
(srcRect[0] * (src_width - 1) + 0.5) / src_width - 0.5 / dest_width,
(srcRect[1] * (src_height - 1) + 0.5) / src_height - 0.5 / dest_height,
(srcRect[2] * (src_width - 1) + 0.5) / src_width + 0.5 / dest_width,
(srcRect[3] * (src_height - 1) + 0.5) / src_height + 0.5 / dest_height,
};
glBegin(GL_QUADS);
glTexCoord2f(texcoords[0], texcoords[1]);
glVertex2f(vertices[0], vertices[1]);
glTexCoord2f(texcoords[2], texcoords[1]);
glVertex2f(vertices[2], vertices[1]);
//etc...
glEnd();
To write this code I followed the information from this page.
This seems to work as intended in some corner cases (exact copy, copying a row or a column of one pixel).
My hardest test case is to perform an exact copy of a 2xN rectangle when both the input and output textures are bigger than 2xN.
I probably have some problem with offsets and scaling (the trivial ones don't work).
Solution:
The 0.5/tex_width part in the definition of the texcoords was wrong.
An easy way to work around is to completely remove that part.
float texcoords[4] = {
(srcRect[0] * (src_width - 1) + 0.5) / src_width,
(srcRect[1] * (src_height - 1) + 0.5) / src_height,
(srcRect[2] * (src_width - 1) + 0.5) / src_width,
(srcRect[3] * (src_height - 1) + 0.5) / src_height
};
Instead, we draw a smaller quad, by offsetting the vertices by:
float dx = 1.0 / (dest_rect[2] - dest_rect[0]) - epsilon;
float dy = 1.0 / (dest_rect[3] - dest_rect[1]) - epsilon;
// assume glTexCoord for every vertex
glVertex2f(vertices[0] + dx, vertices[1] + dy);
glVertex2f(vertices[2] - dx, vertices[1] + dy);
glVertex2f(vertices[2] - dx, vertices[3] - dy);
glVertex2f(vertices[0] + dx, vertices[3] - dy);
In this way we draw a quad which pass from the exact center of every border pixel.
Since OpenGL may or may not draw the border pixels in this case, we need the epsilons.
I believe that my original solution (don't offset vertex coords) can still work, but need a bit of extra math to compute the right offsets for the texcoords.