Creating and addressing large buffers in OpenGL (on the scale of GBs)

Creating and addressing large buffers in OpenGL (on the scale of GBs) - opengl

I was surprised to find my shaders would start reading zeroes out of buffers when addressing higher indices. I'm guessing this has something to do with the precision of the addressing internals in the driver. I don't ever get any out of memory error, shaders just seem to silently stop accessing them. Correct me if I'm wrong but I believe CUDA supports 64 bit pointers and large amounts of memory just fine.
I've built a MWE (below) where I create a buffer one vec4 shy of 2GB. If I hit or go over 2GB the shaders don't write anything even to the first element. Using image_load_store to write to the buffer in a shader only works up to 512MiB. I have much more luck with bindless graphics, which correctly writes to the entire buffer, but am still stuck with a max of 2GB even though I can create a larger buffer and it seems bindless graphics uses 64 bit addressing, so I don't see any reason this limit should exist.
How can I create and use buffers larger than 2GB with OpenGL?
I'm using a GTX Titan (6GB).
//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>
const char* imageSource =
"#version 440\n"
"uniform layout(rgba32f) imageBuffer data;\n"
"uniform float val;\n"
"void main() {\n"
" imageStore(data, gl_VertexID, vec4(val));\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"uniform vec4* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_VertexID] = vec4(val);\n"
" gl_Position = vec4(0.0);\n"
"}\n";
GLuint compile(GLenum type, const char* shaderSrc)
{
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
glCompileShader(shader);
int success = 0;
int loglen = 0;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
GLchar* log = new GLchar[loglen];
glGetShaderInfoLog(shader, loglen, &loglen, log);
if (!success)
{
printf("%s\n", log);
exit(0);
}
GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
return program;
}
int main(int argc, char** argv)
{
float* check;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
glutCreateWindow("test");
glewInit();
GLsizeiptr bufferSize = 1024 * 1024 * 1024; //1GB
bufferSize *= 2;
bufferSize -= 16;
GLsizeiptr numFloats = bufferSize/sizeof(float);
GLsizeiptr numVec4s = bufferSize/(sizeof(float)*4);
float testVal = 123.123f;
glEnable(GL_RASTERIZER_DISCARD);
float* dat = new float[numFloats];
memset(dat, 0, bufferSize);
//create a buffer with data
GLuint buffer;
glGenBuffers(1, &buffer);
glBindBuffer(GL_TEXTURE_BUFFER, buffer);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);
//get a bindless address
GLuint64 address;
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
//make a texture alias for it
GLuint bufferTexture;
glGenTextures(1, &bufferTexture);
glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer); //should be GL_RGBA32F (see update)
glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F); //should be GL_RGBA32F (see update)
//compile the shaders
GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
//initialize buffer
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
//run image_load_store
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), testVal);
glDrawArrays(GL_POINTS, 0, numVec4s);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
for (GLsizeiptr i = 0; i < numFloats; ++i)
{
if (check[i] != testVal)
{
printf("failed image_load_store: dat[%td] = %f (%fMiB)\n", i, check[i], (double)i*sizeof(float)/1024.0/1024.0);
break;
}
}
glUnmapBuffer(GL_TEXTURE_BUFFER);
//initialize buffer
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
//run bindless
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), testVal);
glDrawArrays(GL_POINTS, 0, numVec4s);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
for (GLsizeiptr i = 0; i < numFloats; ++i)
{
if (check[i] != testVal)
{
printf("failed bindless: dat[%td] = %f (%fMiB)\n", i, check[i], (double)i*sizeof(float)/1024.0/1024.0);
break;
}
}
glUnmapBuffer(GL_TEXTURE_BUFFER);
return 0;
}
This is the output I get:
> make && ./a.out
g++ -lGL -lGLEW -lglut main.c
failed image_load_store: dat[134217727] = 0.000000 (511.999996MiB)
UPDATE:
Found a mistake. The GL_R32F internal format should be GL_RGBA32F, allows image_load_store to reach the ~2GB mark. The program correctly executes with no output until the size reaches 2GB or more at which point it still fails for both image_load_store and bindless.
GL_MAX_TEXTURE_BUFFER_SIZE is 134217728 for me, which puts the max size at exactly 2GB for RGBA32F. However my question about getting larger than 2GB remains. Sure, I could allocate multiple buffers but that's a bunch of house keeping and overhead I'd prefer not to deal with.

You might need to go vendor-specific; for NVIDIA, you have the following extensions available, allowing you to use 64 bit-sized addresses (and buffer sizes) in the shaders:
https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_buffer_load.txt
https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_buffer_store.txt
Basically, you can start using pointers inside GLSL with it, and pass them up as 64bit values from the CPU host.
The maximum buffer size is returned by
GLuint64EXT max_shader_buffer_address;
glGetIntegerui64vNV(GL_MAX_SHADER_BUFFER_ADDRESS_NV, &max_shader_buffer_address);
printf("Maximum shader buffer address: %lu\n", max_shader_buffer_address);
on my machine with an RTX 3070, it is 18446744073709551615.

Related

OpenGL debug context warning - "Will use VIDEO memory as the source for buffer objection

I'm jumping through the hoops right now to learn opengl and I've come across an issue. On my desktop computer with a nvidia gtx 780 opengl is printing out a warning via the glDebugMessageCallback mechanism:
"Buffer object 1 (bound to _GL_ARRAY_BUFFER_ARB, usage hint is GL_STATIC_DRAW) will use VIDEO memory as the source for buffer object operations."
I'm rendering 1 cube with a vertex and index buffer so this message repeats for every buffer object I'm creating (2 of them). However, there is also one final warning at the end which states:
"Vertex shader in program 3 is being recompiled based on GL State."
I'm still able to render my cube I was rendering before, but the color I had set is flashing between white and the color now. I searched online and found this answer - https://community.khronos.org/t/nvidia-output-debug-error-131185/66033 - which basically said this is nothing but a warning and everything should be fine but that wouldn't explain why my cube is flashing between white and my color now. This same code is working fine on my laptop (2019 Asus laptop which also has a nvidia GTX graphics chip). Anyone come across this issue before? Here is the relevant code:
const char* vertexShaderCode =
R"HereDoc(
#version 430
in layout(location=0) vec3 position;
in layout(location=1) vec3 color;
out vec3 fragColor;
uniform mat4 transformationMatrix;
void main()
{
vec4 newPos = vec4(position, 1.0) * transformationMatrix;//vector is on the left side because my matrices are row major
gl_Position = newPos;
vec3 changedColors;
changedColors.r += color.r + 0;
changedColors.g += color.g + 0;
changedColors.b += color.b + 0;
fragColor = changedColors;
};
)HereDoc";
const char* fragmentShaderCode =
R"HereDoc(
#version 430
out vec4 color;
in vec3 fragColor;
void main()
{
color = vec4(fragColor, 1.0f);
};
)HereDoc";
void GLAPIENTRY MyOpenGLErrorCallbackFunc(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *userParam)
{
BGZ_CONSOLE("%s type=0x%x %s\n", ( type == GL_DEBUG_TYPE_ERROR ? "** GL ERROR **" : "" ), type, message);
};
void CheckCompileStatus(GLuint shaderID)
{
GLint compileStatus;
glGetShaderiv(shaderID, GL_COMPILE_STATUS, &compileStatus);
if(compileStatus != GL_TRUE)
{
GLint infoLogLength;
glGetShaderiv(shaderID, GL_INFO_LOG_LENGTH, &infoLogLength);
GLchar buffer[512] = {};
GLsizei bufferSize;
glGetShaderInfoLog(shaderID, infoLogLength, &bufferSize, buffer);
BGZ_CONSOLE("%s", buffer);
InvalidCodePath;
};
};
void CheckLinkStatus(GLuint programID)
{
GLint linkStatus;
glGetProgramiv(programID, GL_LINK_STATUS, &linkStatus);
if(linkStatus != GL_TRUE)
{
GLint infoLogLength;
glGetProgramiv(programID, GL_INFO_LOG_LENGTH, &infoLogLength);
GLchar buffer[512] = {};
GLsizei bufferSize;
glGetProgramInfoLog(programID, infoLogLength, &bufferSize, buffer);
BGZ_CONSOLE("%s", buffer);
InvalidCodePath;
};
};
local_func void InstallShaders()
{
GLuint vertexShaderID = glCreateShader(GL_VERTEX_SHADER);
GLuint fragmentShaderID = glCreateShader(GL_FRAGMENT_SHADER);
const char* adapter[1];
adapter[0] = vertexShaderCode;
glShaderSource(vertexShaderID, 1, adapter, 0);
adapter[0] = fragmentShaderCode;
glShaderSource(fragmentShaderID, 1, adapter, 0);
glCompileShader(vertexShaderID);
glCompileShader(fragmentShaderID);
CheckCompileStatus(vertexShaderID);
CheckCompileStatus(fragmentShaderID);
GLuint programID = glCreateProgram();
glAttachShader(programID, vertexShaderID);
glAttachShader(programID, fragmentShaderID);
glLinkProgram(programID);
CheckLinkStatus(programID);
glUseProgram(programID);
};
local_func void
GLInit(int windowWidth, int windowHeight)
{
glEnable(GL_DEBUG_OUTPUT);
glDebugMessageCallback(MyOpenGLErrorCallbackFunc, 0);
glViewport(0, 0, windowWidth, windowHeight);
glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
glEnable(GL_DEPTH_TEST);
glEnable(GL_CULL_FACE);//Defaults to CCW ordering of indicies meaning all indicies that, from the viewers perspective, creating triangles in a CW manner repsrent visible triangles.
glCullFace(GL_BACK);//Culls only back faces (faces facing away from viewer)
InstallShaders();
}
void Draw(Memory_Partition* memPart, s32 id, RunTimeArr<s16> meshIndicies)
{
glDisable(GL_TEXTURE_2D);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 2);
glDrawElements(GL_TRIANGLES, (s32)meshIndicies.length, GL_UNSIGNED_SHORT, 0);
glEnable(GL_TEXTURE_2D);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
};
//This implements a discriminated union for buffering render commands that my game code layer uses.
void RenderViaHardware(Rendering_Info&& renderingInfo, Memory_Partition* platformMemoryPart, int windowWidth, int windowHeight)
{
local_persist bool glIsInitialized { false };
if (NOT glIsInitialized)
{
GLInit(windowWidth, windowHeight);
glClearColor(0.0f, 0.0f, 1.0f, 0.0f);
glIsInitialized = true;
};
u8* currentRenderBufferEntry = renderingInfo.cmdBuffer.baseAddress;
Camera3D camera3d = renderingInfo.camera3d;
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glEnable(GL_TEXTURE_2D);
for (s32 entryNumber = 0; entryNumber < renderingInfo.cmdBuffer.entryCount; ++entryNumber)
{
RenderEntry_Header* entryHeader = (RenderEntry_Header*)currentRenderBufferEntry;
switch (entryHeader->type)
{
case EntryType_InitBuffer:{
RenderEntry_InitBuffer bufferData = *(RenderEntry_InitBuffer*)currentRenderBufferEntry;
ScopedMemory scope{platformMemoryPart};
RunTimeArr<GLfloat> verts{};
InitArr(verts, platformMemoryPart, bufferData.verts.length * 6);
s32 i{};
f32 colorR{1.0f}, colorG{}, colorB{};//Im just hard coding color data right now while I'm learning
for(s32 j{}; j < bufferData.verts.length; ++j)
{
verts.Push(bufferData.verts[j].x);
verts.Push(bufferData.verts[j].y);
verts.Push(bufferData.verts[j].z);
verts.Push(colorR);
verts.Push(colorG);
verts.Push(colorB);
};
u32 vertexArrayID{};
glGenVertexArrays(1, &vertexArrayID);
glBindVertexArray(vertexArrayID);
GLuint bufferID;
glGenBuffers(1, &bufferID);
glBindBuffer(GL_ARRAY_BUFFER, bufferID);
glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * verts.length, verts.elements, GL_DYNAMIC_DRAW);
glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);
glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 6, 0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 6, (char*)(sizeof(GLfloat)*3));
GLuint indexBufferID;
glGenBuffers(1, &indexBufferID);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, indexBufferID);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(s16) * bufferData.indicies.length, bufferData.indicies.elements, GL_DYNAMIC_DRAW);
currentRenderBufferEntry += sizeof(RenderEntry_InitBuffer);
}break;
//...other cases for entries which are irrelevant to problem
case EntryType_Geometry: {
ScopedMemory scope{platformMemoryPart};
RenderEntry_Geometry geometryEntry = *(RenderEntry_Geometry*)currentRenderBufferEntry;
//camera transform setup
Mat4x4 xRotMatrix = XRotation(camera3d.rotation.x);
Mat4x4 yRotMatrix = YRotation(camera3d.rotation.y);
Mat4x4 zRotMatrix = ZRotation(camera3d.rotation.z);
Mat4x4 fullRotMatrix = xRotMatrix * yRotMatrix * zRotMatrix;
v3 xAxis = GetColumn(fullRotMatrix, 0);
v3 yAxis = GetColumn(fullRotMatrix, 1);
v3 zAxis = GetColumn(fullRotMatrix, 2);
//Setup full transform matrix
Mat4x4 camTransform = ProduceCameraTransformMatrix(xAxis, yAxis, zAxis, camera3d.worldPos);
Mat4x4 projectionTransform = ProduceProjectionTransformMatrix_UsingFOV(renderingInfo.fov, renderingInfo.aspectRatio, renderingInfo.nearPlane, renderingInfo.farPlane);
Mat4x4 fullTransformMatrix = projectionTransform * camTransform * geometryEntry.worldTransform;
//Send transform matrix to vertex shader
GLint transformMatrixUniformLocation = glGetUniformLocation(3, "transformationMatrix");
glUniformMatrix4fv(transformMatrixUniformLocation, 1, GL_FALSE, &fullTransformMatrix.elem[0][0]);
Draw(platformMemoryPart, geometryEntry.id, geometryEntry.indicies);
currentRenderBufferEntry += sizeof(RenderEntry_Geometry);
}break;
InvalidDefaultCase;
};
}
renderingInfo.cmdBuffer.entryCount = 0;
};
EDIT:
I figured out the issue with the colors not working which was answered in the comments below. However, I still don't know what these warnings are trying to tell me and if they are anything I should be looking to fix.

You variable vec3 changedColors; is uninitialized. Initialize it with vec3 changedColors = vec3(0);. The reason why it works on your laptop might be that its graphics driver will initialize it to zero by default, while your other graphics driver won't.
Regarding the warning (not error). It just warns you that your buffer will be put in video memory since you're using GL_STATIC_DRAW for your buffer. It's actually more of a log and you can safely ignore it. If you want to get rid of it you have to filter it away in your callback (which you passed to glDebugMessageCallback). Your callback will have a severity parameter that lets you to filter messages with a certain severity.
Or, if you only want to get rid of that specific message, filter on its id value.
Here's an example taken from blog.nobel-joergensen.com:
void APIENTRY openglCallbackFunction(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar* message, const void* userParam)
{
cout << "---------------------opengl-callback-start------------" << endl;
cout << "message: "<< message << endl;
cout << "type: ";
switch (type) {
case GL_DEBUG_TYPE_ERROR:
cout << "ERROR";
break;
case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR:
cout << "DEPRECATED_BEHAVIOR";
break;
case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR:
cout << "UNDEFINED_BEHAVIOR";
break;
case GL_DEBUG_TYPE_PORTABILITY:
cout << "PORTABILITY";
break;
case GL_DEBUG_TYPE_PERFORMANCE:
cout << "PERFORMANCE";
break;
case GL_DEBUG_TYPE_OTHER:
cout << "OTHER";
break;
}
cout << endl;
cout << "id: " << id << endl;
cout << "severity: ";
switch (severity){
case GL_DEBUG_SEVERITY_LOW:
cout << "LOW";
break;
case GL_DEBUG_SEVERITY_MEDIUM:
cout << "MEDIUM";
break;
case GL_DEBUG_SEVERITY_HIGH:
cout << "HIGH";
break;
}
cout << endl;
cout << "---------------------opengl-callback-end--------------" << endl;
}

Use different shader programs in OpenGL?

I have to use two different shader programs in OpenGL for different objects.
I found that I have to use glUseProgram() to switch between different shader programs, but not much information on that.
How does generating and binding VAOs and VBOs work for each shader program (how & when) given that I have two different shader programs I use for different objects?

When you render objects in OpenGL, your code will look like this:
Bind program with glUseProgram, set uniforms with glUniform4fv, glUniformMatrix4fv, etc.
Bind the vertex array with glBindVertexArray.
Bind any textures you need with glActiveTexture and glBindTexture.
Change any other state, e.g., glEnable, glDisable, glBlendFunc.
Draw with glDrawArrays or glDrawElements.
If you want, reset state back to defaults.
These are all the things that you tend to do in vanilla OpenGL 3 code. You should already have this part working.
If you need to write multiple objects with different shader programs, you just do the above steps multiple times. State changes can be omitted if you are going to use the same state for multiple programs (except uniforms, which are saved separately for each program). For example, you might use the same VAO, the same textures, the same blending function, et cetera.
There are many tutorials on how OpenGL 3 drawing commands work, if you are looking for more detailed examples.

It is possible to switch program objects without having to specify their arguments all over again. In order for it to work, though, you must pre-assign the values of your vertex attribute array indices before compiling and linking, using glBindAttribLocation, making sure each of your programs uses separate indices. If you don't, then the same VBO may go to both programs. The VBOs being used by both programs must all be in the same VAO, which must be bound while the programs are active and the Draw commands are executed.
Here's an example:
// HELPER FUNCTIONS
// ================
// Read a file to a string.
char *load(const char *fn)
{
int fd = open(fn, O_RDONLY);
assert(fd != -1);
off_t size = lseek(fd, 0, SEEK_END);
assert(size != -1);
off_t res = lseek(fd, 0, SEEK_SET);
assert(res != -1);
size++; // null terminator
char *buf = (char *)malloc(size);
char *p = buf;
for (;;) {
// File has gotten bigger since we started? Fuck that.
assert(p - buf < size);
ssize_t nread = read(fd, (char *)p, 0x10000);
assert(nread != -1);
if (nread == 0) {
*p = '\0';
break;
}
#ifndef NDEBUG
// Null character? Fuck that shit.
void *nullbyte = memchr((char *)p, '\0', nread);
assert(nullbyte == NULL);
#endif
p += nread;
}
int cres = close(fd);
assert(cres == 0);
return buf;
}
// Compile the "type" shader named "filename" and attach it to
// "shader_program".
static void compile_shader(GLenum type, const char *filename,
GLuint shader_program)
{
GLuint shader = glCreateShader(type);
char *source = load(filename);
glShaderSource(shader, 1, &source, 0);
glCompileShader(shader);
#ifndef NDEBUG
GLint success;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
if (!success) {
GLint log_length;
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
char *log = (char *)malloc(log_length);
glGetShaderInfoLog(shader, log_length, NULL, log);
fprintf(stderr, "Failed to compile %s:\n%s", filename, log);
abort();
}
#endif
glAttachShader(shader_program, shader);
}
// Return a shader program with vertex shader "vertfile" and the fragment
// shader "fragfile". The program is not linked, in case stuff still needs to
// be added.
GLuint compile_shader_program(const char *vertfile, const char *fragfile)
{
GLuint p = glCreateProgram();
compile_shader(GL_VERTEX_SHADER, vertfile, p);
compile_shader(GL_FRAGMENT_SHADER, fragfile, p);
return p;
}
// . . .
// INIT
// ====
// Make and bind the VAO, shared between both programs.
GLuint vao;
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
// Init one_program.
float square[] = {0.f, 0.f, 1.f, 0.f, 0.f, 1.f, 1.f, 1.f};
GLuint vbo;
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(square), square, GL_STATIC_DRAW);
// Make sure these indices are unique among all VBOs you are using to render.
GLuint attr = 0;
glEnableVertexAttribArray(attr);
glVertexAttribPointer(attr, 2, GL_FLOAT, GL_FALSE, 0, NULL);
GLuint one_program = compile_shader_program("glsl/2d.vert", "glsl/white.frag");
glBindAttribLocation(one_program, attr, "vertex_pos");
glLinkProgram(one_program);
// Set uniforms.
// . . .
// Do the same thing for the_other_program.
float triangle[] = {0.f, 0.f, 1.f, 0.f, 0.f, 1.f};
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(square), square, GL_STATIC_DRAW);
attr++;
glEnableVertexAttribArray(attr);
glVertexAttribPointer(attr, 2, GL_FLOAT, GL_FALSE, 0, NULL);
GLuint the_other_program =
compile_shader_program("glsl/2d.vert", "glsl/chrome.frag");
glBindAttribLocation(the_other_program, attr, "vertex_pos");
glLinkProgram(the_other_program);
// Set uniforms.
// . . .
// RENDER ONE FRAME
// ================
glUseProgram(one_program);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glUseProgram(the_other_program);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 3);

SFML - Opengl VAO issue giving me an (1282) error

I am getting an error when trying to use VAO's inside of SFML and not sure if it is SFML or it is my own opengl code
GLenum err = glewInit();
if (err != GLEW_OK)
{
std::cout << "NOT WORKING" << std::endl;
}
std::vector<sf::Vector3f> g_vertext_buffer_data;
g_vertex_buffer_data.push_back({ -1.0f, -1.0f, 0.0f });
g_vertex_buffer_data.push_back({1.0f, -1.0f, 0.0f});
g_vertex_buffer_data.push_back({ 0.0f, 1.0f, 0.0f });
const char* vertexShaderSource =
"#version 330\n\
in vec4 position;\
void main(void){\ gl_Position = position;\
}";
// compile fragment shader source
const GLchar* fragmentShaderSource =
"#version 330\n\
void main(void) {\
out vec4 fragcolor; fragcolor= vec4(1.0,1.0,1.0,1.0);\
}";
/* Creating Shader */
this->programId = glCreateProgram();
this->vId = glCreateShader(GL_VERTEX_SHADER);
this->fId = glCreateShader(GL_FRAGMENT_SHADER);
/* Get Shader Size */
int vertexShaderLength = strlen(vertexShaderSource);
int fragmentShaderLength = strlen(fragmentShaderSource);
/* Loading and binding shader */
glShaderSource(this->vId, 1, &vertexShaderSource, NULL);
glShaderSource(this->fId, 1, &fragmentShaderSource, NULL);
/* Compile Shaders */
glCompileShader(vId);
glCompileShader(fId);
/* Attach Shaders */
glAttachShader(this->programId, this->vId);
glAttachShader(this->programId, this->fId);
/* Linkg program */
glLinkProgram(this->programId);
/* Use and bind attribute */
glUseProgram(this->programId);
this->positionId = glGetAttribLocation(this->programId, "position");
glUseProgram(0);
/* VAO Time */
glGenVertexArrays(1, &this->vaoId);
glBindVertexArray(this->vaoId);
/* VBO Time assigning to VAO */
glGenBuffers(1, &this->vboId);
glBindBuffer(GL_ARRAY_BUFFER, this->vboId);
glBufferData(GL_ARRAY_BUFFER, g_vertex_buffer_data.size() * sizeof(sf::Vector3f), &g_vertex_buffer_data[0], GL_STATIC_DRAW);
glEnableVertexAttribArray(this->positionId);
glVertexAttribPointer(this->positionId, 2, GL_FLOAT, GL_FALSE, sizeof(sf::Vector3f), 0);
/* Close out bindings */
glBindVertexArray(0);
glBindBuffer(GL_ARRAY_BUFFER, 0);
while(1)
{
glUseProgram(this->programId);
glBindVertexArray(this->vaoId);
glDrawArrays(GL_TRIANGLES, 0, 3);
glBindVertexArray(0);
glUseProgram(0);
gameWindow.glPushStates();
}
The error code I get is: opengl error in user code (1282)
I have changed the size() issue that was brought up in the blBufferData() but still am getting the issue.

There is at least a problem with the size that is passed to glBufferData:
glBufferData(GL_ARRAY_BUFFER,
sizeof(g_vertex_buffer_data) * sizeof(sf::Vector3f),
g_vertex_buffer_data[0], GL_STATIC_DRAW);
sizeof(g_vertex_buffer_data) is equal to sizeof(std::vector<?>) which is the size of the vector object and not the size of the data contained. Try using
glBufferData(GL_ARRAY_BUFFER,
g_vertex_buffer_data.size() * sizeof(sf::Vector3f),
g_vertex_buffer_data[0], GL_STATIC_DRAW);
Another thing: In OpenGL 3.3 Core Profile there is no gl_FragColor variable. You will have to define an out variable.
Next: Your vertex shader seems to be empty. You have to write to gl_Position otherwise nothing will be shown.

Possible error codes for glGetAttribLocation are:
GL_INVALID_OPERATION
Which don't have a fixed value. Try to get the error string with gluErrorString() or take a look in the header to which of those 1282 maps.
• check your shader got compiled without error?
• check your shader got linked without error?
What type have positionId? All object id's must be GLuint type.
And btw allways enable shader compilation-linking error check, and debug will be more informative.
I do that in this way (OpenGL-ES 2.0):
m_nVertexShader = glCreateShader(GL_VERTEX_SHADER);
m_nPixelShader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(m_nVertexShader, 1, &lpszVertexBuffer, NULL);
glShaderSource(m_nPixelShader, 1, &lpszFragmentBuffer, NULL);
glCompileShader(m_nVertexShader);
int iIsOk = 0;
glGetShaderiv(m_nVertexShader, GL_COMPILE_STATUS, &iIsOk);
if(!iIsOk)
{
GLint infoLen = 0;
glGetShaderiv(m_nVertexShader, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetShaderInfoLog(m_nVertexShader, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteShader(m_nVertexShader);
return;
}
glCompileShader(m_nPixelShader);
glGetShaderiv(m_nPixelShader, GL_COMPILE_STATUS, &iIsOk);
if(!iIsOk)
{
GLint infoLen = 0;
glGetShaderiv(m_nPixelShader, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetShaderInfoLog(m_nPixelShader, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteShader(m_nPixelShader);
return;
}
m_nProgram = glCreateProgram();
glAttachShader(m_nProgram, m_nVertexShader);
glAttachShader(m_nProgram, m_nPixelShader);
glBindAttribLocation(m_nProgram, 0, "rm_Vertex");
glLinkProgram(m_nProgram);
glGetProgramiv(m_nProgram, GL_LINK_STATUS, &iIsOk);
// Fail to pass status validation
if(!iIsOk)
{
GLint infoLen = 0;
glGetProgramiv(m_nProgram, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetProgramInfoLog(m_nProgram, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteProgram(m_nProgram);
return;
}
glUseProgram(m_nProgram);
As you use GLSL 3.3, fist you must specify fragment rendertarget output by calling
glBindFragDataLocation(this->programId, 0, "fragcolor");
Secondly your fragment shader must be like
"#version 330
out vec4 fragcolor;
void main(void) {
fragcolor= vec4(1.0,1.0,1.0,1.0);
}
The example of using this kind of shaders is on OpenGL 3.3 + GLSL 1.5 Sample.

How do you measure peak memory bandwidth in OpenGL?

Just to get an idea of what kind of speeds I should be expecting I have been trying to benchmark transfer between global memory and shaders, rather than relying on GPU spec sheets. However I can't get close to the theoretical maximum. In fact I'm out by a factor of 50!.
I'm using a GTX Titan X, which is said to have 336.5GB/s. Linux x64 driver 352.21.
I found a CUDA benchmark here which gives me ~240–250GB/s (this is more what I expect).
I'm trying to match exactly what they do with shaders. I've tried vertex shaders, compute shaders, accessing buffer objects via image_load_store and NV_shader_buffer_store, with floats, vec4s, loops inside the shader (with coalesced addressing within the work group) and various methods of timing. I'm stuck at ~7GB/s (see the update below).
Why is GL so much slower? Am I doing something wrong and if so, how should it be done?
Here's my MWE with three methods (1. vertex shader with image_load_store, 2. vertex shader with bindless graphics, 3. compute shader with bindless graphics):
//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>
const char* imageSource =
"#version 440\n"
"uniform layout(r32f) imageBuffer data;\n"
"uniform float val;\n"
"void main() {\n"
" imageStore(data, gl_VertexID, vec4(val, 0.0, 0.0, 0.0));\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"uniform float* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_VertexID] = val;\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessComputeSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"layout(local_size_x = 256) in;\n"
"uniform float* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_GlobalInvocationID.x] = val;\n"
"}\n";
GLuint compile(GLenum type, const char* shaderSrc)
{
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
glCompileShader(shader);
int success = 0;
int loglen = 0;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
GLchar* log = new GLchar[loglen];
glGetShaderInfoLog(shader, loglen, &loglen, log);
if (!success)
{
printf("%s\n", log);
exit(0);
}
GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
return program;
}
GLuint timerQueries[2];
void start()
{
glGenQueries(2, timerQueries);
glQueryCounter(timerQueries[0], GL_TIMESTAMP);
}
float stop()
{
glMemoryBarrier(GL_ALL_BARRIER_BITS);
GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
glQueryCounter(timerQueries[1], GL_TIMESTAMP);
GLint available = 0;
while (!available) //sometimes gets stuck here for whatever reason
glGetQueryObjectiv(timerQueries[1], GL_QUERY_RESULT_AVAILABLE, &available);
GLuint64 a, b;
glGetQueryObjectui64v(timerQueries[0], GL_QUERY_RESULT, &a);
glGetQueryObjectui64v(timerQueries[1], GL_QUERY_RESULT, &b);
glDeleteQueries(2, timerQueries);
return b - a;
}
int main(int argc, char** argv)
{
float* check;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
glutCreateWindow("test");
glewInit();
int bufferSize = 64 * 1024 * 1024; //64MB
int loops = 500;
glEnable(GL_RASTERIZER_DISCARD);
float* dat = new float[bufferSize/sizeof(float)];
memset(dat, 0, bufferSize);
//create a buffer with data
GLuint buffer;
glGenBuffers(1, &buffer);
glBindBuffer(GL_TEXTURE_BUFFER, buffer);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);
//get a bindless address
GLuint64 address;
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
//make a texture alias for it
GLuint bufferTexture;
glGenTextures(1, &bufferTexture);
glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);
glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);
//compile the shaders
GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
GLuint bindlessComputeShader = compile(GL_COMPUTE_SHADER, bindlessComputeSource);
//warm-up and check values
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
//glMemoryBarrier(GL_ALL_BARRIER_BITS); //this causes glDispatchCompute to segfault later, so don't uncomment
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f); //glDispatchCompute doesn't actually write anything with bindless graphics
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glFinish();
//time image_load_store
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 imageTime = stop();
printf("image_load_store: %.2fGB/s\n", (float)((bufferSize * (double)loops) / imageTime));
//time bindless
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 bindlessTime = stop();
printf("bindless: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessTime));
//time bindless in a compute shader
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
GLuint64 bindlessComputeTime = stop();
printf("bindless compute: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessComputeTime));
assert(glGetError() == GL_NO_ERROR);
return 0;
}
My output:
image_load_store: 6.66GB/s
bindless: 6.68GB/s
bindless compute: 6.65GB/s
Some notes:
Compute shaders with bindless graphics don't appear to write anything (the commented out assert fails), or at least the data isn't retrieved with glMapBuffer even though the speed matches the other methods. Using image_load_store in the compute shader works and gives the same speed the vertex shaders (though I thought that'd be one too many permutations to post).
Calling glMemoryBarrier(GL_ALL_BARRIER_BITS) before glDispatchCompute causes a crash in the driver.
Commenting out the three glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);, which are used to check the output, raises the speed of the first two tests to 17GB/s and the compute shader skyrockets to 292GB/s which is much closer to what I'd like but this can't be trusted because of point 1.
Sometimes while (!available) hangs for ages (ctrl-c when I get tired of waiting shows its still in the loop).
For reference, here's the CUDA code:
//http://www.ks.uiuc.edu/Research/vmd/doxygen/CUDABench_8cu-source.html
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define CUERR { cudaError_t err; \
if ((err = cudaGetLastError()) != cudaSuccess) { \
printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
return -1; }}
//
// GPU device global memory bandwidth benchmark
//
template <class T>
__global__ void gpuglobmemcpybw(T *dest, const T *src) {
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = src[idx];
}
template <class T>
__global__ void gpuglobmemsetbw(T *dest, const T val) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = val;
}
typedef float4 datatype;
static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
int i;
int len = 1 << 22; // one thread per data element
int loops = 500;
datatype *src, *dest;
datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
// initialize to zero for starters
float memsettime = 0.0f;
float memcpytime = 0.0f;
*gpumemsetgbsec = 0.0;
*gpumemcpygbsec = 0.0;
// attach to the selected device
cudaError_t rc;
rc = cudaSetDevice(cudadev);
if (rc != cudaSuccess) {
#if CUDART_VERSION >= 2010
rc = cudaGetLastError(); // query last error and reset error state
if (rc != cudaErrorSetOnActiveProcess)
return -1; // abort and return an error
#else
cudaGetLastError(); // just ignore and reset error state, since older CUDA
// revs don't have a cudaErrorSetOnActiveProcess enum
#endif
}
cudaMalloc((void **) &src, sizeof(datatype)*len);
CUERR
cudaMalloc((void **) &dest, sizeof(datatype)*len);
CUERR
dim3 BSz(256, 1, 1);
dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1);
// do a warm-up pass
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
CUERR
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
CUERR
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
CUERR
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
// execute the memset kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
}
CUERR
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memsettime, start, end);
CUERR
// execute the memcpy kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
}
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memcpytime, start, end);
CUERR
cudaEventDestroy(start);
CUERR
cudaEventDestroy(end);
CUERR
*gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
*gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
cudaFree(dest);
cudaFree(src);
CUERR
return 0;
}
int main()
{
double a, b;
cudaglobmembw(0, &a, &b);
printf("%f %f\n", (float)a, (float)b);
return 0;
}
Update:
It seems that the buffer gets made non-resident on my glBufferData calls which were there to check output was being written. As per the extension:
A buffer is also made non-resident implicitly as a result of being respecified via BufferData or being deleted.
...
BufferData is specified to "delete the existing data store",
so the GPU address of that data should become invalid. The buffer is
therefore made non-resident in the current context.
At a guess, OpenGL then streams in the buffer object data each frame and doesn't cache it in video memory. This explains why the compute shader failed the assert, however there's a slight anomaly that bindless graphics in the vertex shader still worked when not resident, but I'll ignore that for now. I have no idea why a 64MB buffer object wouldn't default to being resident (though perhaps after first use) when there's 12GB available.
So after each call to glBufferData I make it resident again and get the address in case its changed:
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
I'm now getting 270–290GB/s with the compute shader using either image_load_store or bindless graphics. Now my question includes:
Given the buffer seems to be resident for each test and the compute shader is nice and fast, why are the vertex shader versions still so slow?
Without the bindless graphics extension, how should regular OpenGL users put data into video memory (actually put and not idly suggest that the driver might just like to)?
I'm pretty sure I would have noticed this problem in real world situations, and it's this contrived benchmark that hits a slow path, so how could I trick the driver into making a buffer object resident? Running a compute shader first doesn't change anything.

You are asking the driver to read from your process memory, dat. This causes extensive cache coherency traffic. When the GPU reads that memory, it can't be sure that it is up to date, it might be in the CPU cache, modified, and not written back to RAM yet. This causes the GPU to actually have to read from the CPU cache, which is far more expensive than bypassing the CPU and reading the RAM. The RAM is often idle during normal operation, because a modern CPU's hit rate is typically 95% to 99%. The cache is used continuously.
To achieve maximum performance, you need to let the driver allocate the memory. Normal memory your program uses, like global variables and the heap are allocated in writeback memory. Driver allocated memory will usually be allocated as write combining or uncacheable, which eliminates the coherency traffic.
Peak advertised bandwidth numbers will be achieved only without cache coherency overhead.
To let the driver allocate it, use glBufferData with a nullptr for the data.
It isn't all rosy though, if you manage to coerce the driver into using a system memory write combining buffer. CPU reads to such addresses will be very slow. Sequential writes are optimized by the CPU, but random writes will cause the write combining buffer to flush frequently, hurting performance.

strange opengl rendering stutter

I'm experiencing a strange stutter in my simple opengl (via GLFW3) app. Although vsync is enabled (frame rate is almost steady 60 fps), the motion of the spinning triangle is not always smooth - it's almost like some frames are skipped sometimes. I tried looking at the time difference between consecutive calls to glSwapBuffers(), but those seem pretty consistent.
Am I doing something wrong? Should I use some kind of motion blur filtering to make it appear smoother?
The code:
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <cfloat>
#include <cassert>
#include <minmax.h>
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <Windows.h>
#include <GL/glew.h>
#include <gl/GLU.h>
//#include <GL/GL.h>
#include <GLFW/glfw3.h>
#include <glm/glm.hpp>
#include <glm/gtc/type_ptr.hpp>
#ifdef _WIN32
#pragma warning(disable:4996)
#endif
static int swap_interval;
static double frame_rate;
GLuint LoadShaders(const char * vertex_file_path,const char * fragment_file_path){
// Create the shaders
GLuint VertexShaderID = glCreateShader(GL_VERTEX_SHADER);
GLuint FragmentShaderID = glCreateShader(GL_FRAGMENT_SHADER);
// Read the Vertex Shader code from the file
std::string VertexShaderCode;
std::ifstream VertexShaderStream(vertex_file_path, std::ios::in);
if(VertexShaderStream.is_open()){
std::string Line = "";
while(getline(VertexShaderStream, Line))
VertexShaderCode += "\n" + Line;
VertexShaderStream.close();
}else{
printf("Impossible to open %s. Are you in the right directory ? Don't forget to read the FAQ !\n", vertex_file_path);
return 0;
}
// Read the Fragment Shader code from the file
std::string FragmentShaderCode;
std::ifstream FragmentShaderStream(fragment_file_path, std::ios::in);
if(FragmentShaderStream.is_open()){
std::string Line = "";
while(getline(FragmentShaderStream, Line))
FragmentShaderCode += "\n" + Line;
FragmentShaderStream.close();
}
GLint Result = GL_FALSE;
int InfoLogLength;
// Compile Vertex Shader
printf("Compiling shader : %s\n", vertex_file_path);
char const * VertexSourcePointer = VertexShaderCode.c_str();
glShaderSource(VertexShaderID, 1, &VertexSourcePointer , NULL);
glCompileShader(VertexShaderID);
// Check Vertex Shader
glGetShaderiv(VertexShaderID, GL_COMPILE_STATUS, &Result);
if (Result != GL_TRUE)
{
glGetShaderiv(VertexShaderID, GL_INFO_LOG_LENGTH, &InfoLogLength);
if ( InfoLogLength > 0 ){
std::vector<char> VertexShaderErrorMessage(InfoLogLength+1);
glGetShaderInfoLog(VertexShaderID, InfoLogLength, NULL, &VertexShaderErrorMessage[0]);
printf("%s\n", &VertexShaderErrorMessage[0]);
}
}
// Compile Fragment Shader
printf("Compiling shader : %s\n", fragment_file_path);
char const * FragmentSourcePointer = FragmentShaderCode.c_str();
glShaderSource(FragmentShaderID, 1, &FragmentSourcePointer , NULL);
glCompileShader(FragmentShaderID);
// Check Fragment Shader
glGetShaderiv(FragmentShaderID, GL_COMPILE_STATUS, &Result);
if (Result != GL_TRUE)
{
glGetShaderiv(FragmentShaderID, GL_INFO_LOG_LENGTH, &InfoLogLength);
if ( InfoLogLength > 0 ){
std::vector<char> FragmentShaderErrorMessage(InfoLogLength+1);
glGetShaderInfoLog(FragmentShaderID, InfoLogLength, NULL, &FragmentShaderErrorMessage[0]);
printf("%s\n", &FragmentShaderErrorMessage[0]);
}
}
// Link the program
printf("Linking program\n");
GLuint ProgramID = glCreateProgram();
glAttachShader(ProgramID, VertexShaderID);
glAttachShader(ProgramID, FragmentShaderID);
glLinkProgram(ProgramID);
// Check the program
glGetProgramiv(ProgramID, GL_LINK_STATUS, &Result);
if (Result != GL_TRUE)
{
glGetProgramiv(ProgramID, GL_INFO_LOG_LENGTH, &InfoLogLength);
if ( InfoLogLength > 0 ){
std::vector<char> ProgramErrorMessage(InfoLogLength+1);
glGetProgramInfoLog(ProgramID, InfoLogLength, NULL, &ProgramErrorMessage[0]);
printf("%s\n", &ProgramErrorMessage[0]);
}
}
#ifdef _DEBUG
glValidateProgram(ProgramID);
#endif
glDeleteShader(VertexShaderID);
glDeleteShader(FragmentShaderID);
return ProgramID;
}
static void framebuffer_size_callback(GLFWwindow* window, int width, int height)
{
glViewport(0, 0, width, height);
}
static void set_swap_interval(GLFWwindow* window, int interval)
{
swap_interval = interval;
glfwSwapInterval(swap_interval);
}
static void key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
{
if (key == GLFW_KEY_SPACE && action == GLFW_PRESS)
set_swap_interval(window, 1 - swap_interval);
}
static bool init(GLFWwindow** win)
{
if (!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_COMPAT_PROFILE);
// creating a window using the monitor param will open it full screen
const bool useFullScreen = false;
GLFWmonitor* monitor = useFullScreen ? glfwGetPrimaryMonitor() : NULL;
*win = glfwCreateWindow(640, 480, "", monitor, NULL);
if (!(*win))
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(*win);
GLenum glewError = glewInit();
if( glewError != GLEW_OK )
{
printf( "Error initializing GLEW! %s\n", glewGetErrorString( glewError ) );
return false;
}
//Make sure OpenGL 2.1 is supported
if( !GLEW_VERSION_2_1 )
{
printf( "OpenGL 2.1 not supported!\n" );
return false;
}
glfwMakeContextCurrent(*win);
glfwSetFramebufferSizeCallback(*win, framebuffer_size_callback);
glfwSetKeyCallback(*win, key_callback);
// get version info
const GLubyte* renderer = glGetString (GL_RENDERER); // get renderer string
const GLubyte* version = glGetString (GL_VERSION); // version as a string
printf("Renderer: %s\n", renderer);
printf("OpenGL version supported %s\n", version);
return true;
}
std::string string_format(const std::string fmt, ...) {
int size = 100;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
va_end(ap);
if (n > -1 && n < size) {
str.resize(n);
return str;
}
if (n > -1)
size = n + 1;
else
size *= 2;
}
return str;
}
int main(int argc, char* argv[])
{
srand(9); // constant seed, for deterministic results
unsigned long frame_count = 0;
GLFWwindow* window;
init(&window);
// An array of 3 vectors which represents 3 vertices
static const GLfloat g_vertex_buffer_data[] = {
-1.0f, -1.0f, 0.0f,
1.0f, -1.0f, 0.0f,
0.0f, 1.0f, 0.0f,
};
GLuint vbo;
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
// acclocate GPU memory and copy data
glBufferData(GL_ARRAY_BUFFER, sizeof(g_vertex_buffer_data), g_vertex_buffer_data, GL_STATIC_DRAW);
unsigned int vao = 0;
glGenVertexArrays (1, &vao);
glBindVertexArray (vao);
glEnableVertexAttribArray (0);
glBindBuffer (GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer (0, 3, GL_FLOAT, GL_FALSE, 0, 0);
// Create and compile our GLSL program from the shaders
GLuint programID = LoadShaders( "1.vert", "1.frag" );
// Use our shader
glUseProgram(programID);
GLint locPosition = glGetAttribLocation(programID, "vertex");
assert(locPosition != -1);
glm::mat4 world(1.0f);
GLint locWorld = glGetUniformLocation(programID, "gWorld");
assert(locWorld != -1 && "Error getting address (was it optimized out?)!");
glUniformMatrix4fv(locWorld, 1, GL_FALSE, glm::value_ptr(world));
GLenum err = glGetError();
GLint loc = glGetUniformLocation(programID, "time");
assert(loc != -1 && "Error getting uniform address (was it optimized out?)!");
bool isRunning = true;
while (isRunning)
{
static float time = 0.0f;
static float oldTime = 0.0f;
static float fpsLastUpdateTime = 0.0f;
oldTime = time;
time = (float)glfwGetTime();
static std::string fps;
glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glUseProgram (programID);
glUniform1f(loc, time);
glBindVertexArray (vao);
glDrawArrays (GL_TRIANGLES, 0, 3);
glfwSwapBuffers(window);
glfwPollEvents();
isRunning = !glfwWindowShouldClose(window);
float dT = time-oldTime;
if (time-fpsLastUpdateTime > 0.5)
{
static const char* fmt = "frame rate: %.1f frames per second";
glfwSetWindowTitle(window, string_format(fmt, 1.0f/(dT)).c_str());
fpsLastUpdateTime = time;
}
}
glfwDestroyWindow(window);
glfwTerminate();
return 0;
}
////////////////////////////////////////
// 1.frag
////////////////////////////////////////
#version 330 core
// Ouput data
out vec3 color;
void main()
{
// Output color = red
color = vec3(1,0,0);
}
//////////////////////////////////////////////
// 1.vert
//////////////////////////////////////////////
#version 330 core
// Input vertex data, different for all executions of this shader.
in vec3 vertex;
uniform mat4 gWorld;
uniform float time;
void main()
{
gl_Position = gWorld * vec4(vertex, 1.0f);
gl_Position.x += sin(time);
gl_Position.y += cos(time)/2.0f;
gl_Position.w = 1.0;
}
OK. I got home and did more testing.
First I tried to disable the V-Sync, but I couldn't! I had to disable the windows' desktop effects (Aero) to be able to do so, and lo and behold - once Aero was disabled, the stutter disappeared (with V-Sync on).
Then I tested it with V-Sync off, and of course, I got much higher frame rate with the occasional expected tearing.
Then I tested it in full screen. The rendering was smooth with Aero and without it.
I couldn't find anyone else who share this problem. Do you think it's a GLFW3 bug? a driver/hardware issue (I have GTS450 with the latest drivers)?
Thank you all for you answers. I learned a lot, but my problem is still unsolved.

It's a strange Windows dwm (Desktop Window Manager) composition mode and glfwSwapBuffers() interaction problem. I didn't got down to the root of the problem yet. But you can workaround the stuttering by doing one of the following:
go fullscreen
disable dwm window composition (see my answer to Linear movement stutter)
enable multi sampling: glfwWindowHint(GLFW_SAMPLES, 4);

Without seeing this stutter problem it is difficult to say what the problem is. But the first impression of your program is ok.
So I guess you observe that a frame once in a while is shown twice. Leading to a very small stutter. This happens usually when you try to output 60 frames on 60Hz Monitor with vsync.
In such a setup you must not miss one vsync period or you will see a stutter, because of the frame shown twice.
On the other hand it is nearly impossible to guarantee this because the scheduler on a windows platforms schedules threads for 15ms(about that I don't know the correct value by heart).
So it is possible that a higher priority thread will use the CPU and your presenting thread is not able to swap the buffers for a new frame in time. When you increase the values e.g. 120 frames on 120 Hz monitor you will see those stutters even more often.
So I don't know any solution how you can prevent this on the windows platform. But If someone else knows I would be happy to know it too.

It's hard to tell without visualizing your problem but unless we are talking about some severe stuttering it's rarely a rendering issue. The motion/physics in your program is handled/processed by the CPU. The way you are implementing your animation, is handled in a way that is solely depended on the CPU.
What this means is that:
Say you are rotating your triangle by a fixed amount every CPU cycle. This is very depended on the time a CPU cycle takes to complete. Things like cpu workload can have huge impact on your screen result (not necessarily though). And it doesn't even take huge CPU occupation to notice a difference. All it takes is a background process to wake up and query for updates. This could result in a 'spike' of which could be observed as a tiny pause in your animation flow (due to the small delay the CPU can cause in your animation cycle). This can be interpreted as a stutter.
Now understanding the above there are a few ways to solve your issue (but in my opinion it doesn't worth investing for what you are trying to do above). You need to find a way to have consistent animation steps (with a small margin for variation).
This is a great article to explore:
http://gafferongames.com/game-physics/fix-your-timestep/
Ultimately most of the methods implemented above will result in a better rendering flow. But still not all of them guarantee physics-rendering precision. Without trying it out myself yet, i would say that one would have to go as far as implementing interpolation in his/her rendering process to guarantee smooth drawing as best as possible.
Now what i wanted to explain to you most, is that stuttering is usually caused by the CPU because it intervenes directly with your way of handling physics. But overall, using time for handling your physics and interpolating inside your rendering cycles is a topic definitely worth to explore.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js