glDrawElementsInstanced freezes or slow down at 18680 instances - c++

I am developing a C++ program to simulate rain.
I am using OpenGL instance feature for rendering an increasing numbers of droplets. (One instance = one droplet)
The program runs fine when calling glDrawElementsInstances, until the number of instances reaches 18680. Then, it freezes or produces weird behaviour (huge slowdown, incoherent rendering of instances).
My rendering loop:
GLObject GLDrop(*this->_model._drop, *this->_shader); // generate buffer
while (!glfwWindowShouldClose(this->_window))
{
glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
this->_model._drop->create();
GLDrop.setDropState();
glDrawElementsInstanced(GL_TRIANGLE_STRIP, this->_model._drop->getElementsSize(), GL_UNSIGNED_INT, 0, this->_model._drop->getInstances());
GLDrop.disableDropState();
glfwSwapBuffers(this->_window);
glfwPollEvents();
}
My Buffer generating function, called just before the rendering loop:
void GLObject::generateDropBuffers(void)
{
glGenBuffers(1, &this->_vbo);
glBindBuffer(GL_ARRAY_BUFFER, this->_vbo);
glBufferData(GL_ARRAY_BUFFER, this->_module.getVerticesSize() * sizeof(GLfloat), this->_module.getVertices(), GL_STATIC_DRAW);
glGenBuffers(1, &this->_ebo);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->_ebo);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, this->_module.getElementsSize() * sizeof(GLuint), this->_module.getElements(), GL_STATIC_DRAW);
glGenBuffers(1, &this->_pbo);
glBindBuffer(GL_ARRAY_BUFFER, this->_pbo);
glBufferData(GL_ARRAY_BUFFER, this->_module.getMaxInstances() * DIMENSIONS * sizeof(GLfloat), NULL, GL_DYNAMIC_DRAW);
glGenBuffers(1, &this->_cbo);
glBindBuffer(GL_ARRAY_BUFFER, this->_cbo);
glBufferData(GL_ARRAY_BUFFER, this->_module.getMaxInstances() * COLOR_CHANNELS * sizeof(GLfloat), NULL, GL_DYNAMIC_DRAW);
}
Each time Drop.create() is called, a new batch of droplet is created, incrementing the numbers of instances to be drawn to the screen.
void Drop::create(void)
{
unsigned int i;
unsigned int j;
if (this->_instances < this->_maxInstances - this->_dropBatch)
{
for (GLuint drop = 0; drop < this->_dropBatch; ++drop)
{
i = this->_instances * DIMENSIONS;
j = this->_instances * COLOR_CHANNELS;
this->_positions[i] = rand() % this->_model._vertexCol * UNIT;
this->_positions[i + 1] = this->_model._top + 3.0f * UNIT;
this->_positions[i + 2] = rand() % this->_model._vertexRow * UNIT;
this->_colors[j] = 0.0f;
this->_colors[j + 1] = 0.0f;
this->_colors[j + 2] = 1.0f;
this->_colors[j + 3] = 1.0f;
this->_instances += 1;
}
}
}
My buffer binding function:
void GLObject::setDropState(void)
{
GLuint instances = this->_module.getInstances() - this->_module.getBatchSize();
GLuint posOffset = instances * DIMENSIONS;
GLuint colorOffset = instances * COLOR_CHANNELS;
GLuint posSize = this->_module.getBatchSize() * DIMENSIONS * sizeof(GLfloat);
GLuint colorSize = this->_module.getBatchSize() * COLOR_CHANNELS * sizeof(GLfloat);
glBindBuffer(GL_ARRAY_BUFFER, this->_vbo);
glVertexAttribPointer(this->_shader.getAPosition(), DIMENSIONS, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(this->_shader.getAPosition());
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->_ebo);
glBindBuffer(GL_ARRAY_BUFFER, this->_pbo);
glBufferSubData(GL_ARRAY_BUFFER, posOffset * sizeof(GLfloat), posSize, this->_module.getPositions() + posOffset);
glVertexAttribPointer(this->_shader.getAInstance(), DIMENSIONS, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(this->_shader.getAInstance());
glBindBuffer(GL_ARRAY_BUFFER, this->_cbo);
glBufferSubData(GL_ARRAY_BUFFER, colorOffset * sizeof(GLfloat), colorSize, this->_module.getColors() + colorOffset);
glVertexAttribPointer(this->_shader.getAColors(), COLOR_CHANNELS, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(this->_shader.getAColors());
glVertexAttribDivisor(this->_shader.getAPosition(), 0);
glVertexAttribDivisor(this->_shader.getAInstance(), 1);
glVertexAttribDivisor(this->_shader.getAColors(), 1);
}
Tested on:
NVIDIA GeForce GT 330M 512 MB, MacbookPro.
NVIDIA GeForce 9400M 256 MB, MacbookPro
I don't know what could be the problem. Is there a limitation on the number of instances I can draw in one call ? (I highly doubt that is the case).
I am using buffers way larger than I need, to make sure it is not a memory access issue.
Could it be a memory alignement issue?
OpenGL Manual says this about alignment, but I can't truly understand.
Clients must align data elements consistent with the requirements of
the client platform, with an additional base-level requirement that an
offset within a buffer to a datum comprising N bytes be a multiple of
N.
Any help would be welcome.
My scene, when 18860 instances of droplets are drawn, before the crash
EDIT:
While searching on google I found a similar case here: https://www.opengl.org/discussion_boards/showthread.php/181142-glDrawArraysInstanced-max-number-of-instances
The author of the thread has the same problem as me, but with exactly the double of the number of instances (I guess because his GPU memory is 512 and not 256 as mine). Moreover I found that I have different behaviours according to how many instances I add each loop.
For example:
If I add 100 instances per loop, my program freeze.
If I add 200 or more instances per loop, my program slowdown to 4fps.
At the bottom of the link I posted above, the authour exaplains in details the reason of this behavior. Apparently the difference is due to jumping (or not) a specific interval of instances.
If I add 100 instances per loop I fall into the death-gap (so freeze), but if I add more than 100 I jump over the death-gap (so huge slowdown)
Any idea about this strange macbook "bug"?

I have a similar problems with glDrawElementsInstancedEXT (iPhone 5). I tried different meshes and found out that FPS falls by hundred of times if total number of indices (number of instances multiplied by number of indices of mesh) exceeds approximately 4 mln. I would add that drawing using glDrawElements slows down in a similar manner in this case.
And there is another problem.
If a number of instances exceeds 65535, glDrawElementsInstancedEXT draws depending on the number of instances drawn in the very first call of glDrawElementsInstancedEXT:
if the number of instances in the very first call is less than 65536, then the following calls draw only the first 65535 instances
if the number of instances in the very first call exceeds 65535, then the following calls draw the full number of instances.
It looks like a bug in a driver.

Related

Why is OpenGL immediate mode faster than core?

I am using the following library to render text in OpenGL: fontstash. I have another header file which adds support for OpenGL 3.0+. The question is why is the render implementation with core profile much slower than the immediate mode?
Here is the render code with immediate mode:
static void glfons__renderDraw(void* userPtr, const float* verts, const float* tcoords, const unsigned int* colors, int nverts)
{
GLFONScontext* gl = (GLFONScontext*)userPtr;
if (gl->tex == 0) return;
glBindTexture(GL_TEXTURE_2D, gl->tex);
glEnable(GL_TEXTURE_2D);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
glEnableClientState(GL_COLOR_ARRAY);
glVertexPointer(2, GL_FLOAT, sizeof(float)*2, verts);
glTexCoordPointer(2, GL_FLOAT, sizeof(float)*2, tcoords);
glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(unsigned int), colors);
glDrawArrays(GL_TRIANGLES, 0, nverts);
glDisable(GL_TEXTURE_2D);
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_TEXTURE_COORD_ARRAY);
glDisableClientState(GL_COLOR_ARRAY);
}
Here is the core profile render code:
static void gl3fons__renderDraw(void* userPtr, const float* verts, const float* tcoords, const unsigned int* colors, int nverts)
{
GLFONScontext* gl = (GLFONScontext*)userPtr;
if (gl->tex == 0) return;
if (gl->shader == 0) return;
if (gl->vao == 0) return;
if (gl->vbo == 0) return;
// init shader
glUseProgram(gl->shader);
// init texture
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, gl->tex);
glUniform1i(gl->texture_uniform, 0);
// init our projection matrix
glUniformMatrix4fv(gl->projMat_uniform, 1, false, gl->projMat);
// bind our vao
glBindVertexArray(gl->vao);
// setup our buffer
glBindBuffer(GL_ARRAY_BUFFER, gl->vbo);
glBufferData(GL_ARRAY_BUFFER, (2 * sizeof(float) * 2 * nverts) + (sizeof(int) * nverts), NULL, GL_DYNAMIC_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(float) * 2 * nverts, verts);
glBufferSubData(GL_ARRAY_BUFFER, sizeof(float) * 2 * nverts, sizeof(float) * 2 * nverts, tcoords);
glBufferSubData(GL_ARRAY_BUFFER, 2 * sizeof(float) * 2 * nverts, sizeof(int) * nverts, colors);
// setup our attributes
glEnableVertexAttribArray(0);
glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(1);
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, sizeof(float) * 2, (void *) (sizeof(float) * 2 * nverts));
glEnableVertexAttribArray(2);
glVertexAttribPointer(2, 4, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(int), (void *) (2 * sizeof(float) * 2 * nverts));
glDrawArrays(GL_TRIANGLES, 0, nverts);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindVertexArray(0);
glUseProgram(0);
}
I made a small test for each implementation and the results show that immediate mode is significantly faster than core.
Both tests fill the screen with AAA... and I log the time it took to do this for each frame. This is the loop:
// Create GL stash for 512x512 texture, our coordinate system has zero at top-left.
struct FONScontext* fs = glfonsCreate(512, 512, FONS_ZERO_TOPLEFT);
// Add font to stash.
int fontNormal = fonsAddFont(fs, "sans", "fontstash/example/DroidSerif-Regular.ttf");
// Render some text
float dx = 10, dy = 10;
unsigned int white = glfonsRGBA(255,255,255,255);
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
fonsSetFont(fs, fontNormal);
fonsSetSize(fs, 20.0f);
fonsSetColor(fs, white);
for(int i = 0; i < 90; i++){
for( int j = 0; j < 190; j++){
dx += 10;
fonsDrawText(fs, dx, dy, "A", NULL);
}
dy += 10;
dx = 10;
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> time_span = t2 - t1;
std::cout<<"Time to render: "<<time_span.count()<<"ms"<<std::endl;
And the results show more than 400ms difference between the two:
Core profile (left) vs Immediate mode (right)
What should be changed in order to speed up performance?
I don't know exactly what gl is in this program, but it's pretty clear that, every time you want to render a piece of text, you perform the following operations:
Allocate storage for a buffer, reallocating whatever storage had been created from the last time this was called.
Perform three separate uploads of data to that buffer.
These are not good ways to stream vertex data to the GPU. There are specific techniques for doing this well, but this is not one of them. In particular, the fact that you are constantly reallocating the same buffer is going to kill performance.
The most effective way to deal with this is to have a single buffer with a fixed amount of storage. It gets allocated exactly once and never again. Ideally, whatever API you're getting vertex data from would provide it in an interleaved format, so that you would only need to perform one upload rather than three. But apparently, Fontstash is apparently not so generous.
In any case, the main idea is to avoid reallocation and synchronization. The latter means never trying to write over data that has been written to recently. So your buffer needs to be sufficiently large to hold twice the number of font vertices you ever expect to render. Essentially, you double-buffer the vertex data: writing to one set of data while the other set is being read from.
So at the beginning of the frame, you figure out what the byte offset to where you want to render will be. This will either be the start of the buffer or half-way through it. Then, for each blob of text, you write vertex data to this offset and increment the offset accordingly.
And to avoid having to change VAO state, you should interleave the vertex data manually. Instead of uploading three arrays, you should interleave the vertices so that you're effectively making one gigantic array of vertices. So you never need to call glVertexAttribPointer in the middle of this function; you just use the parameters to glDraw* to draw the part of the array you want.
This also means you only need one glBufferSubData call. But if you have access to persistent mapped buffers, you don't even need that, since you can just write to the memory directly while using the other portion of it. Though if you use persistent mapping, you will need to use a fence sync object when you switch buffer regions to make sure that you're not writing to vertex data that is still being read by the GPU.

How come no cube is drawn on my screen with this code in a GLFW window?

I have a bunch of code (copied from various tutorials) that is supposed to draw a random color-changing cube that the camera shifts around every second or so (with a variable, not using timers yet). It worked in the past before I moved my code into distinctive classes and shoved it all into my main function, but now I can't see anything on the main window other than a blank background. I cannot pinpoint any particular issue here as I am getting no errors or exceptions, and my own personally defined code checks out; when I debugged, every variable had a value I expected, and the shaders I used (in string form) worked in the past before I re-organized my code. I can print out the vertices of the cube in the same scope as the glDrawArrays() function as well, and they have the correct values too. Basically, I have no idea what's wrong with my code that is causing nothing to be drawn.
My best guess is that I called - or forgot to call - some opengl function improperly with the wrong data in one of the three methods of my Model class. In my program, I create a Model object (after glfw and glad are initialized, which then calls the Model constructor), update it every once and a while (time doesn't matter) through the update() function, then draw it to my screen every time my main loop is run through the draw() function.
Possible locations of code faults:
Model::Model(std::vector<GLfloat> vertexBufferData, std::vector<GLfloat> colorBufferData) {
mVertexBufferData = vertexBufferData;
mColorBufferData = colorBufferData;
// Generate 1 buffer, put the resulting identifier in vertexbuffer
glGenBuffers(1, &VBO);
// The following commands will talk about our 'vertexbuffer' buffer
glBindBuffer(GL_ARRAY_BUFFER, VBO);
// Give our vertices to OpenGL.
glBufferData(GL_ARRAY_BUFFER, sizeof(mVertexBufferData), &mVertexBufferData.front(), GL_STATIC_DRAW);
glGenBuffers(1, &CBO);
glBindBuffer(GL_ARRAY_BUFFER, CBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(mColorBufferData), &mColorBufferData.front(), GL_STATIC_DRAW);
// Create and compile our GLSL program from the shaders
programID = loadShaders(zachos::DATA_DEF);
glUseProgram(programID);
}
void Model::update() {
for (int v = 0; v < 12 * 3; v++) {
mColorBufferData[3 * v + 0] = (float)std::rand() / RAND_MAX;
mColorBufferData[3 * v + 1] = (float)std::rand() / RAND_MAX;
mColorBufferData[3 * v + 2] = (float)std::rand() / RAND_MAX;
}
glBufferData(GL_ARRAY_BUFFER, sizeof(mColorBufferData), &mColorBufferData.front(), GL_STATIC_DRAW);
}
void Model::draw() {
// Setup some 3D stuff
glm::mat4 mvp = Mainframe::projection * Mainframe::view * model;
GLuint MatrixID = glGetUniformLocation(programID, "MVP");
glUniformMatrix4fv(MatrixID, 1, GL_FALSE, &mvp[0][0]);
glEnableVertexAttribArray(0);
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glVertexAttribPointer(
0, // attribute 0. No particular reason for 0, but must match the layout in the shader.
3, // size
GL_FLOAT, // type
GL_FALSE, // normalized?
0, // stride
(void*)0 // array buffer offset
);
glEnableVertexAttribArray(1);
glBindBuffer(GL_ARRAY_BUFFER, CBO);
glVertexAttribPointer(
1, // attribute. No particular reason for 1, but must match the layout in the shader.
3, // size
GL_FLOAT, // type
GL_FALSE, // normalized?
0, // stride
(void*)0 // array buffer offset
);
// Draw the array
glDrawArrays(GL_TRIANGLES, 0, mVertexBufferData.size() / 3);
glDisableVertexAttribArray(0);
glDisableVertexAttribArray(1);
};
My question is simple, how come my program won't draw a cube on my screen? Is the issue within these three functions or elsewhere? I can provide more general information about the drawing process if needed, though I believe the code I provided is enough, since I literally just call model.draw().
sizeof(std::vector) will usually just be 24bytes (since the struct contains 3 pointers typically). So basically both of your buffers have 6 floats loaded in them, which is not enough verts for a single triangle, lets alone a cube!
You should instead be calling size() on the vector when loading the data into the vertex buffers.
glBufferData(GL_ARRAY_BUFFER,
mVertexBufferData.size() * sizeof(float), ///< this!
mVertexBufferData.data(), ///< prefer calling data() here!
GL_STATIC_DRAW);

glBufferSubData setting data from 0 rather than supplied offset

Don't have a whole lot to preface with other than this is being used in/for a particle system that uses transform feedback. There are two vbos that I'm ping ponging between render loops. Here's the initial buffer setup:
glGenBuffers(1, &p_vbo_r); glBindBuffer(GL_ARRAY_BUFFER, p_vbo_r);
glBufferData(GL_ARRAY_BUFFER, (MAX_PARTICLES*2) * sizeof(particle), &pp[0], GL_STREAM_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glGenBuffers(1, &p_vbo_w); glBindBuffer(GL_ARRAY_BUFFER, p_vbo_w);
glBufferData(GL_ARRAY_BUFFER, (MAX_PARTICLES*2) * sizeof(particle), &pp[0], GL_STREAM_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
MAX_PARTICLES is set to 262144. particle is a struct that contains 2 glm::vec3 and 2 GLfloat totaling to 32 bytes, so my current setup creates buffers that are slightly over 16mb. pp is an array containing dummy particle data since straight 0s don't stick out as much when using gDebugger.
glBindBuffer(GL_ARRAY_BUFFER, p_vbo_r);
glBufferSubData(GL_ARRAY_BUFFER, total_part*sizeof(particle), pp.size() * sizeof(particle), &pp[0]);
glBindBuffer(GL_ARRAY_BUFFER, p_vbo_w);
glBufferSubData(GL_ARRAY_BUFFER, total_part*sizeof(particle), pp.size() * sizeof(particle), &pp[0]);
p.buf_start = total_part;
total_part += pp.size();
This happens every time a unique emitter is added to the particle system. All possible emitters in a given scene are added before any rendering is done(currently) and this actually works as intended! It stores the offset used when the emitter was added into p.buf_start when it's done and adds to the running offset.
Now, curiously enough using the same set up later during rendering/execution causes the glBufferSubData command to start from 0 rather than the supplied offset, as so:
glBindBuffer(GL_ARRAY_BUFFER, p_vbo_r);
glBufferSubData(GL_ARRAY_BUFFER, p_sys[i].buf_start * sizeof(particle), pp.size() * sizeof(particle), &pp[0]);
glBindBuffer(GL_ARRAY_BUFFER, p_vbo_w);
glBufferSubData(GL_ARRAY_BUFFER, p_sys[i].buf_start * sizeof(particle), pp.size() * sizeof(particle), &pp[0]);
This is done in a loop that checks if the buffer needs reset and does so. Any actor may request a reset at any time after rendering, and the buffer does the above code after the frame is finished but before the next frame starts rendering. The first emitter listed works as intended, but any emitter afterwards always overwrites the VBOs from 0 and not from the given offset. Am I unable to overwrite ping-ponging buffers involved in transform feedback? It's been a little tricky trying to figure this one out. Thanks!
EDIT: I've also tried GL_DYNAMIC_DRAW/GL_STATIC_DRAW for the initial buffer setup to no avail.

C++/OpenGL Random Segmentation Fault

So I'm writing a chunk based procedurally generated terrain game and am running into two errors:
Basically, the way it's working is it generates chunks around the player position, once per game loop.
for (int i = RENDER_RADIUS; i >= 0; i--)
{
for (int j = RENDER_RADIUS; j >= 0; j--)
{
terr.renderChunk(glm::ivec2(c->getXOff() + i, c->getZOff() + j), cubeShader);
terr.renderChunk(glm::ivec2(c->getXOff() - i, c->getZOff() - j), cubeShader);
terr.renderChunk(glm::ivec2(c->getXOff() - i, c->getZOff() + j), cubeShader);
terr.renderChunk(glm::ivec2(c->getXOff() + i, c->getZOff() - j), cubeShader);
}
}
In terr.renderChunk I am using a unordered_map that uses the chunks position as the key and the chunk is the value. If the unordered_map doesn't find the chunk then the position gets added to terr.updateList.
Then, back in the game loop:
if (!terr.updateList.empty())
{
terr.updateChunk(terr.updateList[terr.updateList.size()-1]);
terr.world[terr.updateList[terr.updateList.size()-1]]->render(cubeShader);
terr.updateList.pop_back();
}
In a separate line, I'm ensuring that the players current chunk is loaded as well.
To generate a chunks VBO I added the indices to the chunks vector points and then build it as so:
glGenVertexArrays(1, &this->VAO);
glBindVertexArray(this->VAO);
// vertice VBO
glGenBuffers(1, &this->VBO_VERT);
glBindBuffer(GL_ARRAY_BUFFER, this->VBO_VERT);
glBufferData(GL_ARRAY_BUFFER, this->points.size() * sizeof(glm::vec3), &this->points[0][0], GL_STATIC_DRAW);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 0, (void*)0);
glEnableVertexAttribArray(0);
// texture coords
glGenBuffers(1, &this->VBO_UV);
glBindBuffer(GL_ARRAY_BUFFER, this->VBO_UV);
glBufferData(GL_ARRAY_BUFFER, this->uvs.size() * sizeof(glm::vec2), &this->uvs[0][0], GL_STATIC_DRAW);
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 0, (void*)(0));
glEnableVertexAttribArray(1);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindVertexArray(0);
Now, it all generally works, but I randomly get a segmentation fault and have debugged it down to my render function:
void Chunk::render(Shader shader)
{
shader.setMat4("transform", offsetMatrix);
shader.setFloat("transparency", 1.0f);
glBindVertexArray(VAO);
cout << "size " << points.size() << endl;
glDrawArrays(GL_TRIANGLES, 0, points.size()); //RIGHT HERE CAUSES THE SEGFAULTS
cout << "TEST2" << endl;
}
The segmentation fault seems to happen randomly, however, I do believe it doesn't happen on new chunks but rather going back over old ones.
My question was, is there anything specific with OpenGL/C++ that I'm unaware of that could be causing it?
The other error I'm getting that may be related but I've debugged less is I'm getting chunk rendering errors as so where it renders random terrain but when I go into it, collision still works as if terrain was where it should be.
I realize this is a long question, but any support is really appreciated!
Switching your render call to a renderChunk looks like it would be a safer alternative, not sure if it'd fix the segfault but from what I can see, that's a safer, and not too much slower bet.

Use of glDrawElements I Don't Understand

I am studying the source code of an open source project and they have a use of the function glDrawElements which I don't understand. While being a programmer, I am quite new to the GL API so would appreciate if someone could tell me how this works.
Let's start with the drawing part. The code looks like this:
for (int i = 0; i < numObjs; i++) {
glDrawElements(GL_TRIANGLES, vboIndexSize(i), GL_UNSIGNED_INT, (void*)(UPTR)vboIndexOffset(i));
}
vboIndiexSize(i) returns the number of indices for the current object, and vboIndexOffset returns the offset in bytes, in a flat memory array in which vertex data AND the indices of the objects are stored.
The part I don't understand, is the (void*)(UPTR)vboIndexOffset(i)). I look at the code many times and the function vboIndexOffset returns a int32 and UPTR also cast the returned value to an int32. So how you can you cast a int32 to a void* and expect this to work? But let's assume I made a mistake there and that it actually returns a pointer to this variable instead. The 4th argument of the glDrawElements call is an offset in byte within a memory block. Here is how the data is actually stored on the GPU:
int ofs = m_vertices.getSize();
for (int i = 0; i < numObj; i++)
{
obj[i].ofsInVBO = ofs;
obj[i].sizeInVBO = obj[i].indices->getSize() * 3;
ofs += obj[i].indices->getNumBytes();
}
vbo.resizeDiscard(ofs);
memcpy(vbo.getMutablePtr(), vertices.getPtr(), vertices.getSize());
for (int i = 0; i < numObj; i++)
{
memcpy(
m_vbo.getMutablePtr(obj[i].ofsInVBO),
obj[i].indices->getPtr(),
obj[i].indices->getNumBytes());
}
So all they do is calculate the number of bytes needed to store the vertex data then add to this number the number of bytes needed to store the indices of all the objects we want to draw. Then they allocate memory of that size, and copy the data in this memory: first the vertex data and then the indices. One this is done they push it to the GPU using:
glGenBuffers(1, &glBuffer);
glBindBuffer(GL_ARRAY_BUFFER, glBuffer);
checkSize(size, sizeof(GLsizeiptr) * 8 - 1, "glBufferData");
glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)size, data, GL_STATIC_DRAW);
What's interesting is that they store everything in the GL_ARRAY_BUFFER. They never store the vertex data in a GL_ARRAY_BUFFER and then the indices using a GL_ELEMENT_ARRAY_BUFFER.
But to go back to the code where the drawing is done, they first do the usual stuff to declare vertex attribute. For each attribute:
glBindBuffer(GL_ARRAY_BUFFER, glBuffer);
glEnableVertexAttribArray(loc);
glVertexAttribPointer(loc, size, type, GL_FALSE, stride, pointer);
This makes sense and is just standard. And then the code I already mentioned:
for (int i = 0; i < numObjs; i++) {
glDrawElements(GL_TRIANGLES, vboIndexSize(i), GL_UNSIGNED_INT, (void*)(UPTR)vboIndexOffset(i));
}
So the question: even if (UPTR) actually returns the pointer to variable (the code doesn't indicate this but I may be mistaken, it's a large project), I didn't know it was possible to store all vertex and indices data with the same memory block using GL_ARRAY_BUFFER and then using glDrawElements and having the 4th argument being the offset to the first element of this index list for the current object from this memory block. I thought you needed to use GL_ARRAY_BUFFER and GL_ELEMENT_BUFFER to declare the vertex data and the indices separately. I didn't think you could declare all the data in one go using GL_ARRAY_BUFFER and can't get it to work on my side anyway.
Has anyone see this working before? I haven't got a chance to get it working as yet, and wonder if someone could just potentially tell me if there's something specific I need to be aware of to get it to work. I tested with a simple triangle with position, normal and texture coordinates data, thus I have 8 * 3 floats for the vertex data and I have an array of 3 integers for the indices, 0, 1, 2. I then copy everything in a memory block, initialize the glBufferData with this, then try to draw the triangle with:
int n = 96; // offset in bytes into the memory block, fist int in the index list
glDrawElements(GL_TRIANGLES, 3, GL_UNSIGNED_INT, (void*)(&n));
It doesn't crash but I can't see the triangle.
EDIT:
Adding the code that doesn't seem to work for me (crashes).
float vertices[] = {
0, 1, 0, // Vertex 1 (X, Y)
2, -1, 0, // Vertex 2 (X, Y)
-1, -1, 0, // Vertex 3 (X, Y)
3, 1, 0,
};
U8 *ptr = (U8*)malloc(4 * 3 * sizeof(float) + 6 * sizeof(unsigned int));
memcpy(ptr, vertices, 4 * 3 * sizeof(float));
unsigned int indices[6] = { 0, 1, 2, 0, 3, 1 };
memcpy(ptr + 4 * 3 * sizeof(float), indices, 6 * sizeof(unsigned int));
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, 4 * 3 * sizeof(float) + 6 * sizeof(unsigned int), ptr, GL_STATIC_DRAW);
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 0, NULL);
glEnableVertexAttribArray(0);
free(ptr);
Then when it comes to draw:
glBindVertexArray(vao);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
// see stackoverflow.com/questions/8283714/what-is-the-result-of-null-int/
typedef void (*TFPTR_DrawElements)(GLenum, GLsizei, GLenum, uintptr_t);
TFPTR_DrawElements myGlDrawElements = (TFPTR_DrawElements)glDrawElements;
myGlDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, uintptr_t(4 * 3 * sizeof(float)));
This crashes the app.
see answer below for solution
This is due to OpenGL re-using fixed-function pipeline calls. When you bind a GL_ARRAY_BUFFER VBO, a subsequent call to glVertexAttribPointer expects an offset into the VBO (in bytes), which is then cast to a (void *). The GL_ARRAY_BUFFER binding remains in effect until another buffer is bound, just as the GL_ELEMENT_ARRAY_BUFFER binding remains in effect until another 'index' buffer is bound.
You can encapsulate the buffer binding and attribute pointer (offset) states using a Vertex Array Object.
The address in your example isn't valid. Cast offsets with: (void *) n
Thanks for the answers. I think though that (and after doing some research on the web),
first you should be using glGenVertexArray. It seems that this is THE standard now for OpenGL4.x so rather than calling glVertexAttribPointer before drawing the geometry, it seems like it's best practice to create a VAO when the data is pushed to the GPU buffers.
I (actually) was able to make combine the vertex data and the indices within the SAME buffer (a GL_ARRAY_BUFFER) and then draw the primitive using glDrawElements (see below). The standard way anyway is to push the vertex data to a GL_ARRAY_BUFFER and the indices to a GL_ELEMENT_ARRAY_BUFFER separately. So if that's the standard way of doing it, it's probably better not to try to be too smart and just use these functions.
Example:
glGenBuffers(1, &vbo);
// push the data using GL_ARRAY_BUFFER
glGenBuffers(1, &vio);
// push the indices using GL_ELEMENT_ARRAY_BUFFER
...
glGenVertexArrays(1, &vao);
// do calls to glVertexAttribPointer
...
Please correct me if I am wrong, but that seems the correct (and only) way to go.
EDIT:
However, it is actually possible to "pack" the vertex data and the indices together into an ARRAY_BUFFER as long as a call to glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vbo) is done prior to calling glDrawElements.
Working code (compared with code in original post):
float vertices[] = {
0, 1, 0, // Vertex 1 (X, Y)
2, -1, 0, // Vertex 2 (X, Y)
-1, -1, 0, // Vertex 3 (X, Y)
3, 1, 0,
};
U8 *ptr = (U8*)malloc(4 * 3 * sizeof(float) + 6 * sizeof(unsigned int));
memcpy(ptr, vertices, 4 * 3 * sizeof(float));
unsigned int indices[6] = { 0, 1, 2, 0, 3, 1 };
memcpy(ptr + 4 * 3 * sizeof(float), indices, 6 * sizeof(unsigned int));
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, 4 * 3 * sizeof(float) + 6 * sizeof(unsigned int), ptr, GL_STATIC_DRAW);
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 0, NULL);
glEnableVertexAttribArray(0);
free(ptr);
Then when it comes to draw:
glBindVertexArray(vao);
glBindBuffer(GL_ARRAY_BUFFER, vbo); // << THIS IS ACTUALLY NOT NECESSARY
// VVVV THIS WILL MAKE IT WORK VVVV
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vbo);
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// see stackoverflow.com/questions/8283714/what-is-the-result-of-null-int/
typedef void (*TFPTR_DrawElements)(GLenum, GLsizei, GLenum, uintptr_t);
TFPTR_DrawElements myGlDrawElements = (TFPTR_DrawElements)glDrawElements;
myGlDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, uintptr_t(4 * 3 * sizeof(float)));