Different vertex/index formats in a single buffer - c++

In my scene, I have several static models that never change. Some models only have float3 vertices and 16 bit indices, some are more complex, have colors and normals in vertices, and 32-bit indices.
Q1. Can I combine them all in a single vertex and index buffers, and draw like this:
// Model #1 has only position and 16 bit indices
const int model1verts = 20, model1indices = 100;
UINT stride1 = sizeof( Vector3 ), offset1 = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride1, &offset1 );
context->IASetIndexBuffer( ib, DXGI_FORMAT_R16_UINT, 0 );
context->DrawIndexed( model1indices, 0, 0 );
// Set another shader + input layout here
// Model #2 has positions + normals, and 32 bit indices
const int model2verts = 200, model2indices = 500;
UINT stride2 = sizeof( Vector3 ) * 2, offset2 = stride1 * model1verts;
context->IASetVertexBuffers( 0, 1, &vb, &stride2, &offset2 );
context->IASetIndexBuffer( ib, DXGI_FORMAT_R32_UINT, model1indices * sizeof( uint16_t ) );
context->DrawIndexed( model2indices, 0, 0 );
// 20 more models to follow, all from the same buffers
Q2. AFAIK, GPUs loves aligned data. When calling IASetVertexBuffers/IASetIndexBuffer, those offsets, should they be a multiple of 4 or 16 bytes? Documentation doesn’t say that.
Q3. Should I do that at all? Will this save resources compared to 20-100 smaller buffers each of it’s own format?

Related

Weird compute shader latency

I'm trying to make frustrum culling via compute shader. For that I have a pair of buffers for instanced vertex attributes, and a pair of buffers for indirect draw commands. My compute shader checks if instance coordinates from first buffer are within bounding volume, referencing first draw buffer for counts, subgroupBallot and bitCount to see offset within subgroup, then add results from other subgroups and a global offset, and finally stores the result in second buffer. The global offset is stored in second indirect draw buffer.
The problem is that, when under load, frustum may be few(>1) frames late to the moving camera, with wide lines of disappeared objects on edge. It seems weird to me because culling and rendering are done within same command buffer.
When taking capture in renderdoc, taking a screenshot alt+printScreen, or pausing the render-present thread, things snap back to as they should be.
My only guess is that compute shader from past frame continues to execute even when new frame starts to be drawn, though this should not be happening due to pipeline barriers.
Shader code:
#version 460
#extension GL_KHR_shader_subgroup_ballot : require
struct drawData{
uint indexCount;
uint instanceCount;
uint firstIndex;
uint vertexOffset;
uint firstInstance;
};
struct instanceData{
float x, y, z;
float a, b, c, d;
};
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0) uniform A
{
mat4 cam;
vec4 camPos;
vec4 l;
vec4 t;
vec4 r;
vec4 b;
};
layout(set = 0, binding = 1) buffer B
{
uint count;
drawData data[];
} Draw[2];
layout(set = 0, binding = 2) buffer C
{
instanceData data[];
} Instance[2];
shared uint offsetsM[32];
void main()
{
const uint gID = gl_LocalInvocationID.x;
const uint lID = gl_SubgroupInvocationID;
const uint patchSize = gl_WorkGroupSize.x;
Draw[1].data[0] = Draw[0].data[0];//copy data like index count
Draw[1].count = Draw[0].count;
uint offsetG = 0;//accumulating offset within end buffer
uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
for(uint i = 0; i<loops;++i){
uint posa = i*patchSize+gID;//runs better this way for some reason
vec3 pos = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
vec4 dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
bool Pass = posa<Draw[0].data[0].instanceCount&&//is real
(dot(pos, pos)<l.w*l.w) &&//not too far
all(greaterThan(dist, vec4(0))); //within view frustum
subgroupBarrier();//no idea what is the best, put what works
uvec4 actives = subgroupBallot(Pass);//count passed instances
if(subgroupElect())
offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
barrier();
uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
uint ii = 0;
if(Pass){
for(; ii<gl_SubgroupID; ++ii)
offsetG+= offsetsM[ii];//offsets before subgroup
Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];}//offsets after subgroup
else for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];//same but no data copying
}
if(gID == 0)
Draw[1].data[0].instanceCount = offsetG;
}
For renderpass after the compute I have dependencies:
{//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
}
{//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
}
The command buffer is(fully reused as is, one for each image in swapchain):
vkBeginCommandBuffer(cmd, &begInfo);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
0, 1, &descs[1], 0, 0);
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
vkCmdDispatch(cmd, 1, 1, 1);
VkBufferMemoryBarrier bufMemBar[2];
{//mem bars
{//0 indirect
bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
bufMemBar[0].buffer = bufferIndirect;
bufMemBar[0].offset = 0;
bufMemBar[0].size = -1;
}
{//1 vertex instance
bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
bufMemBar[1].buffer = bufferInstance;
bufMemBar[1].offset = 0;
bufMemBar[1].size = -1;
}
}
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);
VkRenderPassBeginInfo passBegInfo;
passBegInfo.renderPass = pass;
passBegInfo.framebuffer = chain.frames[i];
passBegInfo.renderArea = {{0, 0}, chain.dim};
VkClearValue clears[2]{{0},{0}};
passBegInfo.clearValueCount = 2;
passBegInfo.pClearValues = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
vkCmdBindPipeline (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
VkBuffer buffersVertex[2]{bufferVertexProto, bufferInstance};
VkDeviceSize offsetsVertex[2]{0, 0};
vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
vkCmdBindIndexBuffer (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);
vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
bufferIndirect, 0,
count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);
vkEndCommandBuffer(cmd);
Rendering and presentation are synchronised with two semaphores - imageAvailable, and renderFinished. Frustum calculation is in right order on CPU. Validation layers are enabled.
The problem was that I lacked host synchronisation. Indeed, even within same command buffer, there are no host synchronisation guarantees (and that makes sense, since it enables us to use events).

DirectX - Writing to 3D Texture Causing Display Driver Failure

I'm testing writing to 2D and 3D textures in compute shaders, outputting a gradient noise texture consisting of 32 bit floats. Writing to a 2D texture works fine, but writing to a 3D texture isn't. Are there additional considerations that need to be made when creating a 3D texture when compared to a 2D texture?
Code of how I'm defining the 3D texture below:
HRESULT BaseComputeShader::CreateTexture3D(UINT width, UINT height, UINT depth, DXGI_FORMAT format, ID3D11Texture3D** texture)
{
D3D11_TEXTURE3D_DESC textureDesc;
ZeroMemory(&textureDesc, sizeof(textureDesc));
textureDesc.Width = width;
textureDesc.Height = height;
textureDesc.Depth = depth;
textureDesc.MipLevels = 1;
textureDesc.Format = format;
textureDesc.Usage = D3D11_USAGE_DEFAULT;
textureDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
textureDesc.CPUAccessFlags = 0;
textureDesc.MiscFlags = 0;
return renderer->CreateTexture3D(&textureDesc, 0, texture);
}
HRESULT BaseComputeShader::CreateTexture3DUAV(UINT depth, DXGI_FORMAT format, ID3D11Texture3D** texture, ID3D11UnorderedAccessView** unorderedAccessView)
{
D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc;
ZeroMemory(&uavDesc, sizeof(uavDesc));
uavDesc.Format = format;
uavDesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE3D;
uavDesc.Texture3D.MipSlice = 0;
uavDesc.Texture3D.FirstWSlice = 0;
uavDesc.Texture3D.WSize = depth;
return renderer->CreateUnorderedAccessView(*texture, &uavDesc, unorderedAccessView);
}
HRESULT BaseComputeShader::CreateTexture3DSRV(DXGI_FORMAT format, ID3D11Texture3D** texture, ID3D11ShaderResourceView** shaderResourceView)
{
D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc;
ZeroMemory(&srvDesc, sizeof(srvDesc));
srvDesc.Format = format;
srvDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D;
srvDesc.Texture3D.MostDetailedMip = 0;
srvDesc.Texture3D.MipLevels = 1;
return renderer->CreateShaderResourceView(*texture, &srvDesc, shaderResourceView);
}
And how I'm writing to it in the compute shader:
// The texture we're writing to
RWTexture3D<float> outputTexture : register(u0);
[numthreads(8, 8, 8)]
void main(uint3 DTid : SV_DispatchThreadID)
{
float noiseValue = 0.0f;
float value = 0.0f;
float localAmplitude = amplitude;
float localFrequency = frequency;
// Loop for the number of octaves, running the noise function as many times as desired (8 is usually sufficient)
for (int k = 0; k < octaves; k++)
{
noiseValue = noise(float3(DTid.x * localFrequency, DTid.y * localFrequency, DTid.z * localFrequency)) * localAmplitude;
value += noiseValue;
// Calculate a new amplitude based on the input persistence/gain value
// amplitudeLoop will get smaller as the number of layers (i.e. k) increases
localAmplitude *= persistence;
// Calculate a new frequency based on a lacunarity value of 2.0
// This gives us 2^k as the frequency
// i.e. Frequency at k = 4 will be f * 2^4 as we have looped 4 times
localFrequency *= 2.0f;
}
// Output value to 2D index in the texture provided by thread indexing
outputTexture[DTid.xyz] = value;
}
And finally, how I'm running the shader:
// Set the shader
deviceContext->CSSetShader(computeShader, nullptr, 0);
// Set the shader's buffers and views
deviceContext->CSSetConstantBuffers(0, 1, &cBuffer);
deviceContext->CSSetUnorderedAccessViews(0, 1, &textureUAV, nullptr);
// Launch the shader
deviceContext->Dispatch(512, 512, 512);
// Reset the shader now we're done
deviceContext->CSSetShader(nullptr, nullptr, 0);
// Reset the shader views
ID3D11UnorderedAccessView* ppUAViewnullptr[1] = { nullptr };
deviceContext->CSSetUnorderedAccessViews(0, 1, ppUAViewnullptr, nullptr);
// Create the shader resource view for access in other shaders
HRESULT result = CreateTexture3DSRV(DXGI_FORMAT_R32_FLOAT, &texture, &textureSRV);
if (result != S_OK)
{
MessageBox(NULL, L"Failed to create texture SRV after compute shader execution", L"Failed", MB_OK);
exit(0);
}
My bad, simple mistake. Compute shader threads are limited in number. In the compute shader you're limited to a total of 1024 threads, and the dispatch call cannot dispatch more than 65535 thread groups. The HLSL compiler will catch the former issue, but the Visual C++ compiler will not catch the latter issue.
If you create a texture of 512 * 512 * 512 (which seems what you are trying to achieve), your dispatch needs to be divided by groups:
deviceContext->Dispatch(512 / 8, 512 / 8, 512 / 8);
In your previous case, the dispatch was :
512*8 * 512*8 * 512*8 = 68719476736 units
Which very likely triggered the time out detection and crashes the driver
Also the limit of 65535 is per dimension, so in your case you are completely safe to run this.
And last one, you can create both shader resource view and unordered view right after creating your 3d texture (before the dispatch call).
This is generally recommended to avoid mixing context code and resource creation code.
On resource creation, your check is not valid either :
if (result != S_OK)
HRESULT success condition is >= 0
you can use the built in macro instead eg :
if (SUCCEEDED(result))

Shader storage block name issue

Something weird is happening with my shader storage blocks.
I have 2 SSBs:
#version 450 core
out vec4 out_color;
layout (binding = 0, std430) buffer A_SSB
{
float a_data[];
};
layout (binding = 1, std430) buffer B_SSB
{
float b_data[];
};
void main()
{
a_data[0] = 0.0f;
a_data[1] = 1.0f;
a_data[2] = 2.0f;
a_data[3] = 3.0f;
b_data[0] = 90.0f;
b_data[1] = 81.0f;
b_data[2] = 72.0f;
b_data[3] = 63.0f;
out_color = vec4(0.0f, 0.8f, 1.0f, 1.0f);
}
This is working well, but if i swap the SSB names like that:
layout (binding = 0, std430) buffer B_SSB
{
float a_data[];
};
layout (binding = 1, std430) buffer A_SSB
{
float b_data[];
};
the SSB indexes are swapped although they are hardcoded and data which should be written to a_data is written to b_data and vice versa.
Both SSBs are 250MB large, the max size is more than 2GB. It seems that the indexes are allocated alphabetical but this shouldn't happen. I'm binding the buffers like that:
glCreateBuffers(1, &a_ssb);
glNamedBufferStorage(a_ssb, 7187400 * 9 * sizeof(float), nullptr, GL_MAP_READ_BIT);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, a_ssb);
glShaderStorageBlockBinding(test_prog, 0, 0);
glCreateBuffers(1, &b_ssb);
glNamedBufferStorage(b_ssb, 7187400 * 9 * sizeof(float), nullptr, GL_MAP_READ_BIT);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, b_ssb);
glShaderStorageBlockBinding(test_prog, 1, 1);
Is this a bug or my fault? Also i would like to ask why i'm getting the error "lvalue in array access too complex or possible array index out of bounds" if i'm assigning values in a for loop?
for(unsigned int i = 0; i < 4; ++i)
a_data[i] = float(i);
glShaderStorageBlockBinding(test_prog, 0, 0);
This is your problem.
You assigned the binding index in the shader. You do not need to assign it again.
Your problem comes from the fact that you assigned it incorrectly.
The second parameter to this function is the index of the block you are assigning a binding index to. The only way to get a correct index is to query it via Program Introspection APIs. The block index is the resource index, queried through this call:
auto block_index = glGetProgramResourceIndex​(test_prog, GL_SHADER_STORAGE_BLOCK, "A_SSB");
It just so happened, in your original code, that the shader compiler assigned A_SSB's resource index to 0 and B_SSB's resource index to 1. This assignment was probably arbitrarily done based on their names. Thus, when you changed the names on them, the resource indices didn't change. So A_SSB was still resource index 0, but your shader assigned it binding index 1. Which was fine...
Until your C++ code overrode that assignment with your glShaderStorageBlockBinding(test_prog, 0, 0). That assigned resource index 0 (A_SSB) to binding index 0.
You should either set the binding index in the shader or in C++ code. Not in both.

Multiple quads in one vbo

I am working on a minecraft-ish game, and I've been working a little more with vbos. However; when drawing multiple faces in a single vbo I seem to have a little bit of a issue.
Here is my vbo-generation code:
glGenBuffers(1, &VBO);
glBindBuffer(GL_ARRAY_BUFFER, verts);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, verts * 9 * sizeof(GLfloat), NULL, GL_STATIC_DRAW);
void* ptr = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_WRITE);
GLfloat*model = (GLfloat*)ptr;
GLfloat*tex = ((GLfloat*)ptr) + verts * 6;
GLfloat*color = ((GLfloat*)ptr) + verts * 3;
int p = 0;
int k = p * 3;
for (int mcy = 0; mcy < 5; mcy++) {
for (int mcx = 0; mcx < 5; mcx++) {
double addonX = mcx*32.0;
double addonY = mcy*32.0;
int addonx = mcx * 32;
int addony = mcy * 32;
if (!(hill.get(addonX, addonY)*400.0 > 100 && hill.get(32 + addonX, addonY)*400.0 > 100 && hill.get(addonX, 32 + addonY)*400.0 > 100 && hill.get(32 + addonX, 32 + addonY)*400.0 > 100)) {
draw = true;
int biome1 = BiomeToColor(GetBiome(x, y, addonX, addonY), hill.get(addonX, addonY)*400.0);
int biome2 = BiomeToColor(GetBiome(x, y, 32 + addonX, addonY), hill.get(32 + addonX, addonY)*400.0);
int biome3 = BiomeToColor(GetBiome(x, y, addonX, 32 + addonY), hill.get(addonX, 32 + addonY)*400.0);
int biome4 = BiomeToColor(GetBiome(x, y, 32 + addonX, 32 + addonY), hill.get(32 + addonY, 32 + addonY)*400.0);
model[k] = addonx+ 32;
model[k + 1] = addony;
model[k + 2] = hill.get(addonX + 32, addonY)*400.0;
color[k] = BiomeColors[biome2].r;
color[k + 1] = BiomeColors[biome2].g;
color[k + 2] = BiomeColors[biome2].b;
p++;
k = p * 3;
model[k] = addonx + 32;
model[k + 1] = addony + 32;
model[k + 2] = hill.get(addonX + 32, addonY + 32)*400.0;
color[k] = BiomeColors[biome4].r;
color[k + 1] = BiomeColors[biome4].g;
color[k + 2] = BiomeColors[biome4].b;
p++;
k = p * 3;
model[k] = addonx;
model[k + 1] = addony + 32;
model[k + 2] = hill.get(addonX, addonY + 32)*400.0;
color[k] = BiomeColors[biome3].r;
color[k + 1] = BiomeColors[biome3].g;
color[k + 2] = BiomeColors[biome3].b;
p++;
k = p * 3;
model[k] = addony;
model[k + 1] = addony;
model[k + 2] = hill.get(addonX, addonY)*400.0;
color[k] = BiomeColors[biome1].r;
color[k + 1] = BiomeColors[biome1].g;
color[k + 2] = BiomeColors[biome1].b;
p++;
k = p * 3;
}
}
}
glUnmapBuffer(GL_ARRAY_BUFFER);
glBindBuffer(GL_ARRAY_BUFFER, 0);
And here's the code I use to draw the vbo:
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glVertexPointer(3, GL_FLOAT, 0, 0);
glTexCoordPointer(3, GL_FLOAT, 0, (char*)NULL + verts * 6 * sizeof(GLfloat));
glColorPointer(3, GL_FLOAT, 0, (char*)NULL + verts * 3 * sizeof(GLfloat));
glDrawArrays(GL_QUADS, 0, VBO);
glBindBuffer(GL_ARRAY_BUFFER, 0);
Here's the result I want (using a single quad in every vbo):
unfortunatly I'm still new so you have to click this link :/
And here is the result I get with multiple quads in every vbo:
image
So why do I want to draw multiple quads in a single vbo?
One word: performance, if you compare the two images the thing that really pops out (well, except for the bug with the second image) is the framerate counter. I want to make this game into a big thing, so every fps matters to me.
EDIT:
Omg, I'm so stupid:
model[k] = addony;
A very simple mistake, but so devistating.
Just proves how so small things can brake the game.
It all workes now.
glDrawArrays(GL_QUADS, 0, VBO);
There are a few problems with this call:
the third parameter of glDrawArrays is the count of the things you are drawing so what you are actually saying is:
Draw Quads from my Buffer at 0 until VBO and then stop.
What you should be saying is:
Draw Quads from my Buffer at 0 until Buffer Length and then stop
so now it looks like this:
glDrawArrays(GL_QUADS, 0, verts);
'VBO' in your code is the ID of the Buffer that you want to use.
think about it like a pointer who's number you know or rather a user with an ID.
GL_QUADS is not good use GL_TRIANGLES there are many problems with GL_QUADS later especialy on mobile phones and on other platforms making your data in triangles is much much nicer.
You shouldn't be drawing in GL_QUADS for multiple reasons
Why are you not using VAO's? Are you using an older version of OpenGL that doesn't have VAO's? Otherwise I would suggest using VAO here instead of VBO so you dont need to bind pointers for each draw call.
glBindBuffer(GL_ARRAY_BUFFER, verts);
What you are trying to here is bind a VBO of id: 'verts' to be our current VBO.
'So why do I want to draw multiple quads in a single vbo? One word: performance'
Have you tried to draw multiple quads using instancing?
So sending a model matrix for each of the shapes so that you modify their positions and shapes in the shader and not in the buffer. This way you can draw one vbo over and over again just slightly transformed with a single draw call.
Here is a good tutorial on instancing:
http://learnopengl.com/#!Advanced-OpenGL/Instancing
Just out of curiosity but why did you decide to use:
glMapBuffer(GL_ARRAY_BUFFER, GL_READ_WRITE);
instead of buffering your data in the glBufferData call?
If you need to buffer the data later you can use glBufferSubData
Honestly though I think your performance problems stem from a range of factors.
I would personally use glBufferData instead of map data and when I need to do it during run time and not during loading I would use glBufferSubData.
I would upload the colors to the shader and draw multiples of the SAME VBO again and again with a different model matrix and colors allowing me to instance it.
However you shouldn't need to do that.
What I would recommend is making up the data in triangles and colors and drawing the whole ground as a mesh which you have seemed to tried to do. Your problem was most likely caused by glDrawArrays length being set to that of a VBO.
However in this case I would build a VBO using glBufferData with the size of a chunk then I would use glBufferSubData for each of the quads with colors etc. and once I am done I would draw that multiple times alongside different chunks.
I think it would be of use to you to do more theory of OpenGL.

Constant buffer members access the same memory

I'm using a constant buffer to pass data to my shaders at every frame, and I'm running into an issue where the values of some of the members of the buffer point to the same memory.
When I use the Visual Studio 2012 debugging tools, it looks like the data is being set in the buffer more or less correctly:
0 [0x00000000-0x00000003] | +0
1 [0x00000004-0x00000007] | +1
2 [0x00000008-0x0000000b] | +1
3 [0x0000000c-0x0000000f] | +1
4 [0x00000010-0x00000013] | +0.78539819
5 [0x00000014-0x00000017] | +1.1760513
6 [0x00000018-0x0000001b] | +0
7 [0x0000001c-0x0000001f] | +1
The problem is that when I debug the shader, the sunAngle and phaseFunction both have the same value - specifically 0.78539819, which should be the value of sunAngle only. It does change to 1.1760513 if I swap the order of the two floats, but both will still be the same. I thought I'd packed everything together correctly, but am I missing how to define exactly what constants are in each part of the buffer?
Here's the C++ structure I'm using:
struct SunData {
DirectX::XMFLOAT4 sunPosition;
float sunAngle;
float phaseFunctionResult;
};
And the shader buffer looks like this:
// updated as the sun moves through the sky
cbuffer sunDependent : register( b1 )
{
float4 sunPosition;
float sunAngle; // theta
float phaseFunctionResult; // F( theta, g )
}
Here's the code I'm using to initialize the buffer:
XMVECTOR pos = XMVectorSet( 0, 1, 1, 1 );
XMStoreFloat3( &_sunPosition, pos );
XMStoreFloat4( &_sun.sunPosition, pos );
_sun.sunAngle = XMVectorGetX(
XMVector3AngleBetweenVectors( pos, XMVectorSet( 0, 1, 0, 0 ) )
);
_sun.phaseFunctionResult = _planet.phaseFunction( _sun.sunAngle );
// Fill in a buffer description.
D3D11_BUFFER_DESC cbDesc;
cbDesc.ByteWidth = sizeof( SunData ) + 8;
cbDesc.Usage = D3D11_USAGE_DYNAMIC;
cbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
cbDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
cbDesc.MiscFlags = 0;
cbDesc.StructureByteStride = 0;
// Fill in the subresource data.
D3D11_SUBRESOURCE_DATA data;
data.pSysMem = &_sun;
data.SysMemPitch = 0;
data.SysMemSlicePitch = 0;
// Create the buffer.
ID3D11Buffer *constantBuffer = nullptr;
HRESULT hr = _d3dDevice->CreateBuffer(
&cbDesc,
&data,
&constantBuffer
);
assert( SUCCEEDED( hr ) );
// Set the buffer.
_d3dDeviceContext->VSSetConstantBuffers( 1, 1, &constantBuffer );
_d3dDeviceContext->PSSetConstantBuffers( 1, 1, &constantBuffer );
Release( constantBuffer );
And here's the pixel shader that's using the values:
float4 main( in ATMOS_PS_INPUT input ) : SV_TARGET
{
float R = sunAngle * sunPosition.x * sunIntensity.x
* attenuationCoefficient.x
* phaseFunctionResult;
return float4( R, 1, 1, 1 );
}
It looks like a padding issue like in this question: Question
All constant buffers should be sized to be dividble by sizeof(four-component vector) (doc)