Related
I want to transfer a lot of images loaded by stb_image to fragment shader by texture buffer object in OpenGL.
I first merge all the image data (every image file is in RGB format), and use one unsigned char pointer named data to point to them, this means all image data are put together in the format of RGBRGB...
Then, I used the following code to send all image data to fragment shader:
unsigned tbo, tex;
glGenBuffers(1, &tbo);
glBindBuffer(GL_TEXTURE_BUFFER, tbo);
glGenTextures(1, &tex);
glBindTexture(GL_TEXTURE_BUFFER, tex);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R8UI, tbo);
glBindTexture(GL_TEXTURE_BUFFER, 0);
[...]
glBindBuffer(GL_TEXTURE_BUFFER, tbo);
glBufferData(GL_TEXTURE_BUFFER, len * sizeof(unsigned char),
NULL, GL_DYNAMIC_DRAW); // len is the length of data
glBufferSubData(GL_TEXTURE_BUFFER, 0, len * sizeof(unsigned char), data);
and in rendering loop:
shader.use();
glBindBuffer(GL_TEXTURE_BUFFER, tbo);
glBindTexture(GL_TEXTURE_BUFFER, tex);
[...]
in my fragment shader:
layout (binding = 0) uniform samplerBuffer image_texture_buffer;
out vec4 FragColor;
[...]
void main(){
FragColor = texelFetch(image_texture_buffer, 0);
}
[...]
But FragColor is always black.
Edited on 2021/4/3:
I used this function to merge all the image data:
void merge_data(unsigned char *res, int lena, unsigned char *data, int lenb) {
//cout << lena << ' ' << lenb << endl;
res = (unsigned char *)realloc(res, lena + lenb);
//strcat_s((char *)res, sizeof data, (char *)data);
for (int i = lena, j = 0; j < lenb; ++i, ++j) res[i] = data[j];
}
In addition, someone told me that the video memory data must be aligned with even numbers, which means that I must transmit the data to the shader in the format of RGBARGBA..., Is this the effect?
Just to get an idea of what kind of speeds I should be expecting I have been trying to benchmark transfer between global memory and shaders, rather than relying on GPU spec sheets. However I can't get close to the theoretical maximum. In fact I'm out by a factor of 50!.
I'm using a GTX Titan X, which is said to have 336.5GB/s. Linux x64 driver 352.21.
I found a CUDA benchmark here which gives me ~240–250GB/s (this is more what I expect).
I'm trying to match exactly what they do with shaders. I've tried vertex shaders, compute shaders, accessing buffer objects via image_load_store and NV_shader_buffer_store, with floats, vec4s, loops inside the shader (with coalesced addressing within the work group) and various methods of timing. I'm stuck at ~7GB/s (see the update below).
Why is GL so much slower? Am I doing something wrong and if so, how should it be done?
Here's my MWE with three methods (1. vertex shader with image_load_store, 2. vertex shader with bindless graphics, 3. compute shader with bindless graphics):
//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>
const char* imageSource =
"#version 440\n"
"uniform layout(r32f) imageBuffer data;\n"
"uniform float val;\n"
"void main() {\n"
" imageStore(data, gl_VertexID, vec4(val, 0.0, 0.0, 0.0));\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"uniform float* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_VertexID] = val;\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessComputeSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"layout(local_size_x = 256) in;\n"
"uniform float* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_GlobalInvocationID.x] = val;\n"
"}\n";
GLuint compile(GLenum type, const char* shaderSrc)
{
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
glCompileShader(shader);
int success = 0;
int loglen = 0;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
GLchar* log = new GLchar[loglen];
glGetShaderInfoLog(shader, loglen, &loglen, log);
if (!success)
{
printf("%s\n", log);
exit(0);
}
GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
return program;
}
GLuint timerQueries[2];
void start()
{
glGenQueries(2, timerQueries);
glQueryCounter(timerQueries[0], GL_TIMESTAMP);
}
float stop()
{
glMemoryBarrier(GL_ALL_BARRIER_BITS);
GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
glQueryCounter(timerQueries[1], GL_TIMESTAMP);
GLint available = 0;
while (!available) //sometimes gets stuck here for whatever reason
glGetQueryObjectiv(timerQueries[1], GL_QUERY_RESULT_AVAILABLE, &available);
GLuint64 a, b;
glGetQueryObjectui64v(timerQueries[0], GL_QUERY_RESULT, &a);
glGetQueryObjectui64v(timerQueries[1], GL_QUERY_RESULT, &b);
glDeleteQueries(2, timerQueries);
return b - a;
}
int main(int argc, char** argv)
{
float* check;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
glutCreateWindow("test");
glewInit();
int bufferSize = 64 * 1024 * 1024; //64MB
int loops = 500;
glEnable(GL_RASTERIZER_DISCARD);
float* dat = new float[bufferSize/sizeof(float)];
memset(dat, 0, bufferSize);
//create a buffer with data
GLuint buffer;
glGenBuffers(1, &buffer);
glBindBuffer(GL_TEXTURE_BUFFER, buffer);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);
//get a bindless address
GLuint64 address;
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
//make a texture alias for it
GLuint bufferTexture;
glGenTextures(1, &bufferTexture);
glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);
glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);
//compile the shaders
GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
GLuint bindlessComputeShader = compile(GL_COMPUTE_SHADER, bindlessComputeSource);
//warm-up and check values
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
//glMemoryBarrier(GL_ALL_BARRIER_BITS); //this causes glDispatchCompute to segfault later, so don't uncomment
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f); //glDispatchCompute doesn't actually write anything with bindless graphics
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glFinish();
//time image_load_store
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 imageTime = stop();
printf("image_load_store: %.2fGB/s\n", (float)((bufferSize * (double)loops) / imageTime));
//time bindless
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 bindlessTime = stop();
printf("bindless: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessTime));
//time bindless in a compute shader
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
GLuint64 bindlessComputeTime = stop();
printf("bindless compute: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessComputeTime));
assert(glGetError() == GL_NO_ERROR);
return 0;
}
My output:
image_load_store: 6.66GB/s
bindless: 6.68GB/s
bindless compute: 6.65GB/s
Some notes:
Compute shaders with bindless graphics don't appear to write anything (the commented out assert fails), or at least the data isn't retrieved with glMapBuffer even though the speed matches the other methods. Using image_load_store in the compute shader works and gives the same speed the vertex shaders (though I thought that'd be one too many permutations to post).
Calling glMemoryBarrier(GL_ALL_BARRIER_BITS) before glDispatchCompute causes a crash in the driver.
Commenting out the three glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);, which are used to check the output, raises the speed of the first two tests to 17GB/s and the compute shader skyrockets to 292GB/s which is much closer to what I'd like but this can't be trusted because of point 1.
Sometimes while (!available) hangs for ages (ctrl-c when I get tired of waiting shows its still in the loop).
For reference, here's the CUDA code:
//http://www.ks.uiuc.edu/Research/vmd/doxygen/CUDABench_8cu-source.html
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define CUERR { cudaError_t err; \
if ((err = cudaGetLastError()) != cudaSuccess) { \
printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
return -1; }}
//
// GPU device global memory bandwidth benchmark
//
template <class T>
__global__ void gpuglobmemcpybw(T *dest, const T *src) {
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = src[idx];
}
template <class T>
__global__ void gpuglobmemsetbw(T *dest, const T val) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = val;
}
typedef float4 datatype;
static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
int i;
int len = 1 << 22; // one thread per data element
int loops = 500;
datatype *src, *dest;
datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
// initialize to zero for starters
float memsettime = 0.0f;
float memcpytime = 0.0f;
*gpumemsetgbsec = 0.0;
*gpumemcpygbsec = 0.0;
// attach to the selected device
cudaError_t rc;
rc = cudaSetDevice(cudadev);
if (rc != cudaSuccess) {
#if CUDART_VERSION >= 2010
rc = cudaGetLastError(); // query last error and reset error state
if (rc != cudaErrorSetOnActiveProcess)
return -1; // abort and return an error
#else
cudaGetLastError(); // just ignore and reset error state, since older CUDA
// revs don't have a cudaErrorSetOnActiveProcess enum
#endif
}
cudaMalloc((void **) &src, sizeof(datatype)*len);
CUERR
cudaMalloc((void **) &dest, sizeof(datatype)*len);
CUERR
dim3 BSz(256, 1, 1);
dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1);
// do a warm-up pass
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
CUERR
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
CUERR
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
CUERR
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
// execute the memset kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
}
CUERR
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memsettime, start, end);
CUERR
// execute the memcpy kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
}
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memcpytime, start, end);
CUERR
cudaEventDestroy(start);
CUERR
cudaEventDestroy(end);
CUERR
*gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
*gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
cudaFree(dest);
cudaFree(src);
CUERR
return 0;
}
int main()
{
double a, b;
cudaglobmembw(0, &a, &b);
printf("%f %f\n", (float)a, (float)b);
return 0;
}
Update:
It seems that the buffer gets made non-resident on my glBufferData calls which were there to check output was being written. As per the extension:
A buffer is also made non-resident implicitly as a result of being respecified via BufferData or being deleted.
...
BufferData is specified to "delete the existing data store",
so the GPU address of that data should become invalid. The buffer is
therefore made non-resident in the current context.
At a guess, OpenGL then streams in the buffer object data each frame and doesn't cache it in video memory. This explains why the compute shader failed the assert, however there's a slight anomaly that bindless graphics in the vertex shader still worked when not resident, but I'll ignore that for now. I have no idea why a 64MB buffer object wouldn't default to being resident (though perhaps after first use) when there's 12GB available.
So after each call to glBufferData I make it resident again and get the address in case its changed:
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
I'm now getting 270–290GB/s with the compute shader using either image_load_store or bindless graphics. Now my question includes:
Given the buffer seems to be resident for each test and the compute shader is nice and fast, why are the vertex shader versions still so slow?
Without the bindless graphics extension, how should regular OpenGL users put data into video memory (actually put and not idly suggest that the driver might just like to)?
I'm pretty sure I would have noticed this problem in real world situations, and it's this contrived benchmark that hits a slow path, so how could I trick the driver into making a buffer object resident? Running a compute shader first doesn't change anything.
You are asking the driver to read from your process memory, dat. This causes extensive cache coherency traffic. When the GPU reads that memory, it can't be sure that it is up to date, it might be in the CPU cache, modified, and not written back to RAM yet. This causes the GPU to actually have to read from the CPU cache, which is far more expensive than bypassing the CPU and reading the RAM. The RAM is often idle during normal operation, because a modern CPU's hit rate is typically 95% to 99%. The cache is used continuously.
To achieve maximum performance, you need to let the driver allocate the memory. Normal memory your program uses, like global variables and the heap are allocated in writeback memory. Driver allocated memory will usually be allocated as write combining or uncacheable, which eliminates the coherency traffic.
Peak advertised bandwidth numbers will be achieved only without cache coherency overhead.
To let the driver allocate it, use glBufferData with a nullptr for the data.
It isn't all rosy though, if you manage to coerce the driver into using a system memory write combining buffer. CPU reads to such addresses will be very slow. Sequential writes are optimized by the CPU, but random writes will cause the write combining buffer to flush frequently, hurting performance.
ok so this is the code I use to put the vertex array and buffers onto the graphics card.
void Mesh3D::InitializeBuffers(GLuint program) {
float largestPos = findLargestPosition(&col->vectorGeometry);
std::cout << "largestPos is: ";
std::cout << largestPos;
std::cout << "\n";
int loc;
vaos = new GLuint[nVectorGeometry];
glGenVertexArrays(nVectorGeometry, vaos);//The vertex array object stores the array of vertice indexes used to find the vertex in the buffer vertex buffer object..
vbosPosition = new GLuint[nVectorGeometry];
vbosNormal = new GLuint[nVectorGeometry];
vbosColor = new GLuint[nVectorGeometry];
glGenBuffers(nVectorGeometry, vbosPosition);
glGenBuffers(nVectorGeometry, vbosNormal);
glGenBuffers(nVectorGeometry, vbosColor);
// Configure VBOs to hold positions and normals for each geometry
for (int i = 0; i<nVectorGeometry; i++)//i used to iterate through the individual
{
glBindVertexArray(vaos[i]);
int size = col->vectorGeometry[i].map["POSITION"].size / 4;
float* scaledData = (float*)malloc(size*sizeof(float));
float* p = (float*)col->vectorGeometry[i].map["POSITION"].data;
for (int j = 0; j < size; j++)
{
*(scaledData+j)=*(p+j)/largestPos;
if (*(scaledData + j) < -0.5)
{
std::cout << "scaledData[j] is: ";
std::cout << *(scaledData + j);
std::cout << "\n";
}
}
// Set vertex coordinate data
glBindBuffer(GL_ARRAY_BUFFER, vbosPosition[i]);
glBufferData(GL_ARRAY_BUFFER, col->vectorGeometry[i].map["POSITION"].size, scaledData, GL_STATIC_DRAW);
free(scaledData);
loc = glGetAttribLocation(program, "in_coords");//get a GLuint for the attribute and put it into GLuint loc.
glVertexAttribPointer(loc, col->vectorGeometry[i].map["POSITION"].stride, col->vectorGeometry[i].map["POSITION"].type, GL_FALSE, 0, 0);//glVertexAttribPointer — loc specifies the index of the generic vertex attribute to be modified.
glEnableVertexAttribArray(0);
#ifdef Testing_Mesh3D
PrintGLVertex(vbosPosition[i], col->vectorGeometry[i].map["POSITION"].size / 4);
#endif // Set normal vector data
glBindBuffer(GL_ARRAY_BUFFER, vbosNormal[i]);
glBufferData(GL_ARRAY_BUFFER, col->vectorGeometry[i].map["NORMAL"].size, col->vectorGeometry[i].map["NORMAL"].data, GL_STATIC_DRAW);
loc = glGetAttribLocation(program, "in_normals");
glVertexAttribPointer(loc, col->vectorGeometry[i].map["NORMAL"].stride, col->vectorGeometry[i].map["NORMAL"].type, GL_FALSE, 0, 0);
glEnableVertexAttribArray(0);
Material* material = col->mapGeometryUrlToMaterial2Effect[col->vectorGeometry[i].id];
if (material->effect1.size() > 0)
{
std::cout << "size of effect is: ";
std::cout << material->effect1.size();
std::cout << "\n";
Effect* effect1 = material->effect1[0];
if (effect1->type == enumEffectTypes::color)
{
glBindBuffer(GL_ARRAY_BUFFER, vbosColor[i]);
Color color = effect1->color;
//initialize a buffer array for the color with the color for all the geometry repeated over and over.
int vectorColorsSize = col->vectorGeometry[i].map["POSITION"].size;
float* vectorColors = (float*)malloc(vectorColorsSize*sizeof(float));
for (int j = 0; j <vectorColorsSize; j=j+4)
{
float* f = vectorColors + j;
for (int k = 0; k < color.stride; k++)
{
*(f + k) = *(color.values + k);
}
}
glBufferData(GL_ARRAY_BUFFER, vectorColorsSize*sizeof(float), vectorColors, GL_STATIC_DRAW);
loc = glGetAttribLocation(program, "in_colors");
glVertexAttribPointer(loc, color.stride , color.type, GL_FALSE, 0, 0);
}
else
{
}
}
glEnableVertexAttribArray(1);
}
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindVertexArray(0);
}
I know for a fact the array containing the floating point 4 float color data is fully initialized and contains a the same color repeated over and over for each vertex. However for some reason I am still only seeing a black color being drawn instead of the actual color the objects should be.
This is my vertex shader and fragment shader
#pragma once
#ifndef Included_shaders
#define Included_shaders
#include<stdio.h>
#include<iostream>
static std::string shaderVert = "#version 330\n"
"in vec3 in_coords;\n"
"in vec3 in_normals;\n"
"in vec4 in_colors; \n"//added by me
"out vec3 vertex_normal;\n"
"out vec4 vertex_color;\n"
"void main(void) {\n"
"vertex_normal = in_normals;\n"
"vertex_color = in_colors;\n"//added by me
"gl_Position = vec4(in_coords, 1.0);\n"
"}\n";
static std::string shaderFrag = "#version 330\n"
"in vec3 vertex_normal;\n"
"in vec4 vertex_color;\n"
"out vec4 output_color;\n"
"layout(std140) uniform LightParameters{\n"
"vec4 diffuse_intensity;\n"
"vec4 ambient_intensity;\n"
"vec4 light_direction;\n"
"};\n"
"uniform vec4 diffuse_color;\n"
"void main() {\n"
"/* Compute cosine of angle of incidence */\n"
"float cos_incidence = dot(vertex_normal, light_direction.xyz);\n"
"cos_incidence = clamp(cos_incidence, 0, 1);\n"
"/* Compute Blinn term */\n"
"vec3 view_direction = vec3(0, 0, 1);\n"
"vec3 half_angle = normalize(light_direction.xyz + view_direction);\n"
"float blinn_term = dot(vertex_normal, half_angle);\n"
"blinn_term = clamp(blinn_term, 0, 1);\n"
"blinn_term = pow(blinn_term, 1.0);\n"
"output_color=vertex_color\n;"
"}\n";
#endif
This is the function i call to draw
void Mesh3D::DrawToParent()
{
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
// Draw elements of each mesh in the vector
for (int i = 0; i<nVectorGeometry; i++)
{
glBindVertexArray(vaos[i]);
glDrawElements(col->vectorGeometry[i].primitive/*This is 4 for GL_Triangles*/, col->vectorGeometry[i].index_count,
GL_UNSIGNED_SHORT, col->vectorGeometry[i].indices);
}
glBindVertexArray(0);
glutSwapBuffers();
}
I would really appreciate it if someone could figure out where I am going wrong. I am rather new to drawing 3d graphics with opengl but wanted to create my own classes to draw complicated 3d scenes. I have the objects vertices being drawn correctly but now im trying to add the color which I also read in from the COLLADA file.
I believe the problem is that you glEnableVertexAttribArray(0); twice and glEnableVertexAttribArray(1); once. Instead you should enable the vertex attribute locations that you used to point to your data.
So, after each glVertexAttribPointer call glEnableVertexAttribArray(loc);
Appendix: All the necessary steps for using a VBO (taken from some of my code) its in Go, but the functions work exactly the same.
Vbos[2].Bind(gl.ARRAY_BUFFER)
gl.BufferData(gl.ARRAY_BUFFER, size*4, texCoord, gl.STREAM_DRAW)
self.texcoordloc.AttribPointer(4, gl.FLOAT, false, 0, nil)
self.texcoordloc.EnableArray()
defer self.texcoordloc.DisableArray()
I want to render a Quad via VAO, IBO and VBO but nothing is drawn. I'm using glDrawRangeElements in OS X OpenGL 3.2 Core context. The screen is completely black without any error. GLFW3 is used to create window and context.
Window opening/Context creation code
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
_mainWindow = glfwCreateWindow(width, height, title, monitor, NULL);
if(_mainWindow == NULL)
{
return false;
}
_mainWindowWidth = width;
_mainWindowHeight = height;
glfwSetKeyCallback(_mainWindow, _onKeyEvent);
glfwMakeContextCurrent(_mainWindow);
glewExperimental = GL_TRUE;
glewInit();
_openGLVersion = reinterpret_cast<const char*>(glGetString(GL_VERSION));
Shader sources (compiled successfully). Custom FragColor is binded.
_vertexShaderSource =
"#version 150 core\n"
"in vec3 in_Position;\n"
"in vec3 in_Normal;\n"
"in vec2 in_TexCoord;\n"
"uniform mat4 ModelViewProjection;\n"
"void main()\n"
"{\n"
" gl_Position = ModelViewProjection * vec4(in_Position, 1.0);\n"
"}\n";
_fragmentShaderSource =
"#version 150 core\n"
"out vec4 FColor;"
"void main()\n"
"{\n"
" FColor = vec4(1.0, 1.0, 1.0, 1.0);\n"
"}\n";
Vertices
Vertex* vertices = new Vertex[6];
vertices[0].nz = 1.0f;
vertices[1].nz = 1.0f;
vertices[2].nz = 1.0f;
vertices[3].nz = 1.0f;
vertices[4].nz = 1.0f;
vertices[5].nz = 1.0f;
vertices[1].y = height;
vertices[1].v0 = 1.0f;
vertices[2].x = width;
vertices[2].y = height;
vertices[2].u0 = 1.0f;
vertices[2].v0 = 1.0f;
vertices[4].x = width;
vertices[4].u0 = 1.0f;
vertices[5].x = width;
vertices[5].y = height;
vertices[5].u0 = 1.0f;
vertices[5].v0 = 1.0f;
_mesh->setVertices(vertices, 6);
vertices = NULL;
Uint16* indices = new Uint16[6];
indices[0] = 0;
indices[1] = 2;
indices[2] = 3;
indices[3] = 0;
indices[4] = 1;
indices[5] = 3;
_mesh->setIndices(indices);
indices = NULL;
Buffer update (checked the data, it seems to be correct)
// Update VBO
if(_vertexBufferObjectID == 0)
{
glGenBuffers(1, &_vertexBufferObjectID);
}
glBindBuffer(GL_ARRAY_BUFFER, _vertexBufferObjectID);
float* data = new float[_vertexCount * sizeof(Vertex) / sizeof(float)];
Uint64 begin = 0;
for(Uint32 i = 0; i < _vertexCount; i++)
{
begin = i * 8;
data[begin] = _vertices[i].x;
data[begin + 1] = _vertices[i].y;
data[begin + 2] = _vertices[i].z;
data[begin + 3] = _vertices[i].nx;
data[begin + 4] = _vertices[i].ny;
data[begin + 5] = _vertices[i].nz;
data[begin + 6] = _vertices[i].u0;
data[begin + 7] = _vertices[i].v0;
}
glBufferData(GL_ARRAY_BUFFER, sizeof(Vertex) * _vertexCount, &data[0], GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
delete[] data;
data = NULL;
// Update IBO
if(_indexBufferObjectID == 0)
{
glGenBuffers(1, &_indexBufferObjectID);
}
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, _indexBufferObjectID);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(Uint16) * _vertexCount, &_indices[0], GL_STATIC_DRAW);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
// Update VAO
if(_vertexArrayObjectID == 0)
{
glGenVertexArrays(1, &_vertexArrayObjectID);
}
glBindVertexArray(_vertexArrayObjectID);
glBindBuffer(GL_ARRAY_BUFFER, _vertexBufferObjectID);
// Vertices
glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);
glEnableVertexAttribArray(2);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, sizeof(Vertex), ((char*)NULL + (0)));
// Normals
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, sizeof(Vertex), ((char*)NULL + (12)));
// TexCoords
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(Vertex), ((char*)NULL + (24)));
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, _indexBufferObjectID);
glBindVertexArray(0);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
Rendering code
glUseProgram(material->_programID);
GLuint mvp = glGetUniformLocation(material->_programID, "ModelViewProjection");
glUniformMatrix4fv(mvp, 1, false, glm::value_ptr(camera_mvp));
glBindVertexArray(node->_mesh->_vertexArrayObjectID);
// Draw
glDrawRangeElements(GL_TRIANGLES, 0, 3, node->_mesh->_vertexCount, GL_UNSIGNED_SHORT, NULL);
glBindVertexArray(0);
Note:
camera_mvp is Orthographic
0.00333333_0___________0 0
0__________0.00333333__0 0
0__________0__________-1 0
599________599_________0 1
Program Linking
_programID = glCreateProgram();
glAttachShader(_programID, vertex_shader);
glAttachShader(_programID, fragment_shader);
glLinkProgram(_programID);
glGetProgramiv(_programID, GL_LINK_STATUS, &result);
glGetProgramiv(_programID, GL_INFO_LOG_LENGTH, &loglen);
if(loglen > 0)
{
char* log = new char[loglen];
glGetProgramInfoLog(_programID, loglen, 0, log);
_lastInfoLog = log;
delete log;
log = NULL;
}
if(result == GL_FALSE)
{
glDeleteProgram(_programID);
glDeleteShader(vertex_shader);
glDeleteShader(fragment_shader);
return false;
}
glUseProgram(_programID);
glBindAttribLocation(_programID, 0, "in_Position");
glBindAttribLocation(_programID, 1, "in_Normal");
glBindAttribLocation(_programID, 2, "in_TexCoord");
glBindFragDataLocation(_programID, 0, "FColor");
glUseProgram(0);
glDeleteShader(vertex_shader);
glDeleteShader(fragment_shader);
You don't have layout in #version 150 so you have to use glGetAttribLocation() to ask OpenGL where it put your attributes and adjust the first parameter of your glVertexAttribPointer() calls appropriately.
Make sure you bind your program before calling glGetAttribLocation().
I am enrolled in shaders course and interested in computer vision and image processing. I was wondering how can I mix GLSL shaders knowledge with image processing? What do I gain if I implement image processing algorithms with GLSL?
Case study: real time box blur on CPU vs GPU fragment shader
I have implemented a simple box blur https://en.wikipedia.org/wiki/Box_blur algorithm on CPU and GPU fragment shader to see which was faster:
demo video https://www.youtube.com/watch?v=MRhAljmHq-o
source code: https://github.com/cirosantilli/cpp-cheat/blob/cfd18700827dfcff6080a938793b44b790f0b7e7/opengl/glfw_webcam_image_process.c
My camera refresh rate capped FPS to 30, so I measured how wide the box could be and still keep 30 FPS.
On a Lenovo T430 (2012), NVIDIA NVS5400, Ubuntu 16.04 with image dimensions 960x540, the maximum widths were:
GPU: 23
CPU: 5
Since the computation is quadratic, the speedup was:
( 23 / 5 ) ^ 2 = 21.16
faster on GPU than CPU!
Not all algorithms are faster on the GPU. For example, operations that act on single pictures like swapping RGB reach 30FPS on the CPU, so it is useless to add the complexity of GPU programming to it.
Like any other CPU vs GPU speedup question, it all comes down if you have enough work per byte transferred to the GPU, and benchmarking is the best thing you can do. In general, quadratic algorithms or worse are a good bet for the GPU. See also: What do the terms "CPU bound" and "I/O bound" mean?
Main part of the code (just clone from GitHub):
#include "common.h"
#include "../v4l2/common_v4l2.h"
static const GLuint WIDTH = 640;
static const GLuint HEIGHT = 480;
static const GLfloat vertices[] = {
/* xy uv */
-1.0, 1.0, 0.0, 1.0,
0.0, 1.0, 0.0, 0.0,
0.0, -1.0, 1.0, 0.0,
-1.0, -1.0, 1.0, 1.0,
};
static const GLuint indices[] = {
0, 1, 2,
0, 2, 3,
};
static const GLchar *vertex_shader_source =
"#version 330 core\n"
"in vec2 coord2d;\n"
"in vec2 vertexUv;\n"
"out vec2 fragmentUv;\n"
"void main() {\n"
" gl_Position = vec4(coord2d, 0, 1);\n"
" fragmentUv = vertexUv;\n"
"}\n";
static const GLchar *fragment_shader_source =
"#version 330 core\n"
"in vec2 fragmentUv;\n"
"out vec3 color;\n"
"uniform sampler2D myTextureSampler;\n"
"void main() {\n"
" color = texture(myTextureSampler, fragmentUv.yx).rgb;\n"
"}\n";
static const GLchar *vertex_shader_source2 =
"#version 330 core\n"
"in vec2 coord2d;\n"
"in vec2 vertexUv;\n"
"out vec2 fragmentUv;\n"
"void main() {\n"
" gl_Position = vec4(coord2d + vec2(1.0, 0.0), 0, 1);\n"
" fragmentUv = vertexUv;\n"
"}\n";
static const GLchar *fragment_shader_source2 =
"#version 330 core\n"
"in vec2 fragmentUv;\n"
"out vec3 color;\n"
"uniform sampler2D myTextureSampler;\n"
"// pixel Delta. How large a pixel is in 0.0 to 1.0 that textures use.\n"
"uniform vec2 pixD;\n"
"void main() {\n"
/*"// Identity\n"*/
/*" color = texture(myTextureSampler, fragmentUv.yx ).rgb;\n"*/
/*"// Inverter\n"*/
/*" color = 1.0 - texture(myTextureSampler, fragmentUv.yx ).rgb;\n"*/
/*"// Swapper\n"*/
/*" color = texture(myTextureSampler, fragmentUv.yx ).gbr;\n"*/
/*"// Double vision ortho.\n"*/
/*" color = ("*/
/*" texture(myTextureSampler, fragmentUv.yx ).rgb +\n"*/
/*" texture(myTextureSampler, fragmentUv.xy ).rgb\n"*/
/*" ) / 2.0;\n"*/
/*"// Multi-me.\n"*/
/*" color = texture(myTextureSampler, 4.0 * fragmentUv.yx ).rgb;\n"*/
/*"// Horizontal linear blur.\n"*/
/*" int blur_width = 21;\n"*/
/*" int blur_width_half = blur_width / 2;\n"*/
/*" color = vec3(0.0, 0.0, 0.0);\n"*/
/*" for (int i = -blur_width_half; i <= blur_width_half; ++i) {\n"*/
/*" color += texture(myTextureSampler, vec2(fragmentUv.y + i * pixD.x, fragmentUv.x)).rgb;\n"*/
/*" }\n"*/
/*" color /= blur_width;\n"*/
/*"// Square linear blur.\n"*/
" int blur_width = 23;\n"
" int blur_width_half = blur_width / 2;\n"
" color = vec3(0.0, 0.0, 0.0);\n"
" for (int i = -blur_width_half; i <= blur_width_half; ++i) {\n"
" for (int j = -blur_width_half; j <= blur_width_half; ++j) {\n"
" color += texture(\n"
" myTextureSampler, fragmentUv.yx + ivec2(i, j) * pixD\n"
" ).rgb;\n"
" }\n"
" }\n"
" color /= (blur_width * blur_width);\n"
"}\n";
int main(int argc, char **argv) {
CommonV4l2 common_v4l2;
GLFWwindow *window;
GLint
coord2d_location,
myTextureSampler_location,
vertexUv_location,
coord2d_location2,
pixD_location2,
myTextureSampler_location2,
vertexUv_location2
;
GLuint
ebo,
program,
program2,
texture,
vbo,
vao,
vao2
;
unsigned int
cpu,
width,
height
;
uint8_t *image;
float *image2 = NULL;
/*uint8_t *image2 = NULL;*/
if (argc > 1) {
width = strtol(argv[1], NULL, 10);
} else {
width = WIDTH;
}
if (argc > 2) {
height = strtol(argv[2], NULL, 10);
} else {
height = HEIGHT;
}
if (argc > 3) {
cpu = (argv[3][0] == '1');
} else {
cpu = 0;
}
/* Window system. */
glfwInit();
glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
window = glfwCreateWindow(2 * width, height, __FILE__, NULL, NULL);
glfwMakeContextCurrent(window);
glewInit();
CommonV4l2_init(&common_v4l2, COMMON_V4L2_DEVICE, width, height);
/* Shader setup. */
program = common_get_shader_program(vertex_shader_source, fragment_shader_source);
coord2d_location = glGetAttribLocation(program, "coord2d");
vertexUv_location = glGetAttribLocation(program, "vertexUv");
myTextureSampler_location = glGetUniformLocation(program, "myTextureSampler");
/* Shader setup 2. */
const GLchar *fs;
if (cpu) {
fs = fragment_shader_source;
} else {
fs = fragment_shader_source2;
}
program2 = common_get_shader_program(vertex_shader_source2, fs);
coord2d_location2 = glGetAttribLocation(program2, "coord2d");
vertexUv_location2 = glGetAttribLocation(program2, "vertexUv");
myTextureSampler_location2 = glGetUniformLocation(program2, "myTextureSampler");
pixD_location2 = glGetUniformLocation(program2, "pixD");
/* Create vbo. */
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
/* Create ebo. */
glGenBuffers(1, &ebo);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
/* vao. */
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer(coord2d_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices[0]), (GLvoid*)0);
glEnableVertexAttribArray(coord2d_location);
glVertexAttribPointer(vertexUv_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), (GLvoid*)(2 * sizeof(vertices[0])));
glEnableVertexAttribArray(vertexUv_location);
glBindVertexArray(0);
/* vao2. */
glGenVertexArrays(1, &vao2);
glBindVertexArray(vao2);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer(coord2d_location2, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices[0]), (GLvoid*)0);
glEnableVertexAttribArray(coord2d_location2);
glVertexAttribPointer(vertexUv_location2, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), (GLvoid*)(2 * sizeof(vertices[0])));
glEnableVertexAttribArray(vertexUv_location2);
glBindVertexArray(0);
/* Texture buffer. */
glGenTextures(1, &texture);
glBindTexture(GL_TEXTURE_2D, texture);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
/* Constant state. */
glViewport(0, 0, 2 * width, height);
glClearColor(1.0f, 1.0f, 1.0f, 1.0f);
glActiveTexture(GL_TEXTURE0);
/* Main loop. */
common_fps_init();
do {
/* Blocks until an image is available, thus capping FPS to that.
* 30FPS is common in cheap webcams. */
CommonV4l2_updateImage(&common_v4l2);
image = CommonV4l2_getImage(&common_v4l2);
glClear(GL_COLOR_BUFFER_BIT);
/* Original. */
glTexImage2D(
GL_TEXTURE_2D, 0, GL_RGB, width, height,
0, GL_RGB, GL_UNSIGNED_BYTE, image
);
glUseProgram(program);
glUniform1i(myTextureSampler_location, 0);
glBindVertexArray(vao);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
glBindVertexArray(0);
/* Optional CPU modification to compare with GPU shader speed. */
if (cpu) {
image2 = realloc(image2, 3 * width * height * sizeof(image2[0]));
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
size_t index = 3 * (i * width + j);
/* Inverter. */
/*image2[index + 0] = 1.0 - (image[index + 0] / 255.0);*/
/*image2[index + 1] = 1.0 - (image[index + 1] / 255.0);*/
/*image2[index + 2] = 1.0 - (image[index + 2] / 255.0);*/
/* Swapper. */
/*image2[index + 0] = image[index + 1] / 255.0;*/
/*image2[index + 1] = image[index + 2] / 255.0;*/
/*image2[index + 2] = image[index + 0] / 255.0;*/
/* Square linear blur. */
int blur_width = 5;
int blur_width_half = blur_width / 2;
int blur_width2 = (blur_width * blur_width);
image2[index + 0] = 0.0;
image2[index + 1] = 0.0;
image2[index + 2] = 0.0;
for (int k = -blur_width_half; k <= blur_width_half; ++k) {
for (int l = -blur_width_half; l <= blur_width_half; ++l) {
int i2 = i + k;
int j2 = j + l;
// Out of bounds is black. TODO: do module to match shader exactly.
if (i2 > 0 && i2 < (int)height && j2 > 0 && j2 < (int)width) {
unsigned int srcIndex = index + 3 * (k * width + l);
image2[index + 0] += image[srcIndex + 0];
image2[index + 1] += image[srcIndex + 1];
image2[index + 2] += image[srcIndex + 2];
}
}
}
image2[index + 0] /= (blur_width2 * 255.0);
image2[index + 1] /= (blur_width2 * 255.0);
image2[index + 2] /= (blur_width2 * 255.0);
}
}
glTexImage2D(
GL_TEXTURE_2D, 0, GL_RGB, width, height,
0, GL_RGB, GL_FLOAT, image2
);
}
/* Modified. */
glUseProgram(program2);
glUniform1i(myTextureSampler_location2, 0);
glUniform2f(pixD_location2, 1.0 / width, 1.0 / height);
glBindVertexArray(vao2);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
glBindVertexArray(0);
glfwSwapBuffers(window);
glfwPollEvents();
common_fps_print();
} while (!glfwWindowShouldClose(window));
/* Cleanup. */
if (cpu) {
free(image2);
}
CommonV4l2_deinit(&common_v4l2);
glDeleteBuffers(1, &vbo);
glDeleteVertexArrays(1, &vao);
glDeleteTextures(1, &texture);
glDeleteProgram(program);
glfwTerminate();
return EXIT_SUCCESS;
}
The first obvious answer is that you gain parallelism. Now, why using GLSL rather than, say CUDA which is more flexible ? GLSL doesn't require you to have an NVIDIA graphics card, so it's a much more portable solution (you'd still have the option of OpenCL though).
What can you gain with parallelism ? Most of the time, you can treat pixels independantly. For instance, increasing the contrast of an image usually requires you to loop over all pixels and apply an affine transform of the pixel values. If each pixel is handled by a separate thread, then you don't need to do this loop anymore : you just raterize a quad, and apply a pixel shader that reads a texture at the current rasterized point, and ouput to the render target (or the screen) the transformed pixel value.
The drawback is that your data need to reside on the GPU : you'll need to transfer all your images to the GPU which can take some time, and can make the speedup gained with the parallelization useless. As such, GPU implementations are often done either when the operations to be made are compute intensive, or when the whole pipeline can remain on the GPU (for instance, if the goal is to only display the modified image on screen, you save the need to transfer back the image on the CPU).
OpenGL 4.3 (announced at SIGGRAPH 2012) supports Compute shaders. If you are doing strictly graphics work, and already using OpenGL, it might be easier to use this than OpenCL / OpenGL interop (or CUDA / OpenGL interop).
Here is what Khronos has to say about when to use 4.3 Compute shaders versus OpenCL: Link to PDF; see slide 5.