how does GLM handle translation - c++

The OpenGL maths library(GLM) uses the following algorithm to compute the translation matrix:
//taken from source code
template<typename T, qualifier Q>
GLM_FUNC_QUALIFIER mat<4, 4, T, Q> translate(mat<4, 4, T, Q> const& m, vec<3, T, Q> const& v)
mat<4, 4, T, Q> Result(m);
Result[3] = m[0] * v[0] + m[1] * v[1] + m[2] * v[2] + m[3];
return Result;
(Here the vector v is a 3 dimensional vector and the matrix m is a 4X4 matrix, since we're using homogeneous coordinates the vector v is also 4 dimensional).
The following is from Linear Algebra Theory:
Let m have the entries:
Now, suppose the matrix m gives some linear transformation, and is also a transformation matrix, and we'd like to add a translation of X, Y, and Z in the X, Y and Z dimensions respectively, if I'm not mistaken, the way we'd do that is by forming a composite matrix:
which gives something like:
Now, I'm not getting what this GLM function of translate does, because it does something like:
And the matrix with added transformation of translation, i.e. m becomes:
Now, these two matrices are not equal and hence they would result in different transformations, so I'm confused to which matrix does the actual translation and which is the correct one or if there is any other idea hidden behind the algorithm?
Note: Before reading the answer note that in column-major representation of a matrix, you access the entries of your matrix using: matrix[column-index][row-index].
The source code with which I perform transformation:
#include <iostream>
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <cmath>
#include <string.h>
#include "glm/glm.hpp"
#include "glm/gtc/matrix_transform.hpp"
#include "glm/gtc/type_ptr.hpp"
// Window Dimensions
const GLint WIDTH=800, HEIGHT=600;
GLuint VAO, VBO, shader;
GLint uniformModel {};
GLint uniformModelRot {};
GLfloat triOffset {};
float triMaxOffset = 0.7f;
bool direction = true;
const float toRadians = 3.14159265f/180.0f;
// vertex shader
static const char* vShader =
"#version 330\n"
"layout (location = 0) in vec3 pos;\n"
"uniform mat4 model;\n"
"void main(){\n"
" gl_Position = model * vec4(0.5*pos, 1.0);\n"
// fragment shader
static const char* fShader = ""
"#version 330\n"
"out vec4 color;\n"
"uniform mat4 model;\n"
"void main(){\n"
" color = model *vec4(1.0, 1.0, 0.0, 1.0);\n"
void AddShader(GLuint theProgram, const char* ShaderCode, GLenum shaderType, std::string info){
std::cerr <<"INFO: Adding "<<info<<" Shader"<<std::endl;
GLuint theShader = glCreateShader(shaderType);
const GLchar* theCode[1];
theCode[0] = ShaderCode;
GLint codeLength[1];
codeLength[0] = strlen(ShaderCode);
glShaderSource(theShader, 1, theCode, codeLength);
GLint result =0;
GLchar eLog[1024] ={0};
glGetShaderiv(theShader, GL_COMPILE_STATUS, &result);
glGetShaderInfoLog(shader, sizeof(eLog), NULL, eLog);
std::cerr<<"Error compiling program"<<std::endl;
glAttachShader(theProgram, theShader);
void CompileShader(){
shader = glCreateProgram();
std::cerr<<"Error creating shader"<<std::endl;
AddShader(shader, vShader, GL_VERTEX_SHADER, "vertex");
AddShader(shader, fShader, GL_FRAGMENT_SHADER, "fragment");
GLint result =0;
GLchar eLog[1024] ={0};
glGetProgramiv(shader, GL_LINK_STATUS, &result);
glGetProgramInfoLog(shader, sizeof(eLog), NULL, eLog);
std::cerr<<"Error linking program"<<std::endl;
glGetProgramiv(shader, GL_VALIDATE_STATUS, &result);
glGetProgramInfoLog(shader, sizeof(eLog), NULL, eLog);
std::cerr<<"Error Validating program"<<std::endl;
uniformModel = glGetUniformLocation(shader,"model");
void CreateTriangles(){
GLfloat vertices[]={
-1.0f, -1.0f, 0.0f,
1.0f, -1.0f, 0.0f,
0.0f, 1.0f, 0.0f
glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat)*9,vertices, GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
int main(){
//initialize GLFW
std::cerr << "GLFW initialization failed!" << std::endl;
return 1;
//Setup GLFW window properties
//openGL version
// core profile = no backward compatibility
//allow forward compatibility
GLFWwindow *mainWindow = glfwCreateWindow(WIDTH, HEIGHT, "TEST WINDOW", NULL, NULL);
std::cerr << "GLFW Window creation failed" << std::endl;
return 1;
// get Buffer size information
int bufferWidth, bufferHeight;
glfwGetFramebufferSize(mainWindow, &bufferWidth, &bufferHeight);
// set context for GLEW to use
// allow modern extension features
std::cerr << "GLEW initialization failed" << std::endl;
return 1;
// setup viewport size
glViewport(0, 0, bufferWidth, bufferHeight);
// get and handle user input events
glClearColor(1.0f, 0.0f, 0.0f, 1.0);
triOffset += 0.05f;
triOffset -= 0.05f;
if(abs(triOffset) >= triMaxOffset){
direction = !direction;
glm::mat4 modelMatrix(1.0f);
modelMatrix = glm::translate(modelMatrix, glm::vec3(triOffset, 0.0f, 0.0f));
glUniformMatrix4fv(uniformModel, 1, GL_FALSE,glm::value_ptr(modelMatrix));
// swap buffers
return 0;

OpenGL Mathematics (GLM) is based on the OpenGL Shading Language (GLSL). What glm::translate actually does is to set up a translation matrix and multiply the input matrix by the translation. It computes m*t in the meaning of GLSL Vector and Matrix Operations:
mat<4, 4, T, Q> Result(m);
Result[3] = m[0] * v[0] + m[1] * v[1] + m[2] * v[2] + m[3];
(In the following Result is substituted by R)
Note, m[0] * v[0] multiplies each component of the column m[0] by the scalar v[0]. The result is the vector (m[0][0]*v[0], m[0][1]*v[0], m[0][2]*v[0], m[0][3]*v[0]).
So R[3] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2] + m[3] is the same as
R[3][0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0]
R[3][1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2] + m[3][1]
R[3][2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2]
R[3][3] = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3]
glm::translate actually calculates:
vh = (v[0], v[1], v[2], 1)
R = m
R[3][0] = dot( (m[0][0], m[1][0], m[2][0], m[3][0]), vh )
R[3][1] = dot( (m[0][1], m[1][1], m[2][1], m[3][1]), vh )
R[3][2] = dot( (m[0][2], m[1][2], m[2][2], m[3][2]), vh )
R[3][3] = dot( (m[0][3], m[1][3], m[2][3], m[3][3]), vh )
The code above computes the Dot product of the rows from m, by vh. vh is the 4th column of the translation t. Note the translation matrix t is defined as:
c0 c1 c2 c3
r0: 1 0 0 v[0]
r1: 0 1 0 v[1]
r2: 0 0 0 v[2]
r3: 0 0 0 1
A concatenation of 4x4 matrices (R = m*t) is the Dot product of the rows of m and the columns of t and can be expressed as:
(See OpenGL Shading Language 4.60 Specification - 5.10. Vector and Matrix Operations)
for i from 0 to 3
for j fro 0 to 3
R[i][j] = dot( (m[0][j], m[1][j], m[2][j], m[3][j]), t[i] )
Where dot(a, b) == a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3],
(m[0][j], m[1][j], m[2][j], m[3][j]) is the j-th row of m and
t[i] is i-th column of t.
For glm::translate it is sufficient to copy R[0], R[1] and R[2] from m[0], m[1] and m[2].
e.g. for (i=0, j=0):
R[0][0] = dot( (m[0][0], m[1][0], m[2][0], m[3][0]), t[0] )
R[0][0] = dot( (m[0][0], m[1][0], m[2][0], m[3][0]), (1, 0, 0, 0) )
R[0][0] = m[0][0] * 1 + m[1][0] * 0 + m[2][0] * 0 + m[3][0]) * 0
R[0][0] = m[0][0]
GLM matrices (as OpenGL matrices) are stored in column major order. If you investigate matrices in the debugger that may lead to confusions.
If you have the matrix
c0 c1 c2 c3
r0: Xx Yx Zx Tx
r1: Xy Yy Zy Ty
r2: Xz Yz Zz Tz
r3: 0 0 0 1
then the memory image of a 4*4 OpenGL matrix looks like this:
Xx, Xy, Xz, 0, Yx, Yy, Yz, 0, Zx, Zy, Zz, 0, Tx, Ty, Tz, 1
If you investigate it in a debugger, it may look like:
[ [ Xx, Xy, Xz, 0 ],
[ Yx, Yy, Yz, 0 ],
[ Zx, Zy, Zz, 0 ],
[ Tx, Ty, Tz, 1 ] ]

The technical details of as to how the math is done is magnificiently done in #Rabbid76's answer, but if anyone would like to understand why m*t is computed instead of t*m then here's the answer:
Computing the matrix tm like this:
here, you're taking the standard basis as the basis vectors for linear combination, so, essentially you're transforming in world space coordinates. but
doing it the other way around and computing mt means now you're essentially taking the basis as the m[0], m[1] and m[2] respectively, so you're transforming in the local space given by the basis, and since this is essentially a model matrix, we just call it model space.
That is probably one way to view it if you're only considering translation, but what if you're handling composite transformations like below:
Here the model matrix is M(initialized to identity at first), T is the translation matrix, R the rotation matrix and others are straightforward above.
So the transformation sequence that happens in the above code is:
and say this is applied to the vector v=[x, y, z, 1], the vector undergoes first a rotation, then a translation and then only the model transformation is done, if it helps, you may see it like this:


Multiplication in OpenGL vertex shader using column-major matrix does not draw triangle as expected

When I use a custom column major matrix in my code, and pass it to the vertex shader, the triangle is not drawn as expected, but when I use a row major matrix, it draws the triangle in its correct position.
I googled it and found some answers related to this question:
Like this and this, but I could not understand what I'm doing wrong.
If I'm not mistaken, a row-major matrix is:
{ 0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
Tx, Ty, Tz, w}
So, using this row-major matrix, the multiplication order should be: v' = v*M.
And a column-major matrix is:
{ 0, 4, 8, Tx,
1, 5, 9, Ty,
2, 6, 10, Tz,
3, 7, 11, w}
Using this column-major matrix, the multiplication order should be: v' = M*v.
Where Tx, Ty, and Tz hold the translation values for x, y and z, respectively.
Having said that, I will focus on what I think I'm having trouble with, in order to have a more compact question, but I will post an example code in the end, using GLFW and GLAD(<glad/gl.h>)
This is my vertex shader:
#version 330 core
layout (location = 0) in vec3 aPos;
uniform mat4 transform;
void main()
gl_Position = transform * vec4(aPos, 1.0);
These are my Mat4 struct and its functions:
typedef struct Mat4
float data[16];
} Mat4;
// Return Mat4 identity matrix
Mat4 mat4_identity()
Mat4 m = {0};[0] = 1.0f;[5] = 1.0f;[10] = 1.0f;[15] = 1.0f;
return m;
// Translate Mat4 using row-major order
Mat4 mat4_row_translation(Mat4 a, float x, float y, float z)
Mat4 m = mat4_identity();[12] += x;[13] += y;[14] += z;
return m;
// Translate Mat4 using column-major order
Mat4 mat4_column_translation(Mat4 a, float x, float y, float z)
Mat4 m = mat4_identity();[3] += x;[7] += y;[11] += z;
return m;
This is my update_triangle function where I translate the matrix:
Mat4 trans = mat4_identity();
trans = mat4_column_translation(trans, 0.5f, 0.5f, 0.0f);
unsigned int transformLoc = glGetUniformLocation(shader, "transform");
glUniformMatrix4fv(transformLoc, 1, GL_FALSE,;
Note that I'm passing GL_FALSE in glUniformMatrix4v, which tells OpenGL that the matrix is already in a column-major order.
However, when running the program, I do not get a triangle 0.5f up and 0.5f right, I get this:
Weird triangle translation
But when I use a row-major matrix and change the multiplication order in the vertex shader(v' = v*M), I get the result that I was expecting.
The vertex shader:
#version 330 core
layout (location = 0) in vec3 aPos;
uniform mat4 transform;
void main()
gl_Position = vec4(aPos, 1.0) * transform;
The update_triangle function:
Mat4 trans = mat4_identity();
trans = mat4_row_translation(trans, 0.5f, 0.5f, 0.0f);
unsigned int transformLoc = glGetUniformLocation(shader, "transform");
glUniformMatrix4fv(transformLoc, 1, GL_TRUE,;
Note that I'm passing GL_TRUE in glUniformMatrix4v, which tells OpenGL that the matrix is not in a column-major order.
The result:
Triangle drawn as expected
Here is the code in a single file, it needs to be compiled with GLFW and glad/gl.c.
Comment[0] and Comment1 are just to help with which lines to comment together, for example: If you comment a line with "// Comment[0]" in int, you need to comment the other lines with "// Comment[0]" as well.
But in the Vertex Shader, both matrices use the same line to be drawn correct(which is why I don't understand).
If you are on linux, you can compile with: g++ -o ex example.cpp gl.c -lglfw && ./ex
(You will need to download gl.c from Glad generator)
#include <glad/gl.h>
#include <GLFW/glfw3.h>
#include <stdio.h>
#include <stdlib.h>
// Mat4 structure
typedef struct Mat4
float data[16];
} Mat4;
int c = 0;
// Return Mat4 identity matrix
Mat4 mat4_identity()
Mat4 m = {0};[0] = 1.0f;[5] = 1.0f;[10] = 1.0f;[15] = 1.0f;
return m;
// Translate Mat4 using row-major order
Mat4 mat4_row_translation(Mat4 a, float x, float y, float z)
Mat4 m = mat4_identity();[12] += x;[13] += y;[14] += z;
return m;
// Translate Mat4 using column-major order
Mat4 mat4_column_translation(Mat4 a, float x, float y, float z)
Mat4 m = mat4_identity();[3] += x;[7] += y;[11] += z;
return m;
GLFWwindow *glfw_window;
// Window functions
int init_glfw(const char *window_title, int x, int y, int width, int height);
void framebuffer_size_callback(GLFWwindow* window, int width, int height);
void processInput();
// Shader functions
static unsigned int compile_shader(unsigned int type, const char *source);
static unsigned int create_shader(const char *vertex_shader, const char *fragment_shader);
// Triangle functions
void init_triangle();
void draw_triangle();
void update_triangle();
unsigned int shader = -1;
unsigned int vao = -1;
unsigned int vbo = -1;
float vertices[] = {
-0.5f, -0.5f, 0.0f, // left
0.5f, -0.5f, 0.0f, // right
0.0f, 0.5f, 0.0f // top
const char *vshader = "#version 330 core\n"
"layout (location = 0) in vec3 aPos;\n"
"uniform mat4 transform;\n"
"void main()\n"
// " gl_Position = vec4(aPos, 1.0) * transform;\n" // Comment [0] -> Inverted for column-major
" gl_Position = transform * vec4(aPos, 1.0);\n" // Comment [1] -> Inverted for column-major
const char *fshader = "#version 330 core\n"
"out vec4 FragColor;\n"
"void main()\n"
" FragColor = vec4(1.0f, 0.5f, 0.2f, 1.0f);\n"
int main()
int result = init_glfw("LearnOpenGL", 0, 0, 800, 600);
if(result != 0)
return result;
while (!glfwWindowShouldClose(glfw_window))
// input
// Update triangle vertices
glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
// Draw triangle example
// glfw: swap buffers and poll IO events (keys pressed/released, mouse moved etc.)
// glfw: terminate, clearing all previously allocated GLFW resources.
return 0;
// My confusion is here
void update_triangle()
Mat4 trans = mat4_identity();
trans = mat4_column_translation(trans, 0.5f, 0.5f, 0.0f); // Comment [0]
// trans = mat4_row_translation(trans, 0.5f, 0.5f, 0.0f); // Comment [1]
// Print Mat4
if(c == 0)
// TODO: Remove this
printf("==== Trans: ====\n");
for(int i = 1; i <= 16; i++)
printf("%.2f, ",[i-1]);
if(i % 4 == 0 && i != 0)
unsigned int transformLoc = glGetUniformLocation(shader, "transform");
glUniformMatrix4fv(transformLoc, 1, GL_FALSE,; // Comment [0]
// glUniformMatrix4fv(transformLoc, 1, GL_TRUE,; // Comment [1]
// Window functions
int init_glfw(const char *window_title, int x, int y, int width, int height)
// glfw: initialize and configure
// ------------------------------
#ifdef __APPLE__
// glfw window creation
// --------------------
glfw_window = glfwCreateWindow(width, height, window_title, NULL, NULL);
if (glfw_window == NULL)
printf("Failed to create GLFW window\n");
return -1;
glfwSetFramebufferSizeCallback(glfw_window, framebuffer_size_callback);
// glad: load all OpenGL function pointers
// ---------------------------------------
int version = gladLoadGL(glfwGetProcAddress);
printf("Current GL loaded: %d.%d\n", GLAD_VERSION_MAJOR(version), GLAD_VERSION_MINOR(version));
return 0;
void framebuffer_size_callback(GLFWwindow* window, int width, int height)
glViewport(0, 0, width, height);
void processInput()
if(glfwGetKey(glfw_window, GLFW_KEY_ESCAPE) == GLFW_PRESS)
glfwSetWindowShouldClose(glfw_window, true);
/* Default Compilation for Shader */
static unsigned int compile_shader(unsigned int type, const char *source)
unsigned int id = glCreateShader(type);
glShaderSource(id, 1, &source, NULL);
int result;
glGetShaderiv(id, GL_COMPILE_STATUS, &result);
int length;
glGetShaderiv(id, GL_INFO_LOG_LENGTH, &length);
char* msg = (char*) alloca(length * sizeof(char));
glGetShaderInfoLog(id, length, &length, msg);
printf("Vertex / Fragment Shader Failed:\n %s", msg);
return 0;
return id;
static unsigned int create_shader(const char *vertex_shader, const char *fragment_shader)
unsigned int program = glCreateProgram();
unsigned int vs = compile_shader(GL_VERTEX_SHADER, vertex_shader);
unsigned int fs = compile_shader(GL_FRAGMENT_SHADER, fragment_shader);
glAttachShader(program, vs);
glAttachShader(program, fs);
return program;
// Triangle functions
void init_triangle()
shader = create_shader(vshader, fshader);
printf("shader=%d", shader);
glGenVertexArrays(1, &vao);
printf("vao=%d", vao);
glGenBuffers(1, &vbo);
printf("vbo=%d\n", vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo); // Using this vbo
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), NULL);
void draw_triangle()
glDrawArrays(GL_TRIANGLES, 0, 3);
This is my first question in this forum, so please let me know if there is anything missing.
So many people use row-major or transposed matrices, that they forget that matrices are not naturally oriented that way. So they see a translation matrix as this:
1 0 0 0
0 1 0 0
0 0 1 0
x y z 1
This is a transposed translation matrix. That is not what a normal translation matrix looks like. The translation goes in the 4th column, not the fourth row. Sometimes, you even see this in textbooks, which is utter garbage.
It's easy to know whether a matrix in an array is row or column-major. If it's row-major, then the translation is stored in the 3, 7, and 11th indices. If it's column-major, then the translation is stored in the 12, 13, and 14th indices. Zero-base indices of course.
Your confusion stems from believing that you're using column-major matrices when you're in fact using row-major ones.
The statement that row vs. column major is a notational convention only is entirely true. The mechanics of matrix multiplication and matrix/vector multiplication are the same regardless of the convention.
What changes is the meaning of the results.
A 4x4 matrix after all is just a 4x4 grid of numbers. It doesn't have to refer to a change of coordinate system. However, once you assign meaning to a particular matrix, you now need to know what is stored in it and how to use it.
Take the translation matrix I showed you above. That's a valid matrix. You could store that matrix in a float[16] in one of two ways:
float row_major_t[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, x, y, z, 1};
float column_major_t[16] = {1, 0, 0, x, 0, 1, 0, y, 0, 0, 1, z, 0, 0, 0, 1};
However, I said that this translation matrix is wrong, because the translation is in the wrong place. I specifically said that it is transposed relative to the standard convention for how to build translation matrices, which ought to look like this:
1 0 0 x
0 1 0 y
0 0 1 z
0 0 0 1
Let's look at how these are stored:
float row_major[16] = {1, 0, 0, x, 0, 1, 0, y, 0, 0, 1, z, 0, 0, 0, 1};
float column_major[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, x, y, z, 1};
Notice that column_major is exactly the same as row_major_t. So, if we take a proper translation matrix, and store it as column-major, it is the same as transposing that matrix and storing it as row-major.
That is what is meant by being only a notational convention. There are really two sets of conventions: memory storage and transposition. Memory storage is column vs row major, while transposition is normal vs. transposed.
If you have a matrix that was generated in row-major order, you can get the same effect by transposing the column-major equivalent of that matrix. And vice-versa.
Matrix multiplication can only be done one way: given two matrices, in a specific order, you multiply certain values together and store the results. Now, A*B != B*A, but the actual source code for A*B is the same as the code for B*A. They both run the same code to compute the output.
The matrix multiplication code does not care whether the matrices happen to be stored in column-major or row-major order.
The same cannot be said for vector/matrix multiplication. And here's why.
Vector/matrix multiplication is a falsehood; it cannot be done. However, you can multiply a matrix by another matrix. So if you pretend a vector is a matrix, then you can effectively do vector/matrix multiplication, simply by doing matrix/matrix multiplication.
A 4D vector can be considered a column-vector or a row-vector. That is, a 4D vector can be thought of as a 4x1 matrix (remember: in matrix notation, the row count comes first) or a 1x4 matrix.
But here's the thing: Given two matrices A and B, A*B is only defined if the number of columns of A is the same as the number of rows of B. Therefore, if A is our 4x4 matrix, B must be a matrix with 4 rows in it. Therefore, you cannot perform A*x, where x is a row-vector. Similarly, you cannot perform x*A where x is a column-vector.
Because of this, most matrix math libraries make this assumption: if you multiply a vector times a matrix, you really mean to do the multiplication that actually works, not the one that makes no sense.
Let us define, for any 4D vector x, the following. C shall be the column-vector matrix form of x, and R shall be the row-vector matrix form of x. Given this, for any 4x4 matrix A, A*C represents matrix multiplying A by the column-vector x. And R*A represents matrix multiplying the row-vector x by A.
But if we look at this using strict matrix math, we see that these are not equivalent. R*A cannot be the same as A*C. This is because a row-vector is not the same thing as a column-vector. They're not the same matrix, so they do not produce the same results.
However, they are related in one way. It is true that R != C. However, it is also true that R = CT, where T is the transpose operation. The two matrices are transposes of each other.
Here's a funny fact. Since vectors are treated as matrices, they too have a column vs. row-major storage question. The problem is that they both look the same. The array of floats is the same, so you can't tell the difference between R and C just by looking at the data. The only way to tell the difference is by how they are used.
If you have any two matrices A and B, and A is stored as row-major and B as column-major, multiplying them is completely meaningless. You get nonsense as a result. Well, not really. Mathematically, what you get is the equivalent of doing ATB. Or ABT; they're mathematically identical.
Therefore, matrix multiplication only makes sense if the two matrices (and remember: vector/matrix multiplication is just matrix multiplication) are stored in the same major ordering.
So, is a vector column-major or row-major? It is both and neither, as stated before. It is column major only when it is used as a column matrix, and it is row major when it is used as a row matrix.
Therefore, if you have a matrix A which is column major, x*A means... nothing. Well, again, it means x*AT, but that's not what you really wanted. Similarly, A*x does transposed multiplication if A is row-major.
Therefore, the order of vector/matrix multiplication does change, depending on your major ordering of the data (and whether you're using transposed matrices).

How to implement interactive rotation operations in a decent way

Recently, I want to achieve interactive rotation operations as can be done in meshlab:
Basically, it achieves rotation of three degrees of freedom. I visualize these operations as following codes with the help of GLFW:
static void mouse_move_callback(GLFWwindow* window, double xpos, double ypos){
//perform rotation operations only if keeping the right mouse key pressed
if(glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_RIGHT) == GLFW_RELEASE) {
g_clr_right_mouse = true;
/*clear mouse state once transferred from release state
to pressed state to prevent from a instant flicker*/
g_lastX = xpos;
g_lastY = ypos;
g_clr_right_mouse = false;
float xoffset = xpos - g_lastX; //let movement from down to top positive
float yoffset = g_lastY - ypos;
g_lastX = xpos;
g_lastY = ypos;
//do counterclockwise rotation around x-asis with movement in y direction
glm::mat4 r1 = glm::rotate(glm::mat4(), glm::radians(-yoffset * 0.5f), glm::vec3(1.0f,0.0f,0.0f));
//do counterclockwise rotation around y-asis with movement in x direction
glm::mat4 r2 = glm::rotate(glm::mat4(), glm::radians( xoffset * 0.5f), glm::vec3(0.0f,1.0f,0.0f));
glm::mat4 tmp = r2 * r1 * g_model;
for(int i=0; i<3; i++)
g_model[i] = tmp[i];
return ;
These codes are located here, and the whole project can be found here which can be downloaded and built. Finally, it performs as follows:
However, my implementation can only achieve rotation operations of 2 DOF, I add a keyboard callback to achieve rotation around the z axis:
void keyboard_callback(GLFWwindow* window, int key, int scancode, int action, int mod){
if(glfwGetKey(window, GLFW_KEY_LEFT) == GLFW_PRESS){
glm::mat4 r3 = glm::rotate(glm::mat4(), glm::radians(3.0f), glm::vec3(0,0,1.0f));
glm::mat4 tmp = r3 * g_model;
for(int i=0; i<3; i++)
g_model[i] = tmp[i];
}else if(glfwGetKey(window, GLFW_KEY_RIGHT) == GLFW_PRESS){
glm::mat4 r3 = glm::rotate(glm::mat4(), glm::radians(-3.0f), glm::vec3(0,0,1.0f));
glm::mat4 tmp = r3 * g_model;
for(int i=0; i<3; i++)
g_model[i] = tmp[i];
So my question is how to decently achieve interactive rotation operations of 3 DOF only with mouse movement?
When dragging the mouse, the object must be rotated around an axis that is perpendicular to the direction of movement of the mouse. The pivot is the origin of the model.
Rotate the mouse movement vector by 90 ° in the XY plane of the view. Since this is a vector in view space, the vector must be transformed from view space into world space. The matrix that transforms a vector from view space to world space is the inverse matrix of the upper left 3x3 of the view matrix:
vec2 drag_start;
vec2 drag_end;
glm::mat3 to_world = glm::inverse(glm::mat3(view_matrix));
glm::vec2 drag_vec = glm::vec2(drag_end.x - drag_start.x, drag_start.y - drag_end.y);
glm::vec3 axis_vec = glm::normalize(to_world * glm::vec3(-drag_vec.y, drag_vec.x, 0));
Create a rotation matrix around the axis. The angle depends on the length of the vector (height is the height of the viewport in pixels):
GLfloat angle = glm::length(drag_vec) / height / 2 * M_PI;
drag_rotation = glm::rotate(glm::mat4(1.0f), angle, axis_vec);
Compute a rotation matrix while dragging the mouse. Concatenate the rotation matrix and the model matrix after the drag ends:
glm::mat4 view_matrix(1.0f);
glm::mat4 model_rotation(1.0f);
glm::mat4 drag_rotation(1.0f);
glm::vec2 drag_start(0.0f);
bool drag = false;
void mouse_button_callback(GLFWwindow* window, int button, int action, int mods)
if (action == GLFW_PRESS)
drag = true;
double xpos, ypos;
glfwGetCursorPos(window, &xpos, &ypos);
drag_start = glm::vec2(xpos, ypos);
else if (action == GLFW_RELEASE)
drag = false;
model_rotation = drag_rotation * model_rotation;
drag_rotation = glm::mat4(1.0f);
void cursor_position_callback(GLFWwindow* window, double xpos, double ypos)
if (!drag)
glm::mat3 to_world = glm::inverse(glm::mat3(view_matrix));
glm::vec2 drag_vec = glm::vec2(xpos - drag_start.x, drag_start.y - ypos);
glm::vec3 axis_vec = glm::normalize(to_world * glm::vec3(-drag_vec.y, drag_vec.x, 0));
GLfloat angle = glm::length(drag_vec) / height / 2 * M_PI;
drag_rotation = glm::rotate(glm::mat4(1.0f), angle, axis_vec);
The model matrix is the concatenation of drag_rotation and model_rotation:
glm::mat4 model = drag_rotation * model_rotation;
See also Orbit
Complete example:
#include <GL/glew.h>
#include <GL/gl.h>
#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/type_ptr.hpp>
#include <GLFW/glfw3.h>
#include <vector>
#include <string>
#include <stdexcept>
#include <iostream>
#include <cmath>
#include <math.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
std::string sh_vert = R"(
#version 460 core
layout (location = 0) in vec4 a_position;
layout (location = 1) in vec3 a_uvw;
out vec3 v_uvw;
layout (location = 0) uniform mat4 u_projection;
layout (location = 1) uniform mat4 u_view;
layout (location = 2) uniform mat4 u_model;
void main()
v_uvw = a_uvw;
gl_Position = u_projection * u_view * u_model * a_position;
std::string sh_frag = R"(
#version 460 core
out vec4 frag_color;
in vec3 v_uvw;
vec3 HUEtoRGB(in float H)
float R = abs(H * 6.0 - 3.0) - 1.0;
float G = 2.0 - abs(H * 6.0 - 2.0);
float B = 2.0 - abs(H * 6.0 - 4.0);
return clamp(vec3(R, G, B), 0.0, 1.0);
void main()
frag_color = vec4(HUEtoRGB(v_uvw.z), 1.0);
class ShaderProgram
GLuint programObject;
static ShaderProgram newProgram(const std::string& vsh, const std::string& fsh);
GLuint compileShader(const std::string& sourceCode, GLenum shaderType);
void linkProgram(std::vector<GLuint> shObjs);
void compileStatus(GLuint shader);
void linkStatus();
class VertexArrayObject
GLuint vaoObject = 0;
GLsizei noOfVertices = 0;
GLsizei noOfIndices = 0;
static VertexArrayObject newCube();
static VertexArrayObject newCircles();
static VertexArrayObject newVAO(const std::vector<GLfloat>& varray, const std::vector<GLuint>& iarray);
int width = 800, height = 600;
glm::mat4 view_matrix(1.0f);
glm::mat4 model_rotation(1.0f);
glm::mat4 drag_rotation(1.0f);
glm::vec2 drag_start(0.0f);
bool drag = false;
void mouse_button_callback(GLFWwindow* window, int button, int action, int mods)
if (action == GLFW_PRESS)
drag = true;
double xpos, ypos;
glfwGetCursorPos(window, &xpos, &ypos);
drag_start = glm::vec2(xpos, ypos);
else if (action == GLFW_RELEASE)
drag = false;
model_rotation = drag_rotation * model_rotation;
drag_rotation = glm::mat4(1.0f);
void cursor_position_callback(GLFWwindow* window, double xpos, double ypos)
if (!drag)
glm::mat3 to_world = glm::inverse(glm::mat3(view_matrix));
glm::vec2 drag_vec = glm::vec2(xpos - drag_start.x, drag_start.y - ypos);
glm::vec3 axis_vec = glm::normalize(to_world * glm::vec3(-drag_vec.y, drag_vec.x, 0));
GLfloat angle = glm::length(drag_vec) / height / 2 * M_PI;
drag_rotation = glm::rotate(glm::mat4(1.0f), angle, axis_vec);
int main(void)
if (glfwInit() == GLFW_FALSE)
throw std::runtime_error( "error initializing glfw" );
glfwWindowHint(GLFW_SAMPLES, 8);
GLFWwindow * window = glfwCreateWindow(width, height, "OGL window", nullptr, nullptr);
if (window == nullptr)
throw std::runtime_error( "error initializing window" );
glfwSetMouseButtonCallback(window, mouse_button_callback);
glfwSetCursorPosCallback(window, cursor_position_callback);
if ( glewInit() != GLEW_OK )
throw std::runtime_error( "error initializing glew" );
auto progam = ShaderProgram::newProgram(sh_vert, sh_frag);
auto cube = VertexArrayObject::newCube();
auto circles = VertexArrayObject::newCircles();
glEnable( GL_DEPTH_TEST );
glClearColor(0.1f, 0.3f, 0.2f, 0.0f);
view_matrix = glm::lookAt(glm::vec3(0.0f, 0.0f, 7.0f), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 1.0f, 0.0f));
glUniformMatrix4fv(1, 1, GL_FALSE, glm::value_ptr(view_matrix));
while (!glfwWindowShouldClose(window))
glfwGetFramebufferSize(window, &width, &height);
float ascpect = (float)width / (float)height;
glm::mat4 project = glm::perspective(glm::radians(60.0f), ascpect, 0.1f, 20.0f);
glUniformMatrix4fv(0, 1, GL_FALSE, glm::value_ptr(project));
glm::mat4 model = drag_rotation * model_rotation;
glViewport(0, 0, width, height);
glUniformMatrix4fv(2, 1, GL_FALSE, glm::value_ptr(model));
glDrawElements(GL_TRIANGLES, cube.noOfIndices, GL_UNSIGNED_INT, nullptr);
glUniformMatrix4fv(2, 1, GL_FALSE, glm::value_ptr(glm::scale(model, glm::vec3(2.5f))));
glDrawElements(GL_LINES, circles.noOfIndices, GL_UNSIGNED_INT, nullptr);
return 0;
ShaderProgram ShaderProgram::newProgram(const std::string& vsh, const std::string& fsh)
ShaderProgram program;
auto shObjs = std::vector<GLuint>
program.compileShader(vsh, GL_VERTEX_SHADER),
program.compileShader(fsh, GL_FRAGMENT_SHADER),
for (auto shObj : shObjs)
for (auto shObj : shObjs)
return program;
GLuint ShaderProgram::compileShader(const std::string& sourceCode, GLenum shaderType)
auto shaderObj = glCreateShader(shaderType);
const char* srcCodePtr = sourceCode.c_str();
glShaderSource(shaderObj, 1, &srcCodePtr, nullptr);
return shaderObj;
void ShaderProgram::linkProgram(std::vector<GLuint> shObjs)
programObject = glCreateProgram();
for (auto shObj : shObjs)
glAttachShader(programObject, shObj);
void ShaderProgram::compileStatus(GLuint shader)
GLint status = GL_TRUE;
glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
if (status == GL_FALSE)
GLint logLen;
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &logLen);
std::vector< char >log(logLen);
GLsizei written;
glGetShaderInfoLog(shader, logLen, &written,;
std::cout << "compile error:" << std::endl << << std::endl;
void ShaderProgram::linkStatus()
GLint status = GL_TRUE;
glGetProgramiv(programObject, GL_LINK_STATUS, &status);
if (status == GL_FALSE)
GLint logLen;
glGetProgramiv(programObject, GL_INFO_LOG_LENGTH, &logLen);
std::vector< char >log(logLen);
GLsizei written;
glGetProgramInfoLog(programObject, logLen, &written,;
std::cout << "link error:" << std::endl << << std::endl;
VertexArrayObject VertexArrayObject::newCube()
static const std::vector<GLfloat> vertices{ -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1 };
static const std::vector<GLfloat> uv{ 0, 0, 1, 0, 1, 1, 0, 1 };
static const std::vector<size_t> faces{ 0, 1, 2, 3, 1, 5, 6, 2, 5, 4, 7, 6, 4, 0, 3, 7, 3, 2, 6, 7, 1, 0, 4, 5 };
std::vector<GLfloat> varray;
std::vector<GLuint> iarray;
for (auto si = 0; si < faces.size() / 4; si++)
for (auto qi = 0; qi < 4; qi++)
varray.insert(varray.end(), vertices.begin() + faces[si * 4 + qi] * 3, vertices.begin() + faces[si * 4 + qi] * 3 + 3);
std::vector<GLfloat> uvw{ 0, 0, (GLfloat)si * 4.0f / (GLfloat)faces.size() };
varray.insert(varray.end(), uvw.begin(), uvw.end());
std::vector<GLuint> indices{ 4u * si, 4u * si + 1, 4u * si + 2, 4u * si, 4u * si + 2, 4u * si + 3 };
iarray.insert(iarray.end(), indices.begin(), indices.end());
return newVAO(varray, iarray);
VertexArrayObject VertexArrayObject::newCircles()
const GLuint noC = 360;
std::vector<GLfloat> varray;
std::vector<GLuint> iarray;
for (int i = 0; i <= noC; i++)
GLfloat angle = static_cast<GLfloat>(i * 2 * M_PI / noC);
GLfloat c = cos(angle), s = sin(angle);
std::vector<GLfloat> va{ 0, c, s, 0, 0, 0, s, 0, c, 0, 0, 1.0f / 3.0f, c, s, 0, 0, 0, 2.0f / 3.0f };
varray.insert(varray.end(), va.begin(), va.end());
for (GLuint ci = 0; ci < 3; ci++)
for (GLuint i = 0; i <= noC; i++)
std::vector<GLuint> ia{ i * 3 + ci, ((i + 1) % noC) * 3 + ci };
iarray.insert(iarray.end(), ia.begin(), ia.end());
return newVAO(varray, iarray);
VertexArrayObject VertexArrayObject::newVAO(const std::vector<GLfloat>& varray, const std::vector<GLuint>& iarray)
VertexArrayObject vao;
vao.noOfIndices = static_cast<GLsizei>(iarray.size());
vao.noOfVertices = static_cast<GLsizei>(varray.size() / 6);
GLuint bufferObjects[2];
glGenBuffers(2, bufferObjects);;
glGenVertexArrays(1, &vao.vaoObject);
glBindBuffer(GL_ARRAY_BUFFER, bufferObjects[0]);
glBufferData(GL_ARRAY_BUFFER, varray.size() * sizeof(*,, GL_STATIC_DRAW);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(*, 0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(*, (void*)(3 * sizeof(*;
if (vao.noOfIndices > 0)
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, bufferObjects[1]);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, iarray.size() * sizeof(*,, GL_STATIC_DRAW);
glDeleteBuffers(2, bufferObjects);
return vao;
You kinda need to draw the ball to make it intuitive.
On mouse down, you put an anchor on the ball directly under the mouse pointer. If the click is outside the ball, then you use the closest point on the ball.
As the mouse moves, you rotate the ball so that the anchor point follows the shortest path so that it remains directly under the mouse pointer. If the mouse pointer is off the ball, then the closest point on the ball is used.
Maybe this will help.

Instancing with OpenGL 3.3 seems very slow

I wrote a minimal code-sample in C++, which is rendering 10000 colored
quads on the screen. I am using "instancing" and so updating only
the model-matrix for each quad each frame. The data of the 6 vertices
are stored in an indivdual VBO und will be reused all the time.
The projection-matrix (orthographic) is injected once at program-start
via uniform. The model-matrix is calculated on the CPU with the library GLM.
I measured the rendering-time and I got only an average FPS of 52.
I think this is MUCH to less, but I cannot find the mistake/bottleneck in my little sample program.
After some analysis it seems, that the 3 calculations done with GLM
are very slow. Am I doing something wrong here? For example, If
I remove the rotating-calculation, I get an FPS-boost of 10 FPS!
Maybe you can help me to find out, what I can do better here and how
can I optimize my sample. It is important for me, that each quad is individual configurable during runtime, so I decided to use instancing.
Moving the matrix-calculations to the GPU seems another option, but I am really confused, why the CPU has so much problems calculating the 10000
model-matrices! Ok, my CPU is very bad (Athlon 2 Core-Duo M300, GPU is ATI Mobility Radeon 4100), but It should do this task in no measurable time, or?
Here is minimal, fully working, compilable example (If u have GLFW and GLM).
Maybe someone have some time and can help me out here :)
#define GLM_FORCE_SSE2
#include "glew.h"
#include "glfw3.h"
#include "glm.hpp"
#include "glm/gtc/matrix_transform.hpp"
#include <conio.h>
#include <cstdlib>
#include <iostream>
#include <ctime>
GLuint buildShader()
std::string strVSCode =
"#version 330 core\n"
"in vec3 vertexPosition;\n"
"in mat4 modelMatrix;\n"
"uniform mat4 projectionMatrix;\n"
"out vec4 m_color;\n"
"void main() {\n"
" vec4 vecVertex = vec4(vertexPosition, 1);\n"
" gl_Position = projectionMatrix * modelMatrix * vecVertex;\n"
" m_color = gl_Position;\n"
std::string strFSCode = "#version 330 core\n"
"out vec4 frag_colour;\n"
"in vec4 m_color;\n"
"void main() {\n"
" frag_colour = vec4(m_color.x, m_color.y, m_color.z, 0.5f);\n"
GLuint gluiVertexShaderId = glCreateShader(GL_VERTEX_SHADER);
char const * VertexSourcePointer = strVSCode.c_str();
glShaderSource(gluiVertexShaderId, 1, &VertexSourcePointer, NULL);
GLuint gluiFragmentShaderId = glCreateShader(GL_FRAGMENT_SHADER);
char const * FragmentSourcePointer = strFSCode.c_str();
glShaderSource(gluiFragmentShaderId, 1, &FragmentSourcePointer, NULL);
GLuint gluiProgramId = glCreateProgram();
glAttachShader(gluiProgramId, gluiVertexShaderId);
glAttachShader(gluiProgramId, gluiFragmentShaderId);
return gluiProgramId;
struct Sprite
glm::vec3 position, dimension;
float speed, rotation, rx, ry;
struct Vertex
float x, y, z;
Vertex(float x, float y, float z) : x(x), y(y), z(z) {}
int main(int arc, char **argv)
// GLFW init
int displayResWith = 1366; //modify this here
int displayResHeight = 768; //modify this here
glfwWindowHint(GLFW_RED_BITS, 8);
glfwWindowHint(GLFW_GREEN_BITS, 8);
glfwWindowHint(GLFW_BLUE_BITS, 8);
glfwWindowHint(GLFW_ALPHA_BITS, 8);
glfwWindowHint(GLFW_DEPTH_BITS, 32);
glfwWindowHint(GLFW_STENCIL_BITS, 32);
GLFWwindow* window = glfwCreateWindow(displayResWith, displayResHeight,"Instancing", glfwGetPrimaryMonitor(),NULL);
int width, height;
glfwGetFramebufferSize(window, &width, &height);
//GLEW init
glewExperimental = GL_TRUE;
const GLubyte* renderer = glGetString(GL_RENDERER);
const GLubyte* version = glGetString(GL_VERSION);
std::cout << "Renderer: " << renderer << std::endl;
std::cout << "OpenGL supported version: " << version << std::endl;
//OpenGL init
glClearColor(255.0f, 255.0f, 255.0f, 255.0f);
GLuint programID = buildShader();
//VBO vertexBuffer
GLuint vertexBuffer;
glGenBuffers(1, &vertexBuffer);
glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
Vertex VertexBufferData[6];
VertexBufferData[0] = Vertex(-0.5f, 0.5f, 0.0f); //Links oben
VertexBufferData[1] = Vertex(-0.5f, -0.5f, 0.0f); //Links unten
VertexBufferData[2] = Vertex(0.5f, -0.5f, 0.0f); //Rechts unten
VertexBufferData[3] = VertexBufferData[2]; //Rechts unten
VertexBufferData[4] = Vertex(0.5f, 0.5f, 0.0f); //Rechts oben
VertexBufferData[5] = VertexBufferData[0]; //Links oben
glBufferData(GL_ARRAY_BUFFER, sizeof(Vertex)*6, VertexBufferData, GL_STATIC_DRAW);
//VBO instanceBuffer
GLuint instanceBuffer;
glGenBuffers(1, &instanceBuffer);
glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
int iMaxInstanceCount = 30000;
glm::mat4 *ptrInstanceBufferData = new glm::mat4[iMaxInstanceCount];
glBufferData(GL_ARRAY_BUFFER, iMaxInstanceCount * sizeof(glm::mat4), NULL, GL_STREAM_DRAW);
//VAO - Start
GLuint vertexArrayObject;
glGenVertexArrays(1, &vertexArrayObject);
//For VBO vertexbuffer
glEnableVertexAttribArray(glGetAttribLocation(programID, "vertexPosition"));
glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
glGetAttribLocation(programID, "vertexPosition"),
glVertexAttribDivisor(0, 0);
//For VBO instanceBuffer
int pos = glGetAttribLocation(programID, "modelMatrix");
int pos1 = pos + 0;
int pos2 = pos + 1;
int pos3 = pos + 2;
int pos4 = pos + 3;
glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
glVertexAttribPointer(pos1, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(0));
glVertexAttribPointer(pos2, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 4));
glVertexAttribPointer(pos3, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 8));
glVertexAttribPointer(pos4, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 12));
glVertexAttribDivisor(pos1, 1);
glVertexAttribDivisor(pos2, 1);
glVertexAttribDivisor(pos3, 1);
glVertexAttribDivisor(pos4, 1);
glBindVertexArray(0); //VAO - End
//Matrix vars
glm::mat4 Projection, Rotating, Scaling, Translation, Identity;
glm::vec3 ZRotateVec(0.0f, 0.0f, 1.0f);
//Calc projection-matrix and put shader (uniform)
Projection = glm::ortho(0.0f, (float)width, 0.0f, (float)height, 0.0f, 1.0f);
glUniformMatrix4fv(glGetUniformLocation(programID, "projectionMatrix"), 1, GL_FALSE, &Projection[0][0]);
//Creating sprites
std::srand(static_cast<unsigned int>(std::time(0)));
int iActInstanceCount = 10000;
Sprite *ptrSprites = new Sprite[iActInstanceCount];
for (int i = 0; i < iActInstanceCount; ++i)
ptrSprites[i].dimension = glm::vec3(16, 16, 1.0f);
ptrSprites[i].position = glm::vec3(std::rand()%(width-32),std::rand()%(height-32),-1.0f *((std::rand()%256)/256.0f));
ptrSprites[i].rotation = rand() % 360 + 0.0f;
ptrSprites[i].rx = static_cast<float>(std::rand() % 2);
ptrSprites[i].ry = static_cast<float>(std::rand() % 2);
ptrSprites[i].speed = (std::rand() % 100) + 1.0f;
if (ptrSprites[i].speed < 1.0f) ptrSprites[i].speed = 1.0f;
//FPS init
double fFramesRendered = 0.0f;
double fFrameMeasurementStart = 0.0f;
double fFPS = 0.0f;
double fCurrentTime = 0.0f;
//Main-loop (also renderloop)
while (!glfwWindowShouldClose(window))
if (glfwGetKey(window, GLFW_KEY_ESCAPE)== GLFW_PRESS)
glfwSetWindowShouldClose(window, GL_TRUE);
const double fNewTime = glfwGetTime();
double fDeltaTime = fNewTime - fCurrentTime;
fCurrentTime = fNewTime;
for (int i = 0; i < iActInstanceCount; ++i)
float fSpeed = ptrSprites[i].speed * static_cast<float>(fDeltaTime);
ptrSprites[i].rotation += fSpeed;
if (ptrSprites[i].rotation >= 360.0f) ptrSprites[i].rotation = 0.0f;
if (ptrSprites[i].rx == 1) ptrSprites[i].position.x = ptrSprites[i].position.x + fSpeed;
if (ptrSprites[i].rx == 0) ptrSprites[i].position.x = ptrSprites[i].position.x - fSpeed;
if (ptrSprites[i].ry == 1) ptrSprites[i].position.y = ptrSprites[i].position.y + fSpeed;
if (ptrSprites[i].ry == 0) ptrSprites[i].position.y = ptrSprites[i].position.y - fSpeed;
if (ptrSprites[i].position.x <= 0) ptrSprites[i].rx = 1;
if (ptrSprites[i].position.x + ptrSprites[i].dimension.x >= width) ptrSprites[i].rx = 0;
if (ptrSprites[i].position.y <= 0) ptrSprites[i].ry = 1;
if (ptrSprites[i].position.y + ptrSprites[i].dimension.y >= height) ptrSprites[i].ry = 0;
//matrix-calculations (saved in local buffer)
Translation = glm::translate(Identity, ptrSprites[i].position + glm::vec3(ptrSprites[i].dimension.x / 2.0f, ptrSprites[i].dimension.y / 2.0f, 0.0f));
Scaling = glm::scale(Translation, ptrSprites[i].dimension);
ptrInstanceBufferData[i] = glm::rotate(Scaling, ptrSprites[i].rotation, ZRotateVec);
glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
glBufferData(GL_ARRAY_BUFFER, iMaxInstanceCount * sizeof(glm::mat4), NULL, GL_STREAM_DRAW); // Buffer orphaning
glBufferSubData(GL_ARRAY_BUFFER, 0, iActInstanceCount * sizeof(glm::mat4), ptrInstanceBufferData);
glDrawArraysInstanced(GL_TRIANGLES, 0, 6, iActInstanceCount);
if ((fCurrentTime*1000.0f) >= (fFrameMeasurementStart*1000.0f) + 1000.0f)
fFPS = ((fCurrentTime*1000.0f) - (fFrameMeasurementStart*1000.0f)) / 1000.0f * fFramesRendered;
fFrameMeasurementStart = fCurrentTime;
fFramesRendered = 0;
std::cout << "FPS: " << fFPS << std::endl;
//Termination and cleanup
glDeleteBuffers(1, &vertexBuffer);
glDeleteBuffers(1, &instanceBuffer);
glDeleteVertexArrays(1, &vertexArrayObject);
return _getch();
Well, after testing it on my machine, it is definitely CPU limited, so nothing you do with OGL is going to make much difference. I get about ~300fps with GCC on at least -O1, but only ~80 with -O0. My CPU is very fast (i7 2600k, 4.7ghz), but my GPU is rather slow (GT 520). I'm also on Ubuntu.
Some quick ideas for things that might speed it up a little:
Put the vertex positions in an array in the vertex shader and use gl_VertexID to access them
Use radians for angles, as otherwise GLM has to convert them
None of these are likely to make much of any impact, really. Just make sure your compiler is set up right, and there probably isn't much more to do.

How to ripple on a sphere

I'm trying to implement a program that turns a cube into a sphere based on key presses, and ripples whenever it's clicked. I managed to implement the cube-to-sphere-and-back part, but I have completely no idea where to start on the rippling. I've looked at tons of sources online, I get the math, but I have no idea how to implement it on my vertex shader. Can anyone help me with my dilemma? Thank you!
Here's my cpp, vsh, and fsh:
I'm using GLSL, OpenGL 4.4.0
Here's my code for the vertex shader:
#version 120
attribute vec3 pos;
varying vec4 out_color;
uniform float t;
float PI = 3.14159265357;
int factor = 2; //for determining colors
int num_colors; // = factor * 3 (because RGB)
float currang = 0;
float angfac;
vec4 calculate( float a )
//this is just to calculate for the color
void main() {
num_colors = factor*3;
angfac = 2*PI/num_colors;
float ang = atan( pos.z, pos.x )+PI;
out_color = calculate(ang);
mat3 rotateX = mat3(
vec3( 1, 0, 0),
vec3( 0, cos(t), sin(t)),
vec3( 0, -sin(t), cos(t))
mat3 rotateY = mat3(
vec3( cos(t), 0, -sin(t)),
vec3( 0, 1, 0),
vec3( sin(t), 0, cos(t))
mat3 rotateZ = mat3(
vec3( cos(t), sin(t), 0),
vec3(-sin(t), cos(t), 0),
vec3( 0, 0, cos(t))
gl_Position = gl_ModelViewProjectionMatrix * vec4((*rotateY*rotateX) , 1.0 );
and here's parts of my cpp file:
//usual include statements
using namespace std;
enum { ATTRIB_POS };
GLuint mainProgram = 0;
// I use this to indicate the position of the vertices
struct Vtx {
GLfloat x, y, z;
const GLfloat PI = 3.14159265357;
const int sideLength = 10;
const size_t nVertices = (sideLength*sideLength*sideLength)-((sideLength-2)*(sideLength-2)*(sideLength-2));
Vtx cube[nVertices];
Vtx sphere[nVertices];
Vtx diff[nVertices];
const double TIME_SPEED = 0.01;
int mI = 4*(sideLength-1);
const int sLCubed = sideLength*sideLength*sideLength;
int indices[nVertices*nVertices];
GLfloat originX = 0.0f; //offset
GLfloat originY = 0.0f; //offset
bool loadShaderSource(GLuint shader, const char *path) {...}
void checkShaderStatus(GLuint shader) {...}
bool initShader() {...}
//in this part of the code, I instantiate an array of indices to be used by glDrawElements()
void transform(int fac)
//move from cube to sphere and back by adding/subtracting values and updating cube[].xyz
//moveSpeed = diff[]/speedFac
//fac is to determine direction (going to sphere or going to cube; going to sphere is plus, going back to cube is minus)
for( int i = 0; i<nVertices; i++ )
cube[i].x += fac*diff[i].x;
cube[i].y += fac*diff[i].y;
cube[i].z += fac*diff[i].z;
void initCube() {...} //computation for the vertices of the cube depending on sideLength
void initSphere() {...} //computation for the vertices of the sphere based on the vertices of the cube
void toSphere() {...} //changes the values of the array of vertices of the cube to those of the sphere
void initDiff() {...} //computes for the difference of the values of the vertices of the sphere and the vertices of the cube for the slow transformation
int main() {
//error checking (GLEW, OpenGL versions, etc)
glfwSetWindowTitle("CS177 Final Project");
glfwEnable( GLFW_STICKY_KEYS );
glfwSwapInterval( 1 );
if ( !initShader() ) {
return -1;
glVertexAttribPointer(ATTRIB_POS, 3, GL_FLOAT, GL_FALSE, sizeof(Vtx), cube);
GLuint UNIF_T = glGetUniformLocation(mainProgram, "t");
float t = 0;
glUniform1f(UNIF_T, t);
glBlendFunc(GL_ONE, GL_ONE);
do {
int width, height;
glfwGetWindowSize( &width, &height );
glViewport( 0, 0, width, height );
glUniform1f(UNIF_T, t);
if (glfwGetKey(GLFW_KEY_DEL)) transform(-1);
if (glfwGetKey(GLFW_KEY_INSERT)) transform( 1 );
if (glfwGetKey(GLFW_KEY_HOME)) initCube();
if (glfwGetKey(GLFW_KEY_END)) toSphere();
glDrawElements( GL_TRIANGLES, nVertices*nVertices, GL_UNSIGNED_INT, indices);
} while ( glfwGetKey(GLFW_KEY_ESC) != GLFW_PRESS &&
glfwGetWindowParam(GLFW_OPENED) );
return 0;

Image Processing with GLSL shaders?

I am enrolled in shaders course and interested in computer vision and image processing. I was wondering how can I mix GLSL shaders knowledge with image processing? What do I gain if I implement image processing algorithms with GLSL?
Case study: real time box blur on CPU vs GPU fragment shader
I have implemented a simple box blur algorithm on CPU and GPU fragment shader to see which was faster:
demo video
source code:
My camera refresh rate capped FPS to 30, so I measured how wide the box could be and still keep 30 FPS.
On a Lenovo T430 (2012), NVIDIA NVS5400, Ubuntu 16.04 with image dimensions 960x540, the maximum widths were:
GPU: 23
CPU: 5
Since the computation is quadratic, the speedup was:
( 23 / 5 ) ^ 2 = 21.16
faster on GPU than CPU!
Not all algorithms are faster on the GPU. For example, operations that act on single pictures like swapping RGB reach 30FPS on the CPU, so it is useless to add the complexity of GPU programming to it.
Like any other CPU vs GPU speedup question, it all comes down if you have enough work per byte transferred to the GPU, and benchmarking is the best thing you can do. In general, quadratic algorithms or worse are a good bet for the GPU. See also: What do the terms "CPU bound" and "I/O bound" mean?
Main part of the code (just clone from GitHub):
#include "common.h"
#include "../v4l2/common_v4l2.h"
static const GLuint WIDTH = 640;
static const GLuint HEIGHT = 480;
static const GLfloat vertices[] = {
/* xy uv */
-1.0, 1.0, 0.0, 1.0,
0.0, 1.0, 0.0, 0.0,
0.0, -1.0, 1.0, 0.0,
-1.0, -1.0, 1.0, 1.0,
static const GLuint indices[] = {
0, 1, 2,
0, 2, 3,
static const GLchar *vertex_shader_source =
"#version 330 core\n"
"in vec2 coord2d;\n"
"in vec2 vertexUv;\n"
"out vec2 fragmentUv;\n"
"void main() {\n"
" gl_Position = vec4(coord2d, 0, 1);\n"
" fragmentUv = vertexUv;\n"
static const GLchar *fragment_shader_source =
"#version 330 core\n"
"in vec2 fragmentUv;\n"
"out vec3 color;\n"
"uniform sampler2D myTextureSampler;\n"
"void main() {\n"
" color = texture(myTextureSampler, fragmentUv.yx).rgb;\n"
static const GLchar *vertex_shader_source2 =
"#version 330 core\n"
"in vec2 coord2d;\n"
"in vec2 vertexUv;\n"
"out vec2 fragmentUv;\n"
"void main() {\n"
" gl_Position = vec4(coord2d + vec2(1.0, 0.0), 0, 1);\n"
" fragmentUv = vertexUv;\n"
static const GLchar *fragment_shader_source2 =
"#version 330 core\n"
"in vec2 fragmentUv;\n"
"out vec3 color;\n"
"uniform sampler2D myTextureSampler;\n"
"// pixel Delta. How large a pixel is in 0.0 to 1.0 that textures use.\n"
"uniform vec2 pixD;\n"
"void main() {\n"
/*"// Identity\n"*/
/*" color = texture(myTextureSampler, fragmentUv.yx ).rgb;\n"*/
/*"// Inverter\n"*/
/*" color = 1.0 - texture(myTextureSampler, fragmentUv.yx ).rgb;\n"*/
/*"// Swapper\n"*/
/*" color = texture(myTextureSampler, fragmentUv.yx ).gbr;\n"*/
/*"// Double vision ortho.\n"*/
/*" color = ("*/
/*" texture(myTextureSampler, fragmentUv.yx ).rgb +\n"*/
/*" texture(myTextureSampler, fragmentUv.xy ).rgb\n"*/
/*" ) / 2.0;\n"*/
/*"// Multi-me.\n"*/
/*" color = texture(myTextureSampler, 4.0 * fragmentUv.yx ).rgb;\n"*/
/*"// Horizontal linear blur.\n"*/
/*" int blur_width = 21;\n"*/
/*" int blur_width_half = blur_width / 2;\n"*/
/*" color = vec3(0.0, 0.0, 0.0);\n"*/
/*" for (int i = -blur_width_half; i <= blur_width_half; ++i) {\n"*/
/*" color += texture(myTextureSampler, vec2(fragmentUv.y + i * pixD.x, fragmentUv.x)).rgb;\n"*/
/*" }\n"*/
/*" color /= blur_width;\n"*/
/*"// Square linear blur.\n"*/
" int blur_width = 23;\n"
" int blur_width_half = blur_width / 2;\n"
" color = vec3(0.0, 0.0, 0.0);\n"
" for (int i = -blur_width_half; i <= blur_width_half; ++i) {\n"
" for (int j = -blur_width_half; j <= blur_width_half; ++j) {\n"
" color += texture(\n"
" myTextureSampler, fragmentUv.yx + ivec2(i, j) * pixD\n"
" ).rgb;\n"
" }\n"
" }\n"
" color /= (blur_width * blur_width);\n"
int main(int argc, char **argv) {
CommonV4l2 common_v4l2;
GLFWwindow *window;
unsigned int
uint8_t *image;
float *image2 = NULL;
/*uint8_t *image2 = NULL;*/
if (argc > 1) {
width = strtol(argv[1], NULL, 10);
} else {
width = WIDTH;
if (argc > 2) {
height = strtol(argv[2], NULL, 10);
} else {
height = HEIGHT;
if (argc > 3) {
cpu = (argv[3][0] == '1');
} else {
cpu = 0;
/* Window system. */
window = glfwCreateWindow(2 * width, height, __FILE__, NULL, NULL);
CommonV4l2_init(&common_v4l2, COMMON_V4L2_DEVICE, width, height);
/* Shader setup. */
program = common_get_shader_program(vertex_shader_source, fragment_shader_source);
coord2d_location = glGetAttribLocation(program, "coord2d");
vertexUv_location = glGetAttribLocation(program, "vertexUv");
myTextureSampler_location = glGetUniformLocation(program, "myTextureSampler");
/* Shader setup 2. */
const GLchar *fs;
if (cpu) {
fs = fragment_shader_source;
} else {
fs = fragment_shader_source2;
program2 = common_get_shader_program(vertex_shader_source2, fs);
coord2d_location2 = glGetAttribLocation(program2, "coord2d");
vertexUv_location2 = glGetAttribLocation(program2, "vertexUv");
myTextureSampler_location2 = glGetUniformLocation(program2, "myTextureSampler");
pixD_location2 = glGetUniformLocation(program2, "pixD");
/* Create vbo. */
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
/* Create ebo. */
glGenBuffers(1, &ebo);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
/* vao. */
glGenVertexArrays(1, &vao);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer(coord2d_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices[0]), (GLvoid*)0);
glVertexAttribPointer(vertexUv_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), (GLvoid*)(2 * sizeof(vertices[0])));
/* vao2. */
glGenVertexArrays(1, &vao2);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer(coord2d_location2, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices[0]), (GLvoid*)0);
glVertexAttribPointer(vertexUv_location2, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), (GLvoid*)(2 * sizeof(vertices[0])));
/* Texture buffer. */
glGenTextures(1, &texture);
glBindTexture(GL_TEXTURE_2D, texture);
/* Constant state. */
glViewport(0, 0, 2 * width, height);
glClearColor(1.0f, 1.0f, 1.0f, 1.0f);
/* Main loop. */
do {
/* Blocks until an image is available, thus capping FPS to that.
* 30FPS is common in cheap webcams. */
image = CommonV4l2_getImage(&common_v4l2);
/* Original. */
GL_TEXTURE_2D, 0, GL_RGB, width, height,
glUniform1i(myTextureSampler_location, 0);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
/* Optional CPU modification to compare with GPU shader speed. */
if (cpu) {
image2 = realloc(image2, 3 * width * height * sizeof(image2[0]));
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
size_t index = 3 * (i * width + j);
/* Inverter. */
/*image2[index + 0] = 1.0 - (image[index + 0] / 255.0);*/
/*image2[index + 1] = 1.0 - (image[index + 1] / 255.0);*/
/*image2[index + 2] = 1.0 - (image[index + 2] / 255.0);*/
/* Swapper. */
/*image2[index + 0] = image[index + 1] / 255.0;*/
/*image2[index + 1] = image[index + 2] / 255.0;*/
/*image2[index + 2] = image[index + 0] / 255.0;*/
/* Square linear blur. */
int blur_width = 5;
int blur_width_half = blur_width / 2;
int blur_width2 = (blur_width * blur_width);
image2[index + 0] = 0.0;
image2[index + 1] = 0.0;
image2[index + 2] = 0.0;
for (int k = -blur_width_half; k <= blur_width_half; ++k) {
for (int l = -blur_width_half; l <= blur_width_half; ++l) {
int i2 = i + k;
int j2 = j + l;
// Out of bounds is black. TODO: do module to match shader exactly.
if (i2 > 0 && i2 < (int)height && j2 > 0 && j2 < (int)width) {
unsigned int srcIndex = index + 3 * (k * width + l);
image2[index + 0] += image[srcIndex + 0];
image2[index + 1] += image[srcIndex + 1];
image2[index + 2] += image[srcIndex + 2];
image2[index + 0] /= (blur_width2 * 255.0);
image2[index + 1] /= (blur_width2 * 255.0);
image2[index + 2] /= (blur_width2 * 255.0);
GL_TEXTURE_2D, 0, GL_RGB, width, height,
0, GL_RGB, GL_FLOAT, image2
/* Modified. */
glUniform1i(myTextureSampler_location2, 0);
glUniform2f(pixD_location2, 1.0 / width, 1.0 / height);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
} while (!glfwWindowShouldClose(window));
/* Cleanup. */
if (cpu) {
glDeleteBuffers(1, &vbo);
glDeleteVertexArrays(1, &vao);
glDeleteTextures(1, &texture);
The first obvious answer is that you gain parallelism. Now, why using GLSL rather than, say CUDA which is more flexible ? GLSL doesn't require you to have an NVIDIA graphics card, so it's a much more portable solution (you'd still have the option of OpenCL though).
What can you gain with parallelism ? Most of the time, you can treat pixels independantly. For instance, increasing the contrast of an image usually requires you to loop over all pixels and apply an affine transform of the pixel values. If each pixel is handled by a separate thread, then you don't need to do this loop anymore : you just raterize a quad, and apply a pixel shader that reads a texture at the current rasterized point, and ouput to the render target (or the screen) the transformed pixel value.
The drawback is that your data need to reside on the GPU : you'll need to transfer all your images to the GPU which can take some time, and can make the speedup gained with the parallelization useless. As such, GPU implementations are often done either when the operations to be made are compute intensive, or when the whole pipeline can remain on the GPU (for instance, if the goal is to only display the modified image on screen, you save the need to transfer back the image on the CPU).
OpenGL 4.3 (announced at SIGGRAPH 2012) supports Compute shaders. If you are doing strictly graphics work, and already using OpenGL, it might be easier to use this than OpenCL / OpenGL interop (or CUDA / OpenGL interop).
Here is what Khronos has to say about when to use 4.3 Compute shaders versus OpenCL: Link to PDF; see slide 5.