How to destroy CUDA graphics datatypes - c++

I create an OpenGL texture/CUDA surface pair from some RGB data with a function. The cudaSurfaceObject_t can be used in a CUDA kernel for GPU-accelerated image processing, and the GLuint can be used to render the results of the CUDA kernel. The function is provided in the program below:
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
void createTextureSurfacePair(int width, int height, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1, &textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D, textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D, // Type of texture
0, // Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA, // Internal color format to convert to
width, // Image width i.e. 640 for Kinect in standard mode
height, // Image height i.e. 480 for Kinect in standard mode
0, // Border width in pixels (can either be 1 or 0)
GL_BGR, // Input image format (i.e. GL_RGB, GL_RGBA, GL_BGR etc.)
GL_UNSIGNED_BYTE, // Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures, the surfaces will be of the same type.
//They won't know or care about their data types though, for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red, green, blue,
//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created, and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut, textureOut, GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1, &graphicsResourceOut, 0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d, graphicsResourceOut, 0, 0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc, 0, sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut, &resDesc);
cudaCheckError();
}
void initGL() {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
//glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
GLFWwindow* currentGLFWWindow = glfwCreateWindow(1280, 720, "Raytracing in One Weekend", NULL, NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
}
int main() {
initGL();
int size = 500;
uint8_t* data = new uint8_t[size * size * 3]; //dummy 100x100 RGB image
cudaSurfaceObject_t a;
cudaGraphicsResource_t b;
GLuint c;
for (int i = 0; i < 10000; i++) {
/*------ATTEMPT TO CREATE CUDA SURFACE AND OPENGL TEXTURE------------*/
createTextureSurfacePair(size, size, data, c, b, a);
/*------ATTEMPT TO DESTROY CUDA SURFACE AND OPENGL TEXTURE------------*/
//Destroy surface
cudaDestroySurfaceObject(a);
//Destroy graphics resource
cudaGraphicsUnmapResources(1, &b);
//Destroy texture
glDeleteTextures(1, &c);
if (i % 100 == 0) printf("Iteration %d\n", i);
}
}
There appears to be a memory leak in this program, as it causes the dedicated GPU memory to rapidly increase until the program crashes. What haven't I destroyed in the main function?

When I add the following line:
cudaGraphicsUnregisterResource(b);
after this line in your code:
cudaGraphicsUnmapResources(1, &b);
Your program runs to completion for me (i.e. it runs for the indicated 10000 loops) without throwing any errors. It also runs without errors when running under cuda-memcheck.
This function is the "destroyer" for cudaGraphicsGLRegisterImage. You can get some sense that this might be the case by:
studying various CUDA sample codes (such as simpleGLES, postProcessGL, imageDenoisingGL, bilateralFilter, and several others) which use CUDA/OpenGL interop.
referring to the runtime API docs for cudaGraphicsGLRegisterImage, and noting that at the bottom of the function description it lists:
See also:
cudaGraphicsUnregisterResource, cudaGraphicsMapResources, cudaGraphicsSubResourceGetMappedArray, cuGraphicsGLRegisterImage

Related

How to Process RGB Data from the CPU on an NVIDIA GPU and Visualize the Data with an OpenGL Texture

I'm hoping to create a simple computer vision library in C++/CUDA C++ that allows me to do the following:
Grab some RGB data from the host memory. This data will come in a BGR byte array, 8 bits per channel per pixel.
Process that data in a CUDA kernel.
Write the output of that kernel back into some host memory.
Render the output in an OpenGL texture for easy viewing.
These functions would go inside a class like so:
class Processor{
public:
setInput(const byte* data, int imageWidth, int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(byte* destination);
}
setInput() is going to be called with every frame of a video (hundreds or thousands of images of the same dimensions).
How can I write the Processor class so that setInput() can efficiently update an instance's internal CUDA array and processData() can synchronize the CUDA array with the OpenGL texture?
Below is my attempt at implementing such a class, contained in one CUDA C++ file along with a simple test. (Requires GLFW and GLAD.) With this implementation, I can provide some input image data, run a CUDA kernel that produces an output image, and visualize both with OpenGL textures. But it's extremely inefficient because every time setInput() is called, two OpenGL textures and two CUDA surface objects need to be created. And if more than one image is processed, two OpenGL textures and two CUDA surface objects also have to be destroyed.
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
const int windowWidth = 1280, windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
uchar4 pixel = { 255,128,0,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel, input, xPx * sizeof(uchar4), yPx, cudaBoundaryModeClamp);
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel, output, xPx * sizeof(uchar4), yPx);
}
class Processor {
public:
void setInput( uint8_t* const data, int imageWidth, int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* #brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* #brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width, height; };
/**
* #brief Creates a CUDA surface object, CUDA resource, and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut);
/**
* #brief Destroys every CUDA surface object, CUDA resource, and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* #brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions, imageOutputDimensions;
/**
* #brief A CUDA surface that can be read to, written from, or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0, d_imageOutputTexture = 0;
/**
* #brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource, d_imageOutputGraphicsResource;
/**
* #brief A renderable OpenGL texture that is synchronized with the CUDA data
* #see d_imageInputTexture, d_imageOutputTexture
*/
GLuint imageInputTexture = 0, imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data, int imageWidth, int imageHeight)
{
//Same-size images don't need texture regeneration, so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture, why not just overwrite them?
That's what I try to do in the following block, but because "data" is BGR and the texture
is RGBA, the channels get all messed up.
*/
/*
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc, d_imageInputTexture);
cudaCheckError();
//Copy the data from the input array to the surface
cudaMemcpyToArray(resDesc.res.array.array, 0, 0, input.data, imageInputDimensions.width * imageInputDimensions.height * 3, cudaMemcpyHostToDevice);
cudaCheckError();
//Set status flags
surfacesInitialized = true;
return;
*/
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions, data, imageInputTexture, d_imageInputGraphicsResource, d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions, outData, imageOutputTexture, d_imageOutputGraphicsResource, d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData()
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock), imageInputDimensions.height);
kernel <<<blocks, threadsPerBlock >>> (d_imageInputTexture, d_imageOutputTexture, imageInputDimensions.width, imageInputDimensions.height);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1, &textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D, textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D, // Type of texture
0, // Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA, // Internal color format to convert to
dimensions.width, // Image width i.e. 640 for Kinect in standard mode
dimensions.height, // Image height i.e. 480 for Kinect in standard mode
0, // Border width in pixels (can either be 1 or 0)
GL_BGR, // Input image format (i.e. GL_RGB, GL_RGBA, GL_BGR etc.)
GL_UNSIGNED_BYTE, // Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures, the surfaces will be of the same type.
//They won't know or care about their data types though, for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red, green, blue,
//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created, and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut, textureOut, GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1, &graphicsResourceOut, 0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d, graphicsResourceOut, 0, 0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc, 0, sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut, &resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1, &d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1, &d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1, &imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1, &imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL() {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
//glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth, windowHeight, "Output image (OpenGL + GLFW)", NULL, NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0, 0, windowWidth, windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0, windowWidth, windowHeight, 0.0, 0.0, 100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top, GLuint bottom) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D, top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, 0);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight/2);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight / 2);
glTexCoord2i(1, 0); glVertex2i(windowWidth, 0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw top quad */
glBindTexture(GL_TEXTURE_2D, bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, windowHeight / 2);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight);
glTexCoord2i(1, 0); glVertex2i(windowWidth, windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
initGL();
int imageWidth = windowWidth;
int imageHeight = windowHeight / 2;
uint8_t* imageData = new uint8_t[imageWidth * imageHeight * 3];
Processor p;
while (!glfwWindowShouldClose(currentGLFWWindow))
{
//Process the image here
p.setInput(imageData, imageWidth, imageHeight);
p.processData();
showTextures(p.getInputTexture(), p.getOutputTexture());
}
}
TL;DR: I can see at least 2 ways forward here, either convert your data to 4 byte pixels (somehow) and use cudaMemcpy2DToArray, or allow the CUDA kernel to take in raw data (instead of using a surface as input). I'll try to demonstrate both, although I don't wish to put in a large effort at polishing this, so really just demonstrating ideas.
This answer is working off the code you provided in an edit which is not your latest. However in the subsequent edits, mainly you seem to be just ripping out OpenCV, which I would normally applaud. However, since I've worked off your edit that had OpenCV in it, I've elected to use an OpenCV "test case" of my own.
Using 4 byte-per-pixel data, and cudaMemcpy2DToArray: This seems to adhere most closely to what you have demonstrated, albeit commented-out. The idea is we will access the input data by copying it to the CUDA array (acquired from the interop mechanism) directly. As you had previously pointed out, cudaMemcpyToArray is deprecated, so we won't use that. Furthermore, our data format (bytes per pixel) has to match what is in the array. I think there are a number of ways to solve this, depending on your overall pipeline, but the approach I show here isn't efficient, it's just to demonstrate that the method is "workable". If there is a way to use 4 byte per pixel data in your pipeline, however, you may be able to get rid of the "inefficiency" here. To use this method, compile the code with the -DUSE_1 switch.
Input of the data through the kernel. We can skip the inefficiency of the first case by just allowing the kernel to do the 3-byte to 4-byte conversion of data on the fly. Either way, there is a copy of data from host to device, but this method doesn't require 4 byte per pixel input data.
Here is code demonstrating both options:
//nvcc -arch=sm_35 -o t19 glad/src/glad.c t19.cu -lGL -lGLU -I./glad/include -lglfw -std=c++11 -lopencv_core -lopencv_highgui -lopencv_imgcodecs -Wno-deprecated-gpu-targets
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
#include <opencv2/highgui.hpp>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
//const int windowWidth = 1280, windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height, uint8_t *data) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
#ifdef USE_1
uchar4 pixel = { 255,128,0,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel, input, xPx * sizeof(uchar4), yPx, cudaBoundaryModeClamp);
#else
uchar4 pixel;
pixel.x = data[(xPx+yPx*width)*3 + 0];
pixel.y = data[(xPx+yPx*width)*3 + 1];
pixel.z = data[(xPx+yPx*width)*3 + 2];
pixel.w = 255;
surf2Dwrite(pixel, input, xPx * sizeof(uchar4), yPx);
#endif
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel, output, xPx * sizeof(uchar4), yPx);
}
class Processor {
public:
void setInput( uint8_t* const data, int imageWidth, int imageHeight);
void processData(uint8_t *data, uint8_t *d_data);
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* #brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* #brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width, height; };
/**
* #brief Creates a CUDA surface object, CUDA resource, and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut);
/**
* #brief Destroys every CUDA surface object, CUDA resource, and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* #brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions, imageOutputDimensions;
/**
* #brief A CUDA surface that can be read to, written from, or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0, d_imageOutputTexture = 0;
/**
* #brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource, d_imageOutputGraphicsResource;
/**
* #brief A renderable OpenGL texture that is synchronized with the CUDA data
* #see d_imageInputTexture, d_imageOutputTexture
*/
GLuint imageInputTexture = 0, imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data, int imageWidth, int imageHeight)
{
//Same-size images don't need texture regeneration, so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture, why not just overwrite them?
That's what I try to do in the following block, but because "data" is BGR and the texture
is RGBA, the channels get all messed up.
*/
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
#ifdef USE_1
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc, d_imageInputTexture);
cudaCheckError();
uint8_t *data4 = new uint8_t[imageInputDimensions.width*imageInputDimensions.height*4];
for (int i = 0; i < imageInputDimensions.width*imageInputDimensions.height; i++){
data4[i*4+0] = data[i*3+0];
data4[i*4+1] = data[i*3+1];
data4[i*4+2] = data[i*3+2];
data4[i*4+3] = 255;}
//Copy the data from the input array to the surface
// cudaMemcpyToArray(resDesc.res.array.array, 0, 0, data, imageInputDimensions.width * imageInputDimensions.height * 3, cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(resDesc.res.array.array, 0, 0, data4, imageInputDimensions.width*4, imageInputDimensions.width*4, imageInputDimensions.height, cudaMemcpyHostToDevice);
cudaCheckError();
delete[] data4;
#endif
//Set status flags
surfacesInitialized = true;
return;
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions, data, imageInputTexture, d_imageInputGraphicsResource, d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions, outData, imageOutputTexture, d_imageOutputGraphicsResource, d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData(uint8_t *data, uint8_t *d_data)
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock), imageInputDimensions.height);
#ifndef USE_1
cudaMemcpy(d_data, data, imageInputDimensions.width*imageInputDimensions.height*3, cudaMemcpyHostToDevice);
#endif
kernel <<<blocks, threadsPerBlock >>> (d_imageInputTexture, d_imageOutputTexture, imageInputDimensions.width, imageInputDimensions.height, d_data);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1, &textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D, textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D, // Type of texture
0, // Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA, // Internal color format to convert to
dimensions.width, // Image width i.e. 640 for Kinect in standard mode
dimensions.height, // Image height i.e. 480 for Kinect in standard mode
0, // Border width in pixels (can either be 1 or 0)
GL_BGR, // Input image format (i.e. GL_RGB, GL_RGBA, GL_BGR etc.)
GL_UNSIGNED_BYTE, // Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures, the surfaces will be of the same type.
//They won't know or care about their data types though, for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red, green, blue,
//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created, and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut, textureOut, GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1, &graphicsResourceOut, 0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d, graphicsResourceOut, 0, 0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc, 0, sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut, &resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1, &d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1, &d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1, &imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1, &imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL(int windowWidth, int windowHeight) {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
//const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
//glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth, windowHeight, "Output image (OpenGL + GLFW)", NULL, NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0, 0, windowWidth, windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0, windowWidth, windowHeight, 0.0, 0.0, 100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top, GLuint bottom, int windowWidth, int windowHeight) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D, top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, 0);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight/2);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight / 2);
glTexCoord2i(1, 0); glVertex2i(windowWidth, 0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw bottom quad */
glBindTexture(GL_TEXTURE_2D, bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, windowHeight / 2);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight);
glTexCoord2i(1, 0); glVertex2i(windowWidth, windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
using namespace cv;
using namespace std;
// initGL();
std::string filename = "./lena.pgm";
Mat image;
image = imread(filename, CV_LOAD_IMAGE_COLOR); // Read the file
if(! image.data ) // Check for invalid input
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
int windoww = 1280;
int windowh = 720;
initGL(windoww,windowh);
uint8_t *d_data;
cudaMalloc(&d_data, image.cols*image.rows*3);
Processor p;
for (int i = 0; i < image.cols; i++)
{
image.data[i*3+0] = 0;
image.data[i*3+1] = 0;
image.data[i*3+2] = 0;
//Process the image here
p.setInput(image.data, image.cols, image.rows);
p.processData(image.data, d_data);
showTextures(p.getInputTexture(), p.getOutputTexture(), windoww, windowh);
}
}
Notes:
The compilation command is given in the comment in the first line
I created a "video" of sorts using a single image. The "video" will show the image with a black or white line moving horizontally from left to right in the top pixel row of the image. The input image is lena.pgm which can be found in the CUDA samples (for example, at /usr/local/cuda-10.1/samples/3_Imaging/SobelFilter/data/lena.pgm).
It looks to me like you are "sharing" resources between OpenGL and CUDA. This doesn't look like the right map/unmap sequence to me, but it seems to be working, and it doesn't seem to be the focus of your question. I haven't spent any time investigating. I may have missed something.
I'm not suggesting this code is defect free or suitable for any particular purpose. It is mostly your code. I've modified it slightly to demonstrate some ideas described in the text.
There shouldn't be any visual difference in the output whether you compile with -DUSE_1 or not.
This is an useful feature that came across first in (https://www.3dgep.com/opengl-interoperability-with-cuda/), and I have improved upon it to use latest CUDA APIs and flow. You can refer to these 2 functions in cudammf.
https://github.com/prabindh/cudammf/blob/5f93358784fcbaae7eea0850424c59d2ed057dab/cuda_postproces.cu#L119
https://github.com/prabindh/cudammf/blob/5f93358784fcbaae7eea0850424c59d2ed057dab/decoder3.cpp#L507
Basic working is as below:
Create a regular GL texture (GLTextureId). Map it for CUDA access, via cudaGraphicsGLRegisterImage
Do some CUDA processing, and result is in a CUDA buffer
USe cudaMemcpyToArray to transfer between the above 2 device memories
If your output is coming from a Nvidia codec output, you should also refer to the AppDecGL sample in the Nvidia Video SDK (https://developer.nvidia.com/nvidia-video-codec-sdk).

OpenGL integer texture raising GL_INVALID_VALUE

I have an Nvidia GTX 970, with the latest (441.66) driver for Win 10 x64 (18362 build), which is obviously fully OpenGL 4.6 compliant, and currently compiling an app with VS2017.
My problem is, that I seem to be unable to use any other texture type then GL_UNSIGNED_BYTE. I'm currently trying to set up a single channel, unsigned integer (32 bit) texture, but however I try to allocate the texture, OpenGL immediately raises the GL_INVALID_VALUE error, and the shader's result turns all black.
So far I tryed allocating immutably:
glTexStorage2D(GL_TEXTURE_2D, 0, GL_R32UI, 3072, 2048);
And mutably:
glTexImage2D(GL_TEXTURE_2D, 0, GL_R32UI, 3072, 2048, 0, GL_RED, GL_UNSIGNED_INT, textureData);
I tried signed int too, the same thing. I also checked with NSight VS edition, for UINT 2D textures, my max resolution is 16384x16384, so that's not the issue. Also, according to NSight, uint textures are fully supported by the OpenGL driver.
What am I missing here?
Minimum reproducable version:
#include <iostream>
#include <GL/glew.h>
#include <SDL.h>
void OGLErrorCheck()
{
const GLenum errorCode = glGetError();
if (errorCode != GL_NO_ERROR)
{
const GLubyte* const errorString = gluErrorString(errorCode);
std::cout << errorString;
}
}
int main(int argc, char* argv[])
{
glewInit();
glTexImage2D(GL_TEXTURE_2D, 0, GL_R32UI, 3072, 2048, 0, GL_RED, GL_UNSIGNED_INT, nullptr);
OGLErrorCheck();
getchar();
return 0;
}
This yields GL_INVALID_OPERATION.
This is linked with the latest SDL and GLEW, both free software, available for download at https://www.libsdl.org/ and http://glew.sourceforge.net/ respectively.
From the specs about glTexStorage2D:
void glTexStorage2D(
GLenum target,
GLsizei levels,
GLenum internalformat,
GLsizei width,
GLsizei height
);
[…]
GL_INVALID_VALUE is generated if width, height or levels are less than 1
And the value for levels you pass to glTexStorage2D is 0.
First of all you have to create an OpenGL Context. e.g.:
(See also Using OpenGL With SDL)
if (SDL_Init(SDL_INIT_VIDEO) < 0)
return 0;
SDL_Window *window = SDL_CreateWindow("ogl wnd", 0, 0, width, height, SDL_WINDOW_OPENGL);
if (window == nullptr)
return 0;
SDL_GLContext context = SDL_GL_CreateContext(window);
if (glewInit() != GLEW_OK)
return 0;
Then you have to generate an texture name by glGenTextures:
GLuint tobj;
glGenTextures(1, &tobj);
After that you've to bind the named texture to a texturing target by glBindTexture:
glBindTexture(GL_TEXTURE_2D, tobj);
Finally you can specify the two-dimensional texture image by glTexImage2D
glTexImage2D(GL_TEXTURE_2D, 0, GL_R32UI, 3072, 2048, 0, GL_RED_INTEGER, GL_UNSIGNED_INT, nullptr);
Note, the texture format has to be GL_RED_INTEGER rather than GL_RED, because the source texture image has to be interpreted as integral data, rather than normalized floating point data. The format and type parameter specify the format of the source data. The internalformat parameter specifies the format of the target texture image.
Set the texture parameters by glTexParameter ( this can be done before glTexImage2D, too):
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
If you do not generate mipmaps (by glGenerateMipmap), then setting the GL_TEXTURE_MIN_FILTER is important. Since the default filter is GL_NEAREST_MIPMAP_LINEAR the texture would be mipmap incomplete, if you don not change the minifying function to GL_NEAREST or GL_LINEAR.
And mutably: glTexImage2D(GL_TEXTURE_2D, 0, GL_R32UI, 3072, 2048, 0, GL_RED, GL_UNSIGNED_INT, textureData);
I tried signed int too, the same thing. I also checked with NSight VS edition, for UINT 2D textures, my max resolution is 16384x16384, so that's not the issue. Also, according to NSight, uint textures are fully supported by the OpenGL driver.
What am I missing here?
For unormalized integer texture formatss, the format parameter of glTex[Sub]Image is not allowed to be just GL_RED, you have to use GL_RED_INTEGER.The format,type combination GL_RED, GL_UNSIGNED_INT is for specifying normalized fixed point or floating point formats only.

issues with mixing glGetTexImage and imageStore on nvidia opengl

I wrote some code, too long to paste here, that renders into a 3D 1 component float texture via a fragment shader that uses bindless imageLoad and imageStore.
That code is definitely working.
I then needed to work around some GLSL compiler bugs, so wanted to read the 3D texture above back to the host via glGetTexImage. Yes, I did do a glMemoryBarrierEXT(GL_ALL_BARRIER_BITS).
I did check the texture info via glGetTexLevelparameteriv() and everything I see matches. I did check for OpenGL errors, and have none.
Sadly, though, glGetTexImage never seems to read what was written by the fragment shader. Instead, it only returns the fake values I put in when I called glTexImage3D() to create the texture.
Is that expected behavior? The documentation implies otherwise.
If glGetTexImage actually works that way, how can I read back the data in that 3D texture (resident on the device?) Clearly the driver can do that as it does when the texture is made non-resident. Surely there's a simple way to do this simple thing...
I was asking if glGetTexImage was supposed to work that way or not. Here's the code:
void Bindless3DArray::dump_array(Array3D<float> &out)
{
bool was_mapped = m_image_mapped;
if (was_mapped)
unmap_array(); // unmap array so it's accessible to opengl
out.resize(m_depth, m_height, m_width);
glBindTexture(GL_TEXTURE_3D, m_textureid); // from glGenTextures()
#if 0
int w,h,d;
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_WIDTH, &w);
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_HEIGHT, &h);
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_DEPTH, &d);
int internal_format;
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_INTERNAL_FORMAT, &internal_format);
int data_type_r, data_type_g;
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_RED_TYPE, &data_type_r);
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_GREEN_TYPE, &data_type_g);
int size_r, size_g;
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_RED_SIZE, &size_r);
glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_GREEN_SIZE, &size_g);
#endif
glGetTexImage(GL_TEXTURE_3D, 0, GL_RED, GL_FLOAT, &out(0,0,0));
glBindTexture(GL_TEXTURE_3D, 0);
CHECK_GLERROR();
if (was_mapped)
map_array_to_cuda(); // restore state
}
Here's the code that creates the bindless array:
void Bindless3DArray::allocate(int w, int h, int d, ElementType t)
{
if (!m_textureid)
glGenTextures(1, &m_textureid);
m_type = t;
m_width = w;
m_height = h;
m_depth = d;
glBindTexture(GL_TEXTURE_3D, m_textureid);
CHECK_GLERROR();
glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAX_LEVEL, 0); // ensure only 1 miplevel is allocated
CHECK_GLERROR();
Array3D<float> foo(d, h, w);
// DEBUG -- glGetTexImage returns THIS data, not what's on device
for (int z=0; z<m_depth; ++z)
for (int y=0; y<m_height; ++y)
for (int x=0; x<m_width; ++x)
foo(z,y,x) = 3.14159;
//-- Texture creation
if (t == ElementInteger)
glTexImage3D(GL_TEXTURE_3D, 0, GL_R32UI, w, h, d, 0, GL_RED_INTEGER, GL_INT, 0);
else if (t == ElementFloat)
glTexImage3D(GL_TEXTURE_3D, 0, GL_R32F, w, h, d, 0, GL_RED, GL_FLOAT, &foo(0,0,0));
else
throw "Invalid type for Bindless3DArray";
CHECK_GLERROR();
m_handle = glGetImageHandleNV(m_textureid, 0, true, 0, (t == ElementInteger) ? GL_R32UI : GL_R32F);
glMakeImageHandleResidentNV(m_handle, GL_READ_WRITE);
CHECK_GLERROR();
#ifdef USE_CUDA
checkCuda(cudaGraphicsGLRegisterImage(&m_image_resource, m_textureid, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsSurfaceLoadStore));
#endif
}
I allocate the array, render to it via an OpenGL fragment program, and then I call dump_array() to read the data back. Sadly, I only get what I loaded in the allocate call.
The render program looks like
void App::clear_deepz()
{
deepz_clear_program.bind();
deepz_clear_program.setUniformValue("sentinel", SENTINEL);
deepz_clear_program.setUniformValue("deepz", deepz_array.handle());
deepz_clear_program.setUniformValue("sem", semaphore_array.handle());
run_program();
glMemoryBarrierEXT(GL_ALL_BARRIER_BITS);
// glMemoryBarrierEXT(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
// glMemoryBarrierEXT(GL_SHADER_GLOBAL_ACCESS_BARRIER_BIT_NV);
deepz_clear_program.release();
}
and the fragment program is:
#version 420\n
in vec4 gl_FragCoord;
uniform float sentinel;
coherent uniform layout(size1x32) image3D deepz;
coherent uniform layout(size1x32) uimage3D sem;
void main(void)
{
ivec3 coords = ivec3(gl_FragCoord.x, gl_FragCoord.y, 0);
imageStore(deepz, coords, vec4(sentinel));
imageStore(sem, coords, ivec4(0));
discard; // don't write to FBO at all
}
discard; // don't write to FBO at all
That's not what discard means. Oh, it does mean that. But it also means that all Image Load/Store writes will be discarded too. Indeed, odds are, the compiler will see that statement and just do nothing for the entire fragment shader.
If you want to just execute the fragment shader, you can employ the GL 4.3 feature (available on your NVIDIA hardware) of having an empty framebuffer object. Or you could use a compute shader. If you can't use GL 4.3 yet, then use a write mask to turn off all color writes.
As Nicol mentions above, if you want side effects only of image load and store, the proper way is to use an empty frame buffer object.
The bug of mixing glGetTexImage() and bindless textures was in fact a driver bug, and has been fixed as of driver version 335.23. I filed the bug and have confirmed my code is now working properly.
Note I am using empty frame buffer objects in the code, and don't use "discard" any more.

OpenGL renders texture all white

I'm attempting to render a .png image as a texture. However, all that is being rendered is a white square.
I give my texture a unique int ID called texID, read the pixeldata into a buffer 'image' (declared in the .h file). I load my pixelbuffer, do all of my OpenGL stuff and bind that pixelbuffer to a texture for OpenGL. I then draw it all using glDrawElements.
Also I initialize the texture with a size of 32x32 when its contructor is called, therefore i doubt it is related to a power of two size issue.
Can anybody see any mistakes in my OpenGL GL_TEXTURE_2D setup that might give me a block white square.
#include "Texture.h"
Texture::Texture(int width, int height, string filename)
{
const char* fnPtr = filename.c_str(); //our image loader accepts a ptr to a char, not a string
printf(fnPtr);
w = width; //give our texture a width and height, the reason that we need to pass in the width and height values manually
h = height;//UPDATE, these MUST be P.O.T.
unsigned error = lodepng::decode(image,w,h,fnPtr);//lodepng's decode function will load the pixel data into image vector
//display any errors with the texture
if(error)
{
cout << "\ndecoder error " << error << ": " << lodepng_error_text(error) <<endl;
}
for(int i = 0; i<image.size(); i++)
{
printf("%i,", image.at(i));
}
printf("\nImage size is %i", image.size());
//image now contains our pixeldata. All ready for OpenGL to do its thing
//let's get this texture up in the video memory
texGLInit();
}
void Texture::texGLInit()
{
//WHERE YOU LEFT OFF: glGenTextures isn't assigning an ID to textures. it stays at zero the whole time
//i believe this is why it's been rendering white
glGenTextures(1, &textures);
printf("\ntexture = %u", textures);
glBindTexture(GL_TEXTURE_2D, textures);//evrything we're about to do is about this texture
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
//glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
//glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
//glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
//glDisable(GL_COLOR_MATERIAL);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8,w,h,0, GL_RGBA, GL_UNSIGNED_BYTE, &image);
//we COULD free the image vectors memory right about now.
}
void Texture::draw(point centerPoint, point dimensions)
{
glEnable(GL_TEXTURE_2D);
printf("\nDrawing block at (%f, %f)",centerPoint.x, centerPoint.y);
glBindTexture(GL_TEXTURE_2D, textures);//bind the texture
//create a quick vertex array for the primitive we're going to bind the texture to
printf("TexID = %u",textures);
GLfloat vArray[8] =
{
centerPoint.x-(dimensions.x/2), centerPoint.y-(dimensions.y/2),//bottom left i0
centerPoint.x-(dimensions.x/2), centerPoint.y+(dimensions.y/2),//top left i1
centerPoint.x+(dimensions.x/2), centerPoint.y+(dimensions.y/2),//top right i2
centerPoint.x+(dimensions.x/2), centerPoint.y-(dimensions.y/2)//bottom right i3
};
//create a quick texture array (we COULD create this on the heap rather than creating/destoying every cycle)
GLfloat tArray[8] =
{
0.0f,0.0f, //0
0.0f,1.0f, //1
1.0f,1.0f, //2
1.0f,0.0f //3
};
//and finally.. the index array...remember, we draw in triangles....(and we'll go CW)
GLubyte iArray[6] =
{
0,1,2,
0,2,3
};
//Activate arrays
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
//Give openGL a pointer to our vArray and tArray
glVertexPointer(2, GL_FLOAT, 0, &vArray[0]);
glTexCoordPointer(2, GL_FLOAT, 0, &tArray[0]);
//Draw it all
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_BYTE, &iArray[0]);
//glDrawArrays(GL_TRIANGLES,0,6);
//Disable the vertex arrays
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_TEXTURE_COORD_ARRAY);
glDisable(GL_TEXTURE_2D);
//done!
/*glBegin(GL_QUADS);
glTexCoord2f(0.0f,0.0f);
glVertex2f(centerPoint.x-(dimensions.x/2), centerPoint.y-(dimensions.y/2));
glTexCoord2f(0.0f,1.0f);
glVertex2f(centerPoint.x-(dimensions.x/2), centerPoint.y+(dimensions.y/2));
glTexCoord2f(1.0f,1.0f);
glVertex2f(centerPoint.x+(dimensions.x/2), centerPoint.y+(dimensions.y/2));
glTexCoord2f(1.0f,0.0f);
glVertex2f(centerPoint.x+(dimensions.x/2), centerPoint.y-(dimensions.y/2));
glEnd();*/
}
Texture::Texture(void)
{
}
Texture::~Texture(void)
{
}
I'll also include the main class' init, where I do a bit more OGL setup before this.
void init(void)
{
printf("\n......Hello Guy. \n....\nInitilising");
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0,XSize,0,YSize);
glEnable(GL_TEXTURE_2D);
myBlock = new Block(0,0,offset);
glClearColor(0,0.4,0.7,1);
glLineWidth(2); // Width of the drawing line
glMatrixMode(GL_MODELVIEW);
glDisable(GL_DEPTH_TEST);
printf("\nInitialisation Complete");
}
Update: adding in the main function where I first setup my OpenGL window.
int main(int argc, char** argv)
{
glutInit(&argc, argv); // GLUT Initialization
glutInitDisplayMode(GLUT_RGBA|GLUT_DOUBLE); // Initializing the Display mode
glutInitWindowSize(800,600); // Define the window size
glutCreateWindow("Gem Miners"); // Create the window, with caption.
printf("\n========== McLeanTech Systems =========\nBecoming Sentient\n...\n...\n....\nKILL\nHUMAN\nRACE \n");
init(); // All OpenGL initialization
//-- Callback functions ---------------------
glutDisplayFunc(display);
glutKeyboardFunc(mykey);
glutSpecialFunc(processSpecialKeys);
glutSpecialUpFunc(processSpecialUpKeys);
//glutMouseFunc(mymouse);
glutMainLoop(); // Loop waiting for event
}
Here's the usual checklist for whenever textures come out white:
OpenGL context created and being bound to current thread when attemting to load texture?
Allocated texture ID using glGenTextures?
Are the parameters format and internal format to glTex[Sub]Image… valid OpenGL tokens allowed as input for this function?
Is mipmapping being used?
YES: Supply all mipmap layers – optimally set glTexParameteri GL_TEXTURE_BASE_LEVEL and GL_TEXTURE_MAX_LEVEL, as well as GL_TEXTURE_MIN_LOD and GL_TEXTURE_MAX_LOG.
NO: Turn off mipmap filtering by setting glTexParameteri GL_TEXTURE_MIN_FILTER to GL_NEAREST or GL_LINEAR.

Problems with GL_LUMINANCE and ATI

I'm trying to use luminance textures on my ATI graphics card.
The problem: I'm not being able to correctly retrieve data from my GPU. Whenever I try to read it (using glReadPixels), all it gives me is an 'all-ones' array (1.0, 1.0, 1.0...).
You can test it with this code:
#include <stdio.h>
#include <stdlib.h>
#include <GL/glew.h>
#include <GL/glut.h>
static int arraySize = 64;
static int textureSize = 8;
//static GLenum textureTarget = GL_TEXTURE_2D;
//static GLenum textureFormat = GL_RGBA;
//static GLenum textureInternalFormat = GL_RGBA_FLOAT32_ATI;
static GLenum textureTarget = GL_TEXTURE_RECTANGLE_ARB;
static GLenum textureFormat = GL_LUMINANCE;
static GLenum textureInternalFormat = GL_LUMINANCE_FLOAT32_ATI;
int main(int argc, char** argv)
{
// create test data and fill arbitrarily
float* data = new float[arraySize];
float* result = new float[arraySize];
for (int i = 0; i < arraySize; i++)
{
data[i] = i + 1.0;
}
// set up glut to get valid GL context and
// get extension entry points
glutInit (&argc, argv);
glutCreateWindow("TEST1");
glewInit();
// viewport transform for 1:1 pixel=texel=data mapping
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0.0, textureSize, 0.0, textureSize);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glViewport(0, 0, textureSize, textureSize);
// create FBO and bind it (that is, use offscreen render target)
GLuint fboId;
glGenFramebuffersEXT(1, &fboId);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fboId);
// create texture
GLuint textureId;
glGenTextures (1, &textureId);
glBindTexture(textureTarget, textureId);
// set texture parameters
glTexParameteri(textureTarget, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(textureTarget, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(textureTarget, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(textureTarget, GL_TEXTURE_WRAP_T, GL_CLAMP);
// define texture with floating point format
glTexImage2D(textureTarget, 0, textureInternalFormat, textureSize, textureSize, 0, textureFormat, GL_FLOAT, 0);
// attach texture
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, textureTarget, textureId, 0);
// transfer data to texture
//glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
//glRasterPos2i(0, 0);
//glDrawPixels(textureSize, textureSize, textureFormat, GL_FLOAT, data);
glBindTexture(textureTarget, textureId);
glTexSubImage2D(textureTarget, 0, 0, 0, textureSize, textureSize, textureFormat, GL_FLOAT, data);
// and read back
glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
glReadPixels(0, 0, textureSize, textureSize, textureFormat, GL_FLOAT, result);
// print out results
printf("**********************\n");
printf("Data before roundtrip:\n");
printf("**********************\n");
for (int i = 0; i < arraySize; i++)
{
printf("%f, ", data[i]);
}
printf("\n\n\n");
printf("**********************\n");
printf("Data after roundtrip:\n");
printf("**********************\n");
for (int i = 0; i < arraySize; i++)
{
printf("%f, ", result[i]);
}
printf("\n");
// clean up
delete[] data;
delete[] result;
glDeleteFramebuffersEXT (1, &fboId);
glDeleteTextures (1, &textureId);
system("pause");
return 0;
}
I also read somewhere on the internet that ATI cards don't support luminance yet. Does anyone know if this is true?
This has nothing to do with luminance values; the problem is with you reading floating point values.
In order to read floating-point data back properly via glReadPixels, you first need to set the color clamping mode. Since you're obviously not using OpenGL 3.0+, you should be looking at the ARB_color_buffer_float extension. In that extension is glClampColorARB, which works pretty much like the core 3.0 verison.
here's what I found out:
1) If you use GL_LUMINANCE as texture format (and GL_LUMINANCE_FLOAT32_ATI GL_LUMINANCE32F_ARB or GL_RGBA_FLOAT32_ATI as internal format), the glClampColor(..) (or glClampColorARB(..)) doesn't seem to work at all.
I was only able to see the values getting actively clamped/not clamped if I set the texture format to GL_RGBA. I don't understand why this happens, since the only glClampColor(..) limitation I heard of is that it works exclusively with floating-point buffers, which all chosen internal formats seems to be.
2) If you use GL_LUMINANCE (again, with GL_LUMINANCE_FLOAT32_ATI, GL_LUMINANCE32F_ARB or GL_RGBA_FLOAT32_ATI as internal format), it looks like you must "correct" your output buffer dividing each of its elements by 3. I guess this happens because when you use glTexImage2D(..) with GL_LUMINANCE it internally replicates each array component three times and when you read GL_LUMINANCE values with glReadPixel(..) it calculates its values from the sum of the RGB components (thus, three times what you have given as input). But again, it stills give you clamped values.
3) Finally, if you use GL_RED as texture format (instead of GL_LUMINANCE), you don't need to pack your input buffer and you get your output buffer properly. The values are not clamped and you don't need to call glClampColor(..) at all.
So, I guess I'll stick with GL_RED, because in the end what I wanted was an easy way to send and collect floating-point values from my "kernels" without having to worry about offsetting array indexes or anything like this.