Related
I'm hoping to create a simple computer vision library in C++/CUDA C++ that allows me to do the following:
Grab some RGB data from the host memory. This data will come in a BGR byte array, 8 bits per channel per pixel.
Process that data in a CUDA kernel.
Write the output of that kernel back into some host memory.
Render the output in an OpenGL texture for easy viewing.
These functions would go inside a class like so:
class Processor{
public:
setInput(const byte* data, int imageWidth, int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(byte* destination);
}
setInput() is going to be called with every frame of a video (hundreds or thousands of images of the same dimensions).
How can I write the Processor class so that setInput() can efficiently update an instance's internal CUDA array and processData() can synchronize the CUDA array with the OpenGL texture?
Below is my attempt at implementing such a class, contained in one CUDA C++ file along with a simple test. (Requires GLFW and GLAD.) With this implementation, I can provide some input image data, run a CUDA kernel that produces an output image, and visualize both with OpenGL textures. But it's extremely inefficient because every time setInput() is called, two OpenGL textures and two CUDA surface objects need to be created. And if more than one image is processed, two OpenGL textures and two CUDA surface objects also have to be destroyed.
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
const int windowWidth = 1280, windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
uchar4 pixel = { 255,128,0,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel, input, xPx * sizeof(uchar4), yPx, cudaBoundaryModeClamp);
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel, output, xPx * sizeof(uchar4), yPx);
}
class Processor {
public:
void setInput( uint8_t* const data, int imageWidth, int imageHeight);
void processData();
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* #brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* #brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width, height; };
/**
* #brief Creates a CUDA surface object, CUDA resource, and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut);
/**
* #brief Destroys every CUDA surface object, CUDA resource, and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* #brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions, imageOutputDimensions;
/**
* #brief A CUDA surface that can be read to, written from, or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0, d_imageOutputTexture = 0;
/**
* #brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource, d_imageOutputGraphicsResource;
/**
* #brief A renderable OpenGL texture that is synchronized with the CUDA data
* #see d_imageInputTexture, d_imageOutputTexture
*/
GLuint imageInputTexture = 0, imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data, int imageWidth, int imageHeight)
{
//Same-size images don't need texture regeneration, so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture, why not just overwrite them?
That's what I try to do in the following block, but because "data" is BGR and the texture
is RGBA, the channels get all messed up.
*/
/*
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc, d_imageInputTexture);
cudaCheckError();
//Copy the data from the input array to the surface
cudaMemcpyToArray(resDesc.res.array.array, 0, 0, input.data, imageInputDimensions.width * imageInputDimensions.height * 3, cudaMemcpyHostToDevice);
cudaCheckError();
//Set status flags
surfacesInitialized = true;
return;
*/
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions, data, imageInputTexture, d_imageInputGraphicsResource, d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions, outData, imageOutputTexture, d_imageOutputGraphicsResource, d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData()
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock), imageInputDimensions.height);
kernel <<<blocks, threadsPerBlock >>> (d_imageInputTexture, d_imageOutputTexture, imageInputDimensions.width, imageInputDimensions.height);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1, &textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D, textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D, // Type of texture
0, // Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA, // Internal color format to convert to
dimensions.width, // Image width i.e. 640 for Kinect in standard mode
dimensions.height, // Image height i.e. 480 for Kinect in standard mode
0, // Border width in pixels (can either be 1 or 0)
GL_BGR, // Input image format (i.e. GL_RGB, GL_RGBA, GL_BGR etc.)
GL_UNSIGNED_BYTE, // Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures, the surfaces will be of the same type.
//They won't know or care about their data types though, for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red, green, blue,
//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created, and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut, textureOut, GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1, &graphicsResourceOut, 0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d, graphicsResourceOut, 0, 0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc, 0, sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut, &resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1, &d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1, &d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1, &imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1, &imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL() {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
//glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth, windowHeight, "Output image (OpenGL + GLFW)", NULL, NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0, 0, windowWidth, windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0, windowWidth, windowHeight, 0.0, 0.0, 100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top, GLuint bottom) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D, top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, 0);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight/2);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight / 2);
glTexCoord2i(1, 0); glVertex2i(windowWidth, 0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw top quad */
glBindTexture(GL_TEXTURE_2D, bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, windowHeight / 2);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight);
glTexCoord2i(1, 0); glVertex2i(windowWidth, windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
initGL();
int imageWidth = windowWidth;
int imageHeight = windowHeight / 2;
uint8_t* imageData = new uint8_t[imageWidth * imageHeight * 3];
Processor p;
while (!glfwWindowShouldClose(currentGLFWWindow))
{
//Process the image here
p.setInput(imageData, imageWidth, imageHeight);
p.processData();
showTextures(p.getInputTexture(), p.getOutputTexture());
}
}
TL;DR: I can see at least 2 ways forward here, either convert your data to 4 byte pixels (somehow) and use cudaMemcpy2DToArray, or allow the CUDA kernel to take in raw data (instead of using a surface as input). I'll try to demonstrate both, although I don't wish to put in a large effort at polishing this, so really just demonstrating ideas.
This answer is working off the code you provided in an edit which is not your latest. However in the subsequent edits, mainly you seem to be just ripping out OpenCV, which I would normally applaud. However, since I've worked off your edit that had OpenCV in it, I've elected to use an OpenCV "test case" of my own.
Using 4 byte-per-pixel data, and cudaMemcpy2DToArray: This seems to adhere most closely to what you have demonstrated, albeit commented-out. The idea is we will access the input data by copying it to the CUDA array (acquired from the interop mechanism) directly. As you had previously pointed out, cudaMemcpyToArray is deprecated, so we won't use that. Furthermore, our data format (bytes per pixel) has to match what is in the array. I think there are a number of ways to solve this, depending on your overall pipeline, but the approach I show here isn't efficient, it's just to demonstrate that the method is "workable". If there is a way to use 4 byte per pixel data in your pipeline, however, you may be able to get rid of the "inefficiency" here. To use this method, compile the code with the -DUSE_1 switch.
Input of the data through the kernel. We can skip the inefficiency of the first case by just allowing the kernel to do the 3-byte to 4-byte conversion of data on the fly. Either way, there is a copy of data from host to device, but this method doesn't require 4 byte per pixel input data.
Here is code demonstrating both options:
//nvcc -arch=sm_35 -o t19 glad/src/glad.c t19.cu -lGL -lGLU -I./glad/include -lglfw -std=c++11 -lopencv_core -lopencv_highgui -lopencv_imgcodecs -Wno-deprecated-gpu-targets
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cudaGL.h>
#include <cuda_gl_interop.h>
#include <iostream>
#include <opencv2/highgui.hpp>
/** Macro for checking if CUDA has problems */
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
/*Window dimensions*/
//const int windowWidth = 1280, windowHeight = 720;
/*Window address*/
GLFWwindow* currentGLFWWindow = 0;
/**
* A simple image processing kernel that copies the inverted data from the input surface to the output surface.
*/
__global__ void kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height, uint8_t *data) {
//Get the pixel index
unsigned int xPx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int yPx = threadIdx.y + blockIdx.y * blockDim.y;
//Don't do any computation if this thread is outside of the surface bounds.
if (xPx >= width || yPx >= height) return;
//Copy the contents of input to output.
#ifdef USE_1
uchar4 pixel = { 255,128,0,255 };
//Read a pixel from the input. Disable to default to the flat orange color above
surf2Dread<uchar4>(&pixel, input, xPx * sizeof(uchar4), yPx, cudaBoundaryModeClamp);
#else
uchar4 pixel;
pixel.x = data[(xPx+yPx*width)*3 + 0];
pixel.y = data[(xPx+yPx*width)*3 + 1];
pixel.z = data[(xPx+yPx*width)*3 + 2];
pixel.w = 255;
surf2Dwrite(pixel, input, xPx * sizeof(uchar4), yPx);
#endif
//Invert the color
pixel.x = ~pixel.x;
pixel.y = ~pixel.y;
pixel.z = ~pixel.z;
//Write the new pixel color to the
surf2Dwrite(pixel, output, xPx * sizeof(uchar4), yPx);
}
class Processor {
public:
void setInput( uint8_t* const data, int imageWidth, int imageHeight);
void processData(uint8_t *data, uint8_t *d_data);
GLuint getInputTexture();
GLuint getOutputTexture();
void writeOutputTo(uint8_t* destination);
private:
/**
* #brief True if the textures and surfaces are initialized.
*
* Prevents memory leaks
*/
bool surfacesInitialized = false;
/**
* #brief The width and height of a texture/surface pair.
*
*/
struct ImgDim { int width, height; };
/**
* #brief Creates a CUDA surface object, CUDA resource, and OpenGL texture from some data.
*/
void createTextureSurfacePair(const ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut);
/**
* #brief Destroys every CUDA surface object, CUDA resource, and OpenGL texture created by this instance.
*/
void destroyEverything();
/**
* #brief The dimensions of an image and its corresponding texture.
*
*/
ImgDim imageInputDimensions, imageOutputDimensions;
/**
* #brief A CUDA surface that can be read to, written from, or synchronized with a Mat or
* OpenGL texture
*
*/
cudaSurfaceObject_t d_imageInputTexture = 0, d_imageOutputTexture = 0;
/**
* #brief A CUDA resource that's bound to an array in CUDA memory
*/
cudaGraphicsResource_t d_imageInputGraphicsResource, d_imageOutputGraphicsResource;
/**
* #brief A renderable OpenGL texture that is synchronized with the CUDA data
* #see d_imageInputTexture, d_imageOutputTexture
*/
GLuint imageInputTexture = 0, imageOutputTexture = 0;
/** Returns true if nothing can be rendered */
bool empty() { return imageInputTexture == 0; }
};
void Processor::setInput(uint8_t* const data, int imageWidth, int imageHeight)
{
//Same-size images don't need texture regeneration, so skip that.
if (imageHeight == imageInputDimensions.height && imageWidth == imageInputDimensions.width) {
/*
Possible shortcut: we know the input is the same size as the texture and CUDA surface object.
So instead of destroying the surface and texture, why not just overwrite them?
That's what I try to do in the following block, but because "data" is BGR and the texture
is RGBA, the channels get all messed up.
*/
//Use the input surface's CUDAResourceDesc to gain access to the surface data array
#ifdef USE_1
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
cudaGetSurfaceObjectResourceDesc(&resDesc, d_imageInputTexture);
cudaCheckError();
uint8_t *data4 = new uint8_t[imageInputDimensions.width*imageInputDimensions.height*4];
for (int i = 0; i < imageInputDimensions.width*imageInputDimensions.height; i++){
data4[i*4+0] = data[i*3+0];
data4[i*4+1] = data[i*3+1];
data4[i*4+2] = data[i*3+2];
data4[i*4+3] = 255;}
//Copy the data from the input array to the surface
// cudaMemcpyToArray(resDesc.res.array.array, 0, 0, data, imageInputDimensions.width * imageInputDimensions.height * 3, cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(resDesc.res.array.array, 0, 0, data4, imageInputDimensions.width*4, imageInputDimensions.width*4, imageInputDimensions.height, cudaMemcpyHostToDevice);
cudaCheckError();
delete[] data4;
#endif
//Set status flags
surfacesInitialized = true;
return;
}
//Clear everything that originally existed in the texture/surface
destroyEverything();
//Get the size of the image and place it here.
imageInputDimensions.width = imageWidth;
imageInputDimensions.height = imageHeight;
imageOutputDimensions.width = imageWidth;
imageOutputDimensions.height = imageHeight;
//Create the input surface/texture pair
createTextureSurfacePair(imageInputDimensions, data, imageInputTexture, d_imageInputGraphicsResource, d_imageInputTexture);
//Create the output surface/texture pair
uint8_t* outData = new uint8_t[imageOutputDimensions.width * imageOutputDimensions.height * 3];
createTextureSurfacePair(imageOutputDimensions, outData, imageOutputTexture, d_imageOutputGraphicsResource, d_imageOutputTexture);
delete outData;
//Set status flags
surfacesInitialized = true;
}
void Processor::processData(uint8_t *data, uint8_t *d_data)
{
const int threadsPerBlock = 128;
//Call the algorithm
//Set the number of blocks to call the kernel with.
dim3 blocks((unsigned int)ceil((float)imageInputDimensions.width / threadsPerBlock), imageInputDimensions.height);
#ifndef USE_1
cudaMemcpy(d_data, data, imageInputDimensions.width*imageInputDimensions.height*3, cudaMemcpyHostToDevice);
#endif
kernel <<<blocks, threadsPerBlock >>> (d_imageInputTexture, d_imageOutputTexture, imageInputDimensions.width, imageInputDimensions.height, d_data);
//Sync the surface with the texture
cudaDeviceSynchronize();
cudaCheckError();
}
GLuint Processor::getInputTexture()
{
return imageInputTexture;
}
GLuint Processor::getOutputTexture()
{
return imageOutputTexture;
}
void Processor::writeOutputTo(uint8_t* destination)
{
//Haven't figured this out yet
}
void Processor::createTextureSurfacePair(const Processor::ImgDim& dimensions, uint8_t* const data, GLuint& textureOut, cudaGraphicsResource_t& graphicsResourceOut, cudaSurfaceObject_t& surfaceOut) {
// Create the OpenGL texture that will be displayed with GLAD and GLFW
glGenTextures(1, &textureOut);
// Bind to our texture handle
glBindTexture(GL_TEXTURE_2D, textureOut);
// Set texture interpolation methods for minification and magnification
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// Set texture clamping method
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
// Create the texture and its attributes
glTexImage2D(GL_TEXTURE_2D, // Type of texture
0, // Pyramid level (for mip-mapping) - 0 is the top level
GL_RGBA, // Internal color format to convert to
dimensions.width, // Image width i.e. 640 for Kinect in standard mode
dimensions.height, // Image height i.e. 480 for Kinect in standard mode
0, // Border width in pixels (can either be 1 or 0)
GL_BGR, // Input image format (i.e. GL_RGB, GL_RGBA, GL_BGR etc.)
GL_UNSIGNED_BYTE, // Image data type.
data); // The actual image data itself
//Note that the type of this texture is an RGBA UNSIGNED_BYTE type. When CUDA surfaces
//are synchronized with OpenGL textures, the surfaces will be of the same type.
//They won't know or care about their data types though, for they are all just byte arrays
//at heart. So be careful to ensure that any CUDA kernel that handles a CUDA surface
//uses it as an appropriate type. You will see that the update_surface kernel (defined
//above) treats each pixel as four unsigned bytes along the X-axis: one for red, green, blue,
//and alpha respectively.
//Create the CUDA array and texture reference
cudaArray* bitmap_d;
//Register the GL texture with the CUDA graphics library. A new cudaGraphicsResource is created, and its address is placed in cudaTextureID.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html#group__CUDART__OPENGL_1g80d12187ae7590807c7676697d9fe03d
cudaGraphicsGLRegisterImage(&graphicsResourceOut, textureOut, GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsNone);
cudaCheckError();
//Map graphics resources for access by CUDA.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1gad8fbe74d02adefb8e7efb4971ee6322
cudaGraphicsMapResources(1, &graphicsResourceOut, 0);
cudaCheckError();
//Get the location of the array of pixels that was mapped by the previous function and place that address in bitmap_d
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP_1g0dd6b5f024dfdcff5c28a08ef9958031
cudaGraphicsSubResourceGetMappedArray(&bitmap_d, graphicsResourceOut, 0, 0);
cudaCheckError();
//Create a CUDA resource descriptor. This is used to get and set attributes of CUDA resources.
//This one will tell CUDA how we want the bitmap_surface to be configured.
//Documentation for the struct: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaResourceDesc.html#structcudaResourceDesc
struct cudaResourceDesc resDesc;
//Clear it with 0s so that some flags aren't arbitrarily left at 1s
memset(&resDesc, 0, sizeof(resDesc));
//Set the resource type to be an array for convenient processing in the CUDA kernel.
//List of resTypes: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g067b774c0e639817a00a972c8e2c203c
resDesc.resType = cudaResourceTypeArray;
//Bind the new descriptor with the bitmap created earlier.
resDesc.res.array.array = bitmap_d;
//Create a new CUDA surface ID reference.
//This is really just an unsigned long long.
//Docuentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gbe57cf2ccbe7f9d696f18808dd634c0a
surfaceOut = 0;
//Create the surface with the given description. That surface ID is placed in bitmap_surface.
//Documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__SURFACE__OBJECT.html#group__CUDART__SURFACE__OBJECT_1g958899474ab2c5f40d233b524d6c5a01
cudaCreateSurfaceObject(&surfaceOut, &resDesc);
cudaCheckError();
}
void Processor::destroyEverything()
{
if (surfacesInitialized) {
//Input image CUDA surface
cudaDestroySurfaceObject(d_imageInputTexture);
cudaGraphicsUnmapResources(1, &d_imageInputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageInputGraphicsResource);
d_imageInputTexture = 0;
//Output image CUDA surface
cudaDestroySurfaceObject(d_imageOutputTexture);
cudaGraphicsUnmapResources(1, &d_imageOutputGraphicsResource);
cudaGraphicsUnregisterResource(d_imageOutputGraphicsResource);
d_imageOutputTexture = 0;
//Input image GL texture
glDeleteTextures(1, &imageInputTexture);
imageInputTexture = 0;
//Output image GL texture
glDeleteTextures(1, &imageOutputTexture);
imageOutputTexture = 0;
surfacesInitialized = false;
}
}
/** A way to initialize OpenGL with GLFW and GLAD */
void initGL(int windowWidth, int windowHeight) {
// Setup window
if (!glfwInit())
return;
// Decide GL+GLSL versions
#if __APPLE__
// GL 3.2 + GLSL 150
const char* glsl_version = "#version 150";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // Required on Mac
#else
// GL 3.0 + GLSL 130
//const char* glsl_version = "#version 130";
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
//glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); // 3.2+ only
//glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // 3.0+ only
#endif
// Create window with graphics context
currentGLFWWindow = glfwCreateWindow(windowWidth, windowHeight, "Output image (OpenGL + GLFW)", NULL, NULL);
if (currentGLFWWindow == NULL)
return;
glfwMakeContextCurrent(currentGLFWWindow);
glfwSwapInterval(3); // Enable vsync
if (!gladLoadGL()) {
// GLAD failed
printf( "GLAD failed to initialize :(" );
return;
}
//Change GL settings
glViewport(0, 0, windowWidth, windowHeight); // use a screen size of WIDTH x HEIGHT
glMatrixMode(GL_PROJECTION); // Make a simple 2D projection on the entire window
glLoadIdentity();
glOrtho(0.0, windowWidth, windowHeight, 0.0, 0.0, 100.0);
glMatrixMode(GL_MODELVIEW); // Set the matrix mode to object modeling
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClearDepth(0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear the window
}
/** Renders the textures on the GLFW window and requests GLFW to update */
void showTextures(GLuint top, GLuint bottom, int windowWidth, int windowHeight) {
// Clear color and depth buffers
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW); // Operate on model-view matrix
glBindTexture(GL_TEXTURE_2D, top);
/* Draw top quad */
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, 0);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight/2);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight / 2);
glTexCoord2i(1, 0); glVertex2i(windowWidth, 0);
glEnd();
glDisable(GL_TEXTURE_2D);
/* Draw bottom quad */
glBindTexture(GL_TEXTURE_2D, bottom);
glEnable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glTexCoord2i(0, 0); glVertex2i(0, windowHeight / 2);
glTexCoord2i(0, 1); glVertex2i(0, windowHeight);
glTexCoord2i(1, 1); glVertex2i(windowWidth, windowHeight);
glTexCoord2i(1, 0); glVertex2i(windowWidth, windowHeight / 2);
glEnd();
glDisable(GL_TEXTURE_2D);
glfwSwapBuffers(currentGLFWWindow);
glfwPollEvents();
}
int main() {
using namespace cv;
using namespace std;
// initGL();
std::string filename = "./lena.pgm";
Mat image;
image = imread(filename, CV_LOAD_IMAGE_COLOR); // Read the file
if(! image.data ) // Check for invalid input
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
int windoww = 1280;
int windowh = 720;
initGL(windoww,windowh);
uint8_t *d_data;
cudaMalloc(&d_data, image.cols*image.rows*3);
Processor p;
for (int i = 0; i < image.cols; i++)
{
image.data[i*3+0] = 0;
image.data[i*3+1] = 0;
image.data[i*3+2] = 0;
//Process the image here
p.setInput(image.data, image.cols, image.rows);
p.processData(image.data, d_data);
showTextures(p.getInputTexture(), p.getOutputTexture(), windoww, windowh);
}
}
Notes:
The compilation command is given in the comment in the first line
I created a "video" of sorts using a single image. The "video" will show the image with a black or white line moving horizontally from left to right in the top pixel row of the image. The input image is lena.pgm which can be found in the CUDA samples (for example, at /usr/local/cuda-10.1/samples/3_Imaging/SobelFilter/data/lena.pgm).
It looks to me like you are "sharing" resources between OpenGL and CUDA. This doesn't look like the right map/unmap sequence to me, but it seems to be working, and it doesn't seem to be the focus of your question. I haven't spent any time investigating. I may have missed something.
I'm not suggesting this code is defect free or suitable for any particular purpose. It is mostly your code. I've modified it slightly to demonstrate some ideas described in the text.
There shouldn't be any visual difference in the output whether you compile with -DUSE_1 or not.
This is an useful feature that came across first in (https://www.3dgep.com/opengl-interoperability-with-cuda/), and I have improved upon it to use latest CUDA APIs and flow. You can refer to these 2 functions in cudammf.
https://github.com/prabindh/cudammf/blob/5f93358784fcbaae7eea0850424c59d2ed057dab/cuda_postproces.cu#L119
https://github.com/prabindh/cudammf/blob/5f93358784fcbaae7eea0850424c59d2ed057dab/decoder3.cpp#L507
Basic working is as below:
Create a regular GL texture (GLTextureId). Map it for CUDA access, via cudaGraphicsGLRegisterImage
Do some CUDA processing, and result is in a CUDA buffer
USe cudaMemcpyToArray to transfer between the above 2 device memories
If your output is coming from a Nvidia codec output, you should also refer to the AppDecGL sample in the Nvidia Video SDK (https://developer.nvidia.com/nvidia-video-codec-sdk).
I'm attempting to render to a texture for the purpose of shadow mapping in DirectX11. I've set up and bound a separate render target to draw to. Problem is, after calling OMSetRenderTargets it's still rendering to the previously bound render target.
The graphics diagnostics event list shows that OMSetRenderTargets is being called, setting "obj:30" as the render target view. However, the following DrawIndexed call shows the render target as "obj:17", which is the previously bound render target.
Event List
Draw Call
I have the DirectX debug layer enabled, however it does not show any errors or warning messages. I've also ensured that the texture is not bound as a shader resource when the draw call happens but no luck there either.
These are both called by the following function
void GraphicsHandler::DrawSceneToRenderTarget(ID3D11RenderTargetView* RenderTarget, ID3D11VertexShader* WithVertexShader, ID3D11PixelShader* WithPixelShader)
{
const unsigned int VertexSize = sizeof(Vertex);
const unsigned int Offset = 0;
DeviceContext->ClearDepthStencilView(DepthStencilView, D3D11_CLEAR_DEPTH | D3D11_CLEAR_STENCIL, 1.0f, 0.0f);
DeviceContext->VSSetShader(WithVertexShader, nullptr, 0);
DeviceContext->PSSetShader(WithPixelShader, nullptr, 0);
DeviceContext->OMSetRenderTargets(1, &RenderTarget, DepthStencilView); //Render target set here
for (auto& Obj : ActiveScene.Objects)
{
ObjectInfo ObjectData;
ObjectData.ObjectTransform = XMMatrixIdentity();
ObjectData.ObjectTransform *= XMMatrixRotationRollPitchYaw(Obj->Rotator.X, Obj->Rotator.Y, Obj->Rotator.Z);
ObjectData.ObjectTransform *= XMMatrixTranslation(Obj->Position.X, Obj->Position.Y, Obj->Position.Z);
ObjectData.ObjectTransform *= XMMatrixScaling(Obj->Scale.X, Obj->Scale.Y, Obj->Scale.Z);
ObjectData.NormalMatrix = XMMatrixTranspose(XMMatrixInverse(nullptr, ObjectData.ObjectTransform));
DeviceContext->UpdateSubresource(ObjectBuffer, 0, nullptr, &ObjectData, 0, 0);
DeviceContext->UpdateSubresource(MaterialBuffer, 0, nullptr, &Obj->Mat, 0, 0);
DeviceContext->IASetVertexBuffers(0, 1, &Obj->VertexBuffer, &VertexSize, &Offset);
DeviceContext->IASetIndexBuffer(Obj->IndexBuffer, DXGI_FORMAT_R16_UINT, 0);
DeviceContext->VSSetConstantBuffers(0, 1, &ObjectBuffer);
//DeviceContext->PSSetConstantBuffers(0, 1, &MaterialBuffer);
DeviceContext->DrawIndexed(Obj->Indices.size(), 0, 0); //Draw called here
}
}
with the problematic calls to that being in the following two functions
void GraphicsHandler::RenderSceneDepth()
{
DeviceContext->RSSetState(RasterizerState);
DeviceContext->PSSetShaderResources(0, 1, &SceneDepthSRV);
DeviceContext->UpdateSubresource(CameraBuffer, 0, nullptr, &ActiveScene.SceneCamera.GetCameraVSInfo(), 0, 0);
DeviceContext->VSSetConstantBuffers(1, 1, &CameraBuffer);
DeviceContext->ClearRenderTargetView(SceneDepthRTV, Colors::Black);
DrawSceneToRenderTarget(SceneDepthRTV, VertexShader, DepthShader);
}
void GraphicsHandler::RenderShadowMap(ShadowMap& SM)
{
//Clear shader resources, as the texture can't be bound as input and output
ID3D11ShaderResourceView* NullResources[2] = { nullptr, nullptr };
DeviceContext->PSSetShaderResources(0, 2, NullResources);
DeviceContext->RSSetState(SMRasterizerState); //Need to render back faces only
ID3D11SamplerState* Samplers[2] = { SamplerState, ShadowSamplerState };
DeviceContext->PSSetSamplers(0, 2, Samplers);
//If the light is a directional source, render a directional shadow map
DirectionalLight* DirLight = nullptr;
DirLight = dynamic_cast<DirectionalLight*>(SM.ParentLight);
if (DirLight)
{
ID3D11RenderTargetView* RTV = SM.RTVs[0];
SM.LightPovCamera.ForwardDirection = DirLight->Direction;
DeviceContext->ClearRenderTargetView(RTV, Colors::Black);
DeviceContext->UpdateSubresource(LightPovBuffer, 0, nullptr, &SM.LightPovCamera.GetCameraVSInfo(), 0, 0);
DeviceContext->VSSetConstantBuffers(1, 1, &LightPovBuffer);
DrawSceneToRenderTarget(RTV, VertexShader, DepthShader);
}
//Otherwise, render to each face of the texturecube
else
{
for (int N = 0; N < 6; N++)
{
DeviceContext->ClearRenderTargetView(SM.RTVs[N], Colors::Black);
Camera POVCam = SM.GetCameraForCubemapFace(N);
DeviceContext->UpdateSubresource(LightPovBuffer, 0, nullptr, &POVCam.GetCameraVSInfo(), 0, 0);
DeviceContext->VSSetConstantBuffers(1, 1, &LightPovBuffer);
DrawSceneToRenderTarget(SM.RTVs[N], VertexShader, DepthShader);
}
}
}
Woops my mistake, the debug layer actually wasn't enabled, the error was caused by the render target having different dimensions to the depth stencil view. Apologies!
I am trying to draw a bunch of points on the screen. I'm using CUDA to generate the data (position and color), and OpenGL to draw it. I am trying to get CUDA to update a VBO and then OpenGL to draw it, but I get a blank screen. I am not sure if CUDA is not able to update the buffer, or that the buffer is not drawing properly. My GPU is a GTX 1080, and I'm trying to use OpenGL 4.0. Colors are specified by CUDA as well. If my problem is that I need a shader, how do I add that, but also still specify the color through CUDA?
UPDATE: problem seems to be openGL. Updated code to use triangle So new question to add. Why is my VBO not being rendered?
Here is the code:
GPUmain.cuh:
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <curand.h>
#include <GL/glew.h>
#include <SDL_opengl.h>
#include <cuda_gl_interop.h>
#define BUFFER_OFFSET(i) ((char *)NULL + (i))
//ver: x, y, z, r, g, b, a
struct ver {
// x, y, z pos
GLuint x, y, z;
// r, g, b, a color
GLubyte r, g, b, a;
};
class GPU {
public:
static int nParticles;
static GLuint vboid;
static cudaGraphicsResource *CGR;
//collection of vertices to be simulated and rendered
static thrust::device_vector<ver> rverts;
static void init(int w, int h);
static void compute();
static void render();
static void GPUmain();
static void free();
};
GPUmain.cu:
#include "GPUmain.cuh"
__global__ void uploadVerts(ver *vv, ver *vb) {
int id = threadIdx.x + (blockDim.x * blockIdx.x);
vb[id] = vv[id];
vb[id].x = vv[id].x;
vb[id].y = vv[id].y;
vb[id].z = vv[id].z;
vb[id].r = vv[id].r;
vb[id].g = vv[id].g;
vb[id].b = vv[id].b;
vb[id].a = vv[id].a;
}
__global__ void genGrid(ver *v) {
int i = threadIdx.x + (blockDim.x * blockIdx.x);
float x = (float)(i % ((int)1080));
float y = (float)(i / ((int)1920));
v[i].x = x;
v[i].y = y;
v[i].z = 1;
v[i].r = 255;
v[i].g = 0;
v[i].b = 0;
v[i].a = 0;
}
int GPU::nParticles;
GLuint GPU::vboid;
cudaGraphicsResource *GPU::CGR;
//collection of vertices to be simulated and rendered
thrust::device_vector<ver> GPU::rverts;
void GPU::init(int w, int h)
{
nParticles = w * h;
/*rverts.resize(nParticles, ver{0,0,0,0,0,0,0});
genGrid<<<nParticles/1024,1024>>>(thrust::raw_pointer_cast(&rverts[0]));*/
ver e[3] = {
ver{1024,200,2,255,0,0,255},
ver{499,288,173,0,255,0,255},
ver{462,1674,8,0,0,255,255}
};
glGenBuffers(1,&vboid);
glBindBuffer(GL_ARRAY_BUFFER,vboid);
glBufferData(GL_ARRAY_BUFFER,3*sizeof(ver),e,GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
/*cudaGraphicsGLRegisterBuffer(&CGR,vboid,cudaGraphicsMapFlagsWriteDiscard);*/
}
void GPU::compute()
{
}
void GPU::render()
{
/*ver *verts;
size_t size;
cudaGraphicsMapResources(1, &CGR, 0);
cudaGraphicsResourceGetMappedPointer((void**)&verts, &size, CGR);
uploadVerts<<<nParticles/1024, 1024>>>(thrust::raw_pointer_cast(&rverts[0]), verts);
cudaGraphicsUnmapResources(1, &CGR, 0);
cudaDeviceSynchronize();*/
glClearColor(0, 0, 0, 0); // we clear the screen with black (else, frames would overlay...)
glClear(GL_COLOR_BUFFER_BIT); // clear the buffer
glBindBuffer(GL_ARRAY_BUFFER, vboid);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_COLOR_ARRAY);
glVertexPointer(3, GL_INT, 4 * sizeof(GLubyte), 0);
glColorPointer(4, GL_BYTE, 3 * sizeof(GLuint), BUFFER_OFFSET(3 * sizeof(GLuint)));
glDrawArrays(GL_TRIANGLES, 0, 3);
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_COLOR_ARRAY);
glBindBuffer(GL_ARRAY_BUFFER, 0);
}
void GPU::GPUmain()
{
compute();
render();
}
void GPU::free()
{
cudaGraphicsUnregisterResource(CGR);
glBindBuffer(GL_ARRAY_BUFFER, vboid);
glDeleteBuffers(1, &vboid);
glBindBuffer(GL_ARRAY_BUFFER, 0);
rverts.clear();
thrust::device_vector<ver>().swap(rverts);
}
The relevant (that contain OpenGL code) parts of window.cpp:
bool Window::init()
{
//initialize SDL
if (SDL_Init(SDL_INIT_EVERYTHING) != 0) {
log << "Failed to initialize SDL!\n";
return false;
}
//set window atributes
SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 0);
SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 8);
SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
//creat window
window = SDL_CreateWindow(
name.c_str(),
SDL_WINDOWPOS_CENTERED,
SDL_WINDOWPOS_CENTERED,
width,
height,
SDL_WINDOW_OPENGL
);
//create opengl context in the window
glcontext = SDL_GL_CreateContext(window);
SDL_GL_SetSwapInterval(1);
//check if the window was created
if (window == nullptr) {
log << "Failed to create window!\n";
return false;
}
//turn on experimental features
glewExperimental = GL_TRUE;
//initiallize glew
if (glewInit() != GLEW_OK) {
log << "Failed to Init GLEW";
return false;
}
//set drawing parameters
glViewport(0, 0, width, height);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0, width, 0, height, 0, 255);
glPointSize(1);
glEnable(GL_BLEND); // Allow Transparency
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); // how transparency acts
std::cout << sizeof(ver);
GPU::init(width, height);
return true;
}
void Window::renderFrame()
{
GPU::render();
SDL_GL_SwapWindow(window); //swap buffers
}
If you use the fixed-function attributes and client side capabilities, then you've to use a compatibility profile context.
See Fixed Function Pipeline and Legacy OpenGL.
If you want to use a core profile, then you've to use Vertex Array Object and Shader:
SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_COMPATIBILITY);
The following geometry
ver e[3] = {
// x y z r g b a
ver{1024, 200, 2, 255, 0, 0, 255},
ver{ 499, 288, 173, 0, 255, 0, 255},
ver{462, 1674, 8, 0, 0, 255, 255}
};
is clipped by the near plane of the orthographic projection. Note, in view space the z-axis points out of the viewport.
Change the orthographic projection (or invert the z coordinates of the geometry):
glOrtho(0, width, 0, height, 0, 255);
glOrtho(0, width, 0, height, -255, 0);
The stride parameter of glVertexPointer respectively glColorPointer is the offset between consecutive attributes. So it has to be sizeof(ver).
The type of the color attributes is GL_UNSIGNED_BYTE rather than GL_BYTE:
glVertexPointer(3, GL_INT, 4 * sizeof(GLubyte), 0);
glColorPointer(4, GL_BYTE, 3 * sizeof(GLuint), BUFFER_OFFSET(3 * sizeof(GLuint)));
glVertexPointer(3, GL_INT, sizeof(ver), 0);
glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(ver), BUFFER_OFFSET(3 * sizeof(GLuint)));
I have a WinForms application with a panel (500x500 pixels) that I want to render something in. At this point I am just trying to fill it in with a specific color. I want to use OpenGL/CUDA interop to do this.
I got the panel configured to be the region to render stuff in, however when I run my code, the panel just gets filled with the glClear(..) color, and nothing assigned by the kernel is displayed. It sort of worked this morning (inconsistently), and in my attempt to sort out the SwapBuffers() mess, I think I screwed it up.
Here is the pixel format initialization for OpenGL. It seems to work fine, I have the two buffers as I expected, and the context is correct:
static PIXELFORMATDESCRIPTOR pfd=
{
sizeof(PIXELFORMATDESCRIPTOR), // Size Of This Pixel Format Descriptor
1, // Version Number
PFD_DRAW_TO_WINDOW | // Format Must Support Window
PFD_SUPPORT_OPENGL | // Format Must Support OpenGL
PFD_DOUBLEBUFFER, // Must Support Double Buffering
PFD_TYPE_RGBA, // Request An RGBA Format
16, // Select Our Color Depth
0, 0, 0, 0, 0, 0, // Color Bits Ignored
0, // No Alpha Buffer
0, // Shift Bit Ignored
0, // No Accumulation Buffer
0, 0, 0, 0, // Accumulation Bits Ignored
16, // 16Bit Z-Buffer (Depth Buffer)
0, // No Stencil Buffer
0, // No Auxiliary Buffer
PFD_MAIN_PLANE, // Main Drawing Layer
0, // Reserved
0, 0, 0 // Layer Masks Ignored
};
GLint iPixelFormat;
// get the device context's best, available pixel format match
if((iPixelFormat = ChoosePixelFormat(hdc, &pfd)) == 0)
{
MessageBox::Show("ChoosePixelFormat Failed");
return 0;
}
// make that match the device context's current pixel format
if(SetPixelFormat(hdc, iPixelFormat, &pfd) == FALSE)
{
MessageBox::Show("SetPixelFormat Failed");
return 0;
}
if((m_hglrc = wglCreateContext(m_hDC)) == NULL)
{
MessageBox::Show("wglCreateContext Failed");
return 0;
}
if((wglMakeCurrent(m_hDC, m_hglrc)) == NULL)
{
MessageBox::Show("wglMakeCurrent Failed");
return 0;
}
After this is done, I set up the ViewPort as such:
glViewport(0,0,iWidth,iHeight); // Reset The Current Viewport
glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix
glLoadIdentity(); // Reset The Modelview Matrix
glEnable(GL_DEPTH_TEST);
Then I set up the clear color and do a clear:
glClearColor(1.0f, 0.0f, 0.0f, 1.0f);
glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT);
Now I set up the CUDA/OpenGL interop:
cudaDeviceProp prop; int dev;
memset(&prop, 0, sizeof(cudaDeviceProp));
prop.major = 1; prop.minor = 0;
checkCudaErrors(cudaChooseDevice(&dev, &prop));
checkCudaErrors(cudaGLSetGLDevice(dev));
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
GLuint bufferID;
cudaGraphicsResource * resourceID;
glGenBuffers(1, &bufferID);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferID);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, fWidth*fHeight*4, NULL, GL_DYNAMIC_DRAW_ARB);
checkCudaErrors(cudaGraphicsGLRegisterBuffer( &resourceID, bufferID, cudaGraphicsMapFlagsNone ));
Now I try to call my kernel (which just paints each pixel a specific color) and have that displayed.
uchar4* devPtr;
size_t size;
// First clear the back buffer:
glClearColor(1.0f, 0.5f, 0.0f, 0.0f); // orange
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
checkCudaErrors(cudaGraphicsMapResources(1, &resourceID, NULL));
checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &size, resourceID));
animate(devPtr); // This will call the kernel and do a sync (see later)
checkCudaErrors(cudaGraphicsUnmapResources(1, &resourceID, NULL));
// Swap buffers to bring back buffer forward:
SwapBuffers(m_hDC);
At this point I expect to see the kernel colors on the screen, but no! I see orange, which is the clear color that I just set.
Here is the call to the kernel:
void animate(uchar4* dispPtr)
{
checkCudaErrors(cudaDeviceSynchronize());
animKernel<<<blocks, threads>>>(dispPtr, envdim);;
checkCudaErrors(cudaDeviceSynchronize());
}
Here envdim is just the dimensions (so 500x500). The kernel itself:
__global__ void animKernel(uchar4 *optr, dim3 matdim)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * matdim.x;
if (x < matdim.x && y < matdim.y)
{
// BLACK:
optr[offset].x = 0; optr[offset].y = 0; optr[offset].z = 0;
}
}
Things I've done:
The value returned by cudaGraphicsResourceGetMappedPointer's size is 1000000, which corresponds to the 500x500 matrix of uchar4, so that's good.
Each kernel printed the value and location that it was writing to, and that seemed ok.
Played with the alpha value for the clear color, but that doesn't seem to do anything (yet?)
Ran the animate() function several times. Don't know why I thought that would help, but I tried it...
So I guess I'm missing something, but I'm going kind of crazy looking for it. Any advice? Help?
It's another one of those questions I answer myself! Hmph, as I figured, it was a one line issue. The problem resides in the rendering call itself.
The configuration is fine, the one issue I have with the code above is:
I never called glDrawPixels(), which is necessary in order for the OpenGL driver to copy the shared buffer (GL_PiXEL_UNPACK_BUFFER_ARB) source to the display buffer. The correct rendering sequence is then:
uchar4* devPtr;
size_t size;
// First clear the back buffer:
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
checkCudaErrors(cudaGraphicsMapResources(1, &resourceID, NULL));
checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &size, resourceID));
animate(devPtr); // This will call the kernel and do a sync (see later)
checkCudaErrors(cudaGraphicsUnmapResources(1, &resourceID, NULL));
// This is necessary to copy the shared buffer to display
glDrawPixels(fWidth, fHeight, GL_RGBA, GL_UNSIGNED_BYTE, 0);
// Swap buffers to bring back buffer forward:
SwapBuffers(m_hDC);
I'd like to thank the Acade-- uh, CUDA By Example, once again for helping me. Even though the example code from the book used GLUT (which was completely useless for this...), the book referenced normal gl functions.
I'm taking the first step into OpenCL coding. I have a framework that I know can at least take an array from the CPU, do an operation in OpenCL, then read back the array (with the right answer). I'm currently trying to improve this by adding a displaced mesh as found in this OpenCL example (slides 18-23; only significant improvement is I changed the VBO to a float3 instead of a float4).
I have set up a shared context as per earlier in those slides and this resource. I tested the VBO with CPU input data (so I know it draws correctly). Also, I create the context before the VBO (as motivated by this thread). Finally, I tried reworking the kernel into the following [edited]:
__kernel void sine_wave(__global float3* pos, int width, int height, float time) {
uint x = get_global_id(0); uint y = get_global_id(1);
pos[y*width+x] = (float3)(1.0f,1.0f,1.0f);
}
Yet, no matter what I do, I cannot get the OpenCL program to update anything. There are no errors, nothing, yet the VBO remains the same as the input data. If I do not specify input data, the points all render at (0,0,0). I can't figure out what could cause this.
Ideas? Thanks,
Ian
PS #1: current system is NVIDIA GTX 580M, on Windows 7 x64, though the code written is intended to be portable.
PS #2: I can provide code if no one has any clues . . .
Well, I figured it out. After further hours of searching, I downloaded NVIDIA's GPU computing toolkit, which appears to be where the linked demo derives from. I then reduced their code down immensely to the following ~220 line source (may it help ye future coders):
#pragma comment(lib,"Opengl32.lib")
#pragma comment(lib,"glu32.lib")
#pragma comment(lib,"OpenCL.lib")
#pragma comment(lib,"glew32.lib")
#pragma comment(lib,"glut32.lib")
// OpenGL Graphics Includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/OpenGL.h>
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#ifdef UNIX
#include <GL/glx.h>
#endif
#endif
#include <CL/opencl.h>
// Rendering window vars
const unsigned int window_width = 512;
const unsigned int window_height = 512;
const unsigned int mesh_width = 256;
const unsigned int mesh_height = 256;
// OpenCL vars
cl_context cxGPUContext;
cl_device_id* cdDevices;
cl_command_queue cqCommandQueue;
cl_kernel ckKernel;
cl_mem vbo_cl;
cl_program cpProgram;
size_t szGlobalWorkSize[] = {mesh_width, mesh_height};
// vbo variables
GLuint vbo;
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;
void mouse(int button, int state, int x, int y) {
if (state == GLUT_DOWN) {
mouse_buttons |= 1<<button;
} else if (state == GLUT_UP) {
mouse_buttons = 0;
}
mouse_old_x = x;
mouse_old_y = y;
}
void motion(int x, int y) {
float dx, dy;
dx = (float)(x - mouse_old_x);
dy = (float)(y - mouse_old_y);
if (mouse_buttons & 1) {
rotate_x += dy * 0.2f;
rotate_y += dx * 0.2f;
} else if (mouse_buttons & 4) {
translate_z += dy * 0.01f;
}
mouse_old_x = x;
mouse_old_y = y;
}
void DisplayGL(void) {
static float anim = 0.0f;
// run OpenCL kernel to generate vertex positions
glFinish();
clEnqueueAcquireGLObjects(cqCommandQueue, 1, &vbo_cl, 0,0,0);
clSetKernelArg(ckKernel, 3, sizeof(float), &anim);
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, NULL, 0,0,0 );
clEnqueueReleaseGLObjects(cqCommandQueue, 1, &vbo_cl, 0,0,0);
clFinish(cqCommandQueue);
// set view matrix
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glLoadIdentity();
glTranslatef(0.0, 0.0, translate_z);
glRotatef(rotate_x, 1.0, 0.0, 0.0);
glRotatef(rotate_y, 0.0, 1.0, 0.0);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
glColor3f(1.0, 0.0, 0.0);
glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
glDisableClientState(GL_VERTEX_ARRAY);
// flip backbuffer to screen
glutSwapBuffers();
anim += 0.01f;
}
void timerEvent(int value) {
glutPostRedisplay();
glutTimerFunc(10, timerEvent,0);
}
int main(int argc, char** argv) {
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - window_width/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - window_height/2);
glutInitWindowSize(window_width, window_height);
glutCreateWindow("OpenCL/GL Interop (VBO)");
glutDisplayFunc(DisplayGL);
glutMouseFunc(mouse);
glutMotionFunc(motion);
glutTimerFunc(10, timerEvent,0);
glewInit();
glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);
glViewport(0, 0, window_width, window_height);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(60.0, (GLfloat)window_width / (GLfloat) window_height, 0.1, 10.0);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
//Get the NVIDIA platform
cl_platform_id cpPlatform;
clGetPlatformIDs(1,&cpPlatform,NULL);
// Get the number of GPU devices available to the platform
cl_uint uiDevCount;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount);
// Create the device list
cdDevices = new cl_device_id [uiDevCount];
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL);
// Define OS-specific context properties and create the OpenCL context
#if defined (__APPLE__)
CGLContextObj kCGLContext = CGLGetCurrentContext();
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties props[] =
{
CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
0
};
cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum);
#else
#ifdef UNIX
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(),
CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(),
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0
};
cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum);
#else // Win32
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0
};
cxGPUContext = clCreateContext(props, 1, &cdDevices[0], NULL, NULL, NULL);
#endif
#endif
// create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, NULL);
const char* cSourceCL = "__kernel void sine_wave(__global float4* pos, unsigned int width, unsigned int height, float time)\n"
"{\n"
" unsigned int x = get_global_id(0);\n"
" unsigned int y = get_global_id(1);\n"
"\n"
" // calculate uv coordinates\n"
" float u = x / (float) width;\n"
" float v = y / (float) height;\n"
" u = u*2.0f - 1.0f;\n"
" v = v*2.0f - 1.0f;\n"
"\n"
" // calculate simple sine wave pattern\n"
" float freq = 4.0f;\n"
" float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;\n"
"\n"
" // write output vertex\n"
" pos[y*width+x] = (float4)(u, w, v, 1.0f);\n"
"}\n";
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &cSourceCL, NULL, NULL);
clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
// create the kernel
ckKernel = clCreateKernel(cpProgram, "sine_wave", NULL);
// create VBO (if using standard GL or CL-GL interop), otherwise create Cl buffer
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glGenBuffers(1,&vbo);
glBindBuffer(GL_ARRAY_BUFFER,vbo);
// initialize buffer object
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
// create OpenCL buffer from GL VBO
vbo_cl = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, vbo, NULL);
// set the args values
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &vbo_cl);
clSetKernelArg(ckKernel, 1, sizeof(unsigned int), &mesh_width);
clSetKernelArg(ckKernel, 2, sizeof(unsigned int), &mesh_height);
glutMainLoop();
}
After comparison with my original code, I (eventually) found the key difference.
Right:
clEnqueueNDRangeKernel(context->command_queue, kernel->kernel, 2, NULL, global,NULL, 0,0,0 );
Wrong:
clEnqueueNDRangeKernel(context->command_queue, kernel->kernel, 2, NULL, global,local, 0,0,0 );
It turns out that the grid size I was using, 10x10, was smaller than the examples I had seen elsewhere, which told me to use 16x16 for "local". Because "global" is the grid size, "global" was smaller than "local".
For some reason this didn't cause any errors, though at this point I honestly can't say I understand these variables' purposes completely.
Ian