OpenGL compute shader premature abort after calling glComputeDispatch - c++

I have been trying to run a very simple counting compute shader to get a grasp on how many times my shader runs and how large of a compute array I can process.
It seems that I'm either hitting some driver limit or my shader takes too long for the card to execute so it is prematurely aborted or something. There does not seem to be any error returned from glDispatchCompute at least.
I have been reading up everything on compute shaders and nowhere does it seem to say that time limit would be an issue.
The hardware is an intel integrated graphics card which is rather low end but does have compute shader support. I want to be able to run compute shaders even on lower end cards and I think this card should be able to do it but I'm running into weird premature abort problems.
glxinfo | grep compute
GL_ARB_compressed_texture_pixel_storage, GL_ARB_compute_shader,
GL_ARB_compressed_texture_pixel_storage, GL_ARB_compute_shader,
More info:
const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string
const GLubyte* version = glGetString(GL_VERSION); // version as a string
GLint texture_units = 0;
glGetIntegerv(GL_MAX_TEXTURE_IMAGE_UNITS, &texture_units);
GLint maxAttach = 0;
glGetIntegerv(GL_MAX_COLOR_ATTACHMENTS, &maxAttach);
GLint maxDrawBuf = 0;
glGetIntegerv(GL_MAX_DRAW_BUFFERS, &maxDrawBuf);
GLint workGroupCount[3], workGroupSize[3];
GLint maxInvocations;
glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &maxInvocations);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &workGroupCount[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &workGroupCount[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &workGroupCount[2]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &workGroupSize[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &workGroupSize[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &workGroupSize[2]);
printf("Renderer: %s\n", renderer);
printf("OpenGL version supported: %s\n", version);
printf("Number of texture units: %d\n", texture_units);
printf("Maximum number of color attachments: %d\n", maxAttach);
printf("Maximum number of fragment shader outputs: %d\n", maxDrawBuf);
printf("Maximum work group invocations: %d\n", maxInvocations);
printf("Maximum work group count: %d %d %d\n", workGroupCount[0], workGroupCount[1], workGroupCount[2]);
printf("Maximum work group size: %d %d %d\n", workGroupSize[0], workGroupSize[1], workGroupSize[2]);
Output:
Vendor: Intel Open Source Technology Center (0x8086)
Device: Mesa DRI Intel(R) Haswell Mobile (0x416)
OpenGL vendor string: Intel Open Source Technology Center
OpenGL renderer string: Mesa DRI Intel(R) Haswell Mobile
Renderer: Mesa DRI Intel(R) Haswell Mobile
OpenGL version supported: OpenGL ES 3.1 Mesa 17.0.7
Number of texture units: 32
Maximum number of color attachments: 8
Maximum number of fragment shader outputs: 8
Maximum work group invocations: 2048
Maximum work group count: 65535 65535 65535
Maximum work group size: 2048 2048 2048
Shader:
#version 310 es
layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
layout (binding=0) uniform atomic_uint counter;
void main(){
atomicCounterIncrement(counter);
}
Setup:
GLuint ac_buffer;
glGenBuffers(1, &ac_buffer);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, ac_buffer);
glBufferData(GL_ATOMIC_COUNTER_BUFFER, sizeof(GLuint), NULL, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, 0);
GLuint compute_shader = glCreateShader (GL_COMPUTE_SHADER);
std::string ss;
readfile("compute.cs.c", ss);
const char *shader_source = ss.c_str();
glShaderSource (compute_shader, 1, &shader_source, NULL);
glCompileShader (compute_shader);
printShaderInfoLog(compute_shader);
GLuint shader_program = glCreateProgram ();
glAttachShader (shader_program, compute_shader);
glLinkProgram (shader_program);
printProgramInfoLog(shader_program);
glDeleteShader (compute_shader);
glUseProgram (shader_program);
glBindBufferBase(GL_ATOMIC_COUNTER_BUFFER, 0, ac_buffer);
glDispatchCompute(1024, 1024, 1);
if(glGetError() != GL_NO_ERROR) {
printf("There was a problem dispatching compute\n");
}
glMemoryBarrier(GL_ALL_BARRIER_BITS);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, ac_buffer);
GLuint *counter = (GLuint*)glMapBufferRange(GL_ATOMIC_COUNTER_BUFFER, 0, sizeof(GLuint), GL_MAP_READ_BIT);
printf("Counter: %u\n", *counter);
glUnmapBuffer(GL_ATOMIC_COUNTER_BUFFER);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, 0);
When I call glDispatchCompute with smaller values than 128 then I do seem to get reasonable results:
For example glDispatchCompute(128, 128, 1)results in "Counter: 16777216" which is consistent with 128*128*32*32. But if I call it with a 256, 256, 1 - I get result that is 66811258 instead. Which is no longer consistent with expected 67108864.
For smaller compute sets I always get expected results, but for larger ones the counter rarely ever goes beyond 60-100 million. Could I be hitting some driver limit? I though that since max group size is 65535 along each axis then I should be able to request large compute groups to be computed and expect all elements to be processed.
Could it be that my way of counting by means of atomic is flawed? Why does it still get reasonable results for small groups but falls short for large ones? How can I better debug this issue?

It is possible you're just reading the result before computation is complete. You need an explicit call to glFinish() to force completion and can remove the call to glMemoryBarrier(). For OpenGL ES glMemoryBarrier() only deals with relative ordering on the GPU between stages, it doesn't enforce ordering relative to the client access.
The desktop OpenGL 4.6 spec supports CLIENT_MAPPED_BUFFER_BARRIER_BIT for synchronizing client-side access, but this isn't available for OpenGL ES.

Related

Why is OpenGL simple loop faster than Vulkan one?

I have 2 graphics applications for OpenGL and Vulkan.
OpenGL loop looks something like this:
glClear(GL_DEPTH_BUFFER_BIT | GL_COLOR_BUFFER_BIT);
static int test = 0;
// "if" statement here is to ensure that there is no any caching or optimizations
// made by OpenGL driver (if such things exist),
// and commands are re-recorded to the buffer every frame
if ((test = 1 - test) == 0) {
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer1);
glUseProgram(program1);
glDrawArrays(GL_TRIANGLES, 0, vertices_size);
glUseProgram(0);
}
else {
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer2);
glUseProgram(program2);
glDrawArrays(GL_LINES, 0, vertices_size);
glUseProgram(0);
}
glfwSwapBuffers(window);
And Vulkan:
static uint32_t image_index = 0;
vkAcquireNextImageKHR(device, swapchain, 0xFFFFFFFF, image_available_semaphores[image_index], VK_NULL_HANDLE, &image_indices[image_index]);
vkWaitForFences(device, 1, &submission_completed_fences[image_index], VK_TRUE, 0xFFFFFFFF);
// VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
vkBeginCommandBuffer(cmd_buffers[image_index], &command_buffer_bi);
vkCmdBeginRenderPass(cmd_buffers[image_index], &render_pass_bi[image_index], VK_SUBPASS_CONTENTS_INLINE);
vkCmdEndRenderPass(cmd_buffers[image_index]);
vkEndCommandBuffer(cmd_buffers[image_index]);
vkResetFences(device, 1, &submission_completed_fences[image_index]);
vkQueueSubmit(graphics_queue, 1, &submit_info[image_index], submission_completed_fences[image_index]);
present_info[image_index].pImageIndices = &image_indices[image_index];
vkQueuePresentKHR(present_queue, &present_info[image_index]);
const static int max_swapchain_image_index = swapchain_image_count - 1;
if (++image_index > max_swapchain_image_index) {
image_index = 0;
}
In the Vulkan loop there are no even rendering commands, just empty render pass. Validation layers are disabled.
OpenGL FPS is about 10500, and Vulkan FPS is about 7500 (with 8 swapchain images in use with VK_PRESENT_MODE_IMMEDIATE_KHR, less images make FPS lower).
Code is running on laptop with Ubuntu 18.04, discrete GPU Nvidia RTX 2060, Nvidia driver 450.66, Vulkan API version 1.2.133.
I know that OpenGL driver is highly optimized, but i can't imagine what else is to be optimized in Vulkan loop to make it faster than it is.
Are there some low-level linux driver issues? Or maybe Vulkan performance increase is accomplished only in much more complex applications (using multithreading e.g.)?

3D texture size affecting program output without error being thrown

First, I am using the glDebugMessage() instead of glGetError() to determine errors.
Second, I am allocating a 3D texture storage as follows:
glTexImage3D(GL_TEXTURE_3D, 0, GL_RGBA32F, 512, 512, 303, 0, GL_RGBA, GL_FLOAT, NULL);
When the depth coponent is 303 or less, my program works exactly as expected (I allocate a color in the texture and I see that color as output), when that parameter is 304 or higher, the program doesn't work (the screen is black).
I have tested the same program in different machines, and depending on the computer the treshhold changes, sometimes it's higher, sometimes lower.
My hypothesis is then that some drivers cannot allocate enough memory to handle this texture. But no error is being thrown, or at least, the debug message is not getting called.
Is there a way I can verify that I am indeed requesting more memory than can be allocated and extend said memory somehow?
Yep the 303 texture is ~1.2GByte if I compute it correctly. Your gfx card however also need memory for framebuffer and other stuff too. Sadly there is no common GL API to query available memory. There are extensions for this however. This is how I query basic info in my OpenGL engine (VCL/C++):
AnsiString OpenGLscreen::glinfo()
{
AnsiString txt="";
txt+=glGetAnsiString(GL_VENDOR)+"\r\n";
txt+=glGetAnsiString(GL_RENDERER)+"\r\n";
txt+="OpenGL ver: "+glGetAnsiString(GL_VERSION)+"\r\n";
if (ext_is_supported("GL_NVX_gpu_memory_info"))
{
GLint x,y,z;
x=0; glGetIntegerv(GL_GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX,&x);
y=0; glGetIntegerv(GL_GPU_MEMORY_INFO_DEDICATED_VIDMEM_NVX,&y);
z=0; glGetIntegerv(GL_GPU_MEMORY_INFO_TOTAL_AVAILABLE_MEMORY_NVX,&z);
txt+=AnsiString().sprintf("GPU memory: %i/%i/%i MByte\r\n",x>>10,y>>10,z>>10); // GPU free/GPU total/GPU+CPU shared total
x=0; glGetIntegerv(GL_GPU_MEMORY_INFO_EVICTION_COUNT_NVX,&x);
y=0; glGetIntegerv(GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX,&y);
txt+=AnsiString().sprintf("GPU blocks: %i used: %i MByte\r\n",x,y>>10);
}
if (ext_is_supported("GL_ATI_meminfo"))
{
GLint x0,x1,x2,x3; // free,largest available block,free auxiliary, largest available auxiliary
x0=0; glGetIntegerv(GL_VBO_FREE_MEMORY_ATI,&x0);
x1=0; glGetIntegerv(GL_VBO_FREE_MEMORY_ATI,&x1);
x2=0; glGetIntegerv(GL_VBO_FREE_MEMORY_ATI,&x2);
x3=0; glGetIntegerv(GL_VBO_FREE_MEMORY_ATI,&x3);
txt+=AnsiString().sprintf("VBO memory: %i MByte\r\n",x0>>10);
x0=0; glGetIntegerv(GL_TEXTURE_FREE_MEMORY_ATI,&x0);
x1=0; glGetIntegerv(GL_TEXTURE_FREE_MEMORY_ATI,&x1);
x2=0; glGetIntegerv(GL_TEXTURE_FREE_MEMORY_ATI,&x2);
x3=0; glGetIntegerv(GL_TEXTURE_FREE_MEMORY_ATI,&x3);
txt+=AnsiString().sprintf("TXR memory: %i MByte\r\n",x0>>10);
x0=0; glGetIntegerv(GL_RENDERBUFFER_FREE_MEMORY_ATI,&x0);
x1=0; glGetIntegerv(GL_RENDERBUFFER_FREE_MEMORY_ATI,&x1);
x2=0; glGetIntegerv(GL_RENDERBUFFER_FREE_MEMORY_ATI,&x2);
x3=0; glGetIntegerv(GL_RENDERBUFFER_FREE_MEMORY_ATI,&x3);
txt+=AnsiString().sprintf("BUF memory: %i MByte\r\n",x0>>10);
}
return txt;
}
So extract the nVidia and ATI/AMD memory queries. and port to your environment.
btw I just realized in order the above code to work you need also this:
AnsiString glGetAnsiString(GLuint id)
{
GLubyte a,*p=(GLubyte*)(void*)glGetString(id);
AnsiString s;
for(s="";p;p++)
{
a=p[0];
if (!a) break;
s+=char(a);
}
return s;
}
Which just converts result of glGetString from char* to VCL's AnsiString which I use heavily. So in different environment than VCL just stick to char* or convert to string type you got...

Can't generate mipmaps with off-screen OpenGL context on Linux

This question is a continuation of the problem I described here .This is one of the weirdest bugs I have ever seen.I have my engine running in 2 modes:display mode and offscreen.The OS is Linux.I generate mipmaps for the textures and in Display mode it all works fine.In that mode I use GLFW3 for context creation.Now,the funny part:in the offscreen mode,context for which I create manually with the code below,the mipmap generation fails OCCASIONALLY!That's on some runs the resulting output looks ok,and in other the missing levels are clearly seen as the frame is full of texture junk data or entirely empty.
At first I though I had my mipmap gen routine wrong which goes like this:
glGenTextures(1, &textureName);
glBindTexture(GL_TEXTURE_2D, textureName);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, imageInfo.Width, imageInfo.Height, 0, imageInfo.Format, imageInfo.Type, imageInfo.Data);
glTexParameteri ( GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0 );
glGenerateMipmap(GL_TEXTURE_2D);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
I also tried to play with this param:
glTexParameteri ( GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, XXX);
including Max level detection formula:
int numMipmaps = 1 + floor(log2(glm::max(imageInfoOut.width, imageInfoOut.height)));
But all this stuff didn't work consistently.Out of 10-15 runs 3-4 come out with broken Mipmaps.What I then found was that switching to GL_LINEAR solved it.Also in mipmap mode,setting just 1 level worked as well.Finally I started thinking there could a problem on a context level because in screen mode it works!I switched context creation to GLFW3 and it works.So I wonder what's going on here?Do I miss something in Pbuffer setup which breaks mipmaps generation?I doubt it because AFAIK it is done by the driver.
Here is my custom off-screen context creation setup:
int visual_attribs[] = {
GLX_RENDER_TYPE,
GLX_RGBA_BIT,
GLX_RED_SIZE, 8,
GLX_GREEN_SIZE, 8,
GLX_BLUE_SIZE, 8,
GLX_ALPHA_SIZE, 8,
GLX_DEPTH_SIZE, 24,
GLX_STENCIL_SIZE, 8,
None
};
int context_attribs[] = {
GLX_CONTEXT_MAJOR_VERSION_ARB, vmaj,
GLX_CONTEXT_MINOR_VERSION_ARB, vmin,
GLX_CONTEXT_FLAGS_ARB,
GLX_CONTEXT_ROBUST_ACCESS_BIT_ARB
#ifdef DEBUG
| GLX_CONTEXT_DEBUG_BIT_ARB
#endif
,
GLX_CONTEXT_PROFILE_MASK_ARB, GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB,
None
};
_xdisplay = XOpenDisplay(NULL);
int fbcount = 0;
_fbconfig = NULL;
// _render_context
if (!_xdisplay) {
throw();
}
/* get framebuffer configs, any is usable (might want to add proper attribs) */
if (!(_fbconfig = glXChooseFBConfig(_xdisplay, DefaultScreen(_xdisplay), visual_attribs, &fbcount))) {
throw();
}
/* get the required extensions */
glXCreateContextAttribsARB = (glXCreateContextAttribsARBProc) glXGetProcAddressARB((const GLubyte *) "glXCreateContextAttribsARB");
glXMakeContextCurrentARB = (glXMakeContextCurrentARBProc) glXGetProcAddressARB((const GLubyte *) "glXMakeContextCurrent");
if (!(glXCreateContextAttribsARB && glXMakeContextCurrentARB)) {
XFree(_fbconfig);
throw();
}
/* create a context using glXCreateContextAttribsARB */
if (!(_render_context = glXCreateContextAttribsARB(_xdisplay, _fbconfig[0], 0, True, context_attribs))) {
XFree(_fbconfig);
throw();
}
// GLX_MIPMAP_TEXTURE_EXT
/* create temporary pbuffer */
int pbuffer_attribs[] = {
GLX_PBUFFER_WIDTH, 128,
GLX_PBUFFER_HEIGHT, 128,
None
};
_pbuff = glXCreatePbuffer(_xdisplay, _fbconfig[0], pbuffer_attribs);
XFree(_fbconfig);
XSync(_xdisplay, False);
/* try to make it the current context */
if (!glXMakeContextCurrent(_xdisplay, _pbuff, _pbuff, _render_context)) {
/* some drivers does not support context without default framebuffer, so fallback on
* using the default window.
*/
if (!glXMakeContextCurrent(_xdisplay, DefaultRootWindow(_xdisplay),
DefaultRootWindow(_xdisplay), _render_context)) {
throw();
}
}
Almost forgot:My system and hardware:
Kubuntu 13.04 64bit. GPU: NVidia Geforce GTX 680 . The engine uses OpenGL 4.2 API
Full OpenGL info:
**OpenGL vendor string: NVIDIA Corporation
OpenGL renderer string: GeForce GTX 680/PCIe/SSE2
OpenGL version string: 4.4.0 NVIDIA 331.49
OpenGL shading language version string: 4.40 NVIDIA via Cg compiler**
Btw,I used also older drivers and it doesn't matter.
UPDATE:
Seems like my assumption regarding GLFW was wrong.When I compile the engine and run it from the terminal the same is happening.BUT - if I run the engine from IDE (debug or release )there are no issues with the mipmaps.Is it possible the standalone app works against different SOs?
To make it clear,I dont't use Pbuffers to render into.I render into custom Frame buffers.
UPDATE1:
I have read that non-power of 2 textures can be tricky to auto generate mipmaps.And that in case OpenGL fails to generate all the levels it turns of texture usage.Is it possible that's what I am experiencing here?Because once the mipmapped texture goes wrong the rest of textures (non mipmapped) disappear too.But if this is the case then why this behavior is inconsistent?
Uh, why are you using PBuffers in the first place? PBuffers have just too many caveats as that there was only one valid reason to use them in a new project?
You want offscreen rendering? Then use Framebuffer Objects (FBOs).
You need a purely off-screen context? Then create a normal window which you simply don't show and create an FBO on it.

glDrawBuffers usage for multiple render targets, under OS X

I'm having some very strange behaviour from my multiple-render-target code, and have started to wonder whether I'm catastrophically misunderstanding the way that this is supposed to work.
I'm running in a version 2.1 context. Here's the core bit of render setup code I'm executing:
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_fbo);
GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
glDrawBuffers(2,buffers);
My shader then writes out color data to gl_FragColor[0] and glFragColor[1].
This is essentially the same situation as was discussed in this question. However, when I run this on OS X, my shader only outputs to the first render target. OpenGL throws no errors, either during the construction of the FBO with two color attachments, or at any point during the rendering process.
When I examine what's going on via the OSX 'OpenGL Profiler' "trace" view, it shows the driver's side of this code execution as being:
2.86 µs glBindFramebufferEXT(GL_FRAMEBUFFER, 1);
3.48 µs glDrawBuffersARB(2, {GL_COLOR_ATTACHMENT0, GL_ZERO});
Which perhaps explains why nothing was being written to GL_COLOR_ATTACHMENT1; it seems to be being replaced by GL_ZERO in the call to glDrawBuffers!
If I switch the order of the buffers in the buffers[] array to be GL_COLOR_ATTACHMENT1_EXT first and then GL_COLOR_ATTACHMENT0_EXT, then my shader only writes into GL_COLOR_ATTACHMENT1_EXT, and GL_COLOR_ATTACHMENT0_EXT appears to be replaced with GL_ZERO.
Here's where it gets weird. If I use the code:
GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
glDrawBuffers(3, buffers);
Then the statistics view shows this:
0.46 µs glDrawBuffersARB(3, {GL_COLOR_ATTACHMENT0, GL_ZERO, GL_COLOR_ATTACHMENT1});
OpenGL still throws no errors, and my shader successfully writes out data to both color attachments, even though it's writing to gl_FragColor[0] and gl_FragColor[1].
So even though my program is now working, it seems to me like this shouldn't work. And I was curious to see how far I could push this, hoping that pushing OpenGL to an eventual failure would be educational. So I tried compiling with this code:
GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
glDrawBuffers(4, buffers);
When running that, the OpenGL Profiler "trace" view shows this as being executed:
4.26 µs glDrawBuffersARB(4, {GL_COLOR_ATTACHMENT0, GL_ZERO, GL_COLOR_ATTACHMENT1, GL_ZERO});
And now OpenGL is throwing "invalid framebuffer operations" all over the place, but my shader is still successfully writing color data to both color attachment points.
Does all this make sense to anyone? Have I catastrophically misunderstood the way that glDrawBuffers is supposed to be called?
According to the OpenGL Profiler's "Resources" view, my framebuffer (number 1) looks fine; it does have two color attachments attached, as expected.
Attached Objects:
{
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT0
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_TEXTURE
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT: 1
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT: 0
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT1
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_TEXTURE
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT: 2
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT: 0
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT2
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT3
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT4
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT5
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT6
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_COLOR_ATTACHMENT7
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_DEPTH_ATTACHMENT
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_TEXTURE
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT: 3
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT: 0
GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT: 0
}
{
GL_FRAMEBUFFER_ATTACHMENT: GL_STENCIL_ATTACHMENT
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT: GL_NONE
}
}
...and after banging my head against this for almost a week, I figured it out just five minutes after finally posting the question on StackOverflow. Posting my solution since it seems to be a header problem that's likely to affect other OSX folks.
For whatever reason, on my 64-bit OSX build, GLenum is defined as an 8-byte integer type, while the OpenGL drivers actually want 32-bit values in the array being passed to glDrawBuffers. If I rewrite the code as:
uint32_t buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
glDrawBuffers(2,(GLenum*)buffers);
Then everything works as expected. (The placement of the GL_ZERO entries was the hint that eventually led me to this answer)

CUDA OPENGL Interoperability: cudaGLSetGLDevice

Following the Programming Giude of CUDA 4.0, I call cudaGLSetGLDevice
before any other runtime calls. But the next cuda call, cudaMalloc, return "all CUDA-capable devices are busy or unavailable."
Also, in the NVIDIA forum (http://forums.nvidia.com/index.php?showtopic=186399) an user said that:
"In multi-GPU systems though you're going to encounter even larger flaws in CUDA...
a) You can't do CUDA/GL interop when the CUDA context and the OpenGL context are on different devices (undocumented, and unsupported in my experience)
b) You can't do GL device affinity on non-windows machines.
c) You can't do GL device affinity on consumer devices (Quadro/Tesla only)"
Is this true? My final work must run on a linux multi-gpu system. I have to change the graphic library to use? And in this case, what are you suggestions?
OS: Opensuse 11.4 64 bit
Graphic Card: GeForce 9600M GT
DRIVER: 275.21
See Cuda and OpenGL Interop
I had to replace a simple cudaMalloc() by a burden of gl* things.
Nevertheless, it works pretty well.
// The lattice as a GL Buffer
GLuint gridVBO = 0;
struct cudaGraphicsResource *gridVBO_CUDA = NULL;
// Ask for GL memory buffers
glGenBuffers(1, &gridVBO);
glBindBuffer(GL_ARRAY_BUFFER, gridVBO);
const size_t size = L * L * sizeof(unsigned char);
glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, gridVBO);
glBindBuffer(GL_ARRAY_BUFFER, 0);
cutilSafeCall(cudaGraphicsGLRegisterBuffer(&gridVBO_CUDA, gridVBO, cudaGraphicsMapFlagsWriteDiscard));
// Map the GL buffer to a device pointer
unsigned char *grid = NULL;
cutilSafeCall(cudaGraphicsMapResources(1, &gridVBO_CUDA, 0));
size_t num_bytes = 0;
cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **) &grid,
&num_bytes, gridVBO_CUDA));
// Execution configuration
dim3 dimBlock(TILE_X, TILE_Y);
dim3 dimGrid(L/TILE_X, L/TILE_Y);
// Kernel call
kernel<<<dimGrid, dimBlock>>>(grid);
cutilCheckMsg("Kernel launch failed");
// Unmap buffer object
cutilSafeCall(cudaGraphicsUnmapResources(1, &gridVBO_CUDA, 0));