CUDA OPENGL Interoperability: cudaGLSetGLDevice - opengl

Following the Programming Giude of CUDA 4.0, I call cudaGLSetGLDevice
before any other runtime calls. But the next cuda call, cudaMalloc, return "all CUDA-capable devices are busy or unavailable."
Also, in the NVIDIA forum (http://forums.nvidia.com/index.php?showtopic=186399) an user said that:
"In multi-GPU systems though you're going to encounter even larger flaws in CUDA...
a) You can't do CUDA/GL interop when the CUDA context and the OpenGL context are on different devices (undocumented, and unsupported in my experience)
b) You can't do GL device affinity on non-windows machines.
c) You can't do GL device affinity on consumer devices (Quadro/Tesla only)"
Is this true? My final work must run on a linux multi-gpu system. I have to change the graphic library to use? And in this case, what are you suggestions?
OS: Opensuse 11.4 64 bit
Graphic Card: GeForce 9600M GT
DRIVER: 275.21

See Cuda and OpenGL Interop
I had to replace a simple cudaMalloc() by a burden of gl* things.
Nevertheless, it works pretty well.
// The lattice as a GL Buffer
GLuint gridVBO = 0;
struct cudaGraphicsResource *gridVBO_CUDA = NULL;
// Ask for GL memory buffers
glGenBuffers(1, &gridVBO);
glBindBuffer(GL_ARRAY_BUFFER, gridVBO);
const size_t size = L * L * sizeof(unsigned char);
glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, gridVBO);
glBindBuffer(GL_ARRAY_BUFFER, 0);
cutilSafeCall(cudaGraphicsGLRegisterBuffer(&gridVBO_CUDA, gridVBO, cudaGraphicsMapFlagsWriteDiscard));
// Map the GL buffer to a device pointer
unsigned char *grid = NULL;
cutilSafeCall(cudaGraphicsMapResources(1, &gridVBO_CUDA, 0));
size_t num_bytes = 0;
cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **) &grid,
&num_bytes, gridVBO_CUDA));
// Execution configuration
dim3 dimBlock(TILE_X, TILE_Y);
dim3 dimGrid(L/TILE_X, L/TILE_Y);
// Kernel call
kernel<<<dimGrid, dimBlock>>>(grid);
cutilCheckMsg("Kernel launch failed");
// Unmap buffer object
cutilSafeCall(cudaGraphicsUnmapResources(1, &gridVBO_CUDA, 0));

Related

Why is OpenGL simple loop faster than Vulkan one?

I have 2 graphics applications for OpenGL and Vulkan.
OpenGL loop looks something like this:
glClear(GL_DEPTH_BUFFER_BIT | GL_COLOR_BUFFER_BIT);
static int test = 0;
// "if" statement here is to ensure that there is no any caching or optimizations
// made by OpenGL driver (if such things exist),
// and commands are re-recorded to the buffer every frame
if ((test = 1 - test) == 0) {
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer1);
glUseProgram(program1);
glDrawArrays(GL_TRIANGLES, 0, vertices_size);
glUseProgram(0);
}
else {
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer2);
glUseProgram(program2);
glDrawArrays(GL_LINES, 0, vertices_size);
glUseProgram(0);
}
glfwSwapBuffers(window);
And Vulkan:
static uint32_t image_index = 0;
vkAcquireNextImageKHR(device, swapchain, 0xFFFFFFFF, image_available_semaphores[image_index], VK_NULL_HANDLE, &image_indices[image_index]);
vkWaitForFences(device, 1, &submission_completed_fences[image_index], VK_TRUE, 0xFFFFFFFF);
// VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
vkBeginCommandBuffer(cmd_buffers[image_index], &command_buffer_bi);
vkCmdBeginRenderPass(cmd_buffers[image_index], &render_pass_bi[image_index], VK_SUBPASS_CONTENTS_INLINE);
vkCmdEndRenderPass(cmd_buffers[image_index]);
vkEndCommandBuffer(cmd_buffers[image_index]);
vkResetFences(device, 1, &submission_completed_fences[image_index]);
vkQueueSubmit(graphics_queue, 1, &submit_info[image_index], submission_completed_fences[image_index]);
present_info[image_index].pImageIndices = &image_indices[image_index];
vkQueuePresentKHR(present_queue, &present_info[image_index]);
const static int max_swapchain_image_index = swapchain_image_count - 1;
if (++image_index > max_swapchain_image_index) {
image_index = 0;
}
In the Vulkan loop there are no even rendering commands, just empty render pass. Validation layers are disabled.
OpenGL FPS is about 10500, and Vulkan FPS is about 7500 (with 8 swapchain images in use with VK_PRESENT_MODE_IMMEDIATE_KHR, less images make FPS lower).
Code is running on laptop with Ubuntu 18.04, discrete GPU Nvidia RTX 2060, Nvidia driver 450.66, Vulkan API version 1.2.133.
I know that OpenGL driver is highly optimized, but i can't imagine what else is to be optimized in Vulkan loop to make it faster than it is.
Are there some low-level linux driver issues? Or maybe Vulkan performance increase is accomplished only in much more complex applications (using multithreading e.g.)?

glGetBufferSubData and glMapBufferRange for GL_SHADER_STORAGE_BUFFER very slow on NVIDIA GTX960M

I've been having some issues with transfering a GPU buffer into CPU for performing sorting operations. The buffer is a GL_SHADER_STORAGE_BUFFER composed of 300.000 float values. The transfer operation with glGetBufferSubData is taking around 10ms, and with glMapBufferRange, it takes more than 100 ms.
The code Im using is the following:
std::vector<GLfloat> viewRow;
unsigned int viewRowBuffer = -1;
int length = -1;
void bindRowBuffer(unsigned int buffer){
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, buffer);
}
void initRowBuffer(unsigned int &buffer, std::vector<GLfloat> &row, int lengthIn){
// Generate and initialize buffer
length = lengthIn;
row.resize(length);
memset(&row[0], 0, length*sizeof(float));
glGenBuffers(1, &buffer);
bindRowBuffer(buffer);
glBufferStorage(GL_SHADER_STORAGE_BUFFER, row.size() * sizeof(float), &row[0], GL_DYNAMIC_STORAGE_BIT | GL_MAP_READ_BIT | GL_MAP_WRITE_BIT);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
}
void cleanRowBuffer(unsigned int buffer) {
float zero = 0.0;
glClearNamedBufferData(buffer, GL_R32F, GL_RED, GL_FLOAT, &zero);
}
void readGPUbuffer(unsigned int buffer, std::vector<GLfloat> &row) {
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER,0,length *sizeof(float),&row[0]);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
}
void readGPUMapBuffer(unsigned int buffer, std::vector<GLfloat> &row) {
float* data = (float*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, length*sizeof(float), GL_MAP_READ_BIT); glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
memcpy(&row[0], data, length *sizeof(float));
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
}
The main is doing:
bindRowBuffer(viewRowBuffer);
cleanRowBuffer(viewRowBuffer);
countPixs.bind();
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, gPatch);
countPixs.setInt("gPatch", 0);
countPixs.run(SCR_WIDTH/8, SCR_HEIGHT/8, 1);
countPixs.unbind();
readGPUbuffer(viewRowBuffer, viewRow);
Where countPixs is a compute shader, but I'm possitive the problem is not there because if I comment the run command, the read takes exactly the same amount of time.
The weird thing is that if I execute a getbuffer of only 1 float:
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER,0, 1 *sizeof(float),&row[0]);
It takes exactly the same time... so I'm guessing there is something wrong all-the-way... maybe related to the GL_SHADER_STORAGE_BUFFER?
This is likely to be an GPU-CPU synchronization/round trip caused delay.
I.e. once you map your buffer the previous GL command(s) which touched the buffer needs to complete immediately causing pipeline stall.
Note that drivers are lazy: it is very probable GL commands have not even started executing yet.
If you can: glBufferStorage(..., GL_MAP_PERSISTENT_BIT) and map the buffer persistently. This avoids completely re-mapping and allocation of any GPU memory and you can keep the mapped pointer over draw calls with some caveats:
You likely also need GPU fences to detect/wait when the data is actually available from GPU. (Unless you like reading garbace.)
The mapped buffer can't be resized. (since you already use glBufferStorage() you are ok)
It is probably good idea to combine GL_MAP_PERSISTENT_BIT with GL_MAP_COHERENT_BIT
After reading GL 4.5 docs bit more I found out that glFenceSync is mandatory in order to guarantee the data has arrived from the GPU, even with GL_MAP_COHERENT_BIT:
If GL_MAP_COHERENT_BIT is set and the server does a write, the app must call glFenceSync with GL_SYNC_GPU_COMMANDS_COMPLETE (or
glFinish). Then the CPU will see the writes after the sync is
complete.

OpenGL compute shader premature abort after calling glComputeDispatch

I have been trying to run a very simple counting compute shader to get a grasp on how many times my shader runs and how large of a compute array I can process.
It seems that I'm either hitting some driver limit or my shader takes too long for the card to execute so it is prematurely aborted or something. There does not seem to be any error returned from glDispatchCompute at least.
I have been reading up everything on compute shaders and nowhere does it seem to say that time limit would be an issue.
The hardware is an intel integrated graphics card which is rather low end but does have compute shader support. I want to be able to run compute shaders even on lower end cards and I think this card should be able to do it but I'm running into weird premature abort problems.
glxinfo | grep compute
GL_ARB_compressed_texture_pixel_storage, GL_ARB_compute_shader,
GL_ARB_compressed_texture_pixel_storage, GL_ARB_compute_shader,
More info:
const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string
const GLubyte* version = glGetString(GL_VERSION); // version as a string
GLint texture_units = 0;
glGetIntegerv(GL_MAX_TEXTURE_IMAGE_UNITS, &texture_units);
GLint maxAttach = 0;
glGetIntegerv(GL_MAX_COLOR_ATTACHMENTS, &maxAttach);
GLint maxDrawBuf = 0;
glGetIntegerv(GL_MAX_DRAW_BUFFERS, &maxDrawBuf);
GLint workGroupCount[3], workGroupSize[3];
GLint maxInvocations;
glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &maxInvocations);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &workGroupCount[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &workGroupCount[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &workGroupCount[2]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &workGroupSize[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &workGroupSize[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &workGroupSize[2]);
printf("Renderer: %s\n", renderer);
printf("OpenGL version supported: %s\n", version);
printf("Number of texture units: %d\n", texture_units);
printf("Maximum number of color attachments: %d\n", maxAttach);
printf("Maximum number of fragment shader outputs: %d\n", maxDrawBuf);
printf("Maximum work group invocations: %d\n", maxInvocations);
printf("Maximum work group count: %d %d %d\n", workGroupCount[0], workGroupCount[1], workGroupCount[2]);
printf("Maximum work group size: %d %d %d\n", workGroupSize[0], workGroupSize[1], workGroupSize[2]);
Output:
Vendor: Intel Open Source Technology Center (0x8086)
Device: Mesa DRI Intel(R) Haswell Mobile (0x416)
OpenGL vendor string: Intel Open Source Technology Center
OpenGL renderer string: Mesa DRI Intel(R) Haswell Mobile
Renderer: Mesa DRI Intel(R) Haswell Mobile
OpenGL version supported: OpenGL ES 3.1 Mesa 17.0.7
Number of texture units: 32
Maximum number of color attachments: 8
Maximum number of fragment shader outputs: 8
Maximum work group invocations: 2048
Maximum work group count: 65535 65535 65535
Maximum work group size: 2048 2048 2048
Shader:
#version 310 es
layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
layout (binding=0) uniform atomic_uint counter;
void main(){
atomicCounterIncrement(counter);
}
Setup:
GLuint ac_buffer;
glGenBuffers(1, &ac_buffer);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, ac_buffer);
glBufferData(GL_ATOMIC_COUNTER_BUFFER, sizeof(GLuint), NULL, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, 0);
GLuint compute_shader = glCreateShader (GL_COMPUTE_SHADER);
std::string ss;
readfile("compute.cs.c", ss);
const char *shader_source = ss.c_str();
glShaderSource (compute_shader, 1, &shader_source, NULL);
glCompileShader (compute_shader);
printShaderInfoLog(compute_shader);
GLuint shader_program = glCreateProgram ();
glAttachShader (shader_program, compute_shader);
glLinkProgram (shader_program);
printProgramInfoLog(shader_program);
glDeleteShader (compute_shader);
glUseProgram (shader_program);
glBindBufferBase(GL_ATOMIC_COUNTER_BUFFER, 0, ac_buffer);
glDispatchCompute(1024, 1024, 1);
if(glGetError() != GL_NO_ERROR) {
printf("There was a problem dispatching compute\n");
}
glMemoryBarrier(GL_ALL_BARRIER_BITS);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, ac_buffer);
GLuint *counter = (GLuint*)glMapBufferRange(GL_ATOMIC_COUNTER_BUFFER, 0, sizeof(GLuint), GL_MAP_READ_BIT);
printf("Counter: %u\n", *counter);
glUnmapBuffer(GL_ATOMIC_COUNTER_BUFFER);
glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, 0);
When I call glDispatchCompute with smaller values than 128 then I do seem to get reasonable results:
For example glDispatchCompute(128, 128, 1)results in "Counter: 16777216" which is consistent with 128*128*32*32. But if I call it with a 256, 256, 1 - I get result that is 66811258 instead. Which is no longer consistent with expected 67108864.
For smaller compute sets I always get expected results, but for larger ones the counter rarely ever goes beyond 60-100 million. Could I be hitting some driver limit? I though that since max group size is 65535 along each axis then I should be able to request large compute groups to be computed and expect all elements to be processed.
Could it be that my way of counting by means of atomic is flawed? Why does it still get reasonable results for small groups but falls short for large ones? How can I better debug this issue?
It is possible you're just reading the result before computation is complete. You need an explicit call to glFinish() to force completion and can remove the call to glMemoryBarrier(). For OpenGL ES glMemoryBarrier() only deals with relative ordering on the GPU between stages, it doesn't enforce ordering relative to the client access.
The desktop OpenGL 4.6 spec supports CLIENT_MAPPED_BUFFER_BARRIER_BIT for synchronizing client-side access, but this isn't available for OpenGL ES.

Why is my AMD card consuming vast amounts of CPU memory when I use sparse textures?

I've been investigating a performance difference in my application between AMD and nVidia cards related to sparse textures. I've recently discovered that when using an AMD GPU, creating sparse textures appears to have some kind of massive cost in terms of CPU memory.
On a windows 10 machine with an R9 390 with 8 GB of GPU memory and 16 GB of CPU memory, I'm running the following code attached to a timer set to fire 10 times a second
static std::vector<GLuint> _textures;
static const size_t MAX_TEXTURES = 1000;
static const glm::uvec3 TEXTURE_SIZE(512, 512, 1);
if (_textures.size() < MAX_TEXTURES) {
GLuint texture = 0;
uint16_t mips = evalNumMips(TEXTURE_SIZE);
glCreateTextures(GL_TEXTURE_2D, 1, &texture);
_textures.push_back(texture);
glTextureParameteri(texture, GL_TEXTURE_SPARSE_ARB, GL_TRUE);
glTextureParameteri(texture, GL_VIRTUAL_PAGE_SIZE_INDEX_ARB, 0);
glTextureStorage2D(texture, mips, GL_RGBA8, TEXTURE_SIZE.x, TEXTURE_SIZE.y);
GLuint maxSparseLevel;
glGetTextureParameterIuiv(texture, GL_NUM_SPARSE_LEVELS_ARB, &maxSparseLevel);
for (uint16_t mip = 0; mip < maxSparseLevel; ++mip) {
auto dims = evalMipDimensions(TEXTURE_SIZE, mip);
glTexturePageCommitmentEXT(texture, mip, 0, 0, 0, dims.x, dims.y, dims.z, GL_TRUE);
}
}
Where...
static const double LOG_2 = log(2.0);
uint16_t evalNumMips(const glm::uvec3& size) {
double dim = glm::compMax(size);
double val = log(dim) / LOG_2;
return 1 + (uint16_t)val;
}
glm::uvec3 evalMipDimensions(const glm::uvec3& size, uint16_t mip) {
auto result = size;
result >>= mip;
return glm::max(result, glm::uvec3(1));
}
This should, as far as I know, create a texture, allocate virtual memory and then commit that memory. It should consume GPU memory at a rate of about 10 MB per second. On an nVidia card, this code behaves as expected. On an AMD card, however, this code starts consuming CPU physical memory at a rate of about 1 GB per second.
This can be seen in process explorer here, where both the system commit size and physical memory size immediately start to skyrocket as soon as I start the application.
Why is the AMD driver suddenly consuming massive amounts of CPU memory whenever I create a sparse texture?

CUDA + OpenGL. Unknown code=4(cudaErrorLaunchFailure) error

I am doing a simple n-body simulation on CUDA, which then I am trying to visualize with OpenGL.
After I have initialitzed my particle data on the CPU, allocated the respective memory and transfered that data on the GPU, the program has to enter the following cycle:
1) Compute the forces on each particle (CUDA part)
2) update particle positions (CUDA part)
3) display the particles for this time step (OpenGL part)
4) go back to 1)
The interface between CUDA and OpenGL I achieve with the following code:
GLuint dataBufferID;
particle_t* Particles_d;
particle_t* Particles_h;
cudaGraphicsResource *resources[1];
I allocate space on OpenGLs Array_Buffer and register the latter as a cudaGraphicsResource using the following code:
void createVBO()
{
// create buffer object
glGenBuffers(1, &dataBufferID);
glBindBuffer(GL_ARRAY_BUFFER, dataBufferID);
glBufferData(GL_ARRAY_BUFFER, bufferStride*N*sizeof(float), 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
checkCudaErrors(cudaGraphicsGLRegisterBuffer(resources, dataBufferID, cudaGraphicsMapFlagsNone));
}
Lastly, the program cycle that I described (steps 1 to 4) is realized by the following function update(int)
void update(int value)
{
// map OpenGL buffer object for writing from CUDA
float* dataPtr;
checkCudaErrors(cudaGraphicsMapResources(1, resources, 0));
size_t num_bytes;
//get a pointer to that buffer object for manipulation with cuda!
checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dataPtr, &num_bytes,resources[0]));
//fill the Graphics Resource with particle position Data!
launch_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(Particles_d,dataPtr,1);
// unmap buffer object
checkCudaErrors(cudaGraphicsUnmapResources(1, resources, 0));
glutPostRedisplay();
glutTimerFunc(milisec,update,0);
}
I compile end I get the following errors :
CUDA error at src/main.cu:390 code=4(cudaErrorLaunchFailure) "cudaGraphicsMapResources(1, resources, 0)"
CUDA error at src/main.cu:392 code=4(cudaErrorLaunchFailure) "cudaGraphicsResourceGetMappedPointer((void **)&dataPtr, &num_bytes,resources[0])"
CUDA error at src/main.cu:397 code=4(cudaErrorLaunchFailure) "cudaGraphicsUnmapResources(1, resources, 0)"
Does anyone know what might be the reasons for that exception? Am I supposed to create the dataBuffer using createVBO() every time prior to the execution of update(int) ...?
p.s. Just for more clarity, my kernel function is the following:
__global__ void launch_kernel(particle_t* Particles,float* data, int KernelMode){
int i = blockIdx.x*THREADS_PER_BLOCK + threadIdx.x;
if(KernelMode == 1){
//N_d is allocated on device memory
if(i > N_d)
return;
//and update dataBuffer!
updateX(Particles+i);
for(int d=0;d<DIM_d;d++){
data[i*bufferStride_d+d] = Particles[i].p[d]; // update the new coordinate positions in the data buffer!
}
// fill in also the RGB data and the radius. In general THIS IS NOT NECESSARY!! NEED TO PERFORM ONCE! REFACTOR!!!
data[i*bufferStride_d+DIM_d] =Particles[i].r;
data[i*bufferStride_d+DIM_d+1] =Particles[i].g;
data[i*bufferStride_d+DIM_d+2] =Particles[i].b;
data[i*bufferStride_d+DIM_d+3] =Particles[i].radius;
}else{
// if KernelMode = 2 then Update Y
float* Fold = new float[DIM_d];
for(int d=0;d<DIM_d;d++)
Fold[d]=Particles[i].force[d];
//of course in parallel :)
computeForces(Particles,i);
updateV(Particles+i,Fold);
delete [] Fold;
}
// in either case wait for all threads to finish!
__syncthreads();
}
As I mentioned at one of the comments above , it turned out that I had mistaken the computing capability compiler option. I ran cuda-memcheck and I saw that that the cuda Api launch was failing. After I found the right compiler options, everything worked like a charm.