I'm trying to create a game using Vulkan and C++. I've got to a part where I use multiple command buffers with threading - or so I thought if I'm doing it correctly.
Now, I'm having a problem with fence. The console-I added a validation layer-says "Fence 0x21 is already in use by another submission."
I never used the fence in other functions.
The code below is the draw function. I call this function in a loop.
update_ubo (); // this function just writes uniform data on the uniform buffer in the local device.
uint32_t image_index = 0;
VkResult result = vkAcquireNextImageKHR (device, swapchain, numeric_limits <uint64_t>::max (), semaphore_image_avail, fence, &image_index);
// I hope I'm using multithreading correctly.
// all command buffers recorded in record_commandbuffers function are secondary command buffers.
#pragma omp parallel for num_threads(thread::hardware_concurrency ())
for (int64_t i = 0 ; i < (int64_t) vkthreads.size () ; i ++)
record_commandbuffers (vkthreads [i], framebuffers [image_index]);
VkCommandBufferBeginInfo cmdbuf_info = {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
nullptr,
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
nullptr
};
VkRenderPassBeginInfo renderpass_begin = {
VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
nullptr,
renderpass,
framebuffers [image_index],
{
{ 0, 0 },
swapchain_extent
},
1,
&clear_value
};
vkBeginCommandBuffer (pcmdbuf, &cmdbuf_info);
vkCmdBeginRenderPass (pcmdbuf, &renderpass_begin, VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);
for (size_t i = 0 ; i < vkthreads.size () ; i ++)
vkCmdExecuteCommands (pcmdbuf, (uint32_t) vkthreads [i].cmdbufs.size (), vkthreads [i].cmdbufs.data ());
vkCmdEndRenderPass (pcmdbuf);
vkEndCommandBuffer (pcmdbuf);
if (result == VK_ERROR_OUT_OF_DATE_KHR)
window_changed ();
else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR)
throw exception ("Could not acquire next images.");
VkPipelineStageFlags pipeline_flags [] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
VkSubmitInfo submit_info = {
VK_STRUCTURE_TYPE_SUBMIT_INFO,
nullptr,
1,
&semaphore_image_avail,
pipeline_flags,
1,
&pcmdbuf,
1,
&semaphore_render_finished
};
if (vkQueueSubmit (graphics_queue, 1, &submit_info, fence))
throw exception ("Could not submit information into the graphics queue.");
while (vkWaitForFences (device, 1, &fence, VK_TRUE, (uint64_t)100000000) == VK_TIMEOUT)
;
vkResetFences (device, 1, &fence);
VkSwapchainKHR swapchains [] = { swapchain };
VkPresentInfoKHR present_info = {
VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
nullptr,
1,
&semaphore_render_finished,
1,
swapchains,
&image_index,
nullptr
};
result = vkQueuePresentKHR (present_queue, &present_info);
if (result == VK_ERROR_OUT_OF_DATE_KHR)
window_changed ();
else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR)
throw exception ("Could not presnet the queue.");
P.S. FPS dropped significantly when I added multithreading (2000 fps to 210 fps, in debug release), and CPU usage went up significantly, which is expected. Should I care about FPS?
You pass the same fence to AcquireNextImage and QueueSubmit without waiting in between. You only need to pass it to QueueSubmit as the semaphore will take care of any required sync. Just pass VK_NULL_HANDLE to the acquireNexImage.
Paying such a huge cost of 5 ms per frame penalty due to threading overhead does seem a bit steep I'd expect a millisecond or two due to scheduler though it depends on how you are actually multithreading. But as long as total per frame remains under 16 ms for 60 fps it's no big deal.
Related
I'm trying to synchronize frames in Vulkan API, but I have some weird problems. I implemented synchronization like this:
void RenderSystem::OnUpdate(const float deltaTime)
{
uint32_t frameIndex{};
auto result = SwapChain->AcquireNextImageIndex(PresentationCompleteSemaphore.get(),
nullptr,
&frameIndex);
InFlightFences[frameIndex]->Wait();
InFlightFences[frameIndex]->Reset();
if (result == VK_ERROR_OUT_OF_DATE_KHR)
{
Recreate();
return;
}
else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR)
{
throw std::runtime_error("Error when acquiring next image...");
}
UpdateModelMatrix(deltaTime, frameIndex); // TODO: Remove this! For testing purposes only
VkPipelineStageFlags waitStages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
GraphicsMainQueue.Submit({ TriangleCommandBuffers[frameIndex].get() },
{ PresentationCompleteSemaphore.get() },
{ RenderCompleteSemaphore.get() },
InFlightFences[frameIndex].get(),
waitStages);
result = PresentationQueue.Present({ RenderCompleteSemaphore.get() },
{ SwapChain.get() },
&frameIndex);
if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || MainWindow->HasBeenResized())
Recreate();
else if (result != VK_SUCCESS)
throw std::runtime_error("Failed to present result!");
}
And it works on Windows 10 like a charm. Unfortunately on Linux Mint, it doesn't work in some cases. First of all, moving window on Linux is very laggy and sometimes freezes the whole OS for a second, but it's not the biggest problem. Closing the window calls vkDeviceWaitIdle and... it freezes the application. It will never start responding because it will wait for the device forever. The validation layer doesn't report any problem with my code.
I partly solved this problem by moving fences synchronization at the bottom of my function, but in my opinion, it's a suboptimal solution, because I wait for the frame to finish rendering, instead of preparing the next frame.
// ...
if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || MainWindow->HasBeenResized())
Recreate();
else if (result != VK_SUCCESS)
throw std::runtime_error("Failed to present result!");
InFlightFences[frameIndex]->Wait();
InFlightFences[frameIndex]->Reset();
}
How can I properly synchronize frames not only on Windows but also on Linux? What am I doing wrong? What am I missing?
You have only one set of semaphores. That means access to those semaphores might be missynchronized.
Let's see the code without the distractors:
AcquireNextImageIndex( PresentationCompleteSemaphore, frameIndex );
InFlightFences[frameIndex].WaitAndReset();
QSubmit( PresentationCompleteSemaphore, RenderCompleteSemaphore, InFlightFences[frameIndex] );
Present( RenderCompleteSemaphore, frameIndex );
Now, how do we know we can reuse PresentationCompleteSemaphore on Acquire? The Submit waits\unsignals it, and must finish. We could infer this from the fence, but the fence wait happens after the Acquire. So the semaphore still might be in use while Acquire tries to reuse it. This is a possible program flow:
AcquireNextImageIndex( PresentationCompleteSemaphore ) -> frameIndex = 0;
QSubmit( PresentationCompleteSemaphore, RenderCompleteSemaphore, InFlightFences[0] );
// hazard; QSubmit still might be waiting on PresentationCompleteSemaphore
AcquireNextImageIndex( PresentationCompleteSemaphore ) -> frameIndex = 1;
How do we know we can reuse RenderCompleteSemaphore? The QSubmit can only use it when Present is already done with it. Only sane way currently to infer that is when Acquire gives back the same swapchain image. This is a possible program flow:
AcquireNextImageIndex( PresentationCompleteSemaphore ) -> frameIndex = 0;
QSubmit( PresentationCompleteSemaphore, RenderCompleteSemaphore, InFlightFences[0] );
Present( RenderCompleteSemaphore, 0 );
AcquireNextImageIndex( PresentationCompleteSemaphore ) -> frameIndex = 1;
// hazard; RenderCompleteSemaphore might still be waited on by Present
// which presented image 0, but we acquired image 1, so it might be async
QSubmit( PresentationCompleteSemaphore, RenderCompleteSemaphore, InFlightFences[1] );
I have 4 sounds. I need play sound 1, when it finishes, automatically play sound 2; when sound 2 finishes, automatically play sound 3. Soun 3 finishes, play sound 4.... I'm using SDL Mixer 2.0, no SDL Sound...Is there a way?
int main() {
int frequencia = 22050;
Uint16 formato = AUDIO_S16SYS;
int canal = 2; // 1 mono; 2 = stereo;
int buffer = 4096;
Mix_OpenAudio(frequencia, formato, canal, buffer);
Mix_Chunk* sound_1;
Mix_Chunk* sound_2;
Mix_Chunk* sound_3;
Mix_Chunk* sound_4;
som_1 = Mix_LoadWAV("D:\\sound1.wav");
som_2 = Mix_LoadWAV("D:\\sound1.wav");
som_3 = Mix_LoadWAV("D:\\sound1.wav");
som_4 = Mix_LoadWAV("D:\\sound1.wav");
Mix_PlayChannel(-1, sound_1, 0);
Mix_PlayChannel(1, sound_2, 0);
Mix_PlayChannel(2, sound_3, 0);
Mix_PlayChannel(3, sound_4, 0);
return 0;
}
Check in a loop whether the channel is still playing using Mix_Playing(), and add a delay using SDL_Delay() to prevent the loop from consuming all available CPU time.
(In this example, I changed your first call to Mix_PlayChannel() from -1 to 1.)
Mix_PlayChannel(1, sound_1, 0);
while (Mix_Playing(1) != 0) {
SDL_Delay(200); // wait 200 milliseconds
}
Mix_PlayChannel(2, sound_2, 0);
while (Mix_Playing(2) != 0) {
SDL_Delay(200); // wait 200 milliseconds
}
// etc.
You should probably wrap that into a function instead so that you don't repeat what is basically the same code over and over again:
void PlayAndWait(int channel, Mix_Chunk* chunk, int loops)
{
channel = Mix_PlayChannel(channel, chunk, loops);
if (channel < 0) {
return; // error
}
while (Mix_Playing(channel) != 0) {
SDL_Delay(200);
}
}
// ...
PlayAndWait(-1, sound_1, 0);
PlayAndWait(1, sound_2, 0);
PlayAndWait(2, sound_3, 0);
PlayAndWait(3, sound_3, 0);
I'm working on a v4l2 API for capturing images from a raw sensor on embedded platform. My capture routine is related to the example on [1]. The proposed method for streaming is using mmaped buffers as a ringbuffer.
For initialization, buffers (default = 4 buffers) are requested using ioctl with VIDIOC_REQBUFS identifier. Subsequently, they are queued using VIDIOC_QBUF. The entire streaming procedure is described here [2]. As soon as streaming starts, the driver fills the queued buffers with data. The timestamp of v4l2_buffer struct indicates the time of first byte captured which in my case results in a time interval of approximately 8.3 ms (=120fps) between buffers. So far so good.
Now what I would expect of a ring buffer is that new captures automatically overwrite older ones in a circular fashion. But this is not what happens. Only when a buffer is queued again (VIDIOC_QBUF) after it has been dequeued (VIDIOC_DQBUF) and processed (demosaic, tracking step,..), a new frame is assigned to a buffer. If I do meet the timing condition (processing < 8.3 ms) I don't get the latest captured frame when dequeuing but the oldest captured one (according to FIFO), so the one of 3x8.3 ms before the current one. If the timing condition is not met the time span gets even larger, as the buffers are not overwritten.
So I have several questions:
1. Does it even make sense for this tracking application to have a ring buffer as I don't really need history of frames? I certainly doubt that, but by using the proposed mmap method drivers mostly require a minimum amount of buffers to be requested.
2. Should a seperate thread continously DQBUF and QBUF to accomplish the buffer overwrite? How could this be accomplished?
3. As a workaround one could probably dequeue and requeue all buffers on every capture, but this doesn't sound right. Is there someone with more experience in real time capture and streaming who can point to the "proper" way to go?
4. Also currently, I'm doing the preprocessing step (demosaicing) between DQBUF and QBUF and the tracking step afterwards. Should the tracking step also be executed before QBUF is called again?
So the main code basically performs Capture() and Track() subsequently in a while loop. The Capture routine looks as follows:
cv::Mat v4l2Camera::Capture( size_t timeout ) {
fd_set fds;
FD_ZERO(&fds);
FD_SET(mFD, &fds);
struct timeval tv;
tv.tv_sec = 0;
tv.tv_usec = 0;
const bool threaded = true; //false;
// proper register settings
this->format2registerSetting();
//
if( timeout > 0 )
{
tv.tv_sec = timeout / 1000;
tv.tv_usec = (timeout - (tv.tv_sec * 1000)) * 1000;
}
//
const int result = select(mFD + 1, &fds, NULL, NULL, &tv);
if( result == -1 )
{
//if (EINTR == errno)
printf("v4l2 -- select() failed (errno=%i) (%s)\n", errno, strerror(errno));
return cv::Mat();
}
else if( result == 0 )
{
if( timeout > 0 )
printf("v4l2 -- select() timed out...\n");
return cv::Mat(); // timeout, not necessarily an error (TRY_AGAIN)
}
// dequeue input buffer from V4L2
struct v4l2_buffer buf;
memset(&buf, 0, sizeof(v4l2_buffer));
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP; //V4L2_MEMORY_USERPTR;
if( xioctl(mFD, VIDIOC_DQBUF, &buf) < 0 )
{
printf("v4l2 -- ioctl(VIDIOC_DQBUF) failed (errno=%i) (%s)\n", errno, strerror(errno));
return cv::Mat();
}
if( buf.index >= mBufferCountMMap )
{
printf("v4l2 -- invalid mmap buffer index (%u)\n", buf.index);
return cv::Mat();
}
// emit ringbuffer entry
printf("v4l2 -- recieved %ux%u video frame (index=%u)\n", mWidth, mHeight, (uint32_t)buf.index);
void* image_ptr = mBuffersMMap[buf.index].ptr;
// frame processing (& tracking step)
cv::Mat demosaic_mat = demosaic(image_ptr,mSize,mDepth,1);
// re-queue buffer to V4L2
if( xioctl(mFD, VIDIOC_QBUF, &buf) < 0 )
printf("v4l2 -- ioctl(VIDIOC_QBUF) failed (errno=%i) (%s)\n", errno, strerror(errno));
return demosaic_mat;
}
As my knowledge is limited regarding capturing and streaming video I appreciate any help.
I am developing an application that reads data from a named pipe on Windows 7 at around 800 Mbps. I have to develop it with several threads since the FIFO at the other side of the pipe overflows if I am not able to read at the given speed. The performance though is really pitifull and I cannot understand why. I already read several things I tried to split the memory to avoid bad memory sharing.
At the beginning I has thinking I could be a problem with contiguous memory possitions, but the memory sections are queued in a list the main thread is not using them any more after queue it. The amount of memory are huge so I don't thing they lay on same pages or so.
This is the threaded function:
void splitMessage(){
char* bufferMSEO;
char* bufferMDO;
std::list<struct msgBufferStr*> localBufferList;
while(1)
{
long bytesProcessed = 0;
{
std::unique_lock<std::mutex> lk(bufferMutex);
while(bufferList.empty())
{
// Wait until the map has data
listReady.wait(lk);
}
//Extract the data from the list and copy to the local list
localBufferList.splice(localBufferList.end(),bufferList);
//Unlock the mutex and notify
// Manual unlocking is done before notifying, to avoid waking up
// the waiting thread only to block again (see notify_one for details)
lk.unlock();
//listReady.notify_one();
}
for(auto nextBuffer = localBufferList.begin(); nextBuffer != localBufferList.end(); nextBuffer++)
{
//nextBuffer = it->second();
bufferMDO = (*nextBuffer)->MDO;
bufferMSEO = (*nextBuffer)->MSEO;
bytesProcessed += (*nextBuffer)->size;
//Process the data Stream
for(int k=0; k<(*nextBuffer)->size; k++)
{
}
//localBufferList.remove(*nextBuffer);
free(bufferMDO);
free(bufferMSEO);
free(*nextBuffer);
}
localBufferList.clear();
}
}
And here the thread that reads the data and queue them:
DWORD WINAPI InstanceThread(LPVOID lpvParam)
// This routine is a thread processing function to read from and reply to a client
// via the open pipe connection passed from the main loop. Note this allows
// the main loop to continue executing, potentially creating more threads of
// of this procedure to run concurrently, depending on the number of incoming
// client connections.
{
HANDLE hHeap = GetProcessHeap();
TCHAR* pchRequest = (TCHAR*)HeapAlloc(hHeap, 0, BUFSIZE*sizeof(TCHAR));
DWORD cbBytesRead = 0, cbReplyBytes = 0, cbWritten = 0;
BOOL fSuccess = FALSE;
HANDLE hPipe = NULL;
double totalRxData = 0;
char* bufferPnt;
char* bufferMDO;
char* bufferMSEO;
char* destPnt;
// Do some extra error checking since the app will keep running even if this
// thread fails.
if (lpvParam == NULL)
{
printf( "\nERROR - Pipe Server Failure:\n");
printf( " InstanceThread got an unexpected NULL value in lpvParam.\n");
printf( " InstanceThread exitting.\n");
if (pchRequest != NULL) HeapFree(hHeap, 0, pchRequest);
return (DWORD)-1;
}
if (pchRequest == NULL)
{
printf( "\nERROR - Pipe Server Failure:\n");
printf( " InstanceThread got an unexpected NULL heap allocation.\n");
printf( " InstanceThread exitting.\n");
return (DWORD)-1;
}
// Print verbose messages. In production code, this should be for debugging only.
printf("InstanceThread created, receiving and processing messages.\n");
// The thread's parameter is a handle to a pipe object instance.
hPipe = (HANDLE) lpvParam;
try
{
msgSplitter = std::thread(&splitMessage);
//msgSplitter.detach();
}
catch(...)
{
_tprintf(TEXT("CreateThread failed, GLE=%d.\n"), GetLastError());
return -1;
}
while (1)
{
struct msgBufferStr *newBuffer = (struct msgBufferStr* )malloc(sizeof(struct msgBufferStr));
// Read client requests from the pipe. This simplistic code only allows messages
// up to BUFSIZE characters in length.
fSuccess = ReadFile(
hPipe, // handle to pipe
pchRequest, // buffer to receive data
BUFSIZE*sizeof(TCHAR), // size of buffer
&cbBytesRead, // number of bytes read
NULL); // not overlapped I/O
if (!fSuccess || cbBytesRead == 0)
{
if (GetLastError() == ERROR_BROKEN_PIPE)
{
_tprintf(TEXT("InstanceThread: client disconnected.\n"), GetLastError());
break;
}
else if (GetLastError() == ERROR_MORE_DATA)
{
}
else
{
_tprintf(TEXT("InstanceThread ReadFile failed, GLE=%d.\n"), GetLastError());
}
}
//timeStart = omp_get_wtime();
bufferPnt = (char*)pchRequest;
totalRxData += ((double)cbBytesRead)/1000000;
bufferMDO = (char*) malloc(cbBytesRead);
bufferMSEO = (char*) malloc(cbBytesRead/3);
destPnt = bufferMDO;
//#pragma omp parallel for
for(int i = 0; i < cbBytesRead/12; i++)
{
msgCounter++;
if(*(bufferPnt + (i * 12)) == 0) continue;
if(*(bufferPnt + (i * 12)) == 8)
{
errorCounter++;
continue;
}
//Use 64 bits variables in order to make less operations
unsigned long long *sourceAddrLong = (unsigned long long*) (bufferPnt + (i * 12));
unsigned long long *destPntLong = (unsigned long long*) (destPnt + (i * 8));
//Copy the data bytes from source to destination
*destPntLong = *sourceAddrLong;
//Copy and prepare the MSEO lines for the data processing
bufferMSEO[i*4]=(bufferPnt[(i * 12) + 8] & 0x03);
bufferMSEO[i*4 + 1]=(bufferPnt[(i * 12) + 8] & 0x0C) >> 2;
bufferMSEO[i*4 + 2]=(bufferPnt[(i * 12) + 8] & 0x30) >> 4;
bufferMSEO[i*4 + 3]=(bufferPnt[(i * 12) + 8] & 0xC0) >> 6;
}
newBuffer->size = cbBytesRead/3;
newBuffer->MDO = bufferMDO;
newBuffer->MSEO = bufferMSEO;
{
//lock the mutex
std::lock_guard<std::mutex> lk(bufferMutex);
//add data to the list
bufferList.push_back(newBuffer);
} // bufferMutex is automatically released when lk goes out of scope
//Notify
listReady.notify_one();
}
// Flush the pipe to allow the client to read the pipe's contents
// before disconnecting. Then disconnect the pipe, and close the
// handle to this pipe instance.
FlushFileBuffers(hPipe);
DisconnectNamedPipe(hPipe);
CloseHandle(hPipe);
HeapFree(hHeap, 0, pchRequest);
//Show memory leak isues
_CrtDumpMemoryLeaks();
//TODO: Join thread
printf("InstanceThread exitting.\n");
return 1;
}
The think that really blows my mind is that I a let it like this the splitMessage thread takes minutes to read the data even though the first thread finished reading the data long ago. I mean the read thread reads like 1,5Gb or information in seconds and waits for more data from the pipe. This data are processed by the split thread (the only one really "doing" something in almost one minute or more). The CPU is moreover only to less than 20% percent used. (It is a i7 labtop with 16 Gb RAM and 8 cores!)
On the other hand, if I just comment the for loop in the process thread:
for(int k=0; k<(*nextBuffer)->size; k++)
Then the data are read slowly and the FIFO on the other side of the pipe overflows. With 8 processors and at more than 2 GHz should be fast enought to go throw the buffers without many problems, isn't it? I think it has to be a memory access issue or that the scheduler is sending the thread somehow to sleep but I cannot figure out why!!. Other possibility is that the iteration throw the linked list with the iterator is not optimal.
Any help would be geat because I am trying to understand it since a couple of days, I made several changes in the code and tried to simplified at the maximum and I am getting crazy :).
best regards,
Manuel
I'm seeing a lot of threads on this forum dealing with a question whether or not we need to use synchronization when accessing primitive data types from multiple threads: Question 1, question 2, question 3, question 4...
So I wrote a small test to verify this:
I ran it for over an hour on my CPU Intel(R) Core(TM) i7 CPU 860 # 2.80GHz that runs with 4 physical cores:
#define MULTIPLIC_VAL 17
DWORD gdwSharedVal01 = MULTIPLIC_VAL;
DWORD WINAPI thread001(LPVOID lpParameter);
//Begin threads
for(int i = 0; i < 30; i++)
{
DWORD dwThreadId;
HANDLE hThread = ::CreateThread(NULL, 0, thread001, NULL, 0, &dwThreadId);
if(hThread)
{
::CloseHandle(hThread);
}
else
{
_tprintf(L"ERROR: CreateThread error %d\n", ::GetLastError());
}
}
//Wait
getchar();
BOOL checkSharedValue()
{
//RETURN:
// = TRUE if value is OK
if((gdwSharedVal01 % MULTIPLIC_VAL) == 0)
{
return TRUE;
}
return FALSE;
}
DWORD WINAPI thread001(LPVOID lpParameter)
{
srand((UINT)time(NULL));
DWORD dwThreadID = ::GetCurrentThreadId();
_tprintf(L"Thread %d began...\n", dwThreadID);
for(;;)
{
//Set value
DWORD v = rand();
v <<= 16;
v ^= rand();
v = v / MULTIPLIC_VAL;
gdwSharedVal01 = v * MULTIPLIC_VAL;
//Check value
if(!checkSharedValue())
{
//Failed
_tprintf(L"FAILED thread %d\n", dwThreadID);
}
}
return 0;
}
and I got no fails. So how would you explain it?
In Intel, reads and writes to aligned words are atomic operations (atomic in the sense that other processors will see either the original or the new value).
Note that this does not mean that you should not provide synchronization mechanism. This test case is one in which threads just write and read new values into the same variable. If they were providing some sort of operation that involved a read/write for the update it could fail (say 10 threads incrementing the variable by 100, the variable at the end might not have been incremented by 1000 total!) and that there are no other variables in play (where compiler/cpu reordering could cause other issues).