Buffer communication speed nightmare - c++

I'm trying to use buffers to communicate between several 'layers' (threads) in my program and now that I have visual output of what's going on inside, I realize there's a devastating amount of time being eaten up in the process of using these buffers.
Here's some notes about what's going on in my code.
when the rendering mode is triggered in this thread, it begins sending as many points as it can to the layer (thread) below it
the points from the lower thread are then processed and returned to this thread via the output buffer of the lower thread
points received back are mapped (for now) as white pixels in the D3D surface
if I bypass the buffer and put the points directly into the surface pixels, it only takes about 3 seconds to do the whole job
if I hand the point down and then have the lower layer pass it right back up, skipping any actual number-crunching, the whole job takes about 30 minutes (which makes the whole program useless)
changing the size of my buffers has no noticeable effect on the speed
I was originally using MUTEXes in my buffers but have eliminated them in attempt the fix the problem
Is there something I can do differently to fix this speed problem I'm having?
...something to do with the way I'm handling these messages???
Here's my code
I'm very sorry that it's such a mess. I'm having to move way too fast on this project and I've left a lot of pieces laying around in comments where I've been experimenting.
DWORD WINAPI CONTROLSUBSYSTEM::InternalExProcedure(__in LPVOID lpSelf)
{
XMSG xmsg;
LPCONTROLSUBSYSTEM lpThis = ((LPCONTROLSUBSYSTEM)lpSelf);
BOOL bStall;
BOOL bRendering = FALSE;
UINT64 iOutstandingPoints = 0; // points that are out being tested
UINT64 iPointsDone = 0;
UINT64 iPointsTotal = 0;
BOOL bAssigning;
DOUBLE dNextX;
DOUBLE dNextY;
while(1)
{
if( lpThis->hwTargetWindow!=NULL && lpThis->d3ddev!=NULL )
{
lpThis->d3ddev->Clear(0,NULL,D3DCLEAR_TARGET,D3DCOLOR_XRGB(0,0,0),1.0f,0);
if(lpThis->d3ddev->BeginScene())
{
lpThis->d3ddev->StretchRect(lpThis->sfRenderingCanvas,NULL,lpThis->sfBackBuffer,NULL,D3DTEXF_NONE);
lpThis->d3ddev->EndScene();
}
lpThis->d3ddev->Present(NULL,NULL,NULL,NULL);
}
//bStall = TRUE;
// read input buffer
if(lpThis->bfInBuffer.PeekMessage(&xmsg))
{
bStall = FALSE;
if( HIBYTE(xmsg.wType)==HIBYTE(CONT_MSG) )
{
// take message off
lpThis->bfInBuffer.GetMessage(&xmsg);
// double check consistency
if( HIBYTE(xmsg.wType)==HIBYTE(CONT_MSG) )
{
switch(LOBYTE(xmsg.wType))
{
case SETRESOLUTION_MSG:
lpThis->iAreaWidth = (UINT)xmsg.dptPoint.X;
lpThis->iAreaHeight = (UINT)xmsg.dptPoint.Y;
lpThis->sfRenderingCanvas->Release();
if(lpThis->d3ddev->CreateOffscreenPlainSurface(
(UINT)xmsg.dptPoint.X,(UINT)xmsg.dptPoint.Y,
D3DFMT_X8R8G8B8,
D3DPOOL_DEFAULT,
&(lpThis->sfRenderingCanvas),
NULL)!=D3D_OK)
{
MessageBox(NULL,"Error resizing surface.","ERROR",MB_ICONERROR);
}
else
{
D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
lpThis->iPitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
ZeroMemory(data,lpThis->iPitch*lpThis->iAreaHeight);
lpThis->sfRenderingCanvas->UnlockRect();
MessageBox(NULL,"Surface Resized","yay",0);
}
else
{
MessageBox(NULL,"Error resizing surface.","ERROR",MB_ICONERROR);
}
}
break;
case SETCOLORMETHOD_MSG:
break;
case SAVESNAPSHOT_MSG:
lpThis->SaveSnapshot();
break;
case FORCERENDER_MSG:
bRendering = TRUE;
iPointsTotal = lpThis->iAreaHeight*lpThis->iPitch;
iPointsDone = 0;
MessageBox(NULL,"yay, render something!",":o",0);
break;
default:
break;
}
}// else, lost this message
}
else
{
if( HIBYTE(xmsg.wType)==HIBYTE(MATH_MSG) )
{
XMSG xmsg2;
switch(LOBYTE(xmsg.wType))
{
case RESETFRAME_MSG:
case ZOOMIN_MSG:
case ZOOMOUT_MSG:
case PANUP_MSG:
case PANDOWN_MSG:
case PANLEFT_MSG:
case PANRIGHT_MSG:
// tell self to start a render
xmsg2.wType = CONT_MSG|FORCERENDER_MSG;
if(lpThis->bfInBuffer.PutMessage(&xmsg2))
{
// pass it down
while(!lpThis->lplrSubordinate->PutMessage(&xmsg));
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
break;
default:
// pass it down
if(lpThis->lplrSubordinate->PutMessage(&xmsg))
{
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
break;
}
}
else if( lpThis->lplrSubordinate!=NULL )
// pass message down
{
if(lpThis->lplrSubordinate->PutMessage(&xmsg))
{
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
}
}
}
// read output buffer from subordinate
if( lpThis->lplrSubordinate!=NULL && lpThis->lplrSubordinate->PeekMessage(&xmsg) )
{
bStall = FALSE;
if( xmsg.wType==(REPLY_MSG|TESTPOINT_MSG) )
{
// got point test back
D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
INT pitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
INT Y=dRound((xmsg.dptPoint.Y/(DOUBLE)100)*((DOUBLE)lpThis->iAreaHeight));
INT X=dRound((xmsg.dptPoint.X/(DOUBLE)100)*((DOUBLE)pitch));
// decide color
if( xmsg.iNum==0 )
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
else
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
// message handled so remove from buffer
lpThis->lplrSubordinate->GetMessage(&xmsg);
lpThis->sfRenderingCanvas->UnlockRect();
}
}
else if(lpThis->bfOutBuffer.PutMessage(&xmsg))
{
// message sent so pull the real one off the buffer
lpThis->lplrSubordinate->GetMessage(&xmsg);
}
}
if( bRendering && lpThis->lplrSubordinate!=NULL )
{
bAssigning = TRUE;
while(bAssigning)
{
dNextX = 100*((DOUBLE)(iPointsDone%lpThis->iPitch))/((DOUBLE)lpThis->iPitch);
dNextY = 100*(DOUBLE)((INT)(iPointsDone/lpThis->iPitch))/(DOUBLE)(lpThis->iAreaHeight);
xmsg.dptPoint.X = dNextX;
xmsg.dptPoint.Y = dNextY;
//
//xmsg.iNum = 0;
//xmsg.wType = REPLY_MSG|TESTPOINT_MSG;
//
xmsg.wType = MATH_MSG|TESTPOINT_MSG;
/*D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
INT pitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
INT Y=dRound((dNextY/(DOUBLE)100)*((DOUBLE)lpThis->iAreaHeight));
INT X=dRound((dNextX/(DOUBLE)100)*((DOUBLE)pitch));
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
lpThis->sfRenderingCanvas->UnlockRect();
}
iPointsDone++;
if( iPointsDone>=iPointsTotal )
{
MessageBox(NULL,"done rendering","",0);
bRendering = FALSE;
bAssigning = FALSE;
}
*/
if( lpThis->lplrSubordinate->PutMessage(&xmsg) )
{
bStall = FALSE;
iPointsDone++;
if( iPointsDone>=iPointsTotal )
{
MessageBox(NULL,"done rendering","",0);
bRendering = FALSE;
bAssigning = FALSE;
}
}
else
{
bAssigning = FALSE;
}
}
}
//if( bStall )
//Sleep(10);
}
return 0;
}
}
(still getting used to this forum's code block stuff)
Edit:
Here's an example that I perceive to be similar in concept, although this example consumes the messages it produces in the same thread.
#include <Windows.h>
#include "BUFFER.h"
int main()
{
BUFFER myBuffer;
INT jobsTotal = 1024*768;
INT currentJob = 0;
INT jobsOut = 0;
XMSG xmsg;
while(1)
{
if(myBuffer.PeekMessage(&xmsg))
{
// do something with message
// ...
// if successful, remove message
myBuffer.GetMessage(&xmsg);
jobsOut--;
}
while( currentJob<jobsTotal )
{
if( myBuffer.PutMessage(&xmsg) )
{
currentJob++;
jobsOut++;
}
else
{
// buffer is full at the moment
// stop for now and put more on later
break;
}
}
if( currentJob==jobsTotal && jobsOut==0 )
{
MessageBox(NULL,"done","",0);
break;
}
}
return 0;
}
This example also runs in about 3 seconds, as opposed to 30 minutes.
Btw, if anybody knows why visual studio keeps trying to make me say PeekMessageA and GetMessageA instead of the actual names I defined, that would be nice to know as well.

Locking and Unlocking an entire rect to change a single point is probably not very efficient, you might be better off generating a list of points you intend to modify and then locking the rect once, iterating over that list and modifying all the points, and then unlocking the rect.
When you lock the rect you are effectively stalling concurrent access to it, so its like a mutex for the GPU in that respect - then you only modify a single pixel. Doing this repeatedly for each pixel will constantly stall the GPU. You could use D3DLOCK_NOSYSLOCK to avoid this to some extent, but I'm not sure if it will play nicely in the larger context of your program.
I'm obviously not entirely sure what the goal of your algorithm is, but if you are trying to parallel process pixels on a d3d surface, then i think the best approach would be via a shader on the GPU.
Where you basically generate an array in system memory, populate it with "input" values on a per point/pixel basis, then generate a texture on a GPU from the array. Next you paint the texture to a full screen quad, and then render it with a pixel shader to some render target. The shader can be coded to process each point in whatever way you like, the GPU will take care of optimizing parallelization. Then you generate a new texture from that render target and then you copy that texture into a system memory array. And then you can extract all your outputs from that array. You can also apply multiple shaders to the render target result back into the render target to pipeline multiple transformations if needed.

A couple notes:
Don't write your own messape-passing code. It may be correct and slow, or fast and buggy. It takes a lot of experience to design code that's fast and then getting it bug-free is really hard, because debugging threaded code is hard. Win32 provides a couple of efficient threadsafe queues: SList and the window message queue.
Your design splits up work in the worst possible way. Passing information between threads is expensive even under the best circumstances, because it causes cache contention, both on the data and on the synchronization objects. It's MUCH better to split your work into distinct non-interacting (or minimize interaction) datasets and give each to a separate thread, that is then responsible for all stages of processing that dataset.

Don't poll.
That's likely to be the heart of the problem. You have a task continually calling peekmessage and probably finding nothing there. This will just eat all available CPU. Any task that wants to post messages is unlikely to receive any CPU time to acheive this.
I can't remember how you'd achieve this with the windows message queue (probably WaitMessage or some variant) but typically you might implement this with a counting semaphore. When the consumer wants data, it waits for the semaphore to be signalled. When the producer has data, it signals the semaphore.

I managed to resolve it by redesigning the whole thing
It now passes huge payloads instead of individual tasks
(I'm the poster)

Related

SetPerTcpConnectionEStats fails and can't get GetPerTcpConnectionEStats multiple times c++

I am following the example in https://learn.microsoft.com/en-gb/windows/win32/api/iphlpapi/nf-iphlpapi-getpertcp6connectionestats?redirectedfrom=MSDN to get the TCP statistics. Although, I got it working and get the statistics in the first place, still I want to record them every a time interval (which I haven't managed to do so), and I have the following questions.
The SetPerTcpConnectionEStats () fails with status != NO_ERROR and equal to 5. Although, it fails, I can get the statistics. Why?
I want to get the statistics every, let's say 1 second. I have tried two different ways; a) to use a while loop and use a std::this_thread::sleep_for(1s), where I could get the statistics every ~1sec, but the whole app was stalling (is it because of the this), I supposed that I am blocking the operation of the main, and b) (since a) failed) I tried to call TcpStatistics() from another function (in different class) that is triggered every 1 sec (I store clientConnectRow to a global var). However, in that case (b), GetPerTcpConnectionEStats() fails with winStatus = 1214 (ERROR_INVALID_NETNAME) and of course TcpStatistics() cannot get any of the statistics.
a)
ClassB::ClassB()
{
UINT winStatus = GetTcpRow(localPort, hostPort, MIB_TCP_STATE_ESTAB, (PMIB_TCPROW)clientConnectRow);
ToggleAllEstats(clientConnectRow, TRUE);
thread t1(&ClassB::TcpStatistics, this, clientConnectRow);
t1.join();
}
ClassB::TcpStatistics()
{
while (true)
{
GetAndOutputEstats(row, TcpConnectionEstatsBandwidth)
// some more code here
this_thread::sleep_for(milliseconds(1000));
}
}
b)
ClassB::ClassB()
{
MIB_TCPROW client4ConnectRow;
void* clientConnectRow = NULL;
clientConnectRow = &client4ConnectRow;
UINT winStatus = GetTcpRow(localPort, hostPort, MIB_TCP_STATE_ESTAB, (PMIB_TCPROW)clientConnectRow);
m_clientConnectRow = clientConnectRow;
TcpStatistics();
}
ClassB::TcpStatistics()
{
ToggleAllEstats(m_clientConnectRow , TRUE);
void* row = m_clientConnectRow;
GetAndOutputEstats(row, TcpConnectionEstatsBandwidth)
// some more code here
}
ClassB::GetAndOutputEstats(void* row, TCP_ESTATS_TYPE type)
{
//...
winStatus = GetPerTcpConnectionEStats((PMIB_TCPROW)row, type, NULL, 0, 0, ros, 0, rosSize, rod, 0, rodSize);
if (winStatus != NO_ERROR) {wprintf(L"\nGetPerTcpConnectionEStats %s failed. status = %d", estatsTypeNames[type], winStatus); //
}
else { ...}
}
ClassA::FunA()
{
classB_ptr->TcpStatistics();
}
I found a work around for the second part of my question. I am posting it here, in case someone else find it useful. There might be other solutions too, more advanced, but this is how I did it myself. We have to first Obtain MIB_TCPROW corresponding to the TCP connection and then to Enable Estats collection before dumping current stats. So, what I did was to add all of these in a function and call this instead, every time I want to get the stats.
void
ClassB::FunSetTcpStats()
{
MIB_TCPROW client4ConnectRow;
void* clientConnectRow = NULL;
clientConnectRow = &client4ConnectRow;
//this is for the statistics
UINT winStatus = GetTcpRow(lPort, hPort, MIB_TCP_STATE_ESTAB, (PMIB_TCPROW)clientConnectRow); //lPort & hPort in htons!
if (winStatus != ERROR_SUCCESS) {
wprintf(L"\nGetTcpRow failed on the client established connection with %d", winStatus);
return;
}
//
// Enable Estats collection and dump current stats.
//
ToggleAllEstats(clientConnectRow, TRUE);
TcpStatistics(clientConnectRow); // same as GetAllEstats() in msdn
}

Is it possible to wait for a transfer from the staging buffer to complete without calling vkQueueWaitIdle

The following piece of code show you how i transfer a vertex buffer data from the staging buffer to a local memory buffer :
bool Vulkan::UpdateVertexBuffer(std::vector<VERTEX>& data, VULKAN_BUFFER& vertex_buffer)
{
std::memcpy(this->staging_buffer.pointer, &data[0], vertex_buffer.size);
size_t flush_size = static_cast<size_t>(vertex_buffer.size);
unsigned int multiple = static_cast<unsigned int>(flush_size / this->physical_device.properties.limits.nonCoherentAtomSize);
flush_size = this->physical_device.properties.limits.nonCoherentAtomSize * ((uint64_t)multiple + 1);
VkMappedMemoryRange flush_range = {};
flush_range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
flush_range.pNext = nullptr;
flush_range.memory = this->staging_buffer.memory;
flush_range.offset = 0;
flush_range.size = flush_size;
vkFlushMappedMemoryRanges(this->device, 1, &flush_range);
VkResult result = vkWaitForFences(this->device, 1, &this->transfer.fence, VK_FALSE, 1000000000);
if(result != VK_SUCCESS) {
#if defined(_DEBUG)
std::cout << "UpdateVertexBuffer => vkWaitForFences : Timeout" << std::endl;
#endif
return false;
}
vkResetFences(this->device, 1, &this->transfer.fence);
VkCommandBufferBeginInfo command_buffer_begin_info = {};
command_buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
command_buffer_begin_info.pNext = nullptr;
command_buffer_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
command_buffer_begin_info.pInheritanceInfo = nullptr;
vkBeginCommandBuffer(this->transfer.command_buffer, &command_buffer_begin_info);
VkBufferCopy buffer_copy_info = {};
buffer_copy_info.srcOffset = 0;
buffer_copy_info.dstOffset = 0;
buffer_copy_info.size = vertex_buffer.size;
vkCmdCopyBuffer(this->transfer.command_buffer, this->staging_buffer.handle, vertex_buffer.handle, 1, &buffer_copy_info);
VkBufferMemoryBarrier buffer_memory_barrier = {};
buffer_memory_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
buffer_memory_barrier.pNext = nullptr;
buffer_memory_barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
buffer_memory_barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
buffer_memory_barrier.srcQueueFamilyIndex = this->queue_stack[this->transfer_stack_index].index;
buffer_memory_barrier.dstQueueFamilyIndex = this->queue_stack[this->graphics_stack_index].index;
buffer_memory_barrier.buffer = vertex_buffer.handle;
buffer_memory_barrier.offset = 0;
buffer_memory_barrier.size = VK_WHOLE_SIZE;
vkCmdPipelineBarrier(this->transfer.command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1, &buffer_memory_barrier, 0, nullptr);
vkEndCommandBuffer(this->transfer.command_buffer);
VkSubmitInfo submit_info = {};
submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submit_info.pNext = nullptr;
submit_info.waitSemaphoreCount = 0;
submit_info.pWaitSemaphores = nullptr;
submit_info.pWaitDstStageMask = nullptr;
submit_info.commandBufferCount = 1;
submit_info.pCommandBuffers = &this->transfer.command_buffer;
submit_info.signalSemaphoreCount = 0;
submit_info.pSignalSemaphores = nullptr;
VkResult result = vkQueueSubmit(this->queue_stack[this->transfer_stack_index].handle, 1, &submit_info, this->transfer.fence);
if(result != VK_SUCCESS) {
#if defined(_DEBUG)
std::cout << "UpdateVertexBuffer => vkQueueSubmit : Failed" << std::endl;
#endif
return false;
}
#if defined(_DEBUG)
std::cout << "UpdateVertexBuffer : Success" << std::endl;
#endif
return true;
}
It works perfectly without any validation layer warning. But when i call i twice, both buffers contains the same data, from the second call. For example :
UpdateVertexBuffer(cube_data, cube_buffer);
UpdateVertexBuffer(prism_data, prism_buffer);
This will result in having a prism inside both cube_buffer and prism_buffer. To fix this, i can simply wait for a few milliseconds between the two calls :
UpdateVertexBuffer(cube_data, cube_buffer);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
UpdateVertexBuffer(prism_data, prism_buffer);
or preferably, i can replace the fence by a call to
vkQueueWaitIdle(this->queue_stack[this->transfer_stack_index].handle);
In my opinion this will result in performance loss and the fence is supposed to be the optimal way to wait for transfer operation to complete properly, so why is my first buffer filled by second when i'm using a fence. And is there a way to do this properly without using vkQueueWaitIdle.
Thanks for your help.
You wait for the fence for the previous upload after you have already written the data to the staging buffer. That's too late; the fence is there to prevent you from writing data to memory that's being read.
But really, your problem is that your design is wrong. Your design is such that sequential updates all use the same memory. They shouldn't. Instead, sequential updates should use different regions of the same memory, so that they cannot overlap. That way, you can perform the transfers and not have to wait on fences at all (or at least, not until next frame).
Basically, you should treat your staging buffer like a ring buffer. Every operation that wants to do some staged transfer work should "allocate" X bytes of memory from the staging ring buffer. The staging buffer system allocates memory sequentially, wrapping around if there is insufficient space. But it also remembers where the last memory region is that it synchronized with. If you try to stage too much work, then it has to synchronize.
Also, one of the purposes behind mapping memory is that you can write directly to that memory, rather than writing to some other CPU memory and copying it in. So instead of passing in a VULKAN_BUFFER (whatever that is), the process that generated that data should have fetched a pointer to a region of the active staging buffer and written its data into that.
Oh, and one more thing: never, ever create a command buffer and immediately submit it. Just don't do it. There's a reason why vkQueueSubmit can take multiple command buffers, and multiple batches of command buffers. For any one queue, you should never be submitting more than once (or maybe twice) per frame.

Vulkan's transfer queue family capabilities and video card support: Are the condition checks accurate?

I am following this Vulkan Youtube video tutorial by Joshua Shucker. I'm currently on his 14th video where he is working on creating a secondary queue family for the vertex buffer. This focuses on the staging process for vertex buffers. My code matches that of his in his video except that of a cout statement in which I added for testing. Here is the function and structure for the Queue Families:
struct QueueFamilyIndices {
int graphicsFamily = -1;
int transferFamily = -1;
bool isComplete() {
return (graphicsFamily >= 0 && transferFamily >= 0);
}
};
QueueFamilyIndices FindQueueFamilies( const VkPhysicalDevice* device, const VkSurfaceKHR* surface ) {
QueueFamilyIndices indices;
uint32_t queueFamilyCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties( *device, &queueFamilyCount, nullptr );
std::vector<VkQueueFamilyProperties> queueFamilies( queueFamilyCount );
vkGetPhysicalDeviceQueueFamilyProperties( *device, &queueFamilyCount, queueFamilies.data() );
int i = 0;
for( const auto &queueFamily : queueFamilies ) {
VkBool32 presentSupport = false;
vkGetPhysicalDeviceSurfaceSupportKHR( *device, i, *surface, &presentSupport );
if( queueFamily.queueCount > 0 && (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) && presentSupport ) {
indices.graphicsFamily = i;
}
if( queueFamily.queueCount > 0 && (queueFamily.queueFlags & VK_QUEUE_TRANSFER_BIT) &&
!(queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) && presentSupport ) {
indices.transferFamily = i;
}
if( indices.isComplete() ) {
break;
}
i++;
}
if( indices.graphicsFamily >= 0 && indices.transferFamily == -1 ) {
std::cout << "Graphics family found, transfer family missing: using graphics family" << std::endl;
indices.transferFamily = indices.graphicsFamily;
}
return indices;
}
Within this function vkGetPhysicalDeviceSurfaceSupportKHR(...) is being called twice since there are 2 queue families that have been found after vkGetPhysicalDeviceQueueFamilyProperties(...) has been called to populate the vector of VkQueueFamilyProperties structures.
Here is the specs for my NVidia GeForce gtx 750 Ti card based on Vulkan's specifications for its queue families: Vulkan:Report and in case the link changes over time here is the information directly:
Queue family 0
queueCount 16
flags GRAPHICS_BIT
COMPUTE_BIT
TRANSFER_BIT
SPARSE_BINDING_BIT
timestampValidBits 64
minImageTransferGranularity.width 1
minImageTransferGranularity.height 1
minImageTransferGranularity.depth 1
supportsPresent 1
Queue family 1
queueCount 1
flags TRANSFER_BIT
timestampValidBits 64
minImageTransferGranularity.width 1
minImageTransferGranularity.height 1
minImageTransferGranularity.depth 1
supportsPresent 0
Now according to these specs which coincide with the values in my vector of structs while I'm stepping through the debugger my structures are populated with
the values of:
queueFamilies[0].queueFlags = 15;
queueFamilies[0].queueCount = 16;
queueFamilies[0].timestampValidBits = 64;
queueFamilies[0].minImageTransferGranularity = { width = 1, height = 1, depth = 1 };
queueFamilies[1].queueFlags = 4;
queueFamilies[1].queueCount = 1;
queueFamilies[1].timestampValidBits = 64;
queueFamilies[1].minImageTransferGranularity = { width = 1, height = 1, depth = 1 };
So this appears to me that my card does support a separate queueFamily specifically the transferFamily.
Based on my assumption of this support and stepping through this function he has two if statements to check for valid conditions within the for loop for each of the indexed queueFamily objects. The if statements are returning exactly as they should be. My code compiles and builds without any errors or warnings, and it still does render then triangle when I'm not running it through the debugger and it does exit with a code of (0). So the code appears to be fine. However I'm not getting the results that I would at least be expecting.
I'm not sure if there is a bug in his code that he happened to missed, if I'm misinterpreting my video card's support of this Vulkan functionality, or if this could be either a Vulkan API bug or NVidia Driver bug.
However as I was stepping through this function to find out why the indices.transferFamily variable was not being set to i; I noticed that on the second iteration of the loop it has nothing to do with the presence of the transferFamilyQueue, its parameter values, or flags. What is causing this if statement to return false is the presentSupport variable as it is being set to 0 on the second call which does match the data sheet above. So the output is as expected.
My question then becomes: Is there an actual implementation problem with the condition checking in the second if statement?
This is where I'm stuck as I am a bit confused because we are checking to see if there is a transferQueueFamily available and if so use that to create and use a stagingBuffer to copy the contents from the CPU to the GPU for the vertex buffer(s). From what I can see it appears my card does have this transferFamily but does not have supportPresent for this family. However, when thinking about it; if you are using a transferFamily - transferQueue you wouldn't want to present it directly as you'll just be copying the data from a temporary vertexBuffer on the CPU to the vertexBuffer that will be used on the GPU. So I'm wondering if the final check in this if statement is correct or not. If my assumptions about how Vulkan is working here is incorrect please don't hesitate to correct me as this is my first attempt at getting a Vulkan rendering application working.
There's no Vulkan API or NVidia Driver bug. It's right there in your report sheet:
supportsPresent 0
E.g. AMD does seem to support presents on VK_QUEUE_TRANSFER_BIT queue families, but it is purely optional (31.4. Querying for WSI Support):
Not all physical devices will include WSI support. Within a physical device, not all queue families will support presentation.
There's no good reason to be checking presentSupport when searching for a transfer specific queue. This is likely a copy and paste error somewhere. Typically you don't care if anything other than the graphics queue has support for presentation.
You do want to use a transfer queue that does not have the graphics bit set, as such a queue is likely to correspond to dedicated transfer hardware that will not impact the performance of work being done on the graphics queue.
After reading a few good answers here and doing some more testing on my end I think I have found an appropriate solution to the application code design. This function is called about 4 or 5 times throughout the application by other functions. It is called when Vulkan is being initialized, it is called again when rating the devices' suitability for choosing the best possible device that is available, it is also being called when creating the logical device, and so forth.
All of these initial calls typically only need the queueFamily's count and or index values to make sure that a suitable graphics device with a queueFamily is available for graphics processing and rendering.
However when this function is being called to create an arbitrary buffer that will be used as a staging buffer for an existing dedicated transfer queue this time we actually need the family queue and all of its properties. So to fix this problem; when checking for the graphicsQueue I left this last condition check to see if presentSupport is available, as for when the for loop iterates to the next index to check for the dedicated transferQueue, I omitted this condition check for the presentSupport all together.
QueueFamilyIndices FindQueueFamilies( const VkPhysicalDevice* device, const VkSurfaceKHR* surface ) {
QueueFamilyIndices indices;
uint32_t queueFamilyCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties( *device, &queueFamilyCount, nullptr );
std::vector<VkQueueFamilyProperties> queueFamilies( queueFamilyCount );
vkGetPhysicalDeviceQueueFamilyProperties( *device, &queueFamilyCount, queueFamilies.data() );
int i = 0;
for( const auto &queueFamily : queueFamilies ) {
VkBool32 presentSupport = false;
vkGetPhysicalDeviceSurfaceSupportKHR( *device, i, *surface, &presentSupport );
if( queueFamily.queueCount > 0 && (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) && presentSupport ) {
indices.graphicsFamily = i;
}
if( queueFamily.queueCount > 0 && (queueFamily.queueFlags & VK_QUEUE_TRANSFER_BIT) &&
!(queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) /*&& presentSupport*/ ) {
indices.transferFamily = i;
}
if( indices.isComplete() ) {
break;
}
i++;
}
if( indices.graphicsFamily >= 0 && indices.transferFamily == -1 ) {
std::cout << "Graphics family found, transfer family missing: using graphics family" << std::endl;
indices.transferFamily = indices.graphicsFamily;
}
return indices;
}
Now not only is the indices.transferFamily being set to i on the 2nd iteration; the check for indices.isComplete() is also returning true and the last if statement for the roll back is now returning false. Everything seems to being rendering properly without issue. It appears that the staging buffers are being copied over to the GPU now instead of using the CPU for the vertex buffer objects.

C++: both classes do not run concurrently

its my first time here. My code is suppose to make two ultrasonic sensors function at the same time using an mbed. However, i cant seem to make both classes void us_right() and void us_left() in the code run concurrently. Help please :(
#include "mbed.h"
DigitalOut triggerRight(p9);
DigitalIn echoRight(p10);
DigitalOut triggerLeft(p13);
DigitalIn echoLeft(p14);
//DigitalOut myled(LED1); //monitor trigger
//DigitalOut myled2(LED2); //monitor echo
PwmOut steering(p21);
PwmOut velocity(p22);
int distanceRight = 0, distanceLeft = 0;
int correctionRight = 0, correctionLeft = 0;
Timer sonarRight, sonarLeft;
float vo=0;
// Velocity expects -1 (reverse) to +1 (forward)
void Velocity(float v) {
v=v+1;
if (v>=0 && v<=2) {
if (vo>=1 && v<1) { //
velocity.pulsewidth(0.0014); // this is required to
wait(0.1); //
velocity.pulsewidth(0.0015); // move into reverse
wait(0.1); //
} //
velocity.pulsewidth(v/2000+0.001);
vo=v;
}
}
// Steering expects -1 (left) to +1 (right)
void Steering(float s) {
s=s+1;
if (s>=0 && s<=2) {
steering.pulsewidth(s/2000+0.001);
}
}
void us_right() {
sonarRight.reset();
sonarRight.start();
while (echoRight==2) {};
sonarRight.stop();
correctionRight = sonarLeft.read_us();
triggerRight = 1;
sonarRight.reset();
wait_us(10.0);
triggerRight = 0;
while (echoRight==0) {};
// myled2=echoRight;
sonarRight.start();
while (echoRight==1) {};
sonarRight.stop();
distanceRight = ((sonarRight.read_us()-correctionRight)/58.0);
printf("Distance from Right is: %d cm \n\r",distanceRight);
}
void us_left() {
sonarLeft.reset();
sonarLeft.start();
while (echoLeft==2) {};
sonarLeft.stop();
correctionLeft = sonarLeft.read_us();
triggerLeft = 1;
sonarLeft.reset();
wait_us(10.0);
triggerLeft = 0;
while (echoLeft==0) {};
// myled2=echoLeft;
sonarLeft.start();
while (echoLeft==1) {};
sonarLeft.stop();
distanceLeft = (sonarLeft.read_us()-correctionLeft)/58.0;
printf("Distance from Left is: %d cm \n\r",distanceLeft);
}
int main() {
while(true) {
us_right();
us_left();
}
if (distanceLeft < 10 || distanceRight < 10) {
if (distanceLeft < distanceRight) {
for(int i=0; i>-100; i--) { // Go left
Steering(i/100.0);
wait(0.1);
}
}
if (distanceLeft > distanceRight) {
for(int i=0; i>100; i++) { // Go Right
Steering(i/100.0);
wait(0.1);
}
}
}
wait(0.2);
}
You need to use some mechanism to create new threads or processes. Your implementation is sequential, there is nothing you do that tells the code to run concurrently.
You should take a look at some threads libraries (pthreads for example, or if you have access to c++11, there are thread functionality there) or how to create new processes as well as some kind of message passing interface between these processes.
Create two threads, one for each ultrasonic sensor:
void read_left_sensor() {
while (1) {
// do the reading
wait(0.5f);
}
}
int main() {
Thread left_thread;
left_thread.start(&read_left_sensor);
Thread right_thread;
right_thread.start(&read_right_sensor);
while (1) {
// put your control code for the vehicle here
wait(0.1f);
}
}
You can use global variables to write to when reading the sensor, and read them in your main loop. The memory is shared.
Your first problem is that you have placed code outside of your infinite while(true) loop. This later code will never run. But maybe you know this.
int main() {
while(true) {
us_right();
us_left();
} // <- Loops back to the start of while()
// You Never pass this point!!!
if (distanceLeft < 10 || distanceRight < 10) {
// Do stuff etc.
}
wait(0.2);
}
But, I think you are expecting us_right() and us_left() to happen at exactly the same time. You cannot do that in a sequential environment.
Jan Jongboom is correct in suggesting you could use Threads. This allows the 'OS' to designate time for each piece of code to run. But it is still not truly parallel. Each function (classes are a different thing) will get a chance to run. One will run, and when it is finished (or during a wait) another function will get its chance to run.
As you are using an mbed, I'd suggest that your project is an MBED OS 5 project
(you select this when you start a new project). Otherwise you'll need to use an RTOS library. There is a blinky example using threads that should sum it up well. Here is more info.
Threading can be dangerous for someone without experience. So stick to a simple implementation to start with. Make sure you understand what/why/how you are doing it.
Aside: From a hardware perspective, running ultrasonic sensors in parallel is actually not ideal. They both broadcast the same frequency, and can hear each other. Triggering them at the same time, they interfere with each other.
Imagine two people shouting words in a closed room. If they take turns, it will be obvious what they are saying. If they both shout at the same time, it will be very hard!
So actually, not being able to run in parallel is probably a good thing.

Memory validate in difficult task within thread

I'm currently creating a sound system for my project. Every call PlayAsync creating instance of sound in std::thread callback. The sound data proceed in cycle in this callback. When thread proceeds it store sound instance in static vector. When thread ends (sound complete) - it delete sound instance and decrement instance count. When application ends - it must stop all sounds immediate, sending interrupt to every cycle of sound.
The problem is in array keeping these sounds. I am not sure, but I think vector isn't right choice for this purpose.. Here is a code.
void gSound::PlayAsync()
{
std::thread t(gSound::Play,mp_Audio,std::ref(*this));
t.detach();
}
HRESULT gSound::Play(IXAudio2* s_XAudio,gSound& sound)
{
gSound* pSound = new gSound(sound);
pSound->m_Disposed = false;
HRESULT hr;
// Create the source voice
IXAudio2SourceVoice* pSourceVoice;
if( FAILED( hr = s_XAudio->CreateSourceVoice( &pSourceVoice, pSound->pwfx ) ) )
{
gDebug::ShowMessage(L"Error creating source voice");
return hr;
}
// Submit the wave sample data using an XAUDIO2_BUFFER structure
XAUDIO2_BUFFER buffer = {0};
buffer.pAudioData = pSound->pbWaveData;
buffer.Flags = XAUDIO2_END_OF_STREAM; // tell the source voice not to expect any data after this buffer
buffer.AudioBytes = pSound->cbWaveSize;
if( FAILED( hr = pSourceVoice->SubmitSourceBuffer( &buffer ) ) )
{
gDebug::ShowMessage(L"Error submitting source buffer");
pSourceVoice->DestroyVoice();
return hr;
}
hr = pSourceVoice->Start( 0 );
// Let the sound play
BOOL isRunning = TRUE;
m_soundInstanceCount++;
mp_SoundInstances.push_back(pSound); #MARK2
while( SUCCEEDED( hr ) && isRunning && pSourceVoice != nullptr && !pSound->m_Interrupted)
{
XAUDIO2_VOICE_STATE state;
pSourceVoice->GetState( &state );
isRunning = ( state.BuffersQueued > 0 ) != 0;
Sleep(10);
}
pSourceVoice->DestroyVoice();
delete pSound;pSound = nullptr; //its correct ??
m_soundInstanceCount--;
return 0;
}
void gSound::InterrupAllSoundInstances()
{
for(auto Iter = mp_SoundInstances.begin(); Iter != mp_SoundInstances.end(); Iter++)
{
if(*Iter != nullptr)//#MARK1
{
(*Iter)->m_Interrupted = true;
}
}
}
And this I call in application class before disposing sound objects, after main application loop immediate.
gSound::InterrupAllSoundInstances();
while (gSound::m_soundInstanceCount>0)//waiting for deleting all sound instances in threads
{
}
Questions:
So #MARK1 - How to check memory validation in vector? I don't have experience about it. And get errors when try check invalid memory (it's not equals null)
And #MARK2 - How to use vector correctly? Or maybe vector is bad choice? Every time I create sound instance it increases size. It's not good.
A typical issue:
delete pSound;
pSound = nullptr; // issue
This does not do what you think.
It will effectively set pSound to null, but there are other copies of the same pointer too (at least one in the vector) which do not get nullified. This is why you do not find nullptr in your vector.
Instead you could register the index into the vector and nullify that: mp_SoundInstances[index] = nullptr;.
However, I am afraid that you simply do not understand memory handling well and you lack structure. For memory handling, it's hard to tell without details and your system seems complicated enough that I am afraid it would tell too long to explain. For structure, you should read a bit about the Observer pattern.