Intel OneAPI Video decoding memory leak when using C++ CLI

Intel OneAPI Video decoding memory leak when using C++ CLI - c++

I am trying to use Intel OneAPI/OneVPL to decode a stream I receive from an RTSP Camera in C#. But when I run the code I get an enormous memory leak. Around 1-200MB per run, which is around once every second.
When I've collected a GoP from the camera where I know the first data is a keyframe I pass it as a byte array to my CLI and C++ code.
Here I expect it to decode all the frames and return decoded images. It receives 30 frames and returns 16 decoded images but has a memory leak.
I've tried to use Visual Studio memory profiler and all I can tell from it is that its unmanaged memory that's my problem. I've tried to override the "new" and "delete" method inside videoHandler.cpp to track and compare all allocations and deallocations and as far as I can tell everything is handled correctly in there. I cannot see any classes that get instantiated that do not get cleaned up. I think my issue is in the CLI class videoHandlerWrapper.cpp. Am I missing something obvious?
videoHandlerWrapper.cpp
array<imgFrameWrapper^>^ videoHandlerWrapper::decode(array<System::Byte>^ byteArray)
{
array<imgFrameWrapper^>^ returnFrames = gcnew array<imgFrameWrapper^>(30);
{
std::vector<imgFrame> frames(30); //Output from decoding process. imgFrame implements a deconstructor that will rid the data when exiting scope
std::vector<unsigned char> bytes(byteArray->Length); //Input for decoding process
Marshal::Copy(byteArray, 0, IntPtr((unsigned char*)(&((bytes)[0]))), byteArray->Length); //Copy from managed (C#) to unmanaged (C++)
int status = _pVideoHandler->decode(bytes, frames); //Decode
for (size_t i = 0; i < frames.size(); i++)
{
if (frames[i].size > 0)
returnFrames[i] = gcnew imgFrameWrapper(frames[i].size, frames[i].bytes);
}
}
//PrintMemoryUsage();
return returnFrames;
}
videoHandler.cpp
#define BITSTREAM_BUFFER_SIZE 2000000 //TODO Maybe higher or lower bitstream buffer. Thorough testing has been done at 2000000
int videoHandler::decode(std::vector<unsigned char> bytes, std::vector<imgFrame> &frameData)
{
int result = -1;
bool isStillGoing = true;
mfxBitstream bitstream = { 0 };
mfxSession session = NULL;
mfxStatus sts = MFX_ERR_NONE;
mfxSurfaceArray* outSurfaces = nullptr;
mfxU32 framenum = 0;
mfxU32 numVPPCh = 0;
mfxVideoChannelParam* mfxVPPChParams = nullptr;
void* accelHandle = NULL;
mfxVideoParam mfxDecParams = {};
mfxVersion version = { 0, 1 };
//variables used only in 2.x version
mfxConfig cfg = NULL;
mfxLoader loader = NULL;
mfxVariant inCodec = {};
std::vector<mfxU8> input_buffer;
// Initialize VPL session for any implementation of HEVC/H265 decode
loader = MFXLoad();
VERIFY(NULL != loader, "MFXLoad failed -- is implementation in path?");
cfg = MFXCreateConfig(loader);
VERIFY(NULL != cfg, "MFXCreateConfig failed")
inCodec.Type = MFX_VARIANT_TYPE_U32;
inCodec.Data.U32 = MFX_CODEC_AVC;
sts = MFXSetConfigFilterProperty(
cfg,
(mfxU8*)"mfxImplDescription.mfxDecoderDescription.decoder.CodecID",
inCodec);
VERIFY(MFX_ERR_NONE == sts, "MFXSetConfigFilterProperty failed for decoder CodecID");
sts = MFXCreateSession(loader, 0, &session);
VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
// Print info about implementation loaded
version = ShowImplInfo(session);
//VERIFY(version.Major > 1, "Sample requires 2.x API implementation, exiting");
if (version.Major == 1) {
mfxVariant ImplValueSW;
ImplValueSW.Type = MFX_VARIANT_TYPE_U32;
ImplValueSW.Data.U32 = MFX_IMPL_TYPE_SOFTWARE;
MFXSetConfigFilterProperty(cfg, (mfxU8*)"mfxImplDescription.Impl", ImplValueSW);
sts = MFXCreateSession(loader, 0, &session);
VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
}
// Convenience function to initialize available accelerator(s)
accelHandle = InitAcceleratorHandle(session);
bitstream.MaxLength = BITSTREAM_BUFFER_SIZE;
bitstream.Data = (mfxU8*)calloc(bytes.size(), sizeof(mfxU8));
VERIFY(bitstream.Data, "Not able to allocate input buffer");
bitstream.CodecId = MFX_CODEC_AVC;
std::copy(bytes.begin(), bytes.end(), bitstream.Data);
bitstream.DataLength = static_cast<mfxU32>(bytes.size());
memset(&mfxDecParams, 0, sizeof(mfxDecParams));
mfxDecParams.mfx.CodecId = MFX_CODEC_AVC;
mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
sts = MFXVideoDECODE_DecodeHeader(session, &bitstream, &mfxDecParams);
VERIFY(MFX_ERR_NONE == sts, "Error decoding header\n");
numVPPCh = 1;
mfxVPPChParams = new mfxVideoChannelParam[numVPPCh];
for (mfxU32 i = 0; i < numVPPCh; i++) {
mfxVPPChParams[i] = {};
}
//mfxVPPChParams[0].VPP.FourCC = mfxDecParams.mfx.FrameInfo.FourCC;
mfxVPPChParams[0].VPP.FourCC = MFX_FOURCC_BGRA;
mfxVPPChParams[0].VPP.ChromaFormat = MFX_CHROMAFORMAT_YUV420;
mfxVPPChParams[0].VPP.PicStruct = MFX_PICSTRUCT_PROGRESSIVE;
mfxVPPChParams[0].VPP.FrameRateExtN = 30;
mfxVPPChParams[0].VPP.FrameRateExtD = 1;
mfxVPPChParams[0].VPP.CropW = 1920;
mfxVPPChParams[0].VPP.CropH = 1080;
//Set value directly if input and output is the same.
mfxVPPChParams[0].VPP.Width = 1920;
mfxVPPChParams[0].VPP.Height = 1080;
//// USED TO RESIZE. IF INPUT IS THE SAME AS OUTPUT THIS WILL MAKE IT SHIFT A BIT. 1920x1080 becomes 1920x1088.
//mfxVPPChParams[0].VPP.Width = ALIGN16(mfxVPPChParams[0].VPP.CropW);
//mfxVPPChParams[0].VPP.Height = ALIGN16(mfxVPPChParams[0].VPP.CropH);
mfxVPPChParams[0].VPP.ChannelId = 1;
mfxVPPChParams[0].Protected = 0;
mfxVPPChParams[0].IOPattern = MFX_IOPATTERN_IN_SYSTEM_MEMORY | MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
mfxVPPChParams[0].ExtParam = NULL;
mfxVPPChParams[0].NumExtParam = 0;
sts = MFXVideoDECODE_VPP_Init(session, &mfxDecParams, &mfxVPPChParams, numVPPCh); //This causes a MINOR memory leak!
outSurfaces = new mfxSurfaceArray;
while (isStillGoing == true) {
sts = MFXVideoDECODE_VPP_DecodeFrameAsync(session,
&bitstream,
NULL,
0,
&outSurfaces); //Big memory leak. 100MB pr run in the while loop.
switch (sts) {
case MFX_ERR_NONE:
// decode output
if (framenum >= 30)
{
isStillGoing = false;
break;
}
sts = WriteRawFrameToByte(outSurfaces->Surfaces[1], &frameData[framenum]);
VERIFY(MFX_ERR_NONE == sts, "Could not write 1st vpp output");
framenum++;
break;
case MFX_ERR_MORE_DATA:
// The function requires more bitstream at input before decoding can proceed
isStillGoing = false;
break;
case MFX_ERR_MORE_SURFACE:
// The function requires more frame surface at output before decoding can proceed.
// This applies to external memory allocations and should not be expected for
// a simple internal allocation case like this
break;
case MFX_ERR_DEVICE_LOST:
// For non-CPU implementations,
// Cleanup if device is lost
break;
case MFX_WRN_DEVICE_BUSY:
// For non-CPU implementations,
// Wait a few milliseconds then try again
break;
case MFX_WRN_VIDEO_PARAM_CHANGED:
// The decoder detected a new sequence header in the bitstream.
// Video parameters may have changed.
// In external memory allocation case, might need to reallocate the output surface
break;
case MFX_ERR_INCOMPATIBLE_VIDEO_PARAM:
// The function detected that video parameters provided by the application
// are incompatible with initialization parameters.
// The application should close the component and then reinitialize it
break;
case MFX_ERR_REALLOC_SURFACE:
// Bigger surface_work required. May be returned only if
// mfxInfoMFX::EnableReallocRequest was set to ON during initialization.
// This applies to external memory allocations and should not be expected for
// a simple internal allocation case like this
break;
default:
printf("unknown status %d\n", sts);
isStillGoing = false;
break;
}
}
sts = MFXVideoDECODE_VPP_Close(session); // Helps massively! Halves the memory leak speed. Closes internal structures and tables.
VERIFY(MFX_ERR_NONE == sts, "Error closing VPP session\n");
result = 0;
end:
printf("Decode and VPP processed %d frames\n", framenum);
// Clean up resources - It is recommended to close components first, before
// releasing allocated surfaces, since some surfaces may still be locked by
// internal resources.
if (mfxVPPChParams)
delete[] mfxVPPChParams;
if (outSurfaces)
delete outSurfaces;
if (bitstream.Data)
free(bitstream.Data);
if (accelHandle)
FreeAcceleratorHandle(accelHandle);
if (loader)
MFXUnload(loader);
return result;
}
imgFrameWrapper.h
public ref class imgFrameWrapper
{
private:
size_t size;
array<System::Byte>^ bytes;
public:
imgFrameWrapper(size_t u_size, unsigned char* u_bytes);
~imgFrameWrapper();
!imgFrameWrapper();
size_t get_size();
array<System::Byte>^ get_bytes();
};
imgFrameWrapper.cpp
imgFrameWrapper::imgFrameWrapper(size_t u_size, unsigned char* u_bytes)
{
size = u_size;
bytes = gcnew array<System::Byte>(size);
Marshal::Copy((IntPtr)u_bytes, bytes, 0, size);
}
imgFrameWrapper::~imgFrameWrapper()
{
}
imgFrameWrapper::!imgFrameWrapper()
{
}
size_t imgFrameWrapper::get_size()
{
return size;
}
array<System::Byte>^ imgFrameWrapper::get_bytes()
{
return bytes;
}
imgFrame.h
struct imgFrame
{
int size;
unsigned char* bytes;
~imgFrame()
{
if (bytes)
delete[] bytes;
}
};

MFXVideoDECODE_VPP_DecodeFrameAsync() function creates internal memory surfaces for the processing.
You should release surfaces.
Please check this link it's mentioning about it.
https://spec.oneapi.com/onevpl/latest/API_ref/VPL_structs_decode_vpp.html#_CPPv415mfxSurfaceArray
mfxStatus (*Release)(struct mfxSurfaceArray *surface_array)¶
Decrements the internal reference counter of the surface. (*Release) should be
called after using the (*AddRef) function to add a surface or when allocation
logic requires it.
And please check this sample.
https://github.com/oneapi-src/oneVPL/blob/master/examples/hello-decvpp/src/hello-decvpp.cpp
Especially, WriteRawFrame_InternalMem() function in https://github.com/oneapi-src/oneVPL/blob/17968d8d2299352f5a9e09388d24e81064c81c87/examples/util/util/util.h
It shows how to release surfaces.

Related

Why I receive no WT_PACKETEXT window messages after initializing Wintab extensions?

I'm currently trying to process the absolute value of a drawing tablet's touch ring, through the Wintab API. However, despite following instructions as they are described in the documentation, it seems like the WTOpen call doesn't set any extension settings. Using the touch ring after initializing Wintab still triggers the default events, while the default events for pen inputs are suppressed and all pen inputs related to my application instead.
Here are the relevant segments of code:
...
#include "wintab.h"
#define PACKETDATA (PK_X | PK_Y | PK_Z | PK_NORMAL_PRESSURE | PK_ORIENTATION | PK_TIME | PK_BUTTONS)
#define PACKETMODE 0
#define PACKETTOUCHSTRIP PKEXT_ABSOLUTE
#define PACKETTOUCHRING PKEXT_ABSOLUTE
#include "pktdef.h"
...
internal b32
InitWinTab(HWND Window, window_mapping *Map)
{
if(!LoadWintabFunctions())
return false;
LOGCONTEXT Tablet;
AXIS TabletX, TabletY, TabletZ, Pressure;
if(!gpWTInfoA(WTI_DEFCONTEXT, 0, &Tablet))
return false;
gpWTInfoA(WTI_DEVICES, DVC_X, &TabletX);
gpWTInfoA(WTI_DEVICES, DVC_Y, &TabletY);
gpWTInfoA(WTI_DEVICES, DVC_Z, &TabletZ);
gpWTInfoA(WTI_DEVICES, DVC_NPRESSURE, &Pressure);
UINT TouchStripOffset = 0xFFFF;
UINT TouchRingOffset = 0xFFFF;
for(UINT i = 0, ScanTag = 0; gpWTInfoA(WTI_EXTENSIONS + i, EXT_TAG, &ScanTag); i++)
{
if (ScanTag == WTX_TOUCHSTRIP)
TouchStripOffset = i;
if (ScanTag == WTX_TOUCHRING)
TouchRingOffset = i;
}
Tablet.lcOptions |= CXO_MESSAGES;
Tablet.lcPktData = PACKETDATA;
Tablet.lcPktMode = PACKETMODE;
Tablet.lcMoveMask = PACKETDATA;
Tablet.lcBtnUpMask = Tablet.lcBtnDnMask;
Tablet.lcInOrgX = 0;
Tablet.lcInOrgY = 0;
Tablet.lcInExtX = TabletX.axMax;
Tablet.lcInExtY = TabletY.axMax;
if(TouchStripOffset != 0xFFFF)
{
WTPKT DataMask;
gpWTInfoA(WTI_EXTENSIONS + TouchStripOffset, EXT_MASK, &DataMask);
Tablet.lcPktData |= DataMask;
}
if(TouchRingOffset != 0xFFFF)
{
WTPKT DataMask;
gpWTInfoA(WTI_EXTENSIONS + TouchRingOffset, EXT_MASK, &DataMask);
Tablet.lcPktData |= DataMask;
}
Map->AxisMax.x = (r32)TabletX.axMax;
Map->AxisMax.y = (r32)TabletY.axMax;
Map->AxisMax.z = (r32)TabletZ.axMax;
Map->PressureMax = (r32)Pressure.axMax;
if(!gpWTOpenA(Window, &Tablet, TRUE))
return false;
return(TabletX.axMax && TabletY.axMax && TabletZ.axMax && Pressure.axMax);
}
...
case WT_PACKET:
{
PACKET Packet;
if(gpWTPacket((HCTX)LParam, (UINT)WParam, &Packet))
{
...
}
} break;
case WT_PACKETEXT:
{
PACKETEXT Packet;
if(gpWTPacket((HCTX)LParam, (UINT)WParam, &Packet))
{
...
}
} break;
...
The bitmask for the packet data in the initialization have sensible bits set for both extensions and don't overlap with the existing bitmask. No stage of the initialization fails. WT_PACKET gets called only with valid packet data while WT_PACKETEXT never gets called. Furthermore, calling WTPacketsGet with a PACKETEXT pointer on the HCTX returned by WTOpen fills the packet with garbage from the regular packet queue. This leaves me with the conclusion that somehow WTOpen didn't receive notification that the extensions should be loaded and I'm unable to find what else I should define in the LOGCONTEXT data structure to change that.
Is there a mistake in my initialization? Or is there a way to get a better readout to why the extensions didn't load?

It turned out that a driver setting prevented the extension packets from being sent, in favor of using the touch ring for different function. Changing this setting resolved the issue. The code didn't contain any errors itself.

How to solve mesh corruption with staging buffer on Vulkan Api

I am found a bug in my code, that cause mesh data corruption in certain situation using staging buffer. I have:
temporary mesh data
staging buffer with certain size, that used simultaneously by command buffer and memcpy, but not same segment at a time.
Buffer allocator, that gives part of suitable vertex-index buffer, where mesh data transfers from staging by vkCmdCopyBuffer. Buffer contains many of segments, given for different meshes.
The issue that when I am using staging buffer simultaneously by command buffer and memcpy, mesh data writes incorrectly (become overwritten/corrupted) and even badly can cause VK_ERROR_DEVICE_LOST .
https://imgur.com/8p53SUW "correct mesh"
https://imgur.com/plJ8V0v "broken mesh"
[[nodiscard]] static Result writeMeshBuffer(TransferData &data, GpuMesh &buffer)
{
Result result; using namespace vkw;
auto &mesh = buffer.source;
size_t vSize = mesh.vertices_count * mesh.vertex_size;
size_t iSize = mesh.indices_count * mesh.index_size;
size_t mesh_size = vSize + iSize;
auto &staging_offset = data.stagingData.buffer_offset_unused;
// write data to staging buffer
{
// guaranteed that mesh_size will less or equal than staging buffer size
//FIXME false condition generate broken meshes somehow
bool is_wait_before = mesh_size > TransferStagingData::BUFFER_SIZE - staging_offset;
//will work correctly:
//bool is_wait_before = true;
if (is_wait_before) // if we need more memory on staging buffer than not used already
{
result = data.wait_transfer();
if (result != VK_SUCCESS)
return result;
staging_offset = 0;
}
uint8_t *pMemory = static_cast<uint8_t*>(data.stagingData.pMemory) + staging_offset;
memcpy(pMemory, mesh.vertices.pX, vSize);
memcpy(pMemory + vSize, mesh.indices.pXX, iSize);
if (not is_wait_before)
{
result = data.wait_transfer();
if (result != VK_SUCCESS)
return result;
}
}
// write data from staging buffer to mesh buffer
{
auto cmd_cpy_buff = [](CommandBuffer cmd, BufferCopy copy, Offsets offsets, DeviceSizeT size)
{
cmd.cmd_copy_buffer(copy, offsets, size);
};
// SRC DST
BufferCopy copy = { data.stagingData.buffer, buffer.info.buffer };
Offsets offsets = { staging_offset, buffer.info.region.offset };
result = data.transfer.prepare(cmd_cpy_buff, data.transfer.cmd_buffer, copy, offsets, mesh_size);
if (result != VK_SUCCESS)
return result;
data.reset_fence();
result = data.transfer.submit({&data.transfer.cmd_buffer,1},{}, {}, {}, data.transferFence);
if (result != VK_SUCCESS)
return result;
}
// save usused offset to data.stagingData.buffer_offset_unused;
staging_offset = staging_offset == 0 ? mesh_size : 0;
return result;
}
If I can't use staging buffer like this, than why.
If i have an error, idk where.

The issue was
staging_offset = staging_offset == 0 ? mesh_size : 0;
Need to change
staging_offset = staging_offset == 0 ? TransferStagingData::BUFFER_SIZE - mesh_size : 0;
And after change all works correctly.

E_INVALIDARG when calling CreateGraphicsPipelineState

I've been getting a weird error when calling CreateGraphicsPipelineState().
The function returns E_INVALIDARG even though the description is all set up.
The description worked before and the I tried to add indexbuffers to my pipeline, I didn't even touch any of the code for PSO or Shaders and now the PSO creation is all messed up.
The issue is that I don't get any DX-error messages from the driver when enabling the debug-layer. I only get this
"Microsoft C++ exception: _com_error at memory location
when I step through the function.
It feels like it is some pointer error or similar, but I can't figure out what the error is. Perhaps anyone of you can see an obvious mistake that I made?
Here's my code:
CGraphicsPSO* pso = new CGraphicsPSO();
D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
// Input Layout
std::vector<D3D12_INPUT_ELEMENT_DESC> elements;
if (aPSODesc.inputLayout != nullptr)
{
auto& ilData = aPSODesc.inputLayout->desc;
for (auto& element : ilData)
{
// All Data here is correct when breaking
D3D12_INPUT_ELEMENT_DESC elementDesc;
elementDesc.SemanticName = element.mySemanticName;
elementDesc.SemanticIndex = element.mySemanticIndex;
elementDesc.InputSlot = element.myInputSlot;
elementDesc.AlignedByteOffset = element.myAlignedByteOffset;
elementDesc.InputSlotClass = _ConvertInputClassificationDX12(element.myInputSlotClass);
elementDesc.Format = _ConvertFormatDX12(element.myFormat);
elementDesc.InstanceDataStepRate = element.myInstanceDataStepRate;
elements.push_back(elementDesc);
}
D3D12_INPUT_LAYOUT_DESC inputLayout = {};
inputLayout.NumElements = (UINT)elements.size();
inputLayout.pInputElementDescs = elements.data();
psoDesc.InputLayout = inputLayout;
}
// TOPOLOGY
switch (aPSODesc.topology)
{
default:
case EPrimitiveTopology::TriangleList:
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; // <--- Always this option
break;
case EPrimitiveTopology::PointList:
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT;
break;
case EPrimitiveTopology::LineList:
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
break;
//case EPrimitiveTopology::Patch:
// psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH;
// break;
}
// Shaders
if (aPSODesc.vs != nullptr)
{
D3D12_SHADER_BYTECODE vertexShaderBytecode = {};
vertexShaderBytecode.BytecodeLength = aPSODesc.vs->myByteCodeSize;
vertexShaderBytecode.pShaderBytecode = aPSODesc.vs->myByteCode;
psoDesc.VS = vertexShaderBytecode;
}
if (aPSODesc.ps != nullptr)
{
D3D12_SHADER_BYTECODE pixelShaderBytecode = {};
pixelShaderBytecode.BytecodeLength = aPSODesc.ps->myByteCodeSize;
pixelShaderBytecode.pShaderBytecode = aPSODesc.ps->myByteCode;
psoDesc.PS = pixelShaderBytecode;
}
psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; // format of the render target
DXGI_SAMPLE_DESC sampleDesc = {};
sampleDesc.Count = 1;
sampleDesc.Quality = 0;
psoDesc.DepthStencilState.DepthEnable = FALSE;
psoDesc.DepthStencilState.StencilEnable = FALSE;
psoDesc.SampleDesc = sampleDesc; // must be the same sample description as the swapchain and depth/stencil buffer
psoDesc.SampleMask = UINT_MAX; // sample mask has to do with multi-sampling. 0xffffffff means point sampling is done
psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); // a default rasterizer state.
psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); // a default blent state.
psoDesc.NumRenderTargets = 1; // we are only binding one render target
psoDesc.pRootSignature = myGraphicsRootSignature;
psoDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
ID3D12PipelineState* pipelineState;
HRESULT hr = myDevice->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&pipelineState));
pso->myPipelineState = pipelineState;
if (FAILED(hr))
{
delete pso;
return nullptr;
}
return pso;

So I just found the error.
It seems like the way I parsed my semantics for my input-layout gave me an invalid pointer.
Thus the memory at the adress was invalid and giving the DX12-device incorrect decriptions.
So what I did was to locally store the semantic-names within my CreatePSO function until the PSO was created, and now it all works.

Looks to me like the pointers to storage you promised are going out of scope.
..
D3D12_INPUT_LAYOUT_DESC inputLayout = {};
..
psoDesc.InputLayout = inputLayout;
}

nvEncRegisterResource() fails with -23

I've hit a complete brick wall in my attempt to use NVEnc to stream OpenGL frames as H264. I've been at this particular issue for close to 8 hours without any progress.
The problem is the call to nvEncRegisterResource(), which invariably fails with code -23 (enum value NV_ENC_ERR_RESOURCE_REGISTER_FAILED, documented as "failed to register the resource" - thanks NVidia).
I'm trying to follow a procedure outlined in this document from the University of Oslo (page 54, "OpenGL interop"), so I know for a fact that this is supposed to work, though unfortunately said document does not provide the code itself.
The idea is fairly straightforward:
map the texture produced by the OpenGL frame buffer object into CUDA;
copy the texture into a (previously allocated) CUDA buffer;
map that buffer as an NVEnc input resource
use that input resource as the source for the encoding
As I said, the problem is step (3). Here are the relevant code snippets (I'm omitting error handling for brevity.)
// Round up width and height
priv->encWidth = (_resolution.w + 31) & ~31, priv->encHeight = (_resolution.h + 31) & ~31;
// Allocate CUDA "pitched" memory to match the input texture (YUV, one byte per component)
cuErr = cudaMallocPitch(&priv->cudaMemPtr, &priv->cudaMemPitch, 3 * priv->encWidth, priv->encHeight);
This should allocate on-device CUDA memory (the "pitched" variety, though I've tried non-pitched too, without any change in the outcome.)
// Register the CUDA buffer as an input resource
NV_ENC_REGISTER_RESOURCE regResParams = { 0 };
regResParams.version = NV_ENC_REGISTER_RESOURCE_VER;
regResParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
regResParams.width = priv->encWidth;
regResParams.height = priv->encHeight;
regResParams.bufferFormat = NV_ENC_BUFFER_FORMAT_YUV444_PL;
regResParams.resourceToRegister = priv->cudaMemPtr;
regResParams.pitch = priv->cudaMemPitch;
encStat = nvEncApi.nvEncRegisterResource(priv->nvEncoder, &regResParams);
// ^^^ FAILS
priv->nvEncInpRes = regResParams.registeredResource;
This is the brick wall. No matter what I try, nvEncRegisterResource() fails.
I should note that I rather think (though I may be wrong) that I've done all the required initializations. Here is the code that creates and activates the CUDA context:
// Pop the current context
cuRes = cuCtxPopCurrent(&priv->cuOldCtx);
// Create a context for the device
priv->cuCtx = nullptr;
cuRes = cuCtxCreate(&priv->cuCtx, CU_CTX_SCHED_BLOCKING_SYNC, priv->cudaDevice);
// Push our context
cuRes = cuCtxPushCurrent(priv->cuCtx);
.. followed by the creation of the encoding session:
// Create an NV Encoder session
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS nvEncSessParams = { 0 };
nvEncSessParams.apiVersion = NVENCAPI_VERSION;
nvEncSessParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
nvEncSessParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
nvEncSessParams.device = priv->cuCtx; // nullptr
auto encStat = nvEncApi.nvEncOpenEncodeSessionEx(&nvEncSessParams, &priv->nvEncoder);
And finally, the code initializing the encoder:
// Configure the encoder via preset
NV_ENC_PRESET_CONFIG presetConfig = { 0 };
GUID codecGUID = NV_ENC_CODEC_H264_GUID;
GUID presetGUID = NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID;
presetConfig.version = NV_ENC_PRESET_CONFIG_VER;
presetConfig.presetCfg.version = NV_ENC_CONFIG_VER;
encStat = nvEncApi.nvEncGetEncodePresetConfig(priv->nvEncoder, codecGUID, presetGUID, &presetConfig);
NV_ENC_INITIALIZE_PARAMS initParams = { 0 };
initParams.version = NV_ENC_INITIALIZE_PARAMS_VER;
initParams.encodeGUID = codecGUID;
initParams.encodeWidth = priv->encWidth;
initParams.encodeHeight = priv->encHeight;
initParams.darWidth = 1;
initParams.darHeight = 1;
initParams.frameRateNum = 25; // TODO: make this configurable
initParams.frameRateDen = 1; // ditto
// .max_surface_count = (num_mbs >= 8160) ? 32 : 48;
// .buffer_delay ? necessary
initParams.enableEncodeAsync = 0;
initParams.enablePTD = 1;
initParams.presetGUID = presetGUID;
memcpy(&priv->nvEncConfig, &presetConfig.presetCfg, sizeof(priv->nvEncConfig));
initParams.encodeConfig = &priv->nvEncConfig;
encStat = nvEncApi.nvEncInitializeEncoder(priv->nvEncoder, &initParams);
All the above initializations report success.
I'd be extremely grateful to anyone who can get me past this hurdle.
EDIT: here is the complete code to reproduce the problem. The only observable difference to the original code is that cuPopContext() returns an error (which can be ignored) here - probably my original program creates such a context as a side effect of using OpenGL. Otherwise, the code behaves exactly as the original does.
I've built the code with Visual Studio 2013. You must link the following library file (adapt path if not on C:): C:\Program Files (x86)\NVIDIA GPU Computing Toolkit\CUDA\v7.5\lib\Win32\cuda.lib
You must also make sure that C:\Program Files (x86)\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include\ (or similar) is in the include path.
NEW EDIT: modified the code to only use the CUDA driver interface, instead of mixing with the runtime API. Still the same error code.
#ifdef _WIN32
#include <Windows.h>
#endif
#include <cassert>
#include <GL/gl.h>
#include <iostream>
#include <string>
#include <stdexcept>
#include <string>
#include <cuda.h>
//#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include <nvEncodeAPI.h>
// NV Encoder API ---------------------------------------------------
#if defined(_WIN32)
#define LOAD_FUNC(l, s) GetProcAddress(l, s)
#define DL_CLOSE_FUNC(l) FreeLibrary(l)
#else
#define LOAD_FUNC(l, s) dlsym(l, s)
#define DL_CLOSE_FUNC(l) dlclose(l)
#endif
typedef NVENCSTATUS(NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
struct NVEncAPI : public NV_ENCODE_API_FUNCTION_LIST {
public:
// ~NVEncAPI() { cleanup(); }
void init() {
#if defined(_WIN32)
if (sizeof(void*) == 8) {
nvEncLib = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
}
else {
nvEncLib = LoadLibrary(TEXT("nvEncodeAPI.dll"));
}
if (nvEncLib == NULL) throw std::runtime_error("Failed to load NVidia Encoder library: " + std::to_string(GetLastError()));
#else
nvEncLib = dlopen("libnvidia-encode.so.1", RTLD_LAZY);
if (nvEncLib == nullptr)
throw std::runtime_error("Failed to load NVidia Encoder library: " + std::string(dlerror()));
#endif
auto nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE) LOAD_FUNC(nvEncLib, "NvEncodeAPICreateInstance");
version = NV_ENCODE_API_FUNCTION_LIST_VER;
NVENCSTATUS encStat = nvEncodeAPICreateInstance(static_cast<NV_ENCODE_API_FUNCTION_LIST *>(this));
}
void cleanup() {
#if defined(_WIN32)
if (nvEncLib != NULL) {
FreeLibrary(nvEncLib);
nvEncLib = NULL;
}
#else
if (nvEncLib != nullptr) {
dlclose(nvEncLib);
nvEncLib = nullptr;
}
#endif
}
private:
#if defined(_WIN32)
HMODULE nvEncLib;
#else
void* nvEncLib;
#endif
bool init_done;
};
static NVEncAPI nvEncApi;
// Encoder class ----------------------------------------------------
class Encoder {
public:
typedef unsigned int uint_t;
struct Size { uint_t w, h; };
Encoder() {
CUresult cuRes = cuInit(0);
nvEncApi.init();
}
void init(const Size & resolution, uint_t texture) {
NVENCSTATUS encStat;
CUresult cuRes;
texSize = resolution;
yuvTex = texture;
// Purely for information
int devCount = 0;
cuRes = cuDeviceGetCount(&devCount);
// Initialize NVEnc
initEncodeSession(); // start an encoding session
initEncoder();
// Register the YUV texture as a CUDA graphics resource
// CODE COMMENTED OUT AS THE INPUT TEXTURE IS NOT NEEDED YET (TO MY UNDERSTANDING) AT SETUP TIME
//cudaGraphicsGLRegisterImage(&priv->cudaInpTexRes, priv->yuvTex, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
// Allocate CUDA "pitched" memory to match the input texture (YUV, one byte per component)
encWidth = (texSize.w + 31) & ~31, encHeight = (texSize.h + 31) & ~31;
cuRes = cuMemAllocPitch(&cuDevPtr, &cuMemPitch, 4 * encWidth, encHeight, 16);
// Register the CUDA buffer as an input resource
NV_ENC_REGISTER_RESOURCE regResParams = { 0 };
regResParams.version = NV_ENC_REGISTER_RESOURCE_VER;
regResParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
regResParams.width = encWidth;
regResParams.height = encHeight;
regResParams.bufferFormat = NV_ENC_BUFFER_FORMAT_YUV444_PL;
regResParams.resourceToRegister = (void*) cuDevPtr;
regResParams.pitch = cuMemPitch;
encStat = nvEncApi.nvEncRegisterResource(nvEncoder, &regResParams);
assert(encStat == NV_ENC_SUCCESS); // THIS IS THE POINT OF FAILURE
nvEncInpRes = regResParams.registeredResource;
}
void cleanup() { /* OMITTED */ }
void encode() {
// THE FOLLOWING CODE WAS NEVER REACHED YET BECAUSE OF THE ISSUE.
// INCLUDED HERE FOR REFERENCE.
CUresult cuRes;
NVENCSTATUS encStat;
cuRes = cuGraphicsResourceSetMapFlags(cuInpTexRes, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
cuRes = cuGraphicsMapResources(1, &cuInpTexRes, 0);
CUarray mappedArray;
cuRes = cuGraphicsSubResourceGetMappedArray(&mappedArray, cuInpTexRes, 0, 0);
cuRes = cuMemcpyDtoA(mappedArray, 0, cuDevPtr, 4 * encWidth * encHeight);
NV_ENC_MAP_INPUT_RESOURCE mapInputResParams = { 0 };
mapInputResParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
mapInputResParams.registeredResource = nvEncInpRes;
encStat = nvEncApi.nvEncMapInputResource(nvEncoder, &mapInputResParams);
// TODO: encode...
cuRes = cuGraphicsUnmapResources(1, &cuInpTexRes, 0);
}
private:
struct PrivateData;
void initEncodeSession() {
CUresult cuRes;
NVENCSTATUS encStat;
// Pop the current context
cuRes = cuCtxPopCurrent(&cuOldCtx); // THIS IS ALLOWED TO FAIL (it doesn't
// Create a context for the device
cuCtx = nullptr;
cuRes = cuCtxCreate(&cuCtx, CU_CTX_SCHED_BLOCKING_SYNC, 0);
// Push our context
cuRes = cuCtxPushCurrent(cuCtx);
// Create an NV Encoder session
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS nvEncSessParams = { 0 };
nvEncSessParams.apiVersion = NVENCAPI_VERSION;
nvEncSessParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
nvEncSessParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
nvEncSessParams.device = cuCtx;
encStat = nvEncApi.nvEncOpenEncodeSessionEx(&nvEncSessParams, &nvEncoder);
}
void Encoder::initEncoder()
{
NVENCSTATUS encStat;
// Configure the encoder via preset
NV_ENC_PRESET_CONFIG presetConfig = { 0 };
GUID codecGUID = NV_ENC_CODEC_H264_GUID;
GUID presetGUID = NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID;
presetConfig.version = NV_ENC_PRESET_CONFIG_VER;
presetConfig.presetCfg.version = NV_ENC_CONFIG_VER;
encStat = nvEncApi.nvEncGetEncodePresetConfig(nvEncoder, codecGUID, presetGUID, &presetConfig);
NV_ENC_INITIALIZE_PARAMS initParams = { 0 };
initParams.version = NV_ENC_INITIALIZE_PARAMS_VER;
initParams.encodeGUID = codecGUID;
initParams.encodeWidth = texSize.w;
initParams.encodeHeight = texSize.h;
initParams.darWidth = texSize.w;
initParams.darHeight = texSize.h;
initParams.frameRateNum = 25;
initParams.frameRateDen = 1;
initParams.enableEncodeAsync = 0;
initParams.enablePTD = 1;
initParams.presetGUID = presetGUID;
memcpy(&nvEncConfig, &presetConfig.presetCfg, sizeof(nvEncConfig));
initParams.encodeConfig = &nvEncConfig;
encStat = nvEncApi.nvEncInitializeEncoder(nvEncoder, &initParams);
}
//void cleanupEncodeSession();
//void cleanupEncoder;
Size texSize;
GLuint yuvTex;
uint_t encWidth, encHeight;
CUdeviceptr cuDevPtr;
size_t cuMemPitch;
NV_ENC_CONFIG nvEncConfig;
NV_ENC_INPUT_PTR nvEncInpBuf;
NV_ENC_REGISTERED_PTR nvEncInpRes;
CUdevice cuDevice;
CUcontext cuCtx, cuOldCtx;
void *nvEncoder;
CUgraphicsResource cuInpTexRes;
};
int main(int argc, char *argv[])
{
Encoder encoder;
encoder.init({1920, 1080}, 0); // OMITTED THE TEXTURE AS IT IS NOT NEEDED TO REPRODUCE THE ISSUE
return 0;
}

After comparing the NVidia sample NvEncoderCudaInterop with my minimal code, I finally found the item that makes the difference between success and failure: its the pitch parameter of the NV_ENC_REGISTER_RESOURCE structure passed to nvEncRegisterResource().
I haven't seen it documented anywhere, but there's a hard limit on that value, which I've determined experimentally to be at 2560. Anything above that will result in NV_ENC_ERR_RESOURCE_REGISTER_FAILED.
It does not appear to matter that the pitch I was passing was calculated by another API call, cuMemAllocPitch().
(Another thing that was missing from my code was "locking" and unlocking the CUDA context to the current thread via cuCtxPushCurrent() and cuCtxPopCurrent(). Done in the sample via a RAII class.)
EDIT:
I have worked around the problem by doing something for which I had another reason: using NV12 as input format for the encoder instead of YUV444.
With NV12, the pitch parameter drops below the 2560 limit because the byte size per row is equal to the width, so in my case 1920 bytes.
This was necessary (at the time) because my graphics card was a GTX 760 with a "Kepler" GPU, which (as I was initially unaware) only supports NV12 as input format for NVEnc. I have since upgraded to a GTX 970, but as I just found out, the 2560 limit is still there.
This makes me wonder just how exactly one is expected to use NVEnc with YUV444. The only possibility that comes to my mind is to use non-pitched memory, which seems bizarre. I'd appreciate comments from people who've actually used NVEnc with YUV444.
EDIT #2 - PENDING FURTHER UPDATE:
New information has surfaced in the form of another SO question: NVencs Output Bitstream is not readable
It is quite possible that my answer so far was wrong. It seems now that the pitch should not only be set when registering the CUDA resource, but also when actually sending it to the encoder via nvEncEncodePicture(). I cannot check this right now, but I will next time I work on that project.

How to write a Live555 FramedSource to allow me to stream H.264 live

I've been trying to write a class that derives from FramedSource in Live555 that will allow me to stream live data from my D3D9 application to an MP4 or similar.
What I do each frame is grab the backbuffer into system memory as a texture, then convert it from RGB -> YUV420P, then encode it using x264, then ideally pass the NAL packets on to Live555. I made a class called H264FramedSource that derived from FramedSource basically by copying the DeviceSource file. Instead of the input being an input file, I've made it a NAL packet which I update each frame.
I'm quite new to codecs and streaming, so I could be doing everything completely wrong. In each doGetNextFrame() should I be grabbing the NAL packet and doing something like
memcpy(fTo, nal->p_payload, nal->i_payload)
I assume that the payload is my frame data in bytes? If anybody has an example of a class they derived from FramedSource that might at least be close to what I'm trying to do I would love to see it, this is all new to me and a little tricky to figure out what's happening. Live555's documentation is pretty much the code itself which doesn't exactly make it easy for me to figure out.

Ok, I finally got some time to spend on this and got it working! I'm sure there are others who will be begging to know how to do it so here it is.
You will need your own FramedSource to take each frame, encode, and prepare it for streaming, I will provide some of the source code for this soon.
Essentially throw your FramedSource into the H264VideoStreamDiscreteFramer, then throw this into the H264RTPSink. Something like this
scheduler = BasicTaskScheduler::createNew();
env = BasicUsageEnvironment::createNew(*scheduler);
framedSource = H264FramedSource::createNew(*env, 0,0);
h264VideoStreamDiscreteFramer
= H264VideoStreamDiscreteFramer::createNew(*env, framedSource);
// initialise the RTP Sink stuff here, look at
// testH264VideoStreamer.cpp to find out how
videoSink->startPlaying(*h264VideoStreamDiscreteFramer, NULL, videoSink);
env->taskScheduler().doEventLoop();
Now in your main render loop, throw over your backbuffer which you've saved to system memory to your FramedSource so it can be encoded etc. For more info on how to setup the encoding stuff check out this answer How does one encode a series of images into H264 using the x264 C API?
My implementation is very much in a hacky state and is yet to be optimised at all, my d3d application runs at around 15fps due to the encoding, ouch, so I will have to look into this. But for all intents and purposes this StackOverflow question is answered because I was mostly after how to stream it. I hope this helps other people.
As for my FramedSource it looks a little something like this
concurrent_queue<x264_nal_t> m_queue;
SwsContext* convertCtx;
x264_param_t param;
x264_t* encoder;
x264_picture_t pic_in, pic_out;
EventTriggerId H264FramedSource::eventTriggerId = 0;
unsigned H264FramedSource::FrameSize = 0;
unsigned H264FramedSource::referenceCount = 0;
int W = 720;
int H = 960;
H264FramedSource* H264FramedSource::createNew(UsageEnvironment& env,
unsigned preferredFrameSize,
unsigned playTimePerFrame)
{
return new H264FramedSource(env, preferredFrameSize, playTimePerFrame);
}
H264FramedSource::H264FramedSource(UsageEnvironment& env,
unsigned preferredFrameSize,
unsigned playTimePerFrame)
: FramedSource(env),
fPreferredFrameSize(fMaxSize),
fPlayTimePerFrame(playTimePerFrame),
fLastPlayTime(0),
fCurIndex(0)
{
if (referenceCount == 0)
{
}
++referenceCount;
x264_param_default_preset(&param, "veryfast", "zerolatency");
param.i_threads = 1;
param.i_width = 720;
param.i_height = 960;
param.i_fps_num = 60;
param.i_fps_den = 1;
// Intra refres:
param.i_keyint_max = 60;
param.b_intra_refresh = 1;
//Rate control:
param.rc.i_rc_method = X264_RC_CRF;
param.rc.f_rf_constant = 25;
param.rc.f_rf_constant_max = 35;
param.i_sps_id = 7;
//For streaming:
param.b_repeat_headers = 1;
param.b_annexb = 1;
x264_param_apply_profile(&param, "baseline");
encoder = x264_encoder_open(&param);
pic_in.i_type = X264_TYPE_AUTO;
pic_in.i_qpplus1 = 0;
pic_in.img.i_csp = X264_CSP_I420;
pic_in.img.i_plane = 3;
x264_picture_alloc(&pic_in, X264_CSP_I420, 720, 920);
convertCtx = sws_getContext(720, 960, PIX_FMT_RGB24, 720, 760, PIX_FMT_YUV420P, SWS_FAST_BILINEAR, NULL, NULL, NULL);
if (eventTriggerId == 0)
{
eventTriggerId = envir().taskScheduler().createEventTrigger(deliverFrame0);
}
}
H264FramedSource::~H264FramedSource()
{
--referenceCount;
if (referenceCount == 0)
{
// Reclaim our 'event trigger'
envir().taskScheduler().deleteEventTrigger(eventTriggerId);
eventTriggerId = 0;
}
}
void H264FramedSource::AddToBuffer(uint8_t* buf, int surfaceSizeInBytes)
{
uint8_t* surfaceData = (new uint8_t[surfaceSizeInBytes]);
memcpy(surfaceData, buf, surfaceSizeInBytes);
int srcstride = W*3;
sws_scale(convertCtx, &surfaceData, &srcstride,0, H, pic_in.img.plane, pic_in.img.i_stride);
x264_nal_t* nals = NULL;
int i_nals = 0;
int frame_size = -1;
frame_size = x264_encoder_encode(encoder, &nals, &i_nals, &pic_in, &pic_out);
static bool finished = false;
if (frame_size >= 0)
{
static bool alreadydone = false;
if(!alreadydone)
{
x264_encoder_headers(encoder, &nals, &i_nals);
alreadydone = true;
}
for(int i = 0; i < i_nals; ++i)
{
m_queue.push(nals[i]);
}
}
delete [] surfaceData;
surfaceData = NULL;
envir().taskScheduler().triggerEvent(eventTriggerId, this);
}
void H264FramedSource::doGetNextFrame()
{
deliverFrame();
}
void H264FramedSource::deliverFrame0(void* clientData)
{
((H264FramedSource*)clientData)->deliverFrame();
}
void H264FramedSource::deliverFrame()
{
x264_nal_t nalToDeliver;
if (fPlayTimePerFrame > 0 && fPreferredFrameSize > 0) {
if (fPresentationTime.tv_sec == 0 && fPresentationTime.tv_usec == 0) {
// This is the first frame, so use the current time:
gettimeofday(&fPresentationTime, NULL);
} else {
// Increment by the play time of the previous data:
unsigned uSeconds = fPresentationTime.tv_usec + fLastPlayTime;
fPresentationTime.tv_sec += uSeconds/1000000;
fPresentationTime.tv_usec = uSeconds%1000000;
}
// Remember the play time of this data:
fLastPlayTime = (fPlayTimePerFrame*fFrameSize)/fPreferredFrameSize;
fDurationInMicroseconds = fLastPlayTime;
} else {
// We don't know a specific play time duration for this data,
// so just record the current time as being the 'presentation time':
gettimeofday(&fPresentationTime, NULL);
}
if(!m_queue.empty())
{
m_queue.wait_and_pop(nalToDeliver);
uint8_t* newFrameDataStart = (uint8_t*)0xD15EA5E;
newFrameDataStart = (uint8_t*)(nalToDeliver.p_payload);
unsigned newFrameSize = nalToDeliver.i_payload;
// Deliver the data here:
if (newFrameSize > fMaxSize) {
fFrameSize = fMaxSize;
fNumTruncatedBytes = newFrameSize - fMaxSize;
}
else {
fFrameSize = newFrameSize;
}
memcpy(fTo, nalToDeliver.p_payload, nalToDeliver.i_payload);
FramedSource::afterGetting(this);
}
}
Oh and for those who want to know what my concurrent queue is, here it is, and it works brilliantly http://www.justsoftwaresolutions.co.uk/threading/implementing-a-thread-safe-queue-using-condition-variables.html
Enjoy and good luck!

The deliverFrame method lacks the following check at its start:
if (!isCurrentlyAwaitingData()) return;
see DeviceSource.cpp in LIVE

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js