I tried to allocate 17338896 elements of floating point numbers as follows (which is roughly 70 mb):
state = cublasAlloc(theSim->Ndim*theSim->Ndim,
sizeof(*(theSim->K0)),
(void**)&K0cuda);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
return -1;
}
However, I'm receiving error message of CUBLAS_STATUS_ALLOC_FAILED for the variable state. Would this have anything to do with the amount of video card memory available on the machine (128 mb on mine) or would this be a limit of the amount of memory that I can allocate using cublasAlloc() function (i.e. not relevant to the amount of memory available on the machine)? I tried using cudaMalloc() function and I am running into the same problem. Thanks in advance for looking into this.
--------------Addition of Error Reproduction-------------------------------------
#include <cuda.h>
#include <stdio.h>
int main (int argc, char *argv[]) {
// CUDA setup
cublasStatus state;
if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {
printf("CUBLAS init error.\n");
return -1;
}
// Instantiate video memory pointers
float *K0cuda;
// Allocate video memory needed
state = cublasAlloc(20000000,
sizeof(float),
(void**)&K0cuda);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
return -1;
}
// Copy K0 from CPU memory to GPU memory
// Note: before so, decide whether to integrate as a part of InsertionSim or
// CUDA content as a separate class
//state = cublasSetMatrix(theSim->Ndim, theSim->Ndim, sizeof(*theSim->K0),
// theSim->K0, theSim->Ndim, K0cuda, theSim->Ndim);
//if(state != CUBLAS_STATUS_SUCCESS) {
// printf("Error copy to video memory.\n");
// return -1;
//}
// Free memory
if(cublasFree(K0cuda) != CUBLAS_STATUS_SUCCESS) {
printf("Error freeing video memory.\n");
return -1;
}
// CUDA shutdown
if(cublasShutdown() != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS shutdown error.\n");
return -1;
}
if(theSim != NULL) delete theSim;
return 0;
}
Memory can fragment, which means that you can still allocate multiple smaller blocks but not a single large block. Your videocard will obviously need some memory for its normal 2D task. If that happens to break the 128 MB into 2 blocks of almost 64MB, then you'd see this kind of failure.
Related
I have written a program to read the frames from a video file. Everything works perfect except below described issue.
after reading the frame, when I call avcode_send_packet function, it leaks the memory.
I used av_packet_unref before reading the next frame. But still the memory leak is not resolved.
I am using FFMPEG latest 4.3 version on WIndows 10.
also av_frame_unref does not fix the memory leak. I think data buffer inside the packet does not get freed somehow I feel it is related to FFMPEG version issue as I see the similar coding done by other programmers on the internet.
Does any one have idea about how to fix this memory leak ?
----------------- code is as below-----------------------
... here code related to setting avformatcontext, and avcodeccontext.
while(1)
{
if (av_read_frame(pFormatCtx, packet) >= 0)
{
if (packet->stream_index == videoindex)
{
ret = avcodec_send_packet(pCodecCtx, packet);//on executing this line, memory shoots up in MBs , everytime.
if (ret < 0)
{
av_packet_unref(packet);
fprintf(stderr,"Failed to Decode packet. \n:%s", av_err2str(ret));
return -1;
}
ret = avcodec_receive_frame(pCodecCtx, pAvFrame);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
{
av_packet_unref(packet);
continue;
}
if (ret < 0)
{
av_packet_unref(packet);
printf("Failed to Decode packet. \n");
return -1;
}
av_packet_unref(packet);
{
//.. do something with the frame.
}
av_frame_unref(pAvFrame);
}
av_packet_unref(packet);
}
}
I've been trying to implement a one dimensional FFT using cuFFT. An InvalidValue error is thrown and no meaningful results are produced.
I've tried to ensure that each error is caught, and I believe that the cudaMemcpy from DeviceToHost causes the issue, though I am not sure why, nor how to fix it. The data size parameter in cudaMemcpy follows the same relation as supplied by the cuFFT documentation.
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
// cuda macros
#define NX 100 // number of points
#define BATCH 1 // number of ffts to perform
#define RANK 1 //
#define IDIST 1 // distance between 1st elements of batches
#define ISTRIDE 1 // do every ISTRIDEth index
#define ODIST 1 // distance between 1st elements of output
#define OSTRIDE 1 // distance between output elements
void fft1d() {
// create plan for performing fft
cufftHandle plan;
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS) {
printf("Failed to create 1D plan\n");
return;
}
// assemble data
double temp_data[] = {2.598076211353316, 3.2402830701637395, 3.8494572900049224, 4.419388724529261, 4.944267282795252, 5.41874215947433, 5.837976382931011, 6.197696125093141, 6.494234270429254, 6.724567799874842, 6.886348608602047, 6.97792744346504, 6.998370716093996, 6.9474700202387565, 6.8257442563389, 6.6344343416615565, 6.37549055993378, 6.051552679431957, 5.665923042211819, 5.222532898817316, 4.725902331664744, 4.181094175657916, 3.5936624057845576, 2.9695955178498603, 2.315255479544737, 1.6373128742041732, 0.9426788984240022, 0.23843490677753865, -0.46823977812093664, -1.1701410542749289, -1.8601134815746807, -2.531123226988873, -3.176329770049035, -3.7891556376344524, -4.363353457155562, -4.893069644570959, -5.3729040779788875, -5.797965148448726, -6.163919626883915, -6.467036838555256, -6.704226694973039, -6.873071195387157, -6.971849076777267, -6.999553361041935, -6.955901620504255, -6.84133885708361, -6.657032965782207, -6.404862828733319, -6.0873991611848375, -5.707878304681281, -5.270169234606201, -4.778734118422206, -4.23858282669252, -3.6552218606153755, -3.0345982167228436, -2.383038761007964, -1.707185730522749, -1.0139290199674, -0.31033594356630245, 0.39642081173600463, 1.0991363072871054, 1.7906468025248339, 2.463902784786862, 3.1120408346390414, 3.728453594100783, 4.306857124485735, 4.841354967187034, 5.326498254347925, 5.757341256627454, 6.129491801786784, 6.439156050110601, 6.683177170206378, 6.859067520906216, 6.965034011197066, 6.999996379650895, 6.963598207007518, 6.85621054964381, 6.678928156888352, 6.433558310743566, 6.122602401787424, 5.749230429076629, 5.317248684008804, 4.831060947586139, 4.295623596650021, 3.7163950767501706, 3.0992802567403803, 2.4505702323708074, 1.7768781925409076, 1.0850720020162676, 0.3822041878858906, -0.3245599564963766, -1.0280154171511335, -1.7209909100394047, -2.3964219877733033, -3.0474230571943477, -3.667357573646071, -4.249905696354359, -4.78912871521179, -5.279529592175676, -5.716109000098287};
cufftReal *idata;
cudaMalloc((void**) &idata, sizeof(cufftComplex)*NX);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory space for input data.\n");
return;
}
cudaMemcpy(idata, temp_data, sizeof(temp_data)/sizeof(double), cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to load time data to memory.\n");
return;
}
// prepare memory for return data
cufftComplex *odata;
cudaMalloc((void**) &odata, sizeof(cufftComplex)*(NX/2 + 1));
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory for output data.\n");
}
// perform fft
if (cufftExecR2C(plan, idata, odata) != CUFFT_SUCCESS) {
printf("Failed to perform fft.\n");
return;
}
I think the error is thrown here, at the cudaMemcpy.
// grab data from graphics and print (memcpy waits until complete) cuda memcopy doesn't complete
// can return errors from previous cuda calls if they haven't been caught
cufftComplex *out_temp_data;
size_t num_bytes = (NX/2 + 1)*sizeof(cufftComplex);
cudaMemcpy(out_temp_data, odata, num_bytes, cudaMemcpyDeviceToHost);
int error_value = cudaGetLastError();
printf("cudaMemcpy from device state: %i\n", error_value);
if(error_value != cudaSuccess) {
printf("Failed to pull data from device.\n");
return;
}
for (size_t i = 0; i < (NX/2 + 1); i++) {
printf("%lu %f %f\n", i, out_temp_data[i].x, out_temp_data[i].y);
}
// clean up
cufftDestroy(plan);
cudaFree(idata);
}
int main() {
fft1d();
return 0;
}
Memory must be allocated before cudaMemcpy can write the data. Thanks to generic-opto-guy for pointing this out.
In this case:
out_temp_data = new cufftComplex[NX/2 + 1];
I have been porting my RabbitCT CUDA implementation to OpenCL and I'm running into issues with pinned memory.
For CUDA a host buffer is created that buffers the input images to be processed in pinned memory. This allows the host to catch the next batch of input images while the GPU processes the current batch. A simplified mockup of my CUDA implementation is as follows:
// globals
float** hostProjBuffer = new float*[BUFFER_SIZE];
float* devProjection[STREAMS_MAX];
cudaStream_t stream[STREAMS_MAX];
void initialize()
{
// initiate streams
for( uint s = 0; s < STREAMS_MAX; s++ ){
cudaStreamCreateWithFlags (&stream[s], cudaStreamNonBlocking);
cudaMalloc( (void**)&devProjection[s], imgSize);
}
// initiate buffers
for( uint b = 0; b < BUFFER_SIZE; b++ ){
cudaMallocHost((void **)&hostProjBuffer[b], imgSize);
}
}
// main function called for all input images
void backproject(imgdata* r)
{
uint projNr = r->imgnr % BUFFER_SIZE;
uint streamNr = r->imgnr % STREAMS_MAX;
// When buffer is filled, wait until work in current stream has finished
if(projNr == 0) {
cudaStreamSynchronize(stream[streamNr]);
}
// copy received image data to buffer (maps double precision to float)
std::copy(r->I_n, r->I_n+(imgSizeX * imgSizeY), hostProjBuffer[projNr]);
// copy image and matrix to device
cudaMemcpyAsync( devProjection[streamNr], hostProjBuffer[projNr], imgSize, cudaMemcpyHostToDevice, stream[streamNr] );
// call kernel
backproject<<<numBlocks, threadsPerBlock, 0 , stream[streamNr]>>>(devProjection[streamNr]);
}
So, for CUDA, I create a pinned host pointer for each buffer item and copy the data to the device before executing kernel of each stream.
For OpenCL I initially did something similar when following the Nvidia OpenCL Best Practices Guide. Here they recommend creating two buffers, one for copying the kernel data to and one for the pinned memory. However, this leads to the implementation using double the device memory as both the kernel and pinned memory buffers are allocated on the device.
To get around this memory issue, I created an implementation where only a mapping is made to the device as it is needed. This can be seen in the following implementation:
// globals
float** hostProjBuffer = new float* [BUFFER_SIZE];
cl_mem devProjection[STREAMS_MAX], devMatrix[STREAMS_MAX];
cl_command_queue queue[STREAMS_MAX];
// initiate streams
void initialize()
{
for( uint s = 0; s < STREAMS_MAX; s++ ){
queue[s] = clCreateCommandQueueWithProperties(context, device, NULL, &status);
devProjection[s] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, imgSize, NULL, &status);
}
}
// main function called for all input images
void backproject(imgdata* r)
{
const uint projNr = r->imgnr % BUFFER_SIZE;
const uint streamNr = r->imgnr % STREAMS_MAX;
// when buffer is filled, wait until work in current stream has finished
if(projNr == 0) {
status = clFinish(queue[streamNr]);
}
// map host memory region to device buffer
hostProjBuffer[projNr] = (float*) clEnqueueMapBuffer(queue[streamNr], devProjection[streamNr], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0, imgSize, 0, NULL, NULL, &status);
// copy received image data to hostbuffers
std::copy(imgPtr, imgPtr + (imgSizeX * imgSizeY), hostProjBuffer[projNr]);
// unmap the allocated pinned host memory
clEnqueueUnmapMemObject(queue[streamNr], devProjection[streamNr], hostProjBuffer[projNr], 0, NULL, NULL);
// set stream specific arguments
clSetKernelArg(kernel, 0, sizeof(devProjection[streamNr]), (void *) &devProjection[streamNr]);
// launch kernel
clEnqueueNDRangeKernel(queue[streamNr], kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
clFlush(queue[streamNr]);
clFinish(queue[streamNr]); //should be removed!
}
This implementation does use a similar amount of device memory as the CUDA implementation. However, I have been unable to get this last code example working without a clFinish after each loop, which significantly hampers the performance of the application. This indicates data is lost as the host moves ahead of the kernel. I tried increasing my buffer size to the number of input images, but this did not work either. So somehow during execution, the hostBuffer data gets lost.
So, with the goal to write OpenCL code similar to CUDA, I have three questions:
What is the recommended implementation for OpenCL pinned memory?
Is my OpenCL implementation similar to how CUDA handles pinned memory?
What causes the wrong data to be used in the OpenCL example?
Thanks in advance!
Kind regards,
Remy
PS: Question initially asked at the Nvidia developer forums
I have developed just a simple library modifing a library that I found on the internet.
What scares me, is that, when I play an avi, it plays and free the memory when the video ends, but when I play the video, it's like a memory leak! It grows to 138mb although the video has ended and the FreeAll method (A function that deletes the context, etc...) has been called.
Here is the code of the method that is causing the memory leak:
int VideoGL::NextVideoFrame(){
int frameDone = 0;
int result = 0;
double pts = 0;
if(!this->ended){
if (!_started) return 0;
AVPacket* packet;
// Get the number of milliseconds passed and see if we should display a new frame
int64_t msPassed = (1000 * (clock() - _baseTime)) / CLOCKS_PER_SEC;
if (msPassed >= _currentPts)
{
// If this is not the current frame, copy it to the buffer
if (_currentFramePts != _currentPts){
_currentFramePts = _currentPts;
memcpy(buffer_a,buffer, 3 * _codec_context_video->width * _codec_context_video->height);
result = 1;
}
// Try to load a new frame from the video packet queue
bool goodop=false;
AVFrame *_n_frame = avcodec_alloc_frame();
while (!frameDone && (packet = this->DEQUEUE(VIDEO)) != NULL)
{
if (packet == (AVPacket*)-1) return -1;
goodop=true;
_s_pts = packet->pts;
avcodec_decode_video2(_codec_context_video, _n_frame, &frameDone, packet);
av_free_packet(packet);
if (packet->dts == AV_NOPTS_VALUE)
{
if (_n_frame->opaque && *(uint64_t*)_n_frame->opaque != AV_NOPTS_VALUE) pts = (double) *(uint64_t*)_n_frame->opaque;
else pts = 0;
}
else pts = (double) packet->dts;
pts *= av_q2d(_codec_context_video->time_base);
}
if (frameDone)
{
// if a frame was loaded scale it to the current texture frame buffer, but also set the pts so that it won't be copied to the texture until it's time
sws_scale(sws_ctx,_n_frame->data, _n_frame->linesize, 0, _codec_context_video->height, _rgb_frame->data, _rgb_frame->linesize);
double nts = 1.0/av_q2d(_codec_context_video->time_base);
_currentPts = (uint64_t) (pts*nts);
}
avcodec_free_frame(&_n_frame);
av_free(_n_frame);
if(!goodop){
ended=true;
}
}
}
return result;
}
I'll be waiting for answers, thanks.
I had a memory leak problem either. For me, the deallocation worked when I included the following commands:
class members:
AVPacket avpkt;
AVFrame *frame;
AVCodecContext *avctx;
AVCodec *codec;
constructor:
av_init_packet(&avpkt);
avcodec_open2(avctx, codec, NULL);
frame = avcodec_alloc_frame();
destructor:
av_free_packet(&avpkt);
avcodec_free_frame(&frame);
av_free(frame);
avcodec_close(avctx);
i also had the same problem. According to the ffplay.c
you should call
av_frame_unref(pFrame);
avcodec_get_frame_defaults(pFrame);
after every sw_scale call. this will free up all malloc during decode.
I had similar routine using FFmpeg that would leak memory. I found a resolution by deallocating memory for the frame and packet objects for each call to avcodec_decode_video2.
In your code the packet object is freed, however the frame is not. Adding the following lines before avcodec_decode_video2 should resolve the memory leak. I found that it's safe to call avcodec_free_frame on a frame object that is already deallocated. You could remove the allocation of the frame before the while loop.
avcodec_free_frame(&_n_frame);
_n_frame = avcodec_alloc_frame();
avcodec_decode_video2(_codec_context_video, _n_frame, &frameDone, packet);
I'm trying to transcode a video with help of libavcodec.
On transcoding big video files(hour or more) i get huge memory leaks in avcodec_encode_video. I have tried to debug it, but with different video files different functions produce leaks, i have got a little bit confused about that :). Here FFMPEG with QT memory leak is the same issue that i have, but i have no idea how did that person solve it. QtFFmpegwrapper seems to do the same i do(or i missed something).
my method is lower. I took care about aFrame and aPacket outside with av_free and av_free_packet.
int
Videocut::encode(
AVStream *anOutputStream,
AVFrame *aFrame,
AVPacket *aPacket
)
{
AVCodecContext *outputCodec = anOutputStream->codec;
if (!anOutputStream ||
!aFrame ||
!aPacket)
{
return 1;
/* NOTREACHED */
}
uint8_t * buffer = (uint8_t *)malloc(
sizeof(uint8_t) * _DefaultEncodeBufferSize
);
if (NULL == buffer) {
return 2;
/* NOTREACHED */
}
int packetSize = avcodec_encode_video(
outputCodec,
buffer,
_DefaultEncodeBufferSize,
aFrame
);
if (packetSize < 0) {
free(buffer);
return 1;
/* NOTREACHED */
}
aPacket->data = buffer;
aPacket->size = packetSize;
return 0;
}
The first step would be to try to reproduce your problem under Valgrind on a Linux box, if you can.
ffmpeg encoders and decoders usually not dynamically allocate memory; they reuse buffers between calls. Leaks are usually going to be in the frames somewhere.
Note that av_free_packet will only free your dynamically allocated buffer if the packet has a destructor function!
Look at how the function is defined in libavcodec/avpacket.c:
void av_free_packet(AVPacket *pkt)
{
if (pkt) {
if (pkt->destruct) pkt->destruct(pkt);
pkt->data = NULL; pkt->size = 0;
pkt->side_data = NULL;
pkt->side_data_elems = 0;
}
}
If there is no pkt->destruct function, no clean up takes place!