I am having issues encoding screen captures, into a h.264 file for viewing. The program below is cobbled together from examples here and here. The first example, uses an older version of the ffmpeg api. So I tried to update that example for use in my program. The file is created and has something written to it, but when I view the file. The encoded images are all distorted. I am able to run the video encoding example from the ffmpeg api successfully. This is my first time posting, so if I missed anything please let me know.
I appreciate any assistance that is given.
My program:
#include <Windows.h>
#include <string>
#include <sstream>
#include <time.h>
#include <iostream>
#include <dwmapi.h>
extern "C"{
#include <stdint.h>
#include <libavcodec/avcodec.h>
#include <libavutil/imgutils.h>
#include <libswscale/swscale.h>
#include <libavutil/opt.h>
}
using namespace std;
void ScreenShot(const char* BmpName, uint8_t *frame)
{
HWND DesktopHwnd = GetDesktopWindow();
RECT DesktopParams;
HDC DevC = GetDC(DesktopHwnd);
GetWindowRect(DesktopHwnd,&DesktopParams);
DWORD Width = DesktopParams.right - DesktopParams.left;
DWORD Height = DesktopParams.bottom - DesktopParams.top;
DWORD FileSize = sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+(sizeof(RGBTRIPLE)+1*(Width*Height*4));
char *BmpFileData = (char*)GlobalAlloc(0x0040,FileSize);
PBITMAPFILEHEADER BFileHeader = (PBITMAPFILEHEADER)BmpFileData;
PBITMAPINFOHEADER BInfoHeader = (PBITMAPINFOHEADER)&BmpFileData[sizeof(BITMAPFILEHEADER)];
BFileHeader->bfType = 0x4D42; // BM
BFileHeader->bfSize = sizeof(BITMAPFILEHEADER);
BFileHeader->bfOffBits = sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER);
BInfoHeader->biSize = sizeof(BITMAPINFOHEADER);
BInfoHeader->biPlanes = 1;
BInfoHeader->biBitCount = 32;
BInfoHeader->biCompression = BI_RGB;
BInfoHeader->biHeight = Height;
BInfoHeader->biWidth = Width;
RGBTRIPLE *Image = (RGBTRIPLE*)&BmpFileData[sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)];
RGBTRIPLE color;
//pPixels = (RGBQUAD **)new RGBQUAD[sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER)];
int start = clock();
HDC CaptureDC = CreateCompatibleDC(DevC);
HBITMAP CaptureBitmap = CreateCompatibleBitmap(DevC,Width,Height);
SelectObject(CaptureDC,CaptureBitmap);
BitBlt(CaptureDC,0,0,Width,Height,DevC,0,0,SRCCOPY|CAPTUREBLT);
GetDIBits(CaptureDC,CaptureBitmap,0,Height,frame,(LPBITMAPINFO)BInfoHeader, DIB_RGB_COLORS);
int end = clock();
cout << "it took " << end - start << " to capture a frame" << endl;
DWORD Junk;
HANDLE FH = CreateFileA(BmpName,GENERIC_WRITE,FILE_SHARE_WRITE,0,CREATE_ALWAYS,0,0);
WriteFile(FH,BmpFileData,FileSize,&Junk,0);
CloseHandle(FH);
GlobalFree(BmpFileData);
}
void video_encode_example(const char *filename, int codec_id)
{
AVCodec *codec;
AVCodecContext *c= NULL;
int i, ret, x, y, got_output;
FILE *f;
AVFrame *frame;
AVPacket pkt;
uint8_t endcode[] = { 0, 0, 1, 0xb7 };
printf("Encode video file %s\n", filename);
/* find the mpeg1 video encoder */
codec = avcodec_find_encoder(AV_CODEC_ID_H264);
if (!codec) {
fprintf(stderr, "Codec not found\n");
cin.get();
exit(1);
}
c = avcodec_alloc_context3(codec);
if (!c) {
fprintf(stderr, "Could not allocate video codec context\n");
cin.get();
exit(1);
}
/* put sample parameters */
c->bit_rate = 400000;
/* resolution must be a multiple of two */
c->width = 352;
c->height = 288;
/* frames per second */
c->time_base.num=1;
c->time_base.den = 25;
c->gop_size = 10; /* emit one intra frame every ten frames */
c->max_b_frames=1;
c->pix_fmt = AV_PIX_FMT_YUV420P;
if(codec_id == AV_CODEC_ID_H264)
av_opt_set(c->priv_data, "preset", "slow", 0);
/* open it */
if (avcodec_open2(c, codec, NULL) < 0) {
fprintf(stderr, "Could not open codec\n");
exit(1);
}
f = fopen(filename, "wb");
if (!f) {
fprintf(stderr, "Could not open %s\n", filename);
exit(1);
}
frame = av_frame_alloc();
if (!frame) {
fprintf(stderr, "Could not allocate video frame\n");
exit(1);
}
frame->format = c->pix_fmt;
frame->width = c->width;
frame->height = c->height;
/* the image can be allocated by any means and av_image_alloc() is
just the most convenient way if av_malloc() is to be used */
ret = av_image_alloc(frame->data, frame->linesize, c->width, c->height, c->pix_fmt, 32);
if (ret < 0) {
fprintf(stderr, "Could not allocate raw picture buffer\n");
exit(1);
}
/* encode 1 second of video */
for(i=0;i<250;i++) {
av_init_packet(&pkt);
pkt.data = NULL; // packet data will be allocated by the encoder
pkt.size = 0;
fflush(stdout);
/* prepare a dummy image */
/* Y */
for(y=0;y<c->height;y++) {
for(x=0;x<c->width;x++) {
frame->data[0][y * frame->linesize[0] + x] = x + y + i * 3;
}
}
/* Cb and Cr */
for(y=0;y<c->height/2;y++) {
for(x=0;x<c->width/2;x++) {
frame->data[1][y * frame->linesize[1] + x] = 128 + y + i * 2;
frame->data[2][y * frame->linesize[2] + x] = 64 + x + i * 5;
}
}
frame->pts = i;
/* encode the image */
ret = avcodec_encode_video2(c, &pkt, frame, &got_output);
if (ret < 0) {
fprintf(stderr, "Error encoding frame\n");
exit(1);
}
if (got_output) {
printf("Write frame %3d (size=%5d)\n", i, pkt.size);
fwrite(pkt.data, 1, pkt.size, f);
av_free_packet(&pkt);
}
}
/* get the delayed frames */
for (got_output = 1; got_output; i++) {
fflush(stdout);
ret = avcodec_encode_video2(c, &pkt, NULL, &got_output);
if (ret < 0) {
fprintf(stderr, "Error encoding frame\n");
exit(1);
}
if (got_output) {
printf("Write frame %3d (size=%5d)\n", i, pkt.size);
fwrite(pkt.data, 1, pkt.size, f);
av_free_packet(&pkt);
}
}
/* add sequence end code to have a real mpeg file */
fwrite(endcode, 1, sizeof(endcode), f);
fclose(f);
avcodec_close(c);
av_free(c);
av_freep(&frame->data[0]);
av_frame_free(&frame);
printf("\n");
}
void write_video_frame()
{
}
int lineSizeOfFrame(int width)
{
return (width*24 + 31)/32 * 4;//((width*24 / 8) + 3) & ~3;//(width*24 + 31)/32 * 4;
}
int getScreenshotWithCursor(uint8_t* frame)
{
int successful = 0;
HDC screen, bitmapDC;
HBITMAP screen_bitmap;
screen = GetDC(NULL);
RECT DesktopParams;
HWND desktop = GetDesktopWindow();
GetWindowRect(desktop, &DesktopParams);
int width = DesktopParams.right;
int height = DesktopParams.bottom;
bitmapDC = CreateCompatibleDC(screen);
screen_bitmap = CreateCompatibleBitmap(screen, width, height);
SelectObject(bitmapDC, screen_bitmap);
if (BitBlt(bitmapDC, 0, 0, width, height, screen, 0, 0, SRCCOPY))
{
int pos_x, pos_y;
HICON hcur;
ICONINFO icon_info;
CURSORINFO cursor_info;
cursor_info.cbSize = sizeof(CURSORINFO);
if (GetCursorInfo(&cursor_info))
{
if (cursor_info.flags == CURSOR_SHOWING)
{
hcur = CopyIcon(cursor_info.hCursor);
if (GetIconInfo(hcur, &icon_info))
{
pos_x = cursor_info.ptScreenPos.x - icon_info.xHotspot;
pos_y = cursor_info.ptScreenPos.y - icon_info.yHotspot;
DrawIcon(bitmapDC, pos_x, pos_y, hcur);
if (icon_info.hbmColor) DeleteObject(icon_info.hbmColor);
if (icon_info.hbmMask) DeleteObject(icon_info.hbmMask);
}
}
}
int header_size = sizeof(BITMAPINFOHEADER) + 256*sizeof(RGBQUAD);
size_t line_size = lineSizeOfFrame(width);
PBITMAPINFO lpbi = (PBITMAPINFO) malloc(header_size);
lpbi->bmiHeader.biSize = header_size;
lpbi->bmiHeader.biWidth = width;
lpbi->bmiHeader.biHeight = height;
lpbi->bmiHeader.biPlanes = 1;
lpbi->bmiHeader.biBitCount = 24;
lpbi->bmiHeader.biCompression = BI_RGB;
lpbi->bmiHeader.biSizeImage = height*line_size;
lpbi->bmiHeader.biXPelsPerMeter = 0;
lpbi->bmiHeader.biYPelsPerMeter = 0;
lpbi->bmiHeader.biClrUsed = 0;
lpbi->bmiHeader.biClrImportant = 0;
if (GetDIBits(bitmapDC, screen_bitmap, 0, height, (LPVOID)frame, lpbi, DIB_RGB_COLORS))
{
int i;
uint8_t *buf_begin = frame;
uint8_t *buf_end = frame + line_size*(lpbi->bmiHeader.biHeight - 1);
void *temp = malloc(line_size);
for (i = 0; i < lpbi->bmiHeader.biHeight / 2; ++i)
{
memcpy(temp, buf_begin, line_size);
memcpy(buf_begin, buf_end, line_size);
memcpy(buf_end, temp, line_size);
buf_begin += line_size;
buf_end -= line_size;
}
cout << *buf_begin << endl;
free(temp);
successful = 1;
}
free(lpbi);
}
DeleteObject(screen_bitmap);
DeleteDC(bitmapDC);
ReleaseDC(NULL, screen);
return successful;
}
int main()
{
RECT DesktopParams;
HWND desktop = GetDesktopWindow();
GetWindowRect(desktop, &DesktopParams);
int width = DesktopParams.right;
int height = DesktopParams.bottom;
uint8_t *frame = (uint8_t *)malloc(width * height);
AVCodec *codec;
AVCodecContext *codecContext = NULL;
AVPacket packet;
FILE *f;
AVFrame *pictureYUV = NULL;
AVFrame *pictureRGB;
avcodec_register_all();
codec = avcodec_find_encoder(AV_CODEC_ID_H264);
if(!codec)
{
cout << "codec not found!" << endl;
cin.get();
return 1;
}
else
{
cout << "codec h265 found!" << endl;
}
codecContext = avcodec_alloc_context3(codec);
codecContext->bit_rate = width * height * 4;
codecContext->width = width;
codecContext->height = height;
codecContext->time_base.num = 1;
codecContext->time_base.den = 250;
codecContext->gop_size = 10;
codecContext->max_b_frames = 1;
codecContext->keyint_min = 1;
codecContext->i_quant_factor = (float)0.71; // qscale factor between P and I frames
codecContext->b_frame_strategy = 20; ///// find out exactly what this does
codecContext->qcompress = (float)0.6; ///// find out exactly what this does
codecContext->qmin = 20; // minimum quantizer
codecContext->qmax = 51; // maximum quantizer
codecContext->max_qdiff = 4; // maximum quantizer difference between frames
codecContext->refs = 4; // number of reference frames
codecContext->trellis = 1;
codecContext->pix_fmt = AV_PIX_FMT_YUV420P;
codecContext->codec_id = AV_CODEC_ID_H264;
codecContext->codec_type = AVMEDIA_TYPE_VIDEO;
if(avcodec_open2(codecContext, codec, NULL) < 0)
{
cout << "couldn't open codec" << endl;
cout << stderr << endl;
cin.get();
return 1;
}
else
{
cout << "opened h265 codec!" << endl;
cin.get();
}
f = fopen("test.h264", "wb");
if(!f)
{
cout << "Unable to open file" << endl;
return 1;
}
struct SwsContext *img_convert_ctx = sws_getContext(codecContext->width, codecContext->height, PIX_FMT_RGB32, codecContext->width,
codecContext->height, codecContext->pix_fmt, SWS_BILINEAR, NULL, NULL, NULL);
int got_output = 0, i = 0;
uint8_t encode[] = { 0, 0, 1, 0xb7 };
try
{
for(i = 0; i < codecContext->time_base.den; i++)
{
av_init_packet(&packet);
packet.data = NULL;
packet.size = 0;
pictureRGB = av_frame_alloc();
pictureYUV = av_frame_alloc();
getScreenshotWithCursor(frame);
//ScreenShot("example.bmp", frame);
int nbytes = avpicture_get_size(AV_PIX_FMT_YUV420P, codecContext->width, codecContext->height); // allocating outbuffer
uint8_t* outbuffer = (uint8_t*)av_malloc(nbytes*sizeof(uint8_t));
pictureRGB = av_frame_alloc();
pictureYUV = av_frame_alloc();
avpicture_fill((AVPicture*)pictureRGB, frame, PIX_FMT_RGB32, codecContext->width, codecContext->height); // fill image with input screenshot
avpicture_fill((AVPicture*)pictureYUV, outbuffer, PIX_FMT_YUV420P, codecContext->width, codecContext->height);
av_image_alloc(pictureYUV->data, pictureYUV->linesize, codecContext->width, codecContext->height, codecContext->pix_fmt, 32);
sws_scale(img_convert_ctx, pictureRGB->data, pictureRGB->linesize, 0, codecContext->height, pictureYUV->data, pictureYUV->linesize);
pictureYUV->pts = i;
avcodec_encode_video2(codecContext, &packet, pictureYUV, &got_output);
if(got_output)
{
printf("Write frame %3d (size=%5d)\n", i, packet.size);
fwrite(packet.data, 1, packet.size, f);
av_free_packet(&packet);
}
//av_frame_free(&pictureRGB);
//av_frame_free(&pictureYUV);
}
for(got_output = 1; got_output; i++)
{
fflush(stdout);
avcodec_encode_video2(codecContext, &packet, NULL, &got_output);
if (got_output) {
printf("Write frame %3d (size=%5d)\n", i, packet.size);
fwrite(packet.data, 1, packet.size, f);
av_free_packet(&packet);
}
}
}
catch(std::exception ex)
{
cout << ex.what() << endl;
}
avcodec_close(codecContext);
av_free(codecContext);
av_freep(&pictureYUV->data[0]);
//av_frame_free(&picture);
fwrite(encode, 1, sizeof(encode), f);
fclose(f);
cin.get();
return 0;
}
Related
I'm trying to encode and stream using ffmpeg (libavcodec/libavformat - MSVC x64 with Zeranoe builds)
Here is my code, largely adapted from the encoding example, error handling removed
#include "stdafx.h"
extern "C" {
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/opt.h>
#include <libavutil/channel_layout.h>
#include <libavutil/common.h>
#include <libavutil/imgutils.h>
#include <libavutil/mathematics.h>
#include <libavutil/samplefmt.h>
}
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "avcodec.lib")
int main() {
avcodec_register_all();
av_register_all();
avformat_network_init();
AVCodecID codec_id = AV_CODEC_ID_H264;
AVCodec *codec;
AVCodecContext *c = NULL;
int i, ret, x, y, got_output;
AVFrame *frame;
AVPacket pkt;
codec = avcodec_find_encoder(codec_id);
c = avcodec_alloc_context3(codec);
c->bit_rate = 400000;
c->width = 352;
c->height = 288;
c->time_base.num = 1;
c->time_base.den = 25;
c->gop_size = 25;
c->max_b_frames = 1;
c->pix_fmt = AV_PIX_FMT_YUV420P;
c->codec_type = AVMEDIA_TYPE_VIDEO;
c->flags = CODEC_FLAG_GLOBAL_HEADER;
if (codec_id == AV_CODEC_ID_H264) {
ret = av_opt_set(c->priv_data, "preset", "ultrafast", 0);
ret = av_opt_set(c->priv_data, "tune", "zerolatency", 0);
}
avcodec_open2(c, codec, NULL)
frame = av_frame_alloc();
frame->format = c->pix_fmt;
frame->width = c->width;
frame->height = c->height;
ret = av_image_alloc(frame->data, frame->linesize, c->width, c->height,
c->pix_fmt, 32);
AVFormatContext* avfctx;
AVOutputFormat* fmt = av_guess_format("rtp", NULL, NULL);
ret = avformat_alloc_output_context2(&avfctx, fmt, fmt->name,
"rtp://127.0.0.1:49990");
printf("Writing to %s\n", avfctx->filename);
avio_open(&avfctx->pb, avfctx->filename, AVIO_FLAG_WRITE)
struct AVStream* stream = avformat_new_stream(avfctx, codec);
stream->codecpar->bit_rate = 400000;
stream->codecpar->width = 352;
stream->codecpar->height = 288;
stream->codecpar->codec_id = AV_CODEC_ID_H264;
stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
stream->time_base.num = 1;
stream->time_base.den = 25;
avformat_write_header(avfctx, NULL);
char buf[200000];
AVFormatContext *ac[] = { avfctx };
av_sdp_create(ac, 1, buf, 20000);
printf("sdp:\n%s\n", buf);
FILE* fsdp;
fopen_s(&fsdp, "test.sdp", "w");
fprintf(fsdp, "%s", buf);
fclose(fsdp);
system("PAUSE");
system("start "" \"C:\\Program Files (x86)\\VideoLAN\\VLC\\vlc.exe\" test.sdp");
int j = 0;
for (i = 0; i < 10000; i++) {
av_init_packet(&pkt);
pkt.data = NULL; // packet data will be allocated by the encoder
pkt.size = 0;
fflush(stdout);
/* prepare a dummy image */
/* Y */
for (y = 0; y < c->height; y++) {
for (x = 0; x < c->width; x++) {
frame->data[0][y * frame->linesize[0] + x] = x + y + i * 3;
}
}
/* Cb and Cr */
for (y = 0; y < c->height / 2; y++) {
for (x = 0; x < c->width / 2; x++) {
frame->data[1][y * frame->linesize[1] + x] = 128 + y + i * 2;
frame->data[2][y * frame->linesize[2] + x] = 64 + x + i * 5;
}
}
frame->pts = i;
/* encode the image */
ret = avcodec_send_frame(c, frame);
ret = avcodec_receive_packet(c, &pkt);
if (ret == AVERROR_EOF) {
got_output = false;
printf("Stream EOF\n");
} else if(ret == AVERROR(EAGAIN)) {
got_output = false;
printf("Stream EAGAIN\n");
} else {
got_output = true;
}
if (got_output) {
printf("Write frame %3d (size=%5d)\n", j++, pkt.size);
av_interleaved_write_frame(avfctx, &pkt);
av_packet_unref(&pkt);
}
Sleep(40);
}
// end
ret = avcodec_send_frame(c, NULL);
/* get the delayed frames */
for (; ; i++) {
fflush(stdout);
ret = avcodec_receive_packet(c, &pkt);
if (ret == AVERROR_EOF) {
printf("Stream EOF\n");
break;
} else if (ret == AVERROR(EAGAIN)) {
printf("Stream EAGAIN\n");
got_output = false;
} else {
got_output = true;
}
if (got_output) {
printf("Write frame %3d (size=%5d)\n", j++, pkt.size);
av_interleaved_write_frame(avfctx, &pkt);
av_packet_unref(&pkt);
}
}
avcodec_close(c);
av_free(c);
av_freep(&frame->data[0]);
av_frame_free(&frame);
printf("\n");
system("pause");
return 0;
}
However VLC (opened with the generated SDP file) isn't able to play the stream. Messages has this
core error: ES_OUT_RESET_PCR called
followed by repeated
packetizer_h264 warning: waiting for SPS/PPS
core debug: Buffering <some percent>%
What am I doing wrong?
After a few hours of digging in the ffmpeg source, I found the solution:
Don't use the CODEC_FLAG_GLOBAL_HEADER flag
Use avformat_write_header before each av_interleaved_write_frame
Now VLC plays the stream correctly
I have made a test application to transcode to vorbis format (webm container).
So far, based on FFmpeg examples, things are somewhat working, and output file plays properly, but sound in right channel is missing. I tried looking at different possibilities, but so far could not find any answer.
For reference, this is the code I am using:
#include "stdafx.h"
#define MAX_AUDIO_PACKET_SIZE (128 * 1024)
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <deque>
#include <queue>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/dict.h"
#include "libavutil/error.h"
#include "libavutil/opt.h"
#include <libavutil/fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
AVCodecID outputAudioFormat = AV_CODEC_ID_VORBIS;
static int sws_flags = SWS_BICUBIC;
#define STREAM_DURATION 50.0
#define STREAM_FRAME_RATE 25 /* 25 images/s */
#define STREAM_NB_FRAMES ((int)(STREAM_DURATION * STREAM_FRAME_RATE))
#define STREAM_PIX_FMT AV_PIX_FMT_YUV420P /* default pix_fmt */
AVFormatContext* fmt_ctx= NULL;
int audio_stream_index = -1;
AVCodecContext * codec_ctx_audio = NULL;
AVCodec* codec_audio = NULL;
AVFrame* decoded_frame = NULL;
uint8_t** audio_dst_data = NULL;
int got_frame = 0;
int audiobufsize = 0;
AVPacket input_packet;
int audio_dst_linesize = 0;
int audio_dst_bufsize = 0;
SwrContext * swrContext = NULL;
AVOutputFormat * output_format = NULL ;
AVFormatContext * output_fmt_ctx= NULL;
AVStream * audio_st = NULL;
AVStream* video_st = NULL;
AVCodec * audio_codec = NULL;
AVCodec* video_codec = NULL;
double audio_pts = 0.0;
AVFrame * out_frame = avcodec_alloc_frame();
int audio_input_frame_size = 64;
uint8_t * audio_data_buf = NULL;
uint8_t * audio_out = NULL;
int audio_bit_rate;
int audio_sample_rate;
int audio_channels;
int sourceSampleRate=0;
int destSampleRate = 0;
int dst_nb_samples = 0;
int pivotIndex = 0;
int max_dst_nb_samples = 0;
int samples_count=0;
int decode_packet();
int open_audio_input(char* src_filename);
int decode_frame();
int open_encoder(char* output_filename);
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id);
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
void close_audio(AVFormatContext *oc, AVStream *st);
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);
static AVFrame *frame;
static AVPicture src_picture, dst_picture;
static int frame_count;
/* Add an output stream. */
static AVStream *add_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
{
AVCodecContext *c;
AVStream *st;
/* find the encoder */
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find encoder for '%s'\n",
avcodec_get_name(codec_id));
exit(1);
}
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
st->id = oc->nb_streams-1;
c = st->codec;
switch ((*codec)->type) {
case AVMEDIA_TYPE_AUDIO:
c->sample_fmt = (*codec)->sample_fmts ?
(*codec)->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
c->bit_rate = 64000;
c->sample_rate = 44100;
c->channels = 2;
break;
case AVMEDIA_TYPE_VIDEO:
c->codec_id = codec_id;
c->bit_rate = 400000;
/* Resolution must be a multiple of two. */
c->width = 352;
c->height = 288;
/* timebase: This is the fundamental unit of time (in seconds) in terms
* of which frame timestamps are represented. For fixed-fps content,
* timebase should be 1/framerate and timestamp increments should be
* identical to 1. */
c->time_base.den = STREAM_FRAME_RATE;
c->time_base.num = 1;
c->gop_size = 12; /* emit one intra frame every twelve frames at most */
c->pix_fmt = STREAM_PIX_FMT;
if (c->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
/* just for testing, we also add B frames */
c->max_b_frames = 2;
}
if (c->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
/* Needed to avoid using macroblocks in which some coeffs overflow.
* This does not happen with normal video, it just happens here as
* the motion of the chroma plane does not match the luma plane. */
c->mb_decision = 2;
}
break;
default:
break;
}
/* Some formats want stream headers to be separate. */
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
c->flags |= CODEC_FLAG_GLOBAL_HEADER;
return st;
}
static void open_video(AVFormatContext *oc, AVCodec *codec, AVStream *st)
{
int ret;
AVCodecContext *c = st->codec;
/* open the codec */
ret = avcodec_open2(c, codec, NULL);
if (ret < 0) {
//fprintf(stderr, "Could not open video codec: %s\n", av_err2str(ret));
exit(1);
}
/* allocate and init a re-usable frame */
frame = av_frame_alloc();
if (!frame) {
fprintf(stderr, "Could not allocate video frame\n");
exit(1);
}
frame->format = c->pix_fmt;
frame->width = c->width;
frame->height = c->height;
/* Allocate the encoded raw picture. */
ret = avpicture_alloc(&dst_picture, c->pix_fmt, c->width, c->height);
if (ret < 0) {
//fprintf(stderr, "Could not allocate picture: %s\n", av_err2str(ret));
exit(1);
}
/* If the output format is not YUV420P, then a temporary YUV420P
* picture is needed too. It is then converted to the required
* output format. */
if (c->pix_fmt != AV_PIX_FMT_YUV420P) {
ret = avpicture_alloc(&src_picture, AV_PIX_FMT_YUV420P, c->width, c->height);
if (ret < 0) {
//fprintf(stderr, "Could not allocate temporary picture: %s\n",
// av_err2str(ret));
exit(1);
}
}
/* copy data and linesize picture pointers to frame */
*((AVPicture *)frame) = dst_picture;
}
int open_audio_input(char* src_filename)
{
int i =0;
/* open input file, and allocate format context */
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
{
fprintf(stderr, "Could not open source file %s\n", src_filename);
exit(1);
}
// Retrieve stream information
if(avformat_find_stream_info(fmt_ctx, NULL)<0)
return -1; // Couldn't find stream information
// Dump information about file onto standard error
av_dump_format(fmt_ctx, 0, src_filename, 0);
// Find the first video stream
for(i=0; i<fmt_ctx->nb_streams; i++)
{
if(fmt_ctx->streams[i]->codec->codec_type==AVMEDIA_TYPE_AUDIO)
{
audio_stream_index=i;
break;
}
}
if ( audio_stream_index != -1 )
{
// Get a pointer to the codec context for the audio stream
codec_ctx_audio=fmt_ctx->streams[audio_stream_index]->codec;
// Find the decoder for the video stream
codec_audio=avcodec_find_decoder(codec_ctx_audio->codec_id);
if(codec_audio==NULL) {
fprintf(stderr, "Unsupported audio codec!\n");
return -1; // Codec not found
}
// Open codec
AVDictionary *codecDictOptions = NULL;
if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)
return -1; // Could not open codec
// Set up SWR context once you've got codec information
swrContext = swr_alloc();
av_opt_set_int(swrContext, "in_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swrContext, "out_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swrContext, "in_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_int(swrContext, "out_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_sample_fmt(swrContext, "in_sample_fmt", codec_ctx_audio->sample_fmt, 0);
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
av_opt_set_sample_fmt(swrContext, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
}
else
{
av_opt_set_sample_fmt(swrContext, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
}
int rv = swr_init(swrContext);
sourceSampleRate = destSampleRate = codec_ctx_audio->sample_rate;
// Allocate audio frame
if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();
int nb_planes = 0;
AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ? codec_ctx_audio->channels : 1;
int tempSize = sizeof(uint8_t *) * nb_planes;
audio_dst_data = (uint8_t**)av_mallocz(tempSize);
if (!audio_dst_data)
{
fprintf(stderr, "Could not allocate audio data buffers\n");
}
else
{
for ( int i = 0 ; i < nb_planes ; i ++ )
{
audio_dst_data[i] = NULL;
}
}
}
}
int decode_frame()
{
int rv = 0;
got_frame = 0;
if ( fmt_ctx == NULL )
{
return rv;
}
int ret = 0;
audiobufsize = 0;
rv = av_read_frame(fmt_ctx, &input_packet);
if ( rv < 0 )
{
return rv;
}
rv = decode_packet();
// Free the input_packet that was allocated by av_read_frame
av_free_packet(&input_packet);
return rv;
}
int decode_packet()
{
int rv = 0;
int ret = 0;
//audio stream?
if(input_packet.stream_index == audio_stream_index)
{
avcodec_get_frame_defaults(decoded_frame);
while( input_packet.size > 0 )
{
int result = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);
if ( result < 0)
{
fprintf(stderr, "Error decoding audio frame\n");
//return ret;
}
else
{
if ( got_frame )
{
dst_nb_samples = (int)av_rescale_rnd(swr_get_delay(swrContext, sourceSampleRate) + decoded_frame->nb_samples, sourceSampleRate, destSampleRate, AV_ROUND_UP);
if ( dst_nb_samples > max_dst_nb_samples )
{
max_dst_nb_samples = dst_nb_samples;
if ( audio_dst_data[0] )
{
av_freep(&audio_dst_data[0]);
audio_dst_data[0] = NULL;
}
}
if ( audio_dst_data[0] == NULL )
{
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, codec_ctx_audio->channels,
decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 0);
}
else
{
ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, codec_ctx_audio->channels,
decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_S16, 0);
}
}
/* TODO: extend return code of the av_samples_* functions so that this call is not needed */
int resampled = swr_convert(swrContext, audio_dst_data, out_frame->nb_samples,
(const uint8_t **)(decoded_frame->extended_data), decoded_frame->nb_samples);
char str[900]="";
sprintf(str,"out_frame->nb_samples:\t%d; decoded_frame->nb_samples:\t%d",out_frame->nb_samples,decoded_frame->nb_samples );
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
audio_dst_bufsize = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 1);
}
else
{
audio_dst_bufsize = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_S16, 1);
}
input_packet.size -= result;
input_packet.data += result;
}
else
{
input_packet.size = 0;
input_packet.data = NULL;
}
}
}
}
return rv;
}
int open_encoder(char* output_filename )
{
int rv = 0;
/* allocate the output media context */
AVOutputFormat *opfmt = NULL;
avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);
if (!output_fmt_ctx) {
printf("Could not deduce output format from file extension: using MPEG.\n");
avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);
}
if (!output_fmt_ctx) {
rv = -1;
}
else
{
output_format = output_fmt_ctx->oformat;
}
/* Add the audio stream using the default format codecs
* and initialize the codecs. */
audio_st = NULL;
if ( output_fmt_ctx )
{
if (output_format->audio_codec != AV_CODEC_ID_NONE)
{
audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);
}
/* Now that all the parameters are set, we can open the audio and
* video codecs and allocate the necessary encode buffers. */
if (audio_st)
{
rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
if ( rv < 0 ) return rv;
}
av_dump_format(output_fmt_ctx, 0, output_filename, 1);
/* open the output file, if needed */
if (!(output_format->flags & AVFMT_NOFILE))
{
if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {
fprintf(stderr, "Could not open '%s'\n", output_filename);
rv = -1;
}
else
{
/* Write the stream header, if any. */
if (avformat_write_header(output_fmt_ctx, NULL) < 0)
{
fprintf(stderr, "Error occurred when opening output file\n");
rv = -1;
}
}
}
}
return rv;
}
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
{
AVCodecContext *c;
AVStream *st;
/* find the audio encoder */
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find codec\n");
exit(1);
}
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
st->id = 1;
c = st->codec;
/* put sample parameters */
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
c->sample_fmt = AV_SAMPLE_FMT_FLTP;
}
else
{
c->sample_fmt = AV_SAMPLE_FMT_S16;
}
c->bit_rate = audio_bit_rate;
c->sample_rate = audio_sample_rate;
c->channels = audio_channels;
// some formats want stream headers to be separate
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
c->flags |= CODEC_FLAG_GLOBAL_HEADER;
return st;
}
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
{
int ret=0;
AVCodecContext *c;
st->duration = fmt_ctx->duration;
c = st->codec;
/* open it */
ret = avcodec_open2(c, codec, NULL) ;
if ( ret < 0)
{
fprintf(stderr, "could not open codec\n");
return -1;
//exit(1);
}
if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
audio_input_frame_size = 10000;
else
audio_input_frame_size = c->frame_size;
out_frame->nb_samples = audio_input_frame_size;
int tempSize = audio_input_frame_size *
av_get_bytes_per_sample(c->sample_fmt) *
c->channels;
return ret;
}
void close_audio(AVFormatContext *oc, AVStream *st)
{
avcodec_close(st->codec);
}
void write_audio_frame(uint8_t ** audio_dst_data, int audio_dst_bufsize)
{
AVFormatContext *oc = output_fmt_ctx;
AVStream *st = audio_st;
if ( oc == NULL || st == NULL ) return;
AVCodecContext *c;
AVPacket pkt = { 0 }; // data and size must be 0;
int got_packet=0, ret=0;
av_init_packet(&pkt);
c = st->codec;
out_frame->nb_samples = audio_input_frame_size;
AVRational r;
r.num = 1;
r.den = c->sample_rate;
out_frame->pts = av_rescale_q(samples_count, (AVRational)r, c->time_base);
avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
audio_dst_data[0], audio_dst_bufsize, 0);
samples_count += out_frame->nb_samples;
ret = avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
if (ret < 0)
{
return;
}
if (!got_packet)
return;
/* rescale output packet timestamp values from codec to stream timebase */
pkt.pts = av_rescale_q_rnd(pkt.pts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
pkt.dts = av_rescale_q_rnd(pkt.dts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
pkt.duration = av_rescale_q(pkt.duration, c->time_base, st->time_base);
pkt.stream_index = st->index;
char str[999]="";
sprintf(str,"out_frame->nb_samples:\t%d",out_frame->nb_samples);
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
if (ret != 0)
{
exit(1);
}
av_free_packet(&pkt);
}
void write_delayed_frames(AVFormatContext *oc, AVStream *st)
{
AVCodecContext *c = st->codec;
int got_output = 0;
int ret = 0;
AVPacket pkt;
pkt.data = NULL;
pkt.size = 0;
av_init_packet(&pkt);
int i = 0;
for (got_output = 1; got_output; i++)
{
ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
if (ret < 0)
{
fprintf(stderr, "error encoding frame\n");
exit(1);
}
static int64_t tempPts = 0;
static int64_t tempDts = 0;
/* If size is zero, it means the image was buffered. */
if (got_output)
{
pkt.pts = av_rescale_q_rnd(pkt.pts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
pkt.dts = av_rescale_q_rnd(pkt.dts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
pkt.duration = av_rescale_q(pkt.duration, c->time_base, st->time_base);
pkt.stream_index = st->index;
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
}
else
{
ret = 0;
}
av_free_packet(&pkt);
}
}
int main(int argc, char **argv)
{
/* register all formats and codecs */
av_register_all();
avcodec_register_all();
avformat_network_init();
avdevice_register_all();
int i =0;
int ret=0;
char src_filename[90] = "test.mp2";
char dst_filename[90] = "output.webm";
outputAudioFormat = AV_CODEC_ID_VORBIS;
open_audio_input(src_filename);
if ( codec_ctx_audio->bit_rate == 0 ) codec_ctx_audio->bit_rate = 112000;
audio_bit_rate = codec_ctx_audio->bit_rate;
audio_sample_rate = codec_ctx_audio->sample_rate;
audio_channels = codec_ctx_audio->channels;
open_encoder( dst_filename );
int frames= 0;
while(1)
{
int rv = decode_frame();
if ( rv < 0 )
{
break;
}
if (audio_st)
{
audio_pts = audio_st->pts.val * av_q2d(audio_st->time_base);
}
else
{
audio_pts = 0.0;
}
if ( codec_ctx_audio )
{
if ( got_frame )
{
write_audio_frame( audio_dst_data, audio_dst_bufsize );
frames++;
}
}
printf("\naudio_pts: %f", audio_pts);
}
while(1)
{
dst_nb_samples = (int)av_rescale_rnd(swr_get_delay(swrContext, sourceSampleRate) + decoded_frame->nb_samples, sourceSampleRate, destSampleRate, AV_ROUND_UP);
if ( dst_nb_samples > max_dst_nb_samples )
{
max_dst_nb_samples = dst_nb_samples;
if ( audio_dst_data[0] )
{
av_freep(&audio_dst_data[0]);
audio_dst_data[0] = NULL;
}
}
if ( audio_dst_data[0] == NULL )
{
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
ret = av_samples_alloc(audio_dst_data, NULL, codec_ctx_audio->channels,
decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 0);
}
else
{
ret = av_samples_alloc(audio_dst_data, NULL, codec_ctx_audio->channels,
decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_S16, 0);
}
}
int resampled = swr_convert(swrContext, audio_dst_data, out_frame->nb_samples,NULL, 0);
if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
{
audio_dst_bufsize = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 1);
}
else
{
audio_dst_bufsize = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_S16, 1);
}
if ( audio_dst_bufsize <= 0 ) break;
audio_pts = audio_st->pts.val * av_q2d(audio_st->time_base);
printf("\naudio_pts: %f", audio_pts);
write_audio_frame( audio_dst_data, audio_dst_bufsize );
}
write_delayed_frames( output_fmt_ctx, audio_st );
av_write_trailer(output_fmt_ctx);
close_audio( output_fmt_ctx, audio_st);
swr_free(&swrContext);
avcodec_free_frame(&out_frame);
getch();
return 0;
}
Working under Windows 7, Zeranoe FFmpeg 32 bit build:
libavutil 52. 62.100 / 52. 62.100
libavcodec 55. 47.101 / 55. 47.101
libavformat 55. 22.103 / 55. 22.103
libavdevice 55. 5.102 / 55. 5.102
libavfilter 4. 1.100 / 4. 1.100
libswscale 2. 5.101 / 2. 5.101
libswresample 0. 17.104 / 0. 17.104
libpostproc 52. 3.100 / 52. 3.100
Could anyone point to the place where I might be misunderstanding things?
Thanks for any guidance in advance!
Most likely the resampler isn't initialized or used correctly. Could you change it the way I'm using it here: https://sourceforge.net/p/karlyriceditor/code/HEAD/tree/src/ffmpegvideoencoder.cpp ?
I think I finally found the solution. Resampling sample that comes with FFmpeg (with at least the one I have) could be misleading - probably needs to be corrected. Even according to documentation of swr_convert, audio_dst_data can be a big buffer to avoid buffering:
* If more input is provided than output space then the input will be buffered.
* You can avoid this buffering by providing more output space than input.
* Convertion will run directly without copying whenever possible.
This statement could be incorrect (theoretically and in working has no obvious errors, but sometimes results in awkward behavior as I have discovered).
My solution: do not let audio_dst_data buffer size exceed output codec's frame size - then it works perfectly.
Maybe someone would fix swresample library, or resampling example, or, at least document it more clearly.
I have written a small program to convert webm (vorbis) audio to aac format, using FFmpeg libraries - C++ (on Windows using 32 bit Zeranoe FFmpeg builds). After writing this program, I find it is sometimes converting files as per expectation, and at other times, results in larger duration files, and audio playback is broken/awkward as well.
This code appears to be working fine for mp3, which also uses FLTP format (same as vorbis), so technically both look similar.
Please see below sample code I am using:
////////////////////////////////////////////////
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <deque>
#include <queue>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/dict.h"
#include "libavutil/error.h"
#include "libavutil/opt.h"
#include <libavutil/fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
AVFormatContext* fmt_ctx= NULL;
int audio_stream_index = -1;
AVCodecContext * codec_ctx_audio = NULL;
AVCodec* codec_audio = NULL;
AVFrame* decoded_frame = NULL;
uint8_t** audio_dst_data = NULL;
int got_frame = 0;
int audiobufsize = 0;
AVPacket input_packet;
int audio_dst_linesize = 0;
int audio_dst_bufsize = 0;
SwrContext * swr = NULL;
AVOutputFormat * output_format = NULL ;
AVFormatContext * output_fmt_ctx= NULL;
AVStream * audio_st = NULL;
AVCodec * audio_codec = NULL;
double audio_pts = 0.0;
AVFrame * out_frame = avcodec_alloc_frame();
int audio_input_frame_size = 0;
uint8_t * audio_data_buf = NULL;
uint8_t * audio_out = NULL;
int audio_bit_rate;
int audio_sample_rate;
int audio_channels;
int decode_packet();
int open_audio_input(char* src_filename);
int decode_frame();
int open_encoder(char* output_filename);
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id);
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
void close_audio(AVFormatContext *oc, AVStream *st);
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);
int open_audio_input(char* src_filename)
{
int i =0;
/* open input file, and allocate format context */
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
{
fprintf(stderr, "Could not open source file %s\n", src_filename);
exit(1);
}
// Retrieve stream information
if(avformat_find_stream_info(fmt_ctx, NULL)<0)
return -1; // Couldn't find stream information
// Dump information about file onto standard error
av_dump_format(fmt_ctx, 0, src_filename, 0);
// Find the first video stream
for(i=0; i<fmt_ctx->nb_streams; i++)
{
if(fmt_ctx->streams[i]->codec->codec_type==AVMEDIA_TYPE_AUDIO)
{
audio_stream_index=i;
break;
}
}
if ( audio_stream_index != -1 )
{
// Get a pointer to the codec context for the audio stream
codec_ctx_audio=fmt_ctx->streams[audio_stream_index]->codec;
// Find the decoder for the video stream
codec_audio=avcodec_find_decoder(codec_ctx_audio->codec_id);
if(codec_audio==NULL) {
fprintf(stderr, "Unsupported audio codec!\n");
return -1; // Codec not found
}
// Open codec
AVDictionary *codecDictOptions = NULL;
if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)
return -1; // Could not open codec
// Set up SWR context once you've got codec information
swr = swr_alloc();
av_opt_set_int(swr, "in_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "out_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "in_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_int(swr, "out_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_sample_fmt(swr, "in_sample_fmt", codec_ctx_audio->sample_fmt, 0);
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
swr_init(swr);
// Allocate audio frame
if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();
int nb_planes = 0;
AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ? codec_ctx_audio->channels : 1;
int tempSize = sizeof(uint8_t *) * nb_planes;
audio_dst_data = (uint8_t**)av_mallocz(tempSize);
if (!audio_dst_data)
{
fprintf(stderr, "Could not allocate audio data buffers\n");
}
else
{
for ( int i = 0 ; i < nb_planes ; i ++ )
{
audio_dst_data[i] = NULL;
}
}
}
}
int decode_frame()
{
int rv = 0;
got_frame = 0;
if ( fmt_ctx == NULL )
{
return rv;
}
int ret = 0;
audiobufsize = 0;
rv = av_read_frame(fmt_ctx, &input_packet);
if ( rv < 0 )
{
return rv;
}
rv = decode_packet();
// Free the input_packet that was allocated by av_read_frame
av_free_packet(&input_packet);
return rv;
}
int decode_packet()
{
int rv = 0;
int ret = 0;
//audio stream?
if(input_packet.stream_index == audio_stream_index)
{
/* decode audio frame */
rv = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);
if (rv < 0)
{
fprintf(stderr, "Error decoding audio frame\n");
//return ret;
}
else
{
if (got_frame)
{
if ( audio_dst_data[0] == NULL )
{
ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, decoded_frame->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
if (ret < 0)
{
fprintf(stderr, "Could not allocate audio buffer\n");
return AVERROR(ENOMEM);
}
/* TODO: extend return code of the av_samples_* functions so that this call is not needed */
audio_dst_bufsize = av_samples_get_buffer_size(NULL, audio_st->codec->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
//int16_t* outputBuffer = ...;
swr_convert(swr, audio_dst_data, out_frame->nb_samples,
(const uint8_t **)(decoded_frame->data), decoded_frame->nb_samples);
//swr_convert( swr, audio_dst_data, out_frame->nb_samples, (const uint8_t**) decoded_frame->extended_data, decoded_frame->nb_samples );
}
/* copy audio data to destination buffer:
* this is required since rawaudio expects non aligned data */
//av_samples_copy(audio_dst_data, decoded_frame->data, 0, 0,
// decoded_frame->nb_samples, decoded_frame->channels, (AVSampleFormat)decoded_frame->format);
}
}
}
return rv;
}
int open_encoder(char* output_filename )
{
int rv = 0;
/* allocate the output media context */
AVOutputFormat *opfmt = NULL;
avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);
if (!output_fmt_ctx) {
printf("Could not deduce output format from file extension: using MPEG.\n");
avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);
}
if (!output_fmt_ctx) {
rv = -1;
}
else
{
output_format = output_fmt_ctx->oformat;
}
/* Add the audio stream using the default format codecs
* and initialize the codecs. */
audio_st = NULL;
if ( output_fmt_ctx )
{
if (output_format->audio_codec != AV_CODEC_ID_NONE)
{
audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);
}
/* Now that all the parameters are set, we can open the audio and
* video codecs and allocate the necessary encode buffers. */
if (audio_st)
{
rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
if ( rv < 0 ) return rv;
}
av_dump_format(output_fmt_ctx, 0, output_filename, 1);
/* open the output file, if needed */
if (!(output_format->flags & AVFMT_NOFILE))
{
if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {
fprintf(stderr, "Could not open '%s'\n", output_filename);
rv = -1;
}
else
{
/* Write the stream header, if any. */
if (avformat_write_header(output_fmt_ctx, NULL) < 0)
{
fprintf(stderr, "Error occurred when opening output file\n");
rv = -1;
}
}
}
}
return rv;
}
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
{
AVCodecContext *c;
AVStream *st;
/* find the audio encoder */
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find codec\n");
exit(1);
}
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
st->id = 1;
c = st->codec;
/* put sample parameters */
c->sample_fmt = AV_SAMPLE_FMT_S16;
c->bit_rate = audio_bit_rate;
c->sample_rate = audio_sample_rate;
c->channels = audio_channels;
// some formats want stream headers to be separate
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
c->flags |= CODEC_FLAG_GLOBAL_HEADER;
return st;
}
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
{
int ret=0;
AVCodecContext *c;
st->duration = fmt_ctx->duration;
c = st->codec;
/* open it */
ret = avcodec_open2(c, codec, NULL) ;
if ( ret < 0)
{
fprintf(stderr, "could not open codec\n");
return -1;
//exit(1);
}
if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
audio_input_frame_size = 10000;
else
audio_input_frame_size = c->frame_size;
int tempSize = audio_input_frame_size *
av_get_bytes_per_sample(c->sample_fmt) *
c->channels;
return ret;
}
void close_audio(AVFormatContext *oc, AVStream *st)
{
avcodec_close(st->codec);
}
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize)
{
AVFormatContext *oc = output_fmt_ctx;
AVStream *st = audio_st;
if ( oc == NULL || st == NULL ) return;
AVCodecContext *c;
AVPacket pkt = { 0 }; // data and size must be 0;
int got_packet;
av_init_packet(&pkt);
c = st->codec;
out_frame->nb_samples = audio_input_frame_size;
int buf_size = audio_src_bufsize *
av_get_bytes_per_sample(c->sample_fmt) *
c->channels;
avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
(uint8_t *) *audio_src_data,
buf_size, 1);
avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
if (!got_packet)
{
}
else
{
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
pkt.flags |= AV_PKT_FLAG_KEY;
/* Write the compressed frame to the media file. */
if (av_interleaved_write_frame(oc, &pkt) != 0)
{
fprintf(stderr, "Error while writing audio frame\n");
exit(1);
}
}
av_free_packet(&pkt);
}
void write_delayed_frames(AVFormatContext *oc, AVStream *st)
{
AVCodecContext *c = st->codec;
int got_output = 0;
int ret = 0;
AVPacket pkt;
pkt.data = NULL;
pkt.size = 0;
av_init_packet(&pkt);
int i = 0;
for (got_output = 1; got_output; i++)
{
ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
if (ret < 0)
{
fprintf(stderr, "error encoding frame\n");
exit(1);
}
static int64_t tempPts = 0;
static int64_t tempDts = 0;
/* If size is zero, it means the image was buffered. */
if (got_output)
{
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
}
else
{
ret = 0;
}
av_free_packet(&pkt);
}
}
int main(int argc, char **argv)
{
/* register all formats and codecs */
av_register_all();
avcodec_register_all();
avformat_network_init();
avdevice_register_all();
int i =0;
int ret=0;
char src_filename[90] = "test_a.webm";
char dst_filename[90] = "output.aac";
open_audio_input(src_filename);
if ( codec_ctx_audio->bit_rate == 0 ) codec_ctx_audio->bit_rate = 112000;
audio_bit_rate = codec_ctx_audio->bit_rate;
audio_sample_rate = codec_ctx_audio->sample_rate;
audio_channels = codec_ctx_audio->channels;
open_encoder( dst_filename );
while(1)
{
int rv = decode_frame();
if ( rv < 0 )
{
break;
}
if (audio_st)
{
audio_pts = (double)audio_st->pts.val * audio_st->time_base.num /
audio_st->time_base.den;
}
else
{
audio_pts = 0.0;
}
if ( codec_ctx_audio )
{
if ( got_frame)
{
write_audio_frame( audio_dst_data, audio_dst_bufsize );
}
}
if ( audio_dst_data[0] )
{
av_freep(&audio_dst_data[0]);
audio_dst_data[0] = NULL;
}
printf("\naudio_pts: %.3f", audio_pts);
}
while(1)
{
if ( audio_dst_data && audio_dst_data[0] )
{
av_freep(&audio_dst_data[0]);
audio_dst_data[0] = NULL;
}
ret = av_samples_alloc(audio_dst_data, NULL, codec_ctx_audio->channels,
decoded_frame->nb_samples, AV_SAMPLE_FMT_S16, 0);
ret = swr_convert(swr, audio_dst_data, out_frame->nb_samples,NULL, 0);
if ( ret <= 0 ) break;
write_audio_frame( audio_dst_data, audio_dst_bufsize );
}
write_delayed_frames( output_fmt_ctx, audio_st );
av_write_trailer(output_fmt_ctx);
close_audio( output_fmt_ctx, audio_st);
swr_free(&swr);
avcodec_free_frame(&out_frame);
getch();
return 0;
}
"test_a.webm" input file results in longer duration (40 second output), and if I change it to "jet.webm", it is converted fine.
Both input files are approximately 18 second duration.
For reference, these files can be downloaded from links below:
http://www.filedropper.com/testa ,
http://www.filedropper.com/jet
Alternatively, they are zipped and uploaded elsewhere as well:
http://www.files.com/shared/52c3eefe990ea/test_audio_files.zip
Could someone kindly guide on what I am doing wrong here?
Thanks in advance...
p.s. These files are taken/extracted from different online sources/demos.
Edit 2-1-14: After debugging, I can see audio_pts variable is being populated incorrectly. It relies on audio_st->pts.val, which is automatically calculated upon calling av_interleaved_write_frame() function. I cannot step inside av_interleaved_write_frame() function since I am on Windows, using libav dlls/libs.
So,
For jet.webm file (whose conversion is fine), audio_st->pts.val goes till maximum: 1665567, and audio_pts becomes:
1665567*1/90000 = 18.5063
For test_a.webm file (whose conversion is bad), audio_st->pts.val goes till maximum: 3606988, and audio_pts becomes:
3606988*1/90000 = 40.0776
reference line: audio_pts = (double)audio_st->pts.val * audio_st->time_base.num /
audio_st->time_base.den;
Since PTS is very off, it shouldn't be playing fine logically as well. But I cannot say av_interleaved_write_frame() function is doing it wrong; surely something cleaner can be managed on my end.
Edit 3-1-14: Discovered one more thing: while reading jet.webm file, decoded frame's nb_sample are always 1024 (except for 1st frame: 576), but in case of test_a.webm file, nb_samples are either 1024, or 128, with exceptions of 576 (less frequent). If I ignore writing of frame when nb_sample is 128, I get approximately same file length in the end, but you can hear bits and pieces are missing here and there.
How can I deal with this?
I got distorted image when try to convert YUV420p to RGB24 using
sws_scale.
Code:
ret = avcodec_decode_video2(video_dec_ctx, frame, got_frame, &pkt);
if (ret < 0) {
fprintf(stderr, "Error decoding video frame\n");
return ret;
}
if (*got_frame)
{
printf("video_frame%s n:%d coded_n:%d pts:%s\n",
cached ? "(cached)" : "",
video_frame_count++, frame->coded_picture_number,
"#"/*av_ts2timestr(frame->pts, &video_dec_ctx->time_base)*/);
/* copy decoded frame to destination buffer:
* this is required since rawvideo expects non aligned data */
av_image_copy(video_dst_data, video_dst_linesize,
(const uint8_t **)(frame->data), frame->linesize,
video_dec_ctx->pix_fmt, video_dec_ctx->width, video_dec_ctx->height);
/* write to rawvideo file */
fwrite(video_dst_data[0], 1, video_dst_bufsize, video_dst_file);
AVPicture pic;
avpicture_alloc( &pic, AV_PIX_FMT_RGB24, frame->width, frame->height);
SwsContext *ctxt = sws_getContext(frame->width, frame->height, static_cast<AVPixelFormat>(frame->format),
frame->width, frame->height, AV_PIX_FMT_RGB24, SWS_BILINEAR, NULL, NULL, NULL);
if ( NULL == ctxt )
{
//Log("failed to get sws context");
}
if ( 0 < sws_scale(ctxt, frame->data, frame->linesize, 0, frame->height, pic.data, pic.linesize))
{
char szPic[256] = { 0 };
sprintf( szPic, "decoded/%d.bmp", video_frame_count );
FILE *pf = fopen(szPic,"w");
if ( NULL != pf )
{
BITMAPFILEHEADER bmpFileHeader = {0};
bmpFileHeader.bfReserved1 = 0;
bmpFileHeader.bfReserved2 = 0;
bmpFileHeader.bfType = 0x4D42;
bmpFileHeader.bfSize = sizeof(bmpFileHeader) + sizeof(BITMAPINFOHEADER) + pic.linesize[0] * frame->height;
bmpFileHeader.bfOffBits = sizeof(bmpFileHeader) + sizeof(BITMAPINFOHEADER);
BITMAPINFOHEADER bmiHeader = { 0 };
bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
bmiHeader.biWidth = frame->width;
bmiHeader.biHeight = 0 - frame->height;
bmiHeader.biPlanes = 1;
bmiHeader.biBitCount = 24;
bmiHeader.biCompression = BI_RGB;
bmiHeader.biSizeImage = pic.linesize[0] * frame->height;
bmiHeader.biXPelsPerMeter = 0;
bmiHeader.biYPelsPerMeter = 0;
bmiHeader.biClrUsed = 0;
bmiHeader.biClrImportant = 0;
fwrite( &bmpFileHeader, 1, sizeof(bmpFileHeader), pf );
fwrite( &bmiHeader, 1, sizeof(bmiHeader), pf );
fwrite( pic.data[0], 1, pic.linesize[0] * frame->height, pf );
fclose( pf );
}
}
// pic.data[0] now contains the image data in RGB format (3 bytes)
// and pic.linesize[0] is the pitch of the data (ie. size of a row in memory, which can be larger than width*sizeof(pixel))
avpicture_free(&pic);
sws_freeContext(ctxt);
}
above only decode frame then convert this from to RGB24, then write a bitmap.
original video frame like this,
but converted image,
is there missing some code or some code is wrong?
thanks in advance.
fwrite( pic.data[0], 1, pic.linesize[0] * frame->height, pf );
For an image of e.g. 1280x720, linesize is typically larger, e.g. 1312, so you'll be writing more data than image size if you write linesize*height. You want to write (in a loop) width pixels offset by linesize bytes:
uint8_t *ptr = pic.data[0];
for (int y = 0; y < frame->height; y++) {
fwrite(ptr, 1, frame->width, pf);
ptr += pic.linesize[0];
}
And then it should work correctly.
maybe these codes can help you. these works good.
int got_frame = 0;
auto len = avcodec_decode_video2(m_avCodecContext
, m_avFrame
, &got_frame
, &avpkt);
if (len < 0)
{
return;
}
if (got_frame /*&& !silentMode*/)
{
//if (videoRenderer != nullptr)
{
if (frameSize == NULL)
{
return;
}
uint8_t *dst_data[4];
int dst_linesize[4];
int dst_w, dst_h;
int ret = 0;
if (1)// avcodec_alloc_frame()
{
auto stride = m_avFrame->linesize;
auto scan0 = m_avFrame->data;
SwsContext *scaleContext = sws_getContext(m_avCodecContext->width
, m_avCodecContext->height
, m_avCodecContext->pix_fmt
, m_avCodecContext->width
, m_avCodecContext->height
, PixelFormat::PIX_FMT_BGR24
, SWS_FAST_BILINEAR, NULL, NULL, NULL);
if (scaleContext == NULL)
{
//TODO: log error
return;
}
try
{
//*vb->signal = 1;
ret = avpicture_alloc(&m_dst_picture
, PixelFormat::PIX_FMT_BGR24
, m_avCodecContext->width
, m_avCodecContext->height);
// AVFrame *picture_RGB;
// uint8_t *bufferRGB;
// picture_RGB = avcodec_alloc_frame();
// bufferRGB = (uint8_t*)malloc(720*576*(24/8)/*avpicture_get_size(PIX_FMT_RGB24, 720, 576)*/);
// avpicture_fill((AVPicture *)picture_RGB, bufferRGB, PIX_FMT_RGB24, 720, 576);
if (ret < 0)
{
return;
}
int retScale = sws_scale(scaleContext
, scan0
, stride
, 0
, m_avCodecContext->height
, m_dst_picture.data //picture_RGB->data
, m_dst_picture.linesize //picture_RGB->linesize
);
if (1)
{
HWND hwnd = m_pParent->GetSafeHwnd();
SetFocus(hwnd);
CRect rc;
m_pParent->GetClientRect(rc);
CDC *cdc = m_pParent->GetDC();
char* bitmap = (char*)m_dst_picture.data[0];
// static unsigned int i = 0;
// bmp_save(bitmap, m_avCodecContext->width, m_avCodecContext->height, i++);
BITMAPINFO bmpinfo;
bmpinfo.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
bmpinfo.bmiHeader.biWidth = m_avCodecContext->width;
bmpinfo.bmiHeader.biHeight = -m_avCodecContext->height;
bmpinfo.bmiHeader.biPlanes = 1;
bmpinfo.bmiHeader.biBitCount = 24;
bmpinfo.bmiHeader.biCompression = BI_RGB;
bmpinfo.bmiHeader.biSizeImage =
m_avCodecContext->width * m_avCodecContext->height * (24 / 8);
bmpinfo.bmiHeader.biXPelsPerMeter = 100;
bmpinfo.bmiHeader.biYPelsPerMeter = 100;
bmpinfo.bmiHeader.biClrUsed = 0;
bmpinfo.bmiHeader.biClrImportant = 0;
HBITMAP hBitmap = CreateDIBitmap(cdc->GetSafeHdc(), &bmpinfo.bmiHeader, CBM_INIT, bitmap, &bmpinfo/*bi*/, DIB_RGB_COLORS);
DrawBitmap(cdc, hBitmap, m_pParent);
::DeleteObject(hBitmap);
::DeleteObject(cdc->GetSafeHdc());
}
avpicture_free(&m_dst_picture);
sws_freeContext(scaleContext);
}
catch (int e)
{
sws_freeContext(scaleContext);
}
}
}
}
void DrawBitmap(CDC *pDC, HBITMAP hbitmap,CWnd *wnd)
{
CBitmap *pBitmap = CBitmap::FromHandle(hbitmap);
BITMAP bm;
pBitmap -> GetBitmap(&bm);
CDC MemDC;
MemDC.CreateCompatibleDC(pDC);
HGDIOBJ gob= MemDC.SelectObject(pBitmap);
CRect rc;
wnd->GetClientRect(rc);
pDC->SetStretchBltMode( COLORONCOLOR);
pDC->StretchBlt(0, 0,rc.Width(),rc.Height() , &MemDC, 0, 0, bm.bmWidth, bm.bmHeight, SRCCOPY);
MemDC.SelectObject(gob);
DeleteObject(pBitmap);
DeleteObject(MemDC);
DeleteObject(&bm);
ReleaseDC(wnd->GetSafeHwnd(), MemDC);
}
void initDecoder()
{
m_avCodecContext = avcodec_alloc_context();
if (!m_avCodecContext)
{
//failed to allocate codec context
Cleanup();
return;
}
m_avCodecContext->flags = 0;
uint8_t startCode[] = { 0x00, 0x00, 0x01 };
//////////////////////////////////////////////////////////////////////////
//I thought for play live video you can comment these lines.
if (m_sProps != NULL)
{
// USES_CONVERSION;
// ::MessageBox(NULL, A2T(sprops), TEXT("sprops"), MB_OK);
unsigned spropCount;
SPropRecord* spropRecords = parseSPropParameterSets(m_sProps, spropCount);
try
{
for (unsigned i = 0; i < spropCount; ++i)
{
AddExtraData(startCode, sizeof(startCode));
AddExtraData(spropRecords[i].sPropBytes, spropRecords[i].sPropLength);
}
}
catch (void*)
{
//extradata exceeds size limit
delete[] spropRecords;
Cleanup();
return;
}
delete[] spropRecords;
m_avCodecContext->extradata = extraDataBuffer;
m_avCodecContext->extradata_size = extraDataSize;
}
AddExtraData(startCode, sizeof(startCode));
bInitEx = true;
av_register_all();
avcodec_register_all();
m_codecId = CODEC_ID_H264;
m_avCodec = avcodec_find_decoder(m_codecId);
if (m_avCodec == NULL)
{
return;
}
if (avcodec_open(m_avCodecContext, m_avCodec) < 0)
{
//failed to open codec
Cleanup();
return;
}
if (m_avCodecContext->codec_id == CODEC_ID_H264)
{
m_avCodecContext->flags2 |= CODEC_FLAG2_CHUNKS;
//avCodecContext->flags2 |= CODEC_FLAG2_SHOW_ALL;
}
m_avFrame = avcodec_alloc_frame();
if (!m_avFrame)
{
//failed to allocate frame
Cleanup();
return;
}
}
I have made a small application to extract audio from an mp4 file, or simply convert an existing audio file to AAC/mp4 format (both raw AAC, or inside mp4 container). I have run this application with existing mp4 files as input, and it properly extracts audio, and encodes to mp4 (audio only:AAC), or even directly in AAC format (i.e. test.aac also works). But when I tried running it on mp3 files, output clip plays faster than it should be (a clip of 1:12 seconds plays back till 1:05 seconds only).
Edit: I have made improvements in code - now, it no longer plays back faster, but is still only converted till 1:05 seconds, remaining clip is missing (this is about 89% conversion done, and remaining 11% remaining).
Here is the code I have written to achieve this:
////////////////////////////////////////////////
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <deque>
#include <queue>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/dict.h"
#include "libavutil/error.h"
#include "libavutil/opt.h"
#include <libavutil/fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
AVFormatContext* fmt_ctx= NULL;
int audio_stream_index = -1;
AVCodecContext * codec_ctx_audio = NULL;
AVCodec* codec_audio = NULL;
AVFrame* decoded_frame = NULL;
uint8_t** audio_dst_data = NULL;
int got_frame = 0;
int audiobufsize = 0;
AVPacket input_packet;
int audio_dst_linesize = 0;
int audio_dst_bufsize = 0;
SwrContext * swr = NULL;
AVOutputFormat * output_format = NULL ;
AVFormatContext * output_fmt_ctx= NULL;
AVStream * audio_st = NULL;
AVCodec * audio_codec = NULL;
double audio_pts = 0.0;
AVFrame * out_frame = avcodec_alloc_frame();
int audio_input_frame_size = 0;
uint8_t * audio_data_buf = NULL;
uint8_t * audio_out = NULL;
int audio_bit_rate;
int audio_sample_rate;
int audio_channels;
int decode_packet();
int open_audio_input(char* src_filename);
int decode_frame();
int open_encoder(char* output_filename);
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id);
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
void close_audio(AVFormatContext *oc, AVStream *st);
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);
int open_audio_input(char* src_filename)
{
int i =0;
/* open input file, and allocate format context */
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
{
fprintf(stderr, "Could not open source file %s\n", src_filename);
exit(1);
}
// Retrieve stream information
if(avformat_find_stream_info(fmt_ctx, NULL)<0)
return -1; // Couldn't find stream information
// Dump information about file onto standard error
av_dump_format(fmt_ctx, 0, src_filename, 0);
// Find the first video stream
for(i=0; i<fmt_ctx->nb_streams; i++)
{
if(fmt_ctx->streams[i]->codec->codec_type==AVMEDIA_TYPE_AUDIO)
{
audio_stream_index=i;
break;
}
}
if ( audio_stream_index != -1 )
{
// Get a pointer to the codec context for the audio stream
codec_ctx_audio=fmt_ctx->streams[audio_stream_index]->codec;
// Find the decoder for the video stream
codec_audio=avcodec_find_decoder(codec_ctx_audio->codec_id);
if(codec_audio==NULL) {
fprintf(stderr, "Unsupported audio codec!\n");
return -1; // Codec not found
}
// Open codec
AVDictionary *codecDictOptions = NULL;
if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)
return -1; // Could not open codec
// Set up SWR context once you've got codec information
swr = swr_alloc();
av_opt_set_int(swr, "in_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "out_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "in_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_int(swr, "out_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_sample_fmt(swr, "in_sample_fmt", codec_ctx_audio->sample_fmt, 0);
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
swr_init(swr);
// Allocate audio frame
if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();
int nb_planes = 0;
AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ? codec_ctx_audio->channels : 1;
int tempSize = sizeof(uint8_t *) * nb_planes;
audio_dst_data = (uint8_t**)av_mallocz(tempSize);
if (!audio_dst_data)
{
fprintf(stderr, "Could not allocate audio data buffers\n");
}
else
{
for ( int i = 0 ; i < nb_planes ; i ++ )
{
audio_dst_data[i] = NULL;
}
}
}
}
int decode_frame()
{
int rv = 0;
got_frame = 0;
if ( fmt_ctx == NULL )
{
return rv;
}
int ret = 0;
audiobufsize = 0;
rv = av_read_frame(fmt_ctx, &input_packet);
if ( rv < 0 )
{
return rv;
}
rv = decode_packet();
// Free the input_packet that was allocated by av_read_frame
//av_free_packet(&input_packet);
return rv;
}
int decode_packet()
{
int rv = 0;
int ret = 0;
//audio stream?
if(input_packet.stream_index == audio_stream_index)
{
/* decode audio frame */
rv = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);
if (rv < 0)
{
fprintf(stderr, "Error decoding audio frame\n");
//return ret;
}
else
{
if (got_frame)
{
if ( audio_dst_data[0] == NULL )
{
ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, decoded_frame->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
if (ret < 0)
{
fprintf(stderr, "Could not allocate audio buffer\n");
return AVERROR(ENOMEM);
}
/* TODO: extend return code of the av_samples_* functions so that this call is not needed */
audio_dst_bufsize = av_samples_get_buffer_size(NULL, audio_st->codec->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
//int16_t* outputBuffer = ...;
swr_convert( swr, audio_dst_data, out_frame->nb_samples, (const uint8_t**) decoded_frame->extended_data, decoded_frame->nb_samples );
}
/* copy audio data to destination buffer:
* this is required since rawaudio expects non aligned data */
//av_samples_copy(audio_dst_data, decoded_frame->data, 0, 0,
// decoded_frame->nb_samples, decoded_frame->channels, (AVSampleFormat)decoded_frame->format);
}
}
}
return rv;
}
int open_encoder(char* output_filename )
{
int rv = 0;
/* allocate the output media context */
AVOutputFormat *opfmt = NULL;
avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);
if (!output_fmt_ctx) {
printf("Could not deduce output format from file extension: using MPEG.\n");
avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);
}
if (!output_fmt_ctx) {
rv = -1;
}
else
{
output_format = output_fmt_ctx->oformat;
}
/* Add the audio stream using the default format codecs
* and initialize the codecs. */
audio_st = NULL;
if ( output_fmt_ctx )
{
if (output_format->audio_codec != AV_CODEC_ID_NONE)
{
audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);
}
/* Now that all the parameters are set, we can open the audio and
* video codecs and allocate the necessary encode buffers. */
if (audio_st)
{
rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
if ( rv < 0 ) return rv;
}
av_dump_format(output_fmt_ctx, 0, output_filename, 1);
/* open the output file, if needed */
if (!(output_format->flags & AVFMT_NOFILE))
{
if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {
fprintf(stderr, "Could not open '%s'\n", output_filename);
rv = -1;
}
else
{
/* Write the stream header, if any. */
if (avformat_write_header(output_fmt_ctx, NULL) < 0)
{
fprintf(stderr, "Error occurred when opening output file\n");
rv = -1;
}
}
}
}
return rv;
}
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
{
AVCodecContext *c;
AVStream *st;
/* find the audio encoder */
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find codec\n");
exit(1);
}
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
st->id = 1;
c = st->codec;
/* put sample parameters */
c->sample_fmt = AV_SAMPLE_FMT_S16;
c->bit_rate = audio_bit_rate;
c->sample_rate = audio_sample_rate;
c->channels = audio_channels;
// some formats want stream headers to be separate
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
c->flags |= CODEC_FLAG_GLOBAL_HEADER;
return st;
}
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
{
int ret=0;
AVCodecContext *c;
st->duration = fmt_ctx->duration;
c = st->codec;
/* open it */
ret = avcodec_open2(c, codec, NULL) ;
if ( ret < 0)
{
fprintf(stderr, "could not open codec\n");
return -1;
//exit(1);
}
if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
audio_input_frame_size = 10000;
else
audio_input_frame_size = c->frame_size;
int tempSize = audio_input_frame_size *
av_get_bytes_per_sample(c->sample_fmt) *
c->channels;
return ret;
}
void close_audio(AVFormatContext *oc, AVStream *st)
{
avcodec_close(st->codec);
}
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize)
{
AVFormatContext *oc = output_fmt_ctx;
AVStream *st = audio_st;
if ( oc == NULL || st == NULL ) return;
AVCodecContext *c;
AVPacket pkt = { 0 }; // data and size must be 0;
int got_packet;
av_init_packet(&pkt);
c = st->codec;
out_frame->nb_samples = audio_input_frame_size;
int buf_size = audio_src_bufsize *
av_get_bytes_per_sample(c->sample_fmt) *
c->channels;
avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
(uint8_t *) *audio_src_data,
buf_size, 1);
avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
if (!got_packet)
{
}
else
{
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
pkt.flags |= AV_PKT_FLAG_KEY;
/* Write the compressed frame to the media file. */
if (av_interleaved_write_frame(oc, &pkt) != 0)
{
fprintf(stderr, "Error while writing audio frame\n");
exit(1);
}
}
av_free_packet(&pkt);
}
void write_delayed_frames(AVFormatContext *oc, AVStream *st)
{
AVCodecContext *c = st->codec;
int got_output = 0;
int ret = 0;
AVPacket pkt;
pkt.data = NULL;
pkt.size = 0;
av_init_packet(&pkt);
int i = 0;
for (got_output = 1; got_output; i++)
{
ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
if (ret < 0)
{
fprintf(stderr, "error encoding frame\n");
exit(1);
}
static int64_t tempPts = 0;
static int64_t tempDts = 0;
/* If size is zero, it means the image was buffered. */
if (got_output)
{
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
}
else
{
ret = 0;
}
av_free_packet(&pkt);
}
}
int main(int argc, char **argv)
{
/* register all formats and codecs */
av_register_all();
avcodec_register_all();
avformat_network_init();
avdevice_register_all();
int i =0;
char src_filename[90] = "mp3.mp3";
char dst_filename[90] = "test.mp4";
open_audio_input(src_filename);
audio_bit_rate = codec_ctx_audio->bit_rate;
audio_sample_rate = codec_ctx_audio->sample_rate;
audio_channels = codec_ctx_audio->channels;
open_encoder( dst_filename );
while(1)
{
int rv = decode_frame();
if ( rv < 0 )
{
break;
}
if (audio_st)
{
audio_pts = (double)audio_st->pts.val * audio_st->time_base.num /
audio_st->time_base.den;
}
else
{
audio_pts = 0.0;
}
if ( codec_ctx_audio )
{
if ( got_frame)
{
write_audio_frame( audio_dst_data, audio_dst_bufsize );
}
}
if ( audio_dst_data[0] )
{
av_freep(&audio_dst_data[0]);
audio_dst_data[0] = NULL;
}
av_free_packet(&input_packet);
printf("\naudio_pts: %.3f", audio_pts);
}
write_delayed_frames( output_fmt_ctx, audio_st );
av_write_trailer(output_fmt_ctx);
close_audio( output_fmt_ctx, audio_st);
swr_free(&swr);
avcodec_free_frame(&out_frame);
return 0;
}
///////////////////////////////////////////////
I have been looking at this problem from many angles since about two days now, but cant seem to figure out what I'm doing wrong.
Note also: the printf() statement I've inserted shows audio_pts up to 64.551 (that's about 1:05 seconds that also proves encoder is not going to full duration of input file: 1:12 secs).
Can anyone please guide me what I may be doing wrong?
Thanks in advance for any guidance!
p.s. when run through command line like: ffmpeg -i test.mp3 test.mp4, it converts the file just fine.