Audio mixing algorithm changing volume

Audio mixing algorithm changing volume - c++

I'm trying to mix some audio samples with the following algorithm:
short* FilterGenerator::mixSources(std::vector<RawData>rawsources, int numframes)
{
short* output = new short[numframes * 2]; // multiply 2 for channels
for (int sample = 0; sample < numframes * 2; ++sample)
{
for (int sourceCount = 0; sourceCount < rawsources.size(); ++sourceCount)
{
if (sample <= rawsources.at(sourceCount).frames * 2)
{
short outputSample = rawsources.at(sourceCount).data[sample];
output[sample] += outputSample;
}
}
}
// post mixing volume compression
for (int sample = 0; sample < numframes; ++sample)
{
output[sample] /= (float)rawsources.size();
}
return output;
}
I get the output I want except for the fact that when one of the sources are done, the other sources start playing louder. I know why this is but I don't know how to solve it properly.
Also, this is a screenshot from Audacity from the audio I output:
As you can see there's definitely something wrong. You can see that the audio hasn't got zero at the center anymore and you can see the audio getting louder once one of the sources are done playing.
Most of all I'd like to fix the volume problem but any other tweaks I can do are very appreciated!
Some extra info: I know that this code doesn't allow mono sources but that's ok. I'm only going to use stereo interleaved audio samples.

Usually mixing don't divide by the number of sources. This mean that mix a normal track with a mute track can halve its amplitude. If you want you can eventually normalize the track so that it is in his range.
The code is not tested, there may be errors:
#include <algorithm> // for std::max
#include <cmath> // for std::fabs
short* FilterGenerator::mixSources(std::vector<RawData>rawsources, int numframes)
{
// We can not use shorts immediately because can overflow
// I use floats because in the renormalization not have distortions
float *outputFloating = new float [numframes * 2];
// The maximum of the absolute value of the signal
float maximumOutput = 0;
for (int sample = 0; sample < numframes * 2; ++sample)
{
// makes sure that at the beginning is zero
outputFloating[sample] = 0;
for (int sourceCount = 0; sourceCount < rawsources.size(); ++sourceCount)
{
// I think that should be a '<'
if (sample < rawsources.at(sourceCount).frames * 2)
outputFloating[sample] += rawsources.at(sourceCount).data[sample];
}
// Calculates the maximum
maximumOutput = std::max (maximumOutput, std::fabs(outputFloating[sample]));
}
// A short buffer
short* output = new short [numframes * 2]; // multiply 2 for channels
float multiplier = maximumOutput > 32767 ? 32767 / maximumOutput : 1;
// Renormalize the track
for (int sample = 0; sample < numframes * 2; ++sample)
output[sample] = (short) (outputFloating[sample] * multiplier);
delete[] outputFloating;
return output;
}

Since you're adding up everything into a short before you divide, you're probably getting overflow. You need to add to an intermediary that's bigger. Also the final scaling shouldn't be dependent on the number of samples, it should be a constant - determine it before you call your function.
short* FilterGenerator::mixSources(std::vector<RawData>rawsources, int numframes, double gain = 0.5)
{
short* output = new short[numframes * 2]; // multiply 2 for channels
for (int sample = 0; sample < numframes * 2; ++sample)
{
long newSample = 0;
for (int sourceCount = 0; sourceCount < rawsources.size(); ++sourceCount)
{
if (sample <= rawsources.at(sourceCount).frames * 2)
{
short outputSample = rawsources.at(sourceCount).data[sample];
newSample += outputSample;
}
}
output[sample] = (short)(newSample * gain);
}
return output;
}

You don't really have to do the "post mixing volume compression". Simply add up all the sources and don't allow the sum to overflow. This should work:
short* FilterGenerator::mixSources(std::vector<RawData>rawsources, int numframes)
{
short* output = new short[numframes * 2]; // multiply 2 for channels
for (int sample = 0; sample < numframes * 2; ++sample)
{
long sum = 0;
for (int sourceCount = 0; sourceCount < rawsources.size(); ++sourceCount)
{
if (sample < rawsources.at(sourceCount).frames * 2)
{
short outputSample = rawsources.at(sourceCount).data[sample];
sum += outputSample;
output[sample] += outputSample;
}
if (sum > 32767) sum = 32767;
if (sum < -32768) sum = -32768;
output[sample] = (short)sum;
}
}
return output;
}

Related

Optimization C++ code to match reference run time

I have assigment to optimize some c++ code, I'm bad at coding but I made some attempts so the original is:
#include "stdafx.h"
#include "HistogramStretching.h"
void CHistogramStretching::HistogramStretching(BYTE** pImage, int nW, int nH)
{
//find minimal value
int nMin = pImage[0][0];
for(int j = 0; j < nW; j++)
for(int i = 0; i < nH; i++)
if(pImage[i][j] < nMin)
nMin = pImage[i][j];
//find maximal value
int nMax = pImage[0][0];
for(int j = 0; j < nW; j++)
for(int i = 0; i < nH; i++)
if(pImage[i][j] > nMax)
nMax = pImage[i][j];
//stretches histogram
for(int j = 0; j < nW; j++)
for(int i = 0; i < nH; i++)
{
if(nMax != nMin)
{
float fScale = (nMax - nMin)/100.0;//calculates scale
float fVal = (pImage[i][j] - nMin)/fScale;//scales pixel value
int nVal = (int)(fVal + 0.5);//rounds floating point number to integer
//checks BYTE range (must be 0-255)
if(nVal < 0)
nVal = 0;
if(nVal > 255)
nVal = 255;
pImage[i][j] = nVal;
}
else
pImage[i][j] = 0;//if all pixel values are the same, the image is changed to black
}
}
And my verison is:
#include "stdafx.h"
#include "HistogramStretching.h"
void CHistogramStretching::HistogramStretching(BYTE** pImage, int nW, int nH)
{
//find minimal value
int nMin = pImage[0][0];
int nMax = pImage[0][0];
for (int j = 0; j < nW; j++) {
for (int i = 0; i < nH; i++) {
if (pImage[i][j] < nMin)
nMin = pImage[i][j];
if (pImage[i][j] > nMax)
nMax = pImage[i][j];
}
}
if (nMax != nMin) {
float fScale = (nMax - nMin) / 100.0;//calculates scale
fScale = 1 / fScale;
//stretches histogram
for (int j = 0; j < nW; j++)
for (int i = 0; i < nH; i++)
{
float fVal = (pImage[i][j] - nMin) * fScale;//scales pixel value
int nVal = (int)(fVal + 0.5);//rounds floating point number to integer
//checks BYTE range (must be 0-255)
if (nVal < 0)
nVal = 0;
if (nVal > 255)
nVal = 255;
pImage[i][j] = nVal;
}
//if all pixel values are the same, the image is changed to black
}
else {
pImage[0][0] = 0;
}
}
So I merged the first two loops to one but still the first if make ~15% CPU time, next step was to pull the if statement outside the loops and changing division for multiplication and here that division takes ~8% of CPU time and float to int casting takes ~5% but I think I can't do much with casting. With this "correcions" my code is still some like 6-7 times slower than refference code. I test both code on the same machines. Can you point me to something I can make better?

I think tadman gave you the correct answer.
Replace
for (int j = 0; j < nW; j++) {
for (int i = 0; i < nH; i++) {
if (pImage[i][j] < nMin)
...
}
}
with
for (int i = 0; i < nH; i++) {
for (int j = 0; j < nW; j++) {
if (pImage[i][j] < nMin)
...
}
}
This way your data access becomes cache/memory aligned, which should be way faster.

All modern compilers can vectorize this nicely, when compiled at full optimization (/O2 for MSVC, -O3 for gcc and clang).
The idea is to give the compiler some help so that it can see that the code can be in fact vectorized:
Let the inner loop operate on a single pointer, not on indices, and without accessing anything but the pointed-to value.
Perform the scaling as an integer operation - and don't forget rounding :)
Try to set up operations such that additional range checks are unnecessary, e.g. your checks for BYTE being less than 0. By having the offset and scale set up properly, the result will be guaranteed to fall into the desired range.
The inner loops will get unrolled, and will be vectorized to process 4 bytes at a time. I've tried the recent gcc, clang and MSVC releases and they produce pretty fast code for this.
You're doing something "weird" in that you purposefully scale the results to a 0-99 range. Thus you lose the resolution of the data - you've got a full byte to work with, so why not scale it to 255?
But if you want to scale to 100 values, it's fine. Note that 100(dec) = 0x64. We can make the outputSpan flexible - it will work for any value <= 255.
Thus:
/* Code Part 1 */
#include <cstdint>
constexpr uint32_t outputSpan = 100;
static constexpr uint32_t scale_16(uint8_t min, uint8_t max)
{
return (outputSpan * 0x10000) / (1+max-min);
}
// scale factor in 16.16 fixed point unsigned format
// empty histogram produces scale = outputSpan
static_assert(scale_16(10, 10) == outputSpan * 0x10000, "Scale calculation is wrong");
static constexpr uint8_t scale_pixel(uint8_t const pixel, uint8_t min, uint32_t const scale)
{
uint32_t px = (pixel - min) * scale;
// result in 16.16 fixed point format
return (px + 0x8080u) >> 16;
// round to an integer value
}
We work with fixed-point numbers (instead of floating-point). The scale is in 16.16 format, thus 16 digits in the integer part, and 16 digits in the fractional part, e.g. 0x1234.5678. The value 1.0(dec) would be 0x1.0000.
The pixel scaling simply multiplies the pixel by the scale, rounds it, and returns the truncated integer part.
The rounding is "interesting". You'd think that it'd suffice to add 0.5(dec) = 0x0.8 to the result to round it. That's not the case. The value needs to be a bit larger than that, and 0x0.808 does the job. It pre-biases the value, so that the error range around the exact value has a zero mean. In all cases, the error is at most ±0.5 - thus the result, rounded to an integer, does not lose accuracy.
We use scale_16 and scale_pixel functions to implement the stretcher:
/* Code Part 2 */
void stretchHistogram(uint8_t **pImage, int const nW, int const nH)
{
uint8_t nMin = 255, nMax = 0;
for (uint8_t **row = pImage, **rowEnd = pImage + nH; row != rowEnd; ++row)
for (const uint8_t *p = *row, *pEnd = p + nW; p != pEnd; ++p)
{
auto const px = *p;
if (px < nMin) nMin = px;
if (px > nMax) nMax = px;
}
auto const scale = scale_16(nMin, nMax);
for (uint8_t **row = pImage, **rowEnd = pImage + nH; row != rowEnd; ++row)
for (uint8_t *p = *row, *pEnd = p + nW; p != pEnd; ++p)
*p = scale_pixel(*p, nMin, scale);
}
This also produces decent code on architectures without FPU, such as FPU-less ARM and AVR.
We can also do some manual checks. Suppose that min = 0x10, max = 0xEF, and pixel = 0x32. Let's remember that the scale is in 16.16 format:
scale = 0x64.0000 / (1 + max - min)
= 0x64.0000 / (1 + 0xEF - 0x10)
= 0x64.0000 / (1 + 0xDF)
= 0x64.0000 / 0xE0
Long division:
0x .7249
0x64.0000 / 0xE0
---------
64.0
- 62.0
------
2.00
- 1.C0
-------
.400
- .380
--------
. 800
- . 7E0
---------
. 20
So, we have scale = 0x0.7249. It's less than one (0x1.0), and also a bit less than 1/2 (0x0.8), since we map 224 values onto 100 values - a bit less than half as many.
Now
px = (pixel - min) * scale
= (0x32 - 0x10) * 0x0.7249
= 0x22 * 0x0.7249
Long multiplication:
0x 0.7249
* 0x .0022
------------
.E492
+ E.492
------------
0x F.2DB2
Thus, px = 0xF.2DB2 ≈ 0xF. We have to round it to an integer:
return = (px + 0x0.8080u) >> 16
= (0xF.2DB2 + 0x0.8080) >> 16
= 0xF.AE32 >> 16
≈ 0xF
Let's check in decimal system:
100 / (max-min+1) * (pixel-min) =
= 100 / (239 - 16 + 1) * (50 - 16)
= 100 / 224 * 34
= 100 * 34 / 224
= 3400 / 224
≈ 15.17
≈ 15
≈ 0xF
Here's a test case that ensures that there's no rounding bias for all combinations of min, max, and input pixel value, and that the error is bounded to [-0.5, 0.5]. Just append it to the code above and it should compile and run and produce the following output:
-0.5 0.5 1
For scaling to outputSpan = 256 values (instead of 100), it'd output:
-0.498039 0.498039 0.996078
/* Code Part 3 */
#include <cassert>
#include <cmath>
#include <iostream>
int main()
{
double errMin = 0, errMax = 0;
for (uint16_t min = 0; min <= 255; ++min)
for (uint16_t max = min; max <= 255; ++max)
for (uint16_t val = min; val <= max; ++val)
{
uint8_t const nMin = min, nMax = max;
uint8_t const span = nMax - nMin;
uint8_t const val_src = val;
uint8_t p_val = val_src;
uint8_t *const p = &p_val;
assert(nMin <= nMax);
assert(val >= nMin && val <= nMax);
auto const scale = scale_16(nMin, nMax);
*p = scale_pixel(*p, nMin, scale);
auto pValTarget = (val_src - nMin) * 256.0/(1.0+span);
auto error = pValTarget - *p;
if (error < errMin) errMin = error;
if (error > errMax) errMax = error;
}
std::cout << '\n' << errMin << ' ' << errMax << ' ' << errMax-errMin << std::endl;
assert((errMax-errMin) <= 1.0); // constrain the error
assert(std::abs(errMax+errMin) == 0.0); // constrain the error average
}

FFTW Complex to Real Segmentation Fault

I am attempting to write a naive implementation of the Short-Time Fourier Transform using consecutive FFT frames in time, calculated using the FFTW library, but I am getting a Segmentation fault and cannot work out why.
My code is as below:
// load in audio
AudioFile<double> audioFile;
audioFile.load ("assets/example-audio/file_example_WAV_1MG.wav");
int N = audioFile.getNumSamplesPerChannel();
// make stereo audio mono
double fileDataMono[N];
if (audioFile.isStereo())
for (int i = 0; i < N; i++)
fileDataMono[i] = ( audioFile.samples[0][i] + audioFile.samples[1][i] ) / 2;
// setup stft
// (test transform, presently unoptimized)
int stepSize = 512;
int M = 2048; // fft size
int noOfFrames = (N-(M-stepSize))/stepSize;
// create Hamming window vector
double w[M];
for (int m = 0; m < M; m++) {
w[m] = 0.53836 - 0.46164 * cos( 2*M_PI*m / M );
}
double* input;
// (pads input array if necessary)
if ( (N-(M-stepSize))%stepSize != 0) {
noOfFrames += 1;
int amountOfZeroPadding = stepSize - (N-(M-stepSize))%stepSize;
double ipt[N + amountOfZeroPadding];
for (int i = 0; i < N; i++) // copy values from fileDataMono into input
ipt[i] = fileDataMono[i];
for (int i = 0; i < amountOfZeroPadding; i++)
ipt[N + i] = 0;
input = ipt;
} else {
input = fileDataMono;
}
// compute stft
fftw_complex* stft[noOfFrames];
double frames[noOfFrames][M];
fftw_plan fftPlan;
for (int i = 0; i < noOfFrames; i++) {
stft[i] = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * M);
for (int m = 0; m < M; m++)
frames[i][m] = input[i*stepSize + m] * w[m];
fftPlan = fftw_plan_dft_r2c_1d(M, frames[i], stft[i], FFTW_ESTIMATE);
fftw_execute(fftPlan);
}
// compute istft
double* outputFrames[noOfFrames];
double output[N];
for (int i = 0; i < noOfFrames; i++) {
outputFrames[i] = (double*)fftw_malloc(sizeof(double) * M);
fftPlan = fftw_plan_dft_c2r_1d(M, stft[i], outputFrames[i], FFTW_ESTIMATE);
fftw_execute(fftPlan);
for (int m = 0; i < M; m++) {
output[i*stepSize + m] += outputFrames[i][m];
}
}
fftw_destroy_plan(fftPlan);
for (int i = 0; i < noOfFrames; i++) {
fftw_free(stft[i]);
fftw_free(outputFrames[i]);
}
// output audio
AudioFile<double>::AudioBuffer outputBuffer;
outputBuffer.resize (1);
outputBuffer[0].resize(N);
outputBuffer[0].assign(output, output+N);
bool ok = audioFile.setAudioBuffer(outputBuffer);
audioFile.setAudioBufferSize (1, N);
audioFile.setBitDepth (16);
audioFile.setSampleRate (8000);
audioFile.save ("out/audioOutput.wav");
The segfault seems to be being raised by the first fftw_malloc when computing the forward STFT.
Thanks in advance!

The relevant bit of code is:
double* input;
if ( (N-(M-stepSize))%stepSize != 0) {
double ipt[N + amountOfZeroPadding];
//...
input = ipt;
}
//...
input[i*stepSize + m];
Your input pointer points at memory that exists only inside the if statement. The closing brace denotes the end of the lifetime of the ipt array. When dereferencing the pointer later, you are addressing memory that no longer exists.

How to reconstruct result of Signal->STFT->ISTFT->Signal using overlapping windows?

Apologizing if the answer is on the site somewhere (couldn't find). I'm a hobbyist who tries to load WAV file, get it's magnitude and phase data (for modification), generate spectrogram and then save it back as a new WAV file.
I use C++ (Qt) and FFTW library.
My problem is that resulting WAV differs from original even when no modifications are made. If FFT operations are performed on whole sample sequence, then it looks just like the original. But I have to use STFTs with overlapping windows. In this case I get distortions resulting in periodic cracking/throttling sounds, also waveform of audio is significantly changed.
This can be seen in following examples (viewed in Audacity):
original / processed in one chunk:
original
processed (windowSize=2048, hopSize=1024, no window function):
processed ws=2048, hs=1024, wf=none
I can't post more examples with my reputation, but performing Hamming window function after ISTFT (not before STFT) with the method I use to combine resulting windowed samples gives good sound. But waveform is still quite different, mainly significant loss in peaks is observed.
I think the way I combine result of ISTFT into new sample sequence is the problem. What is the proper way to do this? Example in C++ would be really appreciated.
EDIT *
As correctly pointed out by SleuthEye I made a mistake in code.
Code is adjusted. Waveform and sound seems to be perfect now even without applying a window function. Still, is the method correct for such an operation?
Here's the relevant source:
// getSampleNormalized(n) returns sample n of 1 channel in -1.0 to 1.0 range
// getSampleCount() returns sample count of 1 channel
// quint32 is just unsigned int
quint32 windowSize = 2048;
quint32 windowSizeHalf = windowSize / 2 + 1;
quint32 slideWindowBy = 1024; // hopSize
quint32 windowCount = getSampleCount() / slideWindowBy;
if ( (windowCount * slideWindowBy) < getSampleCount()){
windowCount += 1;
}
quint32 newSampleCount = windowCount * slideWindowBy + ( windowSize - slideWindowBy );
double *window = new double[windowSize];
fftw_complex *fftResult = new fftw_complex[windowSizeHalf];
fftw_complex *fftWindow = new fftw_complex[windowSizeHalf];
double *result = new double[windowSize];
double **magnitudes = new double*[windowCount];
double **phases = new double*[windowCount];
double **signalWindows = new double*[windowCount];
for (int i = 0; i < windowCount; ++i){
magnitudes[i] = new double[windowSizeHalf];
phases[i] = new double[windowSizeHalf];
signalWindows[i] = new double[windowSize];
}
double *sampleSignals = new double[newSampleCount];
fftw_plan fftPlan = fftw_plan_dft_r2c_1d( windowSize, window, fftResult, FFTW_ESTIMATE );
fftw_plan ifftPlan = fftw_plan_dft_c2r_1d( windowSize, fftWindow, result, FFTW_ESTIMATE );
// STFT
for ( int currentWindow = 0; currentWindow < windowCount; ++currentWindow ){
for (int i = 0; i < windowSize; ++i){
quint32 currentSample = currentWindow * slideWindowBy + i;
if ( ( currentSample ) < getSampleCount() ){
window[i] = getSampleNormalized( currentSample ); // * ( windowHamming( i, windowSize ) );
}
else{
window[i] = 0.0;
}
}
fftw_execute(fftPlan);
for (int i = 0; i < windowSizeHalf; ++i){
magnitudes[currentWindow][i] = sqrt( fftResult[i][0]*fftResult[i][0] + fftResult[i][1]*fftResult[i][1] );
phases[currentWindow][i] = atan2( fftResult[i][1], fftResult[i][0] );
}
}
// INVERSE STFT
for ( int currentWindow = 0; currentWindow < windowCount; ++currentWindow ){
for ( int i = 0; i < windowSizeHalf; ++i ){
fftWindow[i][0] = magnitudes[currentWindow][i] * cos( phases[currentWindow][i] ); // Real
fftWindow[i][1] = magnitudes[currentWindow][i] * sin( phases[currentWindow][i] ); // Imaginary
}
fftw_execute(ifftPlan);
for ( int i = 0; i < windowSize; ++i ){
signalWindows[currentWindow][i] = result[i] / windowSize; // getting normalized result
//signalWindows[currentWindow][i] *= (windowHamming( i, windowSize )); // applying Hamming window function
}
}
quint32 pos;
// HERE WE COMBINE RESULTED WINDOWS
// COMBINE AND AVERAGE
// 1st window should be full replace
for ( int i = 0; i < windowSize; ++i ){
sampleSignals[i] = signalWindows[0][i];
}
// 2nd window and onwards: combine with previous ones
for ( int currentWindow = 1; currentWindow < windowCount; ++currentWindow ){
// combine and average with data from previous window
for ( int i = 0; i < (windowSize - slideWindowBy); ++i ){
pos = currentWindow * slideWindowBy + i;
sampleSignals[pos] = (sampleSignals[pos] + signalWindows[currentWindow][i]) * 0.5;
}
// simply replace for the rest
for ( int i = (windowSize - slideWindowBy); i < windowSize; ++i ){
pos = currentWindow * slideWindowBy + i;
sampleSignals[pos] = signalWindows[currentWindow][i];
}
}
// then just save the wav file...

C++ Pattern Matching with FFT cross-correlation (Images)

everyone I am trying to implement patter matching with FFT but I am not sure what the result should be (I think I am missing something even though a read a lot of stuff about the problem and tried a lot of different implementations this one is the best so far). Here is my FFT correlation function.
void fft2d(fftw_complex**& a, int rows, int cols, bool forward = true)
{
fftw_plan p;
for (int i = 0; i < rows; ++i)
{
p = fftw_plan_dft_1d(cols, a[i], a[i], forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
}
fftw_complex* t = (fftw_complex*)fftw_malloc(rows * sizeof(fftw_complex));
for (int j = 0; j < cols; ++j)
{
for (int i = 0; i < rows; ++i)
{
t[i][0] = a[i][j][0];
t[i][1] = a[i][j][1];
}
p = fftw_plan_dft_1d(rows, t, t, forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i = 0; i < rows; ++i)
{
a[i][j][0] = t[i][0];
a[i][j][1] = t[i][1];
}
}
fftw_free(t);
}
int findCorrelation(int argc, char* argv[])
{
BMP bigImage;
BMP keyImage;
BMP result;
RGBApixel blackPixel = { 0, 0, 0, 1 };
const bool swapQuadrants = (argc == 4);
if (argc < 3 || argc > 4) {
cout << "correlation img1.bmp img2.bmp" << endl;
return 1;
}
if (!keyImage.ReadFromFile(argv[1])) {
return 1;
}
if (!bigImage.ReadFromFile(argv[2])) {
return 1;
}
//Preparations
const int maxWidth = std::max(bigImage.TellWidth(), keyImage.TellWidth());
const int maxHeight = std::max(bigImage.TellHeight(), keyImage.TellHeight());
const int rowsCount = maxHeight;
const int colsCount = maxWidth;
BMP bigTemp = bigImage;
BMP keyTemp = keyImage;
keyImage.SetSize(maxWidth, maxHeight);
bigImage.SetSize(maxWidth, maxHeight);
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
RGBApixel p1;
if (i < bigTemp.TellHeight() && j < bigTemp.TellWidth()) {
p1 = bigTemp.GetPixel(j, i);
} else {
p1 = blackPixel;
}
bigImage.SetPixel(j, i, p1);
RGBApixel p2;
if (i < keyTemp.TellHeight() && j < keyTemp.TellWidth()) {
p2 = keyTemp.GetPixel(j, i);
} else {
p2 = blackPixel;
}
keyImage.SetPixel(j, i, p2);
}
//Here is where the transforms begin
fftw_complex **a = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
fftw_complex **b = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
fftw_complex **c = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
for (int i = 0; i < rowsCount; ++i) {
a[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
b[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
c[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
for (int j = 0; j < colsCount; ++j) {
RGBApixel p1;
p1 = bigImage.GetPixel(j, i);
a[i][j][0] = (0.299*p1.Red + 0.587*p1.Green + 0.114*p1.Blue);
a[i][j][1] = 0.0;
RGBApixel p2;
p2 = keyImage.GetPixel(j, i);
b[i][j][0] = (0.299*p2.Red + 0.587*p2.Green + 0.114*p2.Blue);
b[i][j][1] = 0.0;
}
}
fft2d(a, rowsCount, colsCount);
fft2d(b, rowsCount, colsCount);
result.SetSize(maxWidth, maxHeight);
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
fftw_complex& y = a[i][j];
fftw_complex& x = b[i][j];
double u = x[0], v = x[1];
double m = y[0], n = y[1];
c[i][j][0] = u*m + n*v;
c[i][j][1] = v*m - u*n;
int fx = j;
if (fx>(colsCount / 2)) fx -= colsCount;
int fy = i;
if (fy>(rowsCount / 2)) fy -= rowsCount;
float r2 = (fx*fx + fy*fy);
const double cuttoffCoef = (maxWidth * maxHeight) / 37992.;
if (r2<128 * 128 * cuttoffCoef)
c[i][j][0] = c[i][j][1] = 0;
}
fft2d(c, rowsCount, colsCount, false);
const int halfCols = colsCount / 2;
const int halfRows = rowsCount / 2;
if (swapQuadrants) {
for (int i = 0; i < halfRows; ++i)
for (int j = 0; j < halfCols; ++j) {
std::swap(c[i][j][0], c[i + halfRows][j + halfCols][0]);
std::swap(c[i][j][1], c[i + halfRows][j + halfCols][1]);
}
for (int i = halfRows; i < rowsCount; ++i)
for (int j = 0; j < halfCols; ++j) {
std::swap(c[i][j][0], c[i - halfRows][j + halfCols][0]);
std::swap(c[i][j][1], c[i - halfRows][j + halfCols][1]);
}
}
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
const double& g = c[i][j][0];
RGBApixel pixel;
pixel.Alpha = 0;
int gInt = 255 - static_cast<int>(std::floor(g + 0.5));
pixel.Red = gInt;
pixel.Green = gInt;
pixel.Blue = gInt;
result.SetPixel(j, i, pixel);
}
BMP res;
res.SetSize(maxWidth, maxHeight);
result.WriteToFile("result.bmp");
return 0;
}
Sample output

This question would probably be more appropriately posted on another site like cross validated (metaoptimize.com used to also be a good one, but it appears to be gone)
That said:
There's two similar operations you can perform with FFT: convolution and correlation. Convolution is used for determining how two signals interact with each-other, whereas correlation can be used to express how similar two signals are to each-other. Make sure you're doing the right operation as they're both commonly implemented throught a DFT.
For this type of application of DFTs you usually wouldn't extract any useful information in the fourier spectrum unless you were looking for frequencies common to both data sources or whatever (eg, if you were comparing two bridges to see if their supports are spaced similarly).
Your 3rd image looks a lot like the power domain; normally I see the correlation output entirely grey except where overlap occurred. Your code definitely appears to be computing the inverse DFT, so unless I'm missing something the only other explanation I've come up with for the fuzzy look could be some of the "fudge factor" code in there like:
if (r2<128 * 128 * cuttoffCoef)
c[i][j][0] = c[i][j][1] = 0;
As for what you should expect: wherever there are common elements between the two images you'll see a peak. The larger the peak, the more similar the two images are near that region.
Some comments and/or recommended changes:
1) Convolution & correlation are not scale invariant operations. In other words, the size of your pattern image can make a significant difference in your output.
2) Normalize your images before correlation.
When you get the image data ready for the forward DFT pass:
a[i][j][0] = (0.299*p1.Red + 0.587*p1.Green + 0.114*p1.Blue);
a[i][j][1] = 0.0;
/* ... */
How you grayscale the image is your business (though I would've picked something like sqrt( r*r + b*b + g*g )). However, I don't see you doing anything to normalize the image.
The word "normalize" can take on a few different meanings in this context. Two common types:
normalize the range of values between 0.0 and 1.0
normalize the "whiteness" of the images
3) Run your pattern image through an edge enhancement filter. I've personally made use of canny, sobel, and I think I messed with a few others. As I recall, canny was "quick'n dirty", sobel was more expensive, but I got comparable results when it came time to do correlation. See chapter 24 of the "dsp guide" book that's freely available online. The whole book is worth your time, but if you're low on time then at a minimum chapter 24 will help a lot.
4) Re-scale the output image between [0, 255]; if you want to implement thresholds, do it after this step because the thresholding step is lossy.
My memory on this one is hazy, but as I recall (edited for clarity):
You can scale the final image pixels (before rescaling) between [-1.0, 1.0] by dividing off the largest power spectrum value from the entire power spectrum
The largest power spectrum value is, conveniently enough, the center-most value in the power spectrum (corresponding to the lowest frequency)
If you divide it off the power spectrum, you'll end up doing twice the work; since FFTs are linear, you can delay the division until after the inverse DFT pass to when you're re-scaling the pixels between [0..255].
If after rescaling most of your values end up so black you can't see them, you can use a solution to the ODE y' = y(1 - y) (one example is the sigmoid f(x) = 1 / (1 + exp(-c*x) ), for some scaling factor c that gives better gradations). This has more to do with improving your ability to interpret the results visually than anything you might use to programmatically find peaks.
edit I said [0, 255] above. I suggest you rescale to [128, 255] or some other lower bound that is gray rather than black.

Conceal packet loss in PCM stream

I am looking to use 'Packet Loss Concealment' to conceal lost PCM frames in an audio stream. Unfortunately, I cannot find a library that is accessible without all the licensing restrictions and code bloat (...up for some suggestions though).
I have located some GPL code written by Steve Underwood for the Asterisk project which implements PLC. There are several limitations; although, as Steve suggests in his code, his algorithm can be applied to different streams with a bit of work. Currently, the code works with 8kHz 16-bit signed mono streams.
Variations of the code can be found through a simple search of Google Code Search.
My hope is that I can adapt the code to work with other streams. Initially, the goal is to adjust the algorithm for 8+ kHz, 16-bit signed, multichannel audio (all in a C++ environment). Eventually, I'm looking to make the code available under the GPL license in hopes that it could be of benefit to others...
Attached is the code below with my efforts. The code includes a main function that will "drop" a number of frames with a given probability. Unfortunately, the code does not quite work as expected. I'm receiving EXC_BAD_ACCESS when running in gdb, but I don't get a trace from gdb when using 'bt' command. Clearly, I'm trampimg on memory some where but not sure exactly where. When I comment out the amdf_pitch function, the code runs without crashing...
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm");
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm");
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm");
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000);
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 5;
while(!fin.eof())
{
char arr[2];
fin.read(arr, 2);
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[2] = {0x00, 0x00};
fout_lossy.write(blank, 2);
//Fill in data;
Concealer.Fill((int16_t *)blank, 1);
fout_repaired.write(blank, 2);
}
else
{
//Write data to file;
fout_repaired.write(arr, 2);
fout_lossy.write(arr, 2);
Concealer.Receive((int16_t *)arr, 1);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
int amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#if !defined(INT16_MAX)
#define INT16_MAX (32767)
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + plc_history_len - correlation_span - plc_pitch_min, j, correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}
P.S. - I must confess that digital audio is not my forte...

Fixed the problem. The problem lay within the amdf_pitch function. There were some minor bugs elsewhere too (which have been repaired). As a result, the code will now run the testbed inserting blank for a given probability.
Using Audacity I have studied the raw PCM streams that have been created via the testbed. When a blank set of frames is encountered, smoothing occurs from received to blank as expected; however, when we change from blank to valid/received data, we gets clicks because the smoothing doesn't appear to be working during this phase. Any suggestions?
I have attached the updated code:
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm", std::ios::binary);
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm", std::ios::binary);
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm", std::ios::binary);
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000); //1-channel, 16-bit, 32kHz
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 3;
int old_bytes_read = 0;
while(!fin.eof())
{
char arr[1024];
fin.read(arr, 1024);
int total_bytes_read = fin.tellg();
int bytes_read = total_bytes_read - old_bytes_read;
old_bytes_read = total_bytes_read;
if(!bytes_read)
continue; //Probably reached EOF;
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[1024] = {0x00, 0x00};
fout_lossy.write(blank, 1024);
//Fill in data;
Concealer.Fill((int16_t *)blank, 512);
fout_repaired.write(blank, 1024);
}
else
{
//Write data to file;
fout_repaired.write(arr, 1024);
fout_lossy.write(arr, 1024);
Concealer.Receive((int16_t *)arr, 512);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* PcmConcealer.hpp
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
inline int amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* PcmConcealer.cpp
*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + (plc_history_len - correlation_span - plc_pitch_min), correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; (i < pitch_overlap) && (i < frames); i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
/*for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}*/
for (j = 0; j < frames; j++)
acc += abs(amp[i + j] - amp[j]);
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
//std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Audio mixing algorithm changing volume - c++

Related

Optimization C++ code to match reference run time

FFTW Complex to Real Segmentation Fault

How to reconstruct result of Signal->STFT->ISTFT->Signal using overlapping windows?

C++ Pattern Matching with FFT cross-correlation (Images)

Conceal packet loss in PCM stream

Categories

Resources