Clicking sounds in brown/Brownian/random walk noise - c++

I am trying to make brown noise in C++, and to play the sound of it. You can hear the brown noise, but I constantly hear clicking in the background and I don't know why.
Here is my code:
#include <xaudio2.h>
#include <iostream>
#include <random>
using namespace std;
#define PI2 6.28318530717958647692f
#define l 2205 //0.05 seconds
bool init();
bool loop();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(-.01, .01);
IXAudio2MasteringVoice* pMasterVoice;
IXAudio2* pXAudio2;
IXAudio2SourceVoice* pSourceVoice;
XAUDIO2_BUFFER buffer;
WAVEFORMATEX wfx;
XAUDIO2_VOICE_STATE state;
BYTE pDataBuffer[2*l];
BYTE bytw[2];
int pow16[2];
float w[l];
int frame, p;
float tt, ampl;
bool loop() {
w[0] = w[l - 1] + dis(gen)*ampl;
for (int t = 1; t < l; t++) {
tt = (float)(t + frame*l); //total time
w[t] = w[t - 1] + dis(gen)*ampl;
if (w[t] > ampl) {
cout << "upper edge ";
w[t] = ampl - fmod(w[t], ampl);
}
if (w[t] < -ampl) {
cout << "lower edge ";
w[t] = -fmod(w[t], ampl) - ampl;
}
//w[t] = sin(PI2*tt/p)*ampl;
//w[t] = (fmod(tt/p, 1) < .5 ? ampl : -ampl)*(.5f - 2.f*fmod(tt/p, .5f));
int intw = (int)w[t];
if (intw < 0) {
intw += 65535;
}
bytw[0] = 0; bytw[1] = 0;
for (int k = 1; k >= 0; k--) {
//turn integer into a little endian byte array
bytw[k] += (BYTE)(16*(intw/pow16[k]));
intw -= bytw[k]*(pow16[k]/16);
bytw[k] += (BYTE)(intw/(pow16[k]/16));
intw -= (intw/(pow16[k]/16))*pow16[k]/16;
}
pDataBuffer[2*t] = bytw[0];
pDataBuffer[2*t + 1] = bytw[1];
}
cout << endl << endl;
if (frame > 1) {
//wait until the current one is done playing
while (pSourceVoice->GetState(&state), state.BuffersQueued > 1) {}
}
buffer.AudioBytes = 2*l; //number of bytes per buffer
buffer.pAudioData = pDataBuffer;
buffer.Flags = XAUDIO2_END_OF_STREAM;
pSourceVoice->SubmitSourceBuffer(&buffer);
if (frame == 1) {
pSourceVoice->Start(0, 0);
}
frame++;
return true;
}
bool init() {
CoInitializeEx(nullptr, COINIT_MULTITHREADED);
pXAudio2 = nullptr;
XAudio2Create(&pXAudio2, 0, XAUDIO2_DEFAULT_PROCESSOR);
pMasterVoice = nullptr;
pXAudio2->CreateMasteringVoice(&pMasterVoice);
wfx = {0};
wfx.wFormatTag = WAVE_FORMAT_PCM;
wfx.nChannels = (WORD)1; //mono
wfx.nSamplesPerSec = (DWORD)44100; //samplerate
wfx.wBitsPerSample = (WORD)16; //16 bit (signed)
wfx.nBlockAlign = (WORD)2; //2 bytes per sample
wfx.nAvgBytesPerSec = (DWORD)88200; //samplerate*blockalign
wfx.cbSize = (WORD)0;
pSourceVoice = nullptr;
pXAudio2->CreateSourceVoice(&pSourceVoice, &wfx);
tt = 0, p = 1000, ampl = 10000;
pow16[0] = 16;
pow16[1] = 4096;
frame = 0;
return true;
}
int main() {
if (!init()) return 1;
cout << "start";
while (loop()) {}
return 0;
}
The line before the for-loop in loop() is to make sure that the first element nicely attaches itself to the last element of the previous iteration.
To make sure that w doesn't go over ampl or under -ampl, I have added a couple lines that make them bounce back, and I make it output "upper edge" or "lower edge" respectively so that you know when this is happening. As you notice, the clicking also happens when the w is not near the edges.
As a test to make sure it isn't because of XAudio2 being implemented wrongly, you can comment the first line in loop() that defines the first element of w; make the for-loop (in the next line) start from 0; comment the lines that create the brown noise; and uncomment one of the two lines after that: the first line to hear a sine wave sound, the second line to hear a square wave sound (both with a frequency of about 44100/1000 = 44.1 Hz, which you can change around by changing how p is initialized in init()). You will (hopefully) hear a clean sine/square wave sound.
So what is going wrong?

You have two issues in your code:
You only have a single buffer therefore its near impossible to submit a new buffer for playing quickly enough after the buffer stops playing for there to not be a gap between buffers. You are also modifying the buffer data whilst it is being played which will corrupt the output. You should use multiple buffers. With enough buffers this would also allow you to add some short sleeps to your while loop which is checking BuffersQueued to reduce the CPU usage.
You never set pDataBuffer[0] or pDataBuffer[1] so they will always be 0.
This code works:
#include <xaudio2.h>
#include <iostream>
#include <random>
#include <array>
#include <thread>
using namespace std;
#define PI2 6.28318530717958647692f
#define l 2205 //0.05 seconds
bool init();
bool loop();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(-.01, .01);
IXAudio2MasteringVoice* pMasterVoice;
IXAudio2* pXAudio2;
IXAudio2SourceVoice* pSourceVoice;
const size_t bufferCount = 64;
std::array<XAUDIO2_BUFFER, bufferCount> buffers;
WAVEFORMATEX wfx;
XAUDIO2_VOICE_STATE state;
std::array<std::array<BYTE,2 * l>, bufferCount> pDataBuffers;
BYTE bytw[2];
int pow16[2];
float w[l];
int frame, p;
float tt, ampl;
bool loop() {
float prevW = w[l - 1];
auto& pDataBuffer = pDataBuffers[frame & (bufferCount-1)];
auto& buffer = buffers[frame & (bufferCount - 1)];
for (int t = 0; t < l; t++) {
tt = (float)(t + frame * l); //total time
w[t] = prevW + dis(gen) * ampl;
if (w[t] > ampl) {
//cout << "upper edge ";
w[t] = ampl - fmod(w[t], ampl);
}
if (w[t] < -ampl) {
//cout << "lower edge ";
w[t] = -fmod(w[t], ampl) - ampl;
}
//w[t] = sin(PI2*tt/p)*ampl;
//w[t] = (fmod(tt/p, 1) < .5 ? ampl : -ampl)*(.5f - 2.f*fmod(tt/p, .5f));
prevW = w[t];
int intw = (int)w[t];
if (intw < 0) {
intw += 65535;
}
bytw[0] = 0; bytw[1] = 0;
for (int k = 1; k >= 0; k--) {
//turn integer into a little endian byte array
bytw[k] += (BYTE)(16 * (intw / pow16[k]));
intw -= bytw[k] * (pow16[k] / 16);
bytw[k] += (BYTE)(intw / (pow16[k] / 16));
intw -= (intw / (pow16[k] / 16)) * pow16[k] / 16;
}
pDataBuffer[2 * t] = bytw[0];
pDataBuffer[2 * t + 1] = bytw[1];
}
//cout << endl << endl;
if (frame > 1) {
//wait until the current one is done playing
while (pSourceVoice->GetState(&state), state.BuffersQueued > 1) { std::this_thread::sleep_for(std::chrono::milliseconds(1); }
}
buffer.AudioBytes = 2 * l; //number of bytes per buffer
buffer.pAudioData = pDataBuffer.data();
buffer.Flags = 0;
pSourceVoice->SubmitSourceBuffer(&buffer);
if (frame == 1) {
pSourceVoice->Start(0, 0);
}
frame++;
return true;
}
bool init() {
CoInitializeEx(nullptr, COINIT_MULTITHREADED);
pXAudio2 = nullptr;
XAudio2Create(&pXAudio2, 0, XAUDIO2_DEFAULT_PROCESSOR);
pMasterVoice = nullptr;
pXAudio2->CreateMasteringVoice(&pMasterVoice);
wfx = { 0 };
wfx.wFormatTag = WAVE_FORMAT_PCM;
wfx.nChannels = (WORD)1; //mono
wfx.nSamplesPerSec = (DWORD)44100; //samplerate
wfx.wBitsPerSample = (WORD)16; //16 bit (signed)
wfx.nBlockAlign = (WORD)2; //2 bytes per sample
wfx.nAvgBytesPerSec = (DWORD)88200; //samplerate*blockalign
wfx.cbSize = (WORD)0;
pSourceVoice = nullptr;
pXAudio2->CreateSourceVoice(&pSourceVoice, &wfx);
tt = 0, p = 1000, ampl = 10000;
pow16[0] = 16;
pow16[1] = 4096;
frame = 0;
return true;
}
int main() {
if (!init()) return 1;
while (loop()) {}
return 0;
}
I haven't tried to follow all of your logic but it seems over complicated and could definitely be simplified.
The massive use of global variables is also not a great way to write a program. You should move variables inside the functions where possible, otherwise either pass them to the function as arguments or use a class to hold the state.

Related

How to manipulate audio data buffers correctly?

I have implemented recording and playing back audio from a microphone in C++. The next step is to process the audio data for speech recognition. For this I want to write them to large buffers so that there are no word breaks. To do this, I implemented copying to large buffers using the memcpy function. Unfortunately, it doesn't work because only part of words can be recognized. What is my mistake and can this buffer manipulation be done in a more convenient way?
My code:
#include <stdio.h>
#include <Windows.h>
#include <mmsystem.h>
#include <iostream>
#include <fstream>
using namespace std;
#pragma comment(lib, "winmm.lib")
#define Samples 16000
#define NUM_FRAMES Samples*2
#define Channels 1
const int NUM_BUF = 4;
int main()
{
HWAVEIN inStream;
HWAVEOUT outStream;
WAVEFORMATEX waveFormat;
WAVEHDR buffer[NUM_BUF];
waveFormat.cbSize = 0;
waveFormat.wFormatTag = WAVE_FORMAT_PCM;
waveFormat.nChannels = Channels;
waveFormat.nSamplesPerSec = Samples;
waveFormat.wBitsPerSample = 16;
waveFormat.nBlockAlign = waveFormat.nChannels * waveFormat.wBitsPerSample / 8;
waveFormat.nAvgBytesPerSec = waveFormat.nBlockAlign * waveFormat.nSamplesPerSec;
HANDLE event = CreateEventA(NULL, TRUE, FALSE, "waveout event");
MMRESULT res = MMSYSERR_NOERROR;
res = waveInOpen(&inStream, WAVE_MAPPER, &waveFormat, (unsigned long)event, 0, CALLBACK_EVENT);
if (res != MMSYSERR_NOERROR) {
printf("error in waveInOpen\n");
return -1;
}
res = waveOutOpen(&outStream, WAVE_MAPPER, &waveFormat, (unsigned long)event, 0, CALLBACK_EVENT);
if (res != MMSYSERR_NOERROR) {
printf("error in waveOutOpen\n");
return -2;
}
short int *_pBuf;
size_t bpbuff = 16000*2;
_pBuf = new short int [bpbuff * NUM_BUF];
for ( int i = 0; i < NUM_BUF; i++ )
{
buffer[i].lpData = (LPSTR)&_pBuf [i * bpbuff];
buffer[i].dwBufferLength = bpbuff*sizeof(*_pBuf);
buffer[i].dwFlags = 0L;
buffer[i].dwLoops = 0L;
waveInPrepareHeader(inStream, & buffer[i], sizeof(WAVEHDR));
}
ResetEvent(event);
for (int index = 0; index < NUM_BUF; index++) // queue all buffers for input
waveInAddBuffer(inStream, &buffer[index], sizeof(WAVEHDR));
waveInStart(inStream);
int len_buff = buffer[0].dwBufferLength*6 + 1;
int limit_buff = buffer[0].dwBufferLength*5 + 1;
int size = buffer[0].dwBufferLength;
int rl = 0;
int flagg = 0;
char * buff1 = new char[len_buff];
char * buff2 = new char[len_buff];
int flag_buf = 0;
int flag1 = 0, flag2 = 0;
int i = 0;
int inIndex = 0, outIndex = 0; // the next input and output to watch
while (true) {
if (buffer[inIndex].dwFlags & WHDR_DONE & flagg!=1)
{
flagg = 1;
waveInAddBuffer(inStream, &buffer[inIndex], sizeof(WAVEHDR));
inIndex = (inIndex + 1) % NUM_BUF;
}
if (buffer[outIndex].dwFlags & WHDR_DONE & flagg!=0) {
flagg = 0;
if (flag_buf == 0)
{
if (rl<limit_buff)
{
cout << rl << endl;
if (flag1 == 0)
{
//strcpy(buff1, buffer[outIndex].lpData);
memcpy(buff1, buffer[outIndex].lpData, size);
flag1 = 1;
rl = size + 1;
}
else
{
//strcat(buff1, buffer[outIndex].lpData);
memcpy(buff1 + rl, buffer[outIndex].lpData, size);
rl = rl + size;
}
}
else
{
//recognize buff1
flag_buf = 1;
flag1 = 0;
rl = 0;
}
}
else
{
if (rl<limit_buff)
{
if (flag2 == 0)
{
memcpy(buff2, buffer[outIndex].lpData, size);
flag2 = 1;
rl = size + 1;
}
else
{
memcpy(buff2 + rl, buffer[outIndex].lpData, size);
rl = rl + size;
}
}
else
{
//recognize buff2
flag_buf = 0;
flag2 = 0;
rl = 0;
}
}
waveOutWrite(outStream, &buffer[outIndex], sizeof(WAVEHDR));
outIndex = (outIndex + 1) % NUM_BUF;
printf("N_buff_%i %i\n",outIndex , i);
i++;
}
}
for (int index = 0; index < 4; index++)
waveInUnprepareHeader(inStream, &buffer[inIndex], sizeof(WAVEHDR));
free(buffer);
}

Half-precision PyTorch float tensors have same performance as single precision float tensors?

Background: I've implemented the antiobject/"field AI" pattern (https://home.cs.colorado.edu/~ralex/papers/PDF/OOPSLA06antiobjects.pdf) for single diffusion using LibTorch/PyTorch.
This works fine, but in the process of running it on the GPU and optimizing it, I've run into a problem. I have a Titan V, which I believe excels at half-precision float math. However, when I make the tensors torch::kHalf, the performance is the same. (I've also tried torch::kFloat16). Any ideas?
The code that I timed is in update():
#define SDL_MAIN_HANDLED
#include <simple2d.h>
#include <torch/torch.h>
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDAEvent.h>
#include <math.h>
#include <chrono>
#define DEBUG_NO_DRAW
torch::Device gpu(torch::kCUDA);
torch::Device cpu(torch::kCPU);
torch::Device device = gpu;
const int windowLength = 1000;
const int64_t length = 500;
const float diffusionRate = 0.25;
const int obstacleCount = 4000;
const int entityCount = 1000;
float cellLength = windowLength / length;
torch::Tensor scent = torch::zeros({ length, length }, device).to(torch::kHalf);
torch::Tensor up, down, left, right;
torch::Tensor topWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor bottomWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor leftWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor rightWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor obstaclesMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor entities = torch::zeros({ length, length }, device).to(torch::kHalf);
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream();
std::time_t *lastFpsUpdate = NULL;
std::time_t *currentTime = new std::time_t();
int frameAccumulator = 0;
std::vector<long> updateDurations;
void update() {
torch::NoGradGuard no_grad;
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto startTime = std::chrono::high_resolution_clock::now();
down = scent.roll(1, 0) * obstaclesMask * topWallMask;
up = scent.roll(-1, 0) * obstaclesMask * bottomWallMask;
right = scent.roll(1, 1) * obstaclesMask * leftWallMask;
left = scent.roll(-1, 1) * obstaclesMask * rightWallMask;
scent = scent + ((down - scent) + (up - scent) + (right - scent) + (left - scent)) * diffusionRate;
scent = torch::max(scent, entities);
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime);
updateDurations.push_back(duration.count());
}
void render() {
if (lastFpsUpdate == NULL) {
lastFpsUpdate = new std::time_t();
std::time(lastFpsUpdate);
}
torch::Tensor sqrtedScent = scent.sqrt().to(torch::kFloat).to(cpu); // just to make darker scents a little brighter for display
auto obstaclesMaskCPU = obstaclesMask.to(torch::kFloat).to(cpu);
auto sqrtedScentAccessor = sqrtedScent.accessor<float, 2>();
auto obstaclesMaskAccessor = obstaclesMaskCPU.accessor<float, 2>();
float r = 0, g = 0, b = 0, a = 0;
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(
0, 0, 0, 0, 0, 1,
windowLength, 0, 0, 0, 0, 1,
windowLength, windowLength, 0, 0, 0, 1,
0, windowLength, 0, 0, 0, 1);
#endif
for (int i = 0; i < length; i++) {
for(int j = 0; j < length; j++) {
if (obstaclesMaskAccessor[i][j] == 0) {
r = 1; g = 1; b = 1; a = 1;
}
else {
r = 1; g = 0; b = 0; a = sqrtedScentAccessor[i][j];
}
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(cellLength * j, cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * (i + 1), r, g, b, a,
cellLength * j, cellLength * (i + 1), r, g, b, a);
#endif
}
}
frameAccumulator++;
std::time(currentTime);
if (std::difftime(*currentTime, *lastFpsUpdate) > 1.0) {
std::cout << "FPS: " << frameAccumulator << std::endl;
frameAccumulator = 0;
*lastFpsUpdate = *currentTime;
int updateCount = updateDurations.size();
long totalUpdateTime = 0;
for (int i = 0; i < updateCount; i++) {
totalUpdateTime += updateDurations[i];
}
long averageUpdateTime = totalUpdateTime / updateCount;
std::cout << "AverageUpdateTime: " << averageUpdateTime << "us" << std::endl;
updateDurations.clear();
}
}
int main() {
if (torch::cuda::is_available()) {
std::cout << "CUDA is available!" << std::endl;
}
std::cout << "Using " << (device == cpu ? "CPU" : "GPU") << std::endl;
for (int i = 0; i < length; i++) {
topWallMask[0][i] = 0;
bottomWallMask[length - 1][i] = 0;
leftWallMask[i][0] = 0;
rightWallMask[i][length - 1] = 0;
}
for (int i = 0; i < obstacleCount; i++) {
int x = rand() % length;
int y = rand() % length;
obstaclesMask[x][y] = 0;
}
//std::cout << obstaclesMask << std::endl;
for (int i = 0; i < entityCount; i++) {
int x = rand() % length;
int y = rand() % length;
if (obstaclesMask[x][y].item() == 0)
continue;
entities[x][y] = 1;
}
S2D_Window* window = S2D_CreateWindow(
"Collab Diffuse", windowLength, windowLength, update, render, 0
);
S2D_Show(window);
return 0;
}
In both single precision and half precision versions of the code, update() takes about 2700 microseconds.
I'm using PyTorch/LibTorch 1.7.1.
Any other performance tips would be appreciated. (I'm aware drawing pixel by pixel is very slow, so I plan to switch from Simple2D to something else that can draw bitmaps from memory).

Arduino RTC subtracting 1 second every 8 hours

I used the RTC, from an Arduino MKR 1300 with integrated RTC, as an alarm that will trigger a "boolean"(it's an integer) that will tell the loop to run a certain method every minute and then send some data every 5 minutes. It's on an active loop but the method to send data ONLY WORKS if it's inside the loop (no idea why). The problem is the RTC apparently is subtracting 1 second at every 8 hours or so after a few days the timing might come off and instead of sending data every xx:10:xx-xx:15:xx it might send data xx:09:xx-xx:14:xx.
Here's the code:
#include <EmonLib.h>
#include <RTCZero.h>
#include <MKRWAN.h>
EnergyMonitor emon1;
EnergyMonitor emon2;
EnergyMonitor emon3;
RTCZero rtc;
LoRaModem modem;
String appEui = "1234567891011121";
String appKey = "ffffffffffffffffffffffffffffffff";
/* INITIAL_TIME */
const byte seconds = 0;
const byte minutes = 0;
const byte hours = 0;
const byte day = 17;
const byte month = 12;
const byte year = 18;
byte second_alarm = 0;
byte minute_alarm = 0;
byte hour_alarm = 0;
byte INTERVAL = 60;
int SEND_LOOP = 5;
int totalKW;
int counter= 0;
int alarm_Triggered = 0;
void setup()
{
Serial.begin(115200);
if (!modem.begin(EU868)) {
Serial.println("Failed to start module");
while (1) {}
};
Serial.print("Your module version is: ");
Serial.println(modem.version());
Serial.print("Your device EUI is: ");
Serial.println(modem.deviceEUI());
Serial.println("Connecting");
int connected = modem.joinOTAA(appEui, appKey);
if (!connected) {
Serial.println("Something went wrong; are you indoor? Move near a window and retry");
while (1) {}
}
Serial.println("Connected");
// Set poll interval to 60 secs.
modem.minPollInterval(60);
analogReadResolution(9);
emon1.current(1, 53);
emon2.current(2, 53);
emon3.current(3, 53);
counter= 0;
rtc.begin(); // initialize RTC
rtc.setAlarmTime(hour_alarm, minute_alarm, second_alarm);
rtc.enableAlarm(rtc.MATCH_HHMMSS);
rtc.attachInterrupt(triggerAlarm);
// Set the time
rtc.setHours(hours);
rtc.setMinutes(minutes);
rtc.setSeconds(seconds);
// Set the date
rtc.setDay(day);
rtc.setMonth(month);
rtc.setYear(year);
}
void loop() {
if (alarm_Triggered == 1) {
dataMonitor();
alarm_Triggered = 0;
}
}
void dataMonitor() {
int totalWatt = 0;
unsigned long delay_send = 0;
int sending = 0;
double Irms1 = emon1.calcIrms(600);
if (Irms1 < 0.3) Irms1 = 0;
double Watt1 = Irms1 * 230;
double Irms2 = emon2.calcIrms(600);
if (Irms2 < 0.3) Irms2 = 0;
double Watt2 = Irms2 * 230;
double Irms3 = emon3.calcIrms(600);
if (Irms3 < 0.3) Irms3 = 0;
double Watt3 = Irms3 * 230;
totalWatt = Watt1 + Watt2 + Watt3;
totalKW = totalKW + totalWatt / 1000;
Serial.println(counter);
sendDataChecker(Irms1, Irms2, Irms3);
setAlarm();
counter= counter+ 1;
}
void sendDataChecker(double Irms1, double Irms2, double Irms3) {
if (counter== SEND_LOOP) {
double IrmsTotal = Irms1 + Irms2 + Irms3;
String msg = "{\"id\":\"avac_aud2\",\"kW\":" + String(totalKW) + ", \"current\":" + String(IrmsTotal) + "}";
int err;
Serial.println("Ready to Send");
modem.beginPacket();
modem.print(msg);
err = modem.endPacket(true);
Serial.println("Sent1");
if (err > 0) {
//message sent correctly
Serial.println("Sent");
counter= 0;
totalKW = 0;
} else {
Serial.println("ERR");
counter= 0;
}
}
}
void setAlarm() {
second_alarm += INTERVAL;
if (second_alarm >= 60) {
minute_alarm++;
second_alarm = 0;
}
if (minute_alarm >= 60) {
hour_alarm++;
minute_alarm = 0;
}
if (hour_alarm >= 24) {
hour_alarm = 0;
}
rtc.setAlarmTime(hour_alarm, minute_alarm, second_alarm);
}
void triggerAlarm() {
alarm_Triggered = 1;
}

Saving FFT Spectrum in Fmod Studio C++

I'm trying to save the Spectrum in my FMOD_DSP_PARAMETER_FFT but I'm only receiving the spectrum full of zeros, if you can watch my mistake I will agree, I think that I'm not connecting well the DSP to the channel or something similar because I don't find the error in the code.
My code now is like this:
FMOD::System *system;
FMOD::Sound *sound1;
FMOD::Channel *channel = 0;
FMOD::ChannelGroup *mastergroup;
FMOD::ChannelControl *control;
FMOD::DSP *mydsp, *dsphead, *dspchannelmixer;
FMOD::DSPConnection *conection;
FMOD_RESULT result;
unsigned int version;
result = FMOD::System_Create(&system);
result = system->getVersion(&version);
result = system->init(32, FMOD_INIT_NORMAL, NULL);
result = system->createSound("MySong.mp3",FMOD_DEFAULT, 0, &sound1);
result = sound1->setMode(FMOD_LOOP_NORMAL);
result = system->playSound(sound1, 0, true, &channel);
/*
Create the DSP effect.
*/
result = system->getMasterChannelGroup(&mastergroup);
result = system->createDSPByType(FMOD_DSP_TYPE_FFT, &mydsp);
result = system->getMasterChannelGroup(&mastergroup);
result = mastergroup->addDSP(0, mydsp);
result = mydsp->setBypass(true);
result = mydsp->setActive(true);
char s[256];
unsigned int len;
float freq[32];
float fft = 0;
std::vector<float> fftheights;
float m_spectrum_data[FFT_NUM_BINS];
while (1) { //program loop
unsigned int ms = 0;
unsigned int lenms = 0;
bool playing = 0;
bool paused = 0;
int channelsplaying = 0;
if (channel)
{
FMOD::Sound *currentsound = 0;
result = channel->setPaused(false);
result = channel->setMute(false);
result = channel->isPlaying(&playing);
result = channel->getPaused(&paused);
result = channel->setVolume(0.5);
result = channel->getPosition(&ms, FMOD_TIMEUNIT_MS);
channel->getCurrentSound(&currentsound);
if (currentsound)
{
result = currentsound->getLength(&lenms, FMOD_TIMEUNIT_MS);
}
}
system->getChannelsPlaying(&channelsplaying);
FMOD_DSP_PARAMETER_FFT *fftparameter;
float val;
char s[256];
unsigned int len;
float *data = 0;
float freq[32];
int rate, chan, nyquist;
int windowsize = 1024;
result = system->getSoftwareFormat(&rate, 0, 0);
result = mydsp->setParameterInt(FMOD_DSP_FFT_WINDOWTYPE, FMOD_DSP_FFT_WINDOW_TRIANGLE);
result = mydsp->setParameterInt(FMOD_DSP_FFT_WINDOWSIZE, windowsize);
result = mydsp->getParameterFloat(FMOD_DSP_FFT_DOMINANT_FREQ, &val, 0, 0);
result = mydsp->getParameterData(FMOD_DSP_FFT_SPECTRUMDATA, (void **)&fftparameter, &len, s, 256);
nyquist = windowsize / 2;
for (chan = 0; chan < 2; chan++)
{
float average = 0.0f;
float power = 0.0f;
for (int i = 0; i < nyquist - 1; ++i)
{
float hz = i * (rate * 0.5f) / (nyquist - 1);
int index = i + (16384 * chan);
if (fftparameter->spectrum[chan][i] > 0.0001f) // arbitrary cutoff to filter out noise
{
average += data[index] * hz;
power += data[index];
}
}
if (power > 0.001f)
{
freq[chan] = average / power;
}
else
{
freq[chan] = 0;
}
}
printf("\ndom freq = %d : %.02f %.02f\n", (int)val, freq[0], freq[1]);
}
My fftparameter->spectrum is always an array of zero values...
Is posible to connect it without modify the sound that is playing??
Thank you.
There are a few standout issues in your code example.
The FFT DSP has been bypassed with result = mydsp->setBypass(true); causing it to not process.
There are no calls to System::update in the main loop.
The main loop has no sleep so it will spin as fast as possible.
I think your main issue is probably the setBypass call, use setBypass(false).

Conceal packet loss in PCM stream

I am looking to use 'Packet Loss Concealment' to conceal lost PCM frames in an audio stream. Unfortunately, I cannot find a library that is accessible without all the licensing restrictions and code bloat (...up for some suggestions though).
I have located some GPL code written by Steve Underwood for the Asterisk project which implements PLC. There are several limitations; although, as Steve suggests in his code, his algorithm can be applied to different streams with a bit of work. Currently, the code works with 8kHz 16-bit signed mono streams.
Variations of the code can be found through a simple search of Google Code Search.
My hope is that I can adapt the code to work with other streams. Initially, the goal is to adjust the algorithm for 8+ kHz, 16-bit signed, multichannel audio (all in a C++ environment). Eventually, I'm looking to make the code available under the GPL license in hopes that it could be of benefit to others...
Attached is the code below with my efforts. The code includes a main function that will "drop" a number of frames with a given probability. Unfortunately, the code does not quite work as expected. I'm receiving EXC_BAD_ACCESS when running in gdb, but I don't get a trace from gdb when using 'bt' command. Clearly, I'm trampimg on memory some where but not sure exactly where. When I comment out the amdf_pitch function, the code runs without crashing...
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm");
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm");
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm");
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000);
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 5;
while(!fin.eof())
{
char arr[2];
fin.read(arr, 2);
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[2] = {0x00, 0x00};
fout_lossy.write(blank, 2);
//Fill in data;
Concealer.Fill((int16_t *)blank, 1);
fout_repaired.write(blank, 2);
}
else
{
//Write data to file;
fout_repaired.write(arr, 2);
fout_lossy.write(arr, 2);
Concealer.Receive((int16_t *)arr, 1);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
int amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#if !defined(INT16_MAX)
#define INT16_MAX (32767)
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + plc_history_len - correlation_span - plc_pitch_min, j, correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}
P.S. - I must confess that digital audio is not my forte...
Fixed the problem. The problem lay within the amdf_pitch function. There were some minor bugs elsewhere too (which have been repaired). As a result, the code will now run the testbed inserting blank for a given probability.
Using Audacity I have studied the raw PCM streams that have been created via the testbed. When a blank set of frames is encountered, smoothing occurs from received to blank as expected; however, when we change from blank to valid/received data, we gets clicks because the smoothing doesn't appear to be working during this phase. Any suggestions?
I have attached the updated code:
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm", std::ios::binary);
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm", std::ios::binary);
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm", std::ios::binary);
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000); //1-channel, 16-bit, 32kHz
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 3;
int old_bytes_read = 0;
while(!fin.eof())
{
char arr[1024];
fin.read(arr, 1024);
int total_bytes_read = fin.tellg();
int bytes_read = total_bytes_read - old_bytes_read;
old_bytes_read = total_bytes_read;
if(!bytes_read)
continue; //Probably reached EOF;
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[1024] = {0x00, 0x00};
fout_lossy.write(blank, 1024);
//Fill in data;
Concealer.Fill((int16_t *)blank, 512);
fout_repaired.write(blank, 1024);
}
else
{
//Write data to file;
fout_repaired.write(arr, 1024);
fout_lossy.write(arr, 1024);
Concealer.Receive((int16_t *)arr, 512);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* PcmConcealer.hpp
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
inline int amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* PcmConcealer.cpp
*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + (plc_history_len - correlation_span - plc_pitch_min), correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; (i < pitch_overlap) && (i < frames); i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
/*for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}*/
for (j = 0; j < frames; j++)
acc += abs(amp[i + j] - amp[j]);
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
//std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}