MPI Gather Corrupting Arrays - c++

I have written an MPI code in C++ for my Raspberry Pi cluster, which generates an image of the Mandelbrot Set. What happens is on each node (excluding the master, processor 0) part of the Mandelbrot Set is calculated, resulting in each node having a 2D array of ints that indicates whether each xy point is in the set.
It appears to work well on each node individually, but when all the arrays are gathered to the master using this command:
MPI_Gather(&inside, 1, MPI_INT, insideFull, 1, MPI_INT, 0, MPI_COMM_WORLD);
it corrupts the data, and the result is an array full of garbage.
(inside is the nodes' 2D arrays of part of the set. insideFull is also a 2D array but it holds the whole set)
Why would it be doing this?
(This led to me wondering if it corrupting because the master isn't sending its array to itself (or at least I don't want it to). So part of my question also is is there an MPI_Gather variant that doesn't send anything from the root process, just collects from everything else?)
Thanks
EDIT: here's the whole code. If anyone can suggest better ways of how I'm transferring the arrays, please say.
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
// ONLY USE MULTIPLES OF THE NUMBER OF SLAVE PROCESSORS
#define ImageHeight 128
#define ImageWidth 128
double MinRe = -1.9;
double MaxRe = 0.5;
double MinIm = -1.2;
double MaxIm = MinIm + (MaxRe - MinRe)*ImageHeight / ImageWidth;
double Re_factor = (MaxRe - MinRe) / (ImageWidth - 1);
double Im_factor = (MaxIm - MinIm) / (ImageHeight - 1);
unsigned n;
unsigned MaxIterations = 50;
int red;
int green;
int blue;
// MPI variables ****
int processorNumber;
int processorRank;
//*******************//
int main(int argc, char** argv) {
// Initialise MPI
MPI_Init(NULL, NULL);
// Get the number of procesors
MPI_Comm_size(MPI_COMM_WORLD, &processorNumber);
// Get the rank of this processor
MPI_Comm_rank(MPI_COMM_WORLD, &processorRank);
// Get the name of this processor
char processorName[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processorName, &name_len);
// A barrier just to sync all the processors, make timing more accurate
MPI_Barrier(MPI_COMM_WORLD);
// Make an array that stores whether each point is in the Mandelbrot Set
int inside[ImageWidth / processorNumber][ImageHeight / processorNumber];
if(processorRank == 0) {
printf("Generating Mandelbrot Set\n");
}
// We don't want the master to process the Mandelbrot Set, only the slaves
if(processorRank != 0) {
// Determine which coordinates to test on each processor
int xMin = (ImageWidth / (processorNumber - 1)) * (processorRank - 1);
int xMax = ((ImageWidth / (processorNumber - 1)) * (processorRank - 1)) - 1;
int yMin = (ImageHeight / (processorNumber - 1)) * (processorRank - 1);
int yMax = ((ImageHeight / (processorNumber - 1)) * (processorRank - 1)) - 1;
// Check each value to see if it's in the Mandelbrot Set
for (int y = yMin; y <= yMax; y++) {
double c_im = MaxIm - y *Im_factor;
for (int x = xMin; x <= xMax; x++) {
double c_re = MinRe + x*Re_factor;
double Z_re = c_re, Z_im = c_im;
int isInside = 1;
for (n = 0; n <= MaxIterations; ++n) {
double Z_re2 = Z_re * Z_re, Z_im2 = Z_im * Z_im;
if (Z_re2 + Z_im2 > 10) {
isInside = 0;
break;
}
Z_im = 2 * Z_re * Z_im + c_im;
Z_re = Z_re2 - Z_im2 + c_re;
}
if (isInside == 1) {
inside[x][y] = 1;
}
else{
inside[x][y] = 0;
}
}
}
}
// Wait for all processors to finish computing
MPI_Barrier(MPI_COMM_WORLD);
int insideFull[ImageWidth][ImageHeight];
if(processorRank == 0) {
printf("Sending parts of set to master\n");
}
// Send all the arrays to the master
MPI_Gather(&inside[0][0], 1, MPI_INT, &insideFull[0][0], 1, MPI_INT, 0, MPI_COMM_WORLD);
// Output the data to an image
if(processorRank == 0) {
printf("Generating image\n");
FILE * image = fopen("mandelbrot_set.ppm", "wb");
fprintf(image, "P6 %d %d 255\n", ImageHeight, ImageWidth);
for(int y = 0; y < ImageHeight; y++) {
for(int x = 0; x < ImageWidth; x++) {
if(insideFull[x][y]) {
putc(0, image);
putc(0, image);
putc(255, image);
}
else {
putc(0, image);
putc(0, image);
putc(0, image);
}
// Just to see what values return, no actual purpose
printf("%d, %d, %d\n", x, y, insideFull[x][y]);
}
}
fclose(image);
printf("Complete\n");
}
MPI_Barrier(MPI_COMM_WORLD);
// Finalise MPI
MPI_Finalize();
}

You call MPI_Gether with the following parameters:
const void* sendbuf : &inside[0][0] Starting address of send buffer
int sendcount : 1 Number of elements in send buffer
const MPI::Datatype& sendtype : MPI_INT Datatype of send buffer elements
void* recvbuf : &insideFull[0][0]
int recvcount : 1 Number of elements for any single receive
const MPI::Datatype& recvtype : MPI_INT Datatype of recvbuffer elements
int root : 0 Rank of receiving process
MPI_Comm comm : MPI_COMM_WORLD Communicator (handle).
Sending/receiving only one element is not sufficient. Instead of 1 use
(ImageWidth / processorNumber)*(ImageHeight / processorNumber)
Then think about the different memory layout of your source and target 2D arrays:
int inside[ImageWidth / processorNumber][ImageHeight / processorNumber];
vs.
int insideFull[ImageWidth][ImageHeight];
As the copy is a memory bloc copy, and not an intelligent 2D array copy, all your source integers will be transfered contiguously to the target adress, regardless of the different size of the lines.
I'd recommend to send the data fisrt into an array of the same size as the source, and then in the receiving process, to copy the elements to the right lines & columns in the full array, for example with a small function like:
// assemble2d():
// copys a source int sarr[sli][sco] to a destination int darr[dli][sli]
// using an offset to starting at darr[doffli][doffco].
// The elements that are out of bounds are ignored. Negative offset possible.
void assemble2D(int*darr, int dli, int dco, int*sarr, int sli, int sco, int doffli=0, int doffco=0)
{
for (int i = 0; i < sli; i++)
for (int j = 0; j < sco; j++)
if ((i + doffli >= 0) && (j + doffco>=0) && (i + doffli<dli) && (j + doffco<dco))
darr[(i+doffli)*dli + j+doffco] = sarr[i*sli+j];
}

Related

Visualizing/saving an extremely large number of pixels with

I made a program in C++ which calculates the mandelbrot-set. Now I want to visualize it (save it in a picture). But when I try to save a 64k picture some problems come up. So what is the best way to save a picture of the pixels or at least to visual it?
Edit:
When I want to create a for Example 64K (61440 * 34560) image there will be the error "Access violation while writing at the position 0x0..." (originally on German and translated) and the program stops. This error appears with very high resolution. On lower resolutions the program works as it is supposed to.
#include <SFML\Graphics.hpp>
#include <stdlib.h>
#include <complex>
#include <cmath>
#include <thread>
//4K : 3840 * 2160
//8K : 7680 * 4320
//16K: 15360 * 8640
//32K: 30720 * 17280
//64K: 61440 * 34560
//128K:122880 * 69120
const unsigned long width = 61440; //should be dividable by ratioX & numberOfThreads!
const unsigned long height = 34560; //should be dividable by ratioY & numberOfThreads!
const unsigned int maxIterations = 500;
const unsigned int numberOfThreads = 6;
const int maxWidth = width / 3;
const int maxHeight = height / 2;
const int minWidth = -maxWidth * 2;
const int minHeight = -maxHeight;
const double ratioX = 3.0 / width;
const double ratioY = 2.0 / height;
sf::Image img = sf::Image();
int getsGreaterThan2(std::complex<double> z, int noIterations) {
double result;
std::complex<double> zTmp = z;
std::complex<double> c = z;
for (int i = 1; i != noIterations; i++) {
zTmp = std::pow(z, 2) + c;
if (zTmp == z) {
return 0;
}
z = std::pow(z, 2) + c;
result = std::sqrt(std::pow(z.real(), 2) + std::pow(z.imag(), 2));
if (result > 2) {
return i;
}
}
return 0;
}
void fillPixelArrayThreadFunc(int noThreads, int threadNr) { //threadNr ... starts from 0
double imgNumber;
double realNumber;
double tmp;
long startWidth = ((double)width) / noThreads * threadNr + minWidth;
long endWidth = startWidth + width / noThreads;
for (long x = startWidth; x < endWidth; x++) {
imgNumber = x * ratioX;
for (long y = minHeight; y < maxHeight; y++) {
realNumber = y * ratioY;
long xArray = x - minWidth;
long yArray = y - minHeight;
tmp = getsGreaterThan2(std::complex<double>(imgNumber, realNumber), maxIterations);
if (tmp == 0) {
img.setPixel(xArray, yArray, sf::Color(0, 0, 0, 255));
}
else {
img.setPixel(xArray, yArray, sf::Color(tmp / maxIterations * 128, tmp / maxIterations * 128, tmp / maxIterations * 255, 255));
}
}
}
}
int main() {
img.create(width, height, sf::Color::Black);
std::thread *threads = new std::thread[numberOfThreads];
for (int i = 0; i < numberOfThreads; i++) {
threads[i] = std::thread(std::bind(fillPixelArrayThreadFunc, numberOfThreads, i));
}
for (int i = 0; i < numberOfThreads; i++) {
threads[i].join();
}
img.saveToFile("filename.png");
return 1;
}
Your program fails during the call img.create(width, height, sf::Color::Black);.
When you step into the sf::Image::create function you end up here where the newPixels vector is created, this simply fails when width * height is too big as in your case:
////////////////////////////////////////////////////////////
void Image::create(unsigned int width, unsigned int height, const Color& color)
{
if (width && height)
{
// Create a new pixel buffer first for exception safety's sake
std::vector<Uint8> newPixels(width * height * 4);
^61440* ^34560 = 8'493'465'600 bytes !!
Conclusion: SFML cannot handle huge images.

How to MPI_Gather a 2d array of structs with C++?

I am trying to render a fractal calculated using MPI. I used the answer to the following question as reference: sending blocks of 2D array in C using MPI
My problem is, that merge of data via MPI_Gatherv calculated by all the processes does not seem to work properly, because my main process always renders a black screen.
I have the following struct defined:
typedef struct Point {
float r,g,b,x,y;
} Point;
In my main I try to create an MPI_Datatype for the struct:
MPI_Datatype struct_type;
MPI_Datatype struct_members[1] = {MPI_FLOAT};
MPI_Aint offsets[1] = {0};
int struct_blengths[1] = {5};
int struct_items = 1;
MPI_Type_create_struct(struct_items, struct_blengths, offsets, struct_members, &struct_type);
MPI_Type_commit(&struct_type);
I have a global variable for the calculation result:
Point **mandelbrot;
The variable is allocated thusly before each frame is being recalculated:
if (proc_id == root) {
//Just a check if this is the first frame that is being rendered
if (s > 0) {
free(&(mandelbrot[0][0]));
free(mandelbrot);
}
s = W;
Point *p = (Point *) malloc(W * H * sizeof(Point));
mandelbrot = (Point **) malloc(W*sizeof(Point *));
for (int i = 0; i < W; i++) {
mandelbrot[i] = &(p[i*H]);
}
}
Here I try to create an array subtype using the Point struct (following the referenced answer as best I can):
//Width of the fractal to render
W = width;
//Height of the fractal
H = height;
//Chunk of width each process is responsible for [width / number of processes]
int segmentSize = (int) W / ntasks;
MPI_Datatype type, resizedtype;
int sizes[2] = {W,H}; /* size of global array */
int subsizes[2] = {segmentSize, H}; /* size of sub-region */
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, struct_type, &type);
MPI_Type_create_resized(type, 0, H*sizeof(Point), &resizedtype);
MPI_Type_commit(&resizedtype);
Calculate the displacements and counts of blocks to send and allocate memory for the process' subarray:
int sendcounts[segmentSize*H];
int displs[segmentSize*H];
if (proc_id == root) {
for (int i=0; i<segmentSize*H; i++) sendcounts[i] = 1;
int disp = 0;
for (int i=0; i<segmentSize; i++) {
for (int j=0; j<H; j++) {
displs[i*H+j] = disp;
disp += 1;
}
disp += ((W/segmentSize)-1)*H;
}
}
Point *p = (Point *) malloc(segmentSize * H * sizeof(Point));
Point **segment;
segment = (Point **) malloc(segmentSize * sizeof(Point*));
for (int i = 0; i < segmentSize; i++) {
segment[i] = &(p[i*H]);
}
Following that I calculate the color of the Mandelbrot set for each point in the chunk:
int i;
float c[3], dX, dY;
for ( x = 0; x < segmentSize; x++) {
for ( y = 0; y < H; y++) {
//Iterate over the point
i = iterateMandelbrot(rM + x * dR, iM - y * dI);
// Get decimal coordinates for rendering <0,1>
dX = (x + segmentSize * proc_id) / W;
dY = y / H;
//Calculate color using Bernoulli Polynomials
makeColor(i, maxIterations, c);
segment[x][y].x = (float) dX;
segment[x][y].y = (float) dY;
segment[x][y].r = (float) c[0];
segment[x][y].g = (float) c[1];
segment[x][y].b = (float) c[2];
}
}
Lastly I try to gather the chunks into the mandelbort variable for the root process to render:
int buffsize = (int) segmentSize * H;
MPI_Gatherv(&(segment[0][0]), W*H/(buffsize), struct_type,
&(mandelbrot[0][0]), sendcounts, displs, resizedtype,
root, MPI_COMM_WORLD);
MPI_Type_free(&resizedtype);
Ok so the problem is now that no data seems to be written into the mandelbrot variable as my main process renders a black screen. Without using MPI the code works so the problem lies somewhere in the MPI_Gatherv call or maybe the way I am allocating the arrays. I realize there might be some memory leak associated with the mandelbrot set or the local segment arrays but that is not my main concern at the moment. Can you see what I am doing wrong here? Any help is appreciated!

Performant Way to create checkerboard pattern

So I have an image that I want to overlay with a checkerboard pattern.
This is what I have come up with so far:
for ( uint_8 nRow = 0; nRow < image.width(); ++nRow)
for (uint_8 nCol = 0; nCol < image.height(); ++nCol)
if(((nRow/20 + nCol/20) % 2) == 0)
memset(&image.data[nCol + nRow], 0, 1);
Produces a white image unfortunately. I dont think this is very performant because memset is called for every single pixel in the image instead of multiple.
Why does this code not produce a chckerboard pattern? How would you improve it?
For better performance, don't treat the image as a 2-dimensional entity. Instead, look at it as a 1D array of continuous data, where all lines of the image are arranged one after the other.
With this approach, you can write the pattern in one go with a single loop, where in every iteration you memset() multiple adjacent pixels and increase the index by twice the amount of pixels you set:
int data_size = image.width() * image.height();
for (auto it = image.data; it < image.data + data_size; it += 20) {
memset(it, 0, 20);
if (((it - data) + 40) % (20 * 400) == 0) {
it += 40;
} else if (((it - data) + 20) % (20 * 400) != 0) {
it += 20;
}
}
(Replace auto with the type of image.data if you're not using C++11; I suspect it's unsigned char*.)
This is quite friendly for the CPU cache prefetch. It's also friendly for the compiler, which can potentially vectorize and/or perform loop unrolling.
If you have an image's dimensions which are multiple of the checker square size :
(I coded in C but it is fairly easy to transpose to C++)
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define uint unsigned int
#define WIDTH 40
#define HEIGHT 40
#define BLOCK_SIZE 5
void create_checker_row(uint* row, uint size_block, uint nb_col, uint offset )
{
uint ic;
for (ic = size_block*offset ; ic < nb_col; ic+= 2*size_block )
{
memset( (row + ic) , 0, size_block*sizeof(uint) );
}
}
int main()
{
uint ir,ic;
// image creation
uint* pixels = (uint*) malloc(WIDTH*HEIGHT*sizeof(uint));
for (ir = 0; ir < WIDTH; ir++)
{
for ( ic = 0; ic < HEIGHT; ic++)
{
// arbitrary numbers
pixels[ir*WIDTH + ic] = (ir*WIDTH + ic) % 57 ;
printf("%d,", pixels[ir*WIDTH + ic] );
}
printf("\n");
}
for (ir = 0; ir < WIDTH; ir++)
{
create_checker_row( pixels + ir*WIDTH , // pointer at the beggining of n-th row
BLOCK_SIZE , // horizontal length for square
WIDTH , // image width
(ir/BLOCK_SIZE) % 2 // offset to create the checker pattern
);
}
// validation
printf("\n");
printf("Validation \n");
printf("\n");
for (ir = 0; ir < WIDTH; ir++)
{
for ( ic = 0; ic < HEIGHT; ic++)
{
printf("%d,", pixels[ir*WIDTH + ic] );
}
printf("\n");
}
return 0;
}
Seems pretty checkered for me : http://ideone.com/gp9so6
I use this and stb_image_write.h
#include <stdlib.h>
#include <stb_image_write.h>
int main(int argc, char *argv[])
{
const int w = 256, h = 256, ch = 4, segments = 8, box_sz = w / segments;
unsigned char rgba_fg[4] = {255, 255, 0, 255}; //yellow
unsigned char rgba_bg[4] = {255, 0, 0, 255}; //red
unsigned char* data = calloc(w * h * ch, sizeof(unsigned char));
int swap = 0;
int fill = 0; /* set to 1 to fill fg first*/
unsigned char* col = NULL;
for(int i = 0; i < w * h; i++)
{
if(i % (w * box_sz) == 0 && i != 0)
swap = !swap;
if(i % box_sz == 0 && i != 0)
fill = !fill;
if(fill)
{
if(swap)
col = rgba_bg;
else
col = rgba_fg;
}else
{
if(swap)
col = rgba_fg;
else
col = rgba_bg;
}
for(int j = 0; j < ch; j++)
{
data[i*ch + j] = col[j];
}
}
stbi_write_png("checker.png", w, h, ch, data, 0);
free(data);
return 0;
}
Its a bit slow with large images but gets the job done if you cache them

C/CUDA Program Output

The following is a CUDA programming example which is basically C but with NVidia CUDA functions within. I've been trying to interpret this code example and figure out what it is trying to do. My question is this the program compiles just fine, but what arguments does it take? For example this CUDA program is being run in a linux emulator however upon running ./program it returns:
Usage: ./program number
Segmentation fault
What are the programs input arguments. Thank you.
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand () % 128);
y[i] = (float) (rand () % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}
int N = atoi(argv[1]);
The program takes a single integer as a command line argument. (Try calling it as ./program 5, for example.)
It then calculates a SAXPY (An old term originating from early BLAS implementations, but it stuck. It means "single (precision, aka float) real alpha x plus y".) with vectors of dimension N.

Conceal packet loss in PCM stream

I am looking to use 'Packet Loss Concealment' to conceal lost PCM frames in an audio stream. Unfortunately, I cannot find a library that is accessible without all the licensing restrictions and code bloat (...up for some suggestions though).
I have located some GPL code written by Steve Underwood for the Asterisk project which implements PLC. There are several limitations; although, as Steve suggests in his code, his algorithm can be applied to different streams with a bit of work. Currently, the code works with 8kHz 16-bit signed mono streams.
Variations of the code can be found through a simple search of Google Code Search.
My hope is that I can adapt the code to work with other streams. Initially, the goal is to adjust the algorithm for 8+ kHz, 16-bit signed, multichannel audio (all in a C++ environment). Eventually, I'm looking to make the code available under the GPL license in hopes that it could be of benefit to others...
Attached is the code below with my efforts. The code includes a main function that will "drop" a number of frames with a given probability. Unfortunately, the code does not quite work as expected. I'm receiving EXC_BAD_ACCESS when running in gdb, but I don't get a trace from gdb when using 'bt' command. Clearly, I'm trampimg on memory some where but not sure exactly where. When I comment out the amdf_pitch function, the code runs without crashing...
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm");
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm");
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm");
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000);
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 5;
while(!fin.eof())
{
char arr[2];
fin.read(arr, 2);
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[2] = {0x00, 0x00};
fout_lossy.write(blank, 2);
//Fill in data;
Concealer.Fill((int16_t *)blank, 1);
fout_repaired.write(blank, 2);
}
else
{
//Write data to file;
fout_repaired.write(arr, 2);
fout_lossy.write(arr, 2);
Concealer.Receive((int16_t *)arr, 1);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
int amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#if !defined(INT16_MAX)
#define INT16_MAX (32767)
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + plc_history_len - correlation_span - plc_pitch_min, j, correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int channel_index, int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}
P.S. - I must confess that digital audio is not my forte...
Fixed the problem. The problem lay within the amdf_pitch function. There were some minor bugs elsewhere too (which have been repaired). As a result, the code will now run the testbed inserting blank for a given probability.
Using Audacity I have studied the raw PCM streams that have been created via the testbed. When a blank set of frames is encountered, smoothing occurs from received to blank as expected; however, when we change from blank to valid/received data, we gets clicks because the smoothing doesn't appear to be working during this phase. Any suggestions?
I have attached the updated code:
int main (int argc, char *argv[])
{
std::ifstream fin("C:\\cc32kHz.pcm", std::ios::binary);
if(!fin.is_open())
{
std::cout << "Failed to open input file" << std::endl;
return 1;
}
std::ofstream fout_repaired("C:\\cc32kHz_repaired.pcm", std::ios::binary);
if(!fout_repaired.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
std::ofstream fout_lossy("C:\\cc32kHz_lossy.pcm", std::ios::binary);
if(!fout_lossy.is_open())
{
std::cout << "Failed to open output repaired file" << std::endl;
return 1;
}
audio::PcmConcealer Concealer;
Concealer.Init(1, 16, 32000); //1-channel, 16-bit, 32kHz
//Generate random numbers;
srand( time(NULL) );
int value = 0;
int probability = 3;
int old_bytes_read = 0;
while(!fin.eof())
{
char arr[1024];
fin.read(arr, 1024);
int total_bytes_read = fin.tellg();
int bytes_read = total_bytes_read - old_bytes_read;
old_bytes_read = total_bytes_read;
if(!bytes_read)
continue; //Probably reached EOF;
//Generate's random number;
value = rand() % 100 + 1;
if(value <= probability)
{
char blank[1024] = {0x00, 0x00};
fout_lossy.write(blank, 1024);
//Fill in data;
Concealer.Fill((int16_t *)blank, 512);
fout_repaired.write(blank, 1024);
}
else
{
//Write data to file;
fout_repaired.write(arr, 1024);
fout_lossy.write(arr, 1024);
Concealer.Receive((int16_t *)arr, 512);
}
}
fin.close();
fout_repaired.close();
fout_lossy.close();
return 0;
}
PcmConcealer.hpp
/*
* PcmConcealer.hpp
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#ifndef __PCMCONCEALER_HPP__
#define __PCMCONCEALER_HPP__
/**
1. What does it do?
The packet loss concealment module provides a suitable synthetic fill-in signal,
to minimise the audible effect of lost packets in VoIP applications. It is not
tied to any particular codec, and could be used with almost any codec which does not
specify its own procedure for packet loss concealment.
Where a codec specific concealment procedure exists, the algorithm is usually built
around knowledge of the characteristics of the particular codec. It will, therefore,
generally give better results for that particular codec than this generic concealer will.
2. How does it work?
While good packets are being received, the plc_rx() routine keeps a record of the trailing
section of the known speech signal. If a packet is missed, plc_fillin() is called to produce
a synthetic replacement for the real speech signal. The average mean difference function
(AMDF) is applied to the last known good signal, to determine its effective pitch.
Based on this, the last pitch period of signal is saved. Essentially, this cycle of speech
will be repeated over and over until the real speech resumes. However, several refinements
are needed to obtain smooth pleasant sounding results.
- The two ends of the stored cycle of speech will not always fit together smoothly. This can
cause roughness, or even clicks, at the joins between cycles. To soften this, the
1/4 pitch period of real speech preceeding the cycle to be repeated is blended with the last
1/4 pitch period of the cycle to be repeated, using an overlap-add (OLA) technique (i.e.
in total, the last 5/4 pitch periods of real speech are used).
- The start of the synthetic speech will not always fit together smoothly with the tail of
real speech passed on before the erasure was identified. Ideally, we would like to modify
the last 1/4 pitch period of the real speech, to blend it into the synthetic speech. However,
it is too late for that. We could have delayed the real speech a little, but that would
require more buffer manipulation, and hurt the efficiency of the no-lost-packets case
(which we hope is the dominant case). Instead we use a degenerate form of OLA to modify
the start of the synthetic data. The last 1/4 pitch period of real speech is time reversed,
and OLA is used to blend it with the first 1/4 pitch period of synthetic speech. The result
seems quite acceptable.
- As we progress into the erasure, the chances of the synthetic signal being anything like
correct steadily fall. Therefore, the volume of the synthesized signal is made to decay
linearly, such that after 50ms of missing audio it is reduced to silence.
- When real speech resumes, an extra 1/4 pitch period of sythetic speech is blended with the
start of the real speech. If the erasure is small, this smoothes the transition. If the erasure
is long, and the synthetic signal has faded to zero, the blending softens the start up of the
real signal, avoiding a kind of "click" or "pop" effect that might occur with a sudden onset.
3. How do I use it?
Before audio is processed, call plc_init() to create an instance of the packet loss
concealer. For each received audio packet that is acceptable (i.e. not including those being
dropped for being too late) call plc_rx() to record the content of the packet. Note this may
modify the packet a little after a period of packet loss, to blend real synthetic data smoothly.
When a real packet is not available in time, call plc_fillin() to create a sythetic substitute.
That's it!
*/
/*! Minimum allowed pitch (66 Hz) */
#define PLC_PITCH_MIN(SAMPLE_RATE) ((double)(SAMPLE_RATE) / 66.6)
/*! Maximum allowed pitch (200 Hz) */
#define PLC_PITCH_MAX(SAMPLE_RATE) ((SAMPLE_RATE) / 200)
/*! Maximum pitch OLA window */
//#define PLC_PITCH_OVERLAP_MAX(SAMPLE_RATE) ((PLC_PITCH_MIN(SAMPLE_RATE)) >> 2)
/*! The length over which the AMDF function looks for similarity (20 ms) */
#define CORRELATION_SPAN(SAMPLE_RATE) ((20 * (SAMPLE_RATE)) / 1000)
/*! History buffer length. The buffer must also be at leat 1.25 times
PLC_PITCH_MIN, but that is much smaller than the buffer needs to be for
the pitch assessment. */
//#define PLC_HISTORY_LEN(SAMPLE_RATE) ((CORRELATION_SPAN(SAMPLE_RATE)) + (PLC_PITCH_MIN(SAMPLE_RATE)))
namespace audio
{
typedef struct
{
/*! Consecutive erased samples */
int missing_samples;
/*! Current offset into pitch period */
int pitch_offset;
/*! Pitch estimate */
int pitch;
/*! Buffer for a cycle of speech */
float *pitchbuf;//[PLC_PITCH_MIN];
/*! History buffer */
short *history;//[PLC_HISTORY_LEN];
/*! Current pointer into the history buffer */
int buf_ptr;
} plc_state_t;
class PcmConcealer
{
public:
PcmConcealer();
~PcmConcealer();
void Init(int channels, int bit_depth, int sample_rate);
//Process a block of received audio samples.
int Receive(short amp[], int frames);
//Fill-in a block of missing audio samples.
int Fill(short amp[], int frames);
void Destroy();
private:
inline int amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames);
void save_history(plc_state_t *s, short *buf, int channel_index, int frames);
void normalise_history(plc_state_t *s);
/** Holds the states of each of the channels **/
std::vector< plc_state_t * > ChannelStates;
int plc_pitch_min;
int plc_pitch_max;
int plc_pitch_overlap_max;
int correlation_span;
int plc_history_len;
int channel_count;
int sample_rate;
bool Initialized;
};
}
#endif
PcmConcealer.cpp
/*
* PcmConcealer.cpp
*
* Code adapted from Steve Underwood of the Asterisk Project. This code inherits
* the same licensing restrictions as the Asterisk Project.
*/
#include "audio/PcmConcealer.hpp"
/* We do a straight line fade to zero volume in 50ms when we are filling in for missing data. */
#define ATTENUATION_INCREMENT 0.0025 /* Attenuation per sample */
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifdef WIN32
inline double rint(double x)
{
return floor(x + 0.5);
}
#endif
inline short fsaturate(double damp)
{
if (damp > 32767.0)
return INT16_MAX;
if (damp < -32768.0)
return INT16_MIN;
return (short)rint(damp);
}
namespace audio
{
PcmConcealer::PcmConcealer() : Initialized(false)
{
}
PcmConcealer::~PcmConcealer()
{
Destroy();
}
void PcmConcealer::Init(int channels, int bit_depth, int sample_rate)
{
if(Initialized)
return;
if(channels <= 0 || bit_depth != 16)
return;
Initialized = true;
channel_count = channels;
this->sample_rate = sample_rate;
//////////////
double min = PLC_PITCH_MIN(sample_rate);
int imin = (int)min;
double max = PLC_PITCH_MAX(sample_rate);
int imax = (int)max;
plc_pitch_min = imin;
plc_pitch_max = imax;
plc_pitch_overlap_max = (plc_pitch_min >> 2);
correlation_span = CORRELATION_SPAN(sample_rate);
plc_history_len = correlation_span + plc_pitch_min;
//////////////
for(int i = 0; i < channel_count; i ++)
{
plc_state_t *t = new plc_state_t;
memset(t, 0, sizeof(plc_state_t));
t->pitchbuf = new float[plc_pitch_min];
t->history = new short[plc_history_len];
ChannelStates.push_back(t);
}
}
void PcmConcealer::Destroy()
{
if(!Initialized)
return;
while(ChannelStates.size())
{
plc_state_t *s = ChannelStates.at(0);
if(s)
{
if(s->history) delete s->history;
if(s->pitchbuf) delete s->pitchbuf;
memset(s, 0, sizeof(plc_state_t));
delete s;
}
ChannelStates.erase(ChannelStates.begin());
}
ChannelStates.clear();
Initialized = false;
}
//Process a block of received audio samples.
int PcmConcealer::Receive(short amp[], int frames)
{
if(!Initialized)
return 0;
int j = 0;
for(int k = 0; k < ChannelStates.size(); k++)
{
int i;
int overlap_len;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples)
{
/* Although we have a real signal, we need to smooth it to fit well
with the synthetic signal we used for the previous block */
/* The start of the real data is overlapped with the next 1/4 cycle
of the synthetic data. */
pitch_overlap = s->pitch >> 2;
if (pitch_overlap > frames)
pitch_overlap = frames;
gain = 1.0 - s->missing_samples * ATTENUATION_INCREMENT;
if (gain < 0.0)
gain = 0.0;
new_step = 1.0/pitch_overlap;
old_step = new_step*gain;
new_weight = new_step;
old_weight = (1.0 - new_step)*gain;
for (i = 0; i < pitch_overlap; i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->pitchbuf[s->pitch_offset] + new_weight * amp[index]);
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->missing_samples = 0;
}
save_history(s, amp, j, frames);
j++;
}
return frames;
}
//Fill-in a block of missing audio samples.
int PcmConcealer::Fill(short amp[], int frames)
{
if(!Initialized)
return 0;
int j =0;
for(int k = 0; k < ChannelStates.size(); k++)
{
short *tmp = new short[plc_pitch_overlap_max];
int i;
int pitch_overlap;
float old_step;
float new_step;
float old_weight;
float new_weight;
float gain;
short *orig_amp;
int orig_len;
orig_amp = amp;
orig_len = frames;
plc_state_t *s = ChannelStates.at(k);
if (s->missing_samples == 0)
{
// As the gap in real speech starts we need to assess the last known pitch,
//and prepare the synthetic data we will use for fill-in
normalise_history(s);
s->pitch = amdf_pitch(plc_pitch_min, plc_pitch_max, s->history + (plc_history_len - correlation_span - plc_pitch_min), correlation_span);
// We overlap a 1/4 wavelength
pitch_overlap = s->pitch >> 2;
// Cook up a single cycle of pitch, using a single of the real signal with 1/4
//cycle OLA'ed to make the ends join up nicely
// The first 3/4 of the cycle is a simple copy
for (i = 0; i < s->pitch - pitch_overlap; i++)
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i];
// The last 1/4 of the cycle is overlapped with the end of the previous cycle
new_step = 1.0/pitch_overlap;
new_weight = new_step;
for ( ; i < s->pitch; i++)
{
s->pitchbuf[i] = s->history[plc_history_len - s->pitch + i]*(1.0 - new_weight) + s->history[plc_history_len - 2*s->pitch + i]*new_weight;
new_weight += new_step;
}
// We should now be ready to fill in the gap with repeated, decaying cycles
// of what is in pitchbuf
// We need to OLA the first 1/4 wavelength of the synthetic data, to smooth
// it into the previous real data. To avoid the need to introduce a delay
// in the stream, reverse the last 1/4 wavelength, and OLA with that.
gain = 1.0;
new_step = 1.0/pitch_overlap;
old_step = new_step;
new_weight = new_step;
old_weight = 1.0 - new_step;
for (i = 0; (i < pitch_overlap) && (i < frames); i++)
{
int index = (i * channel_count) + j;
amp[index] = fsaturate(old_weight * s->history[plc_history_len - 1 - i] + new_weight * s->pitchbuf[i]);
new_weight += new_step;
old_weight -= old_step;
if (old_weight < 0.0)
old_weight = 0.0;
}
s->pitch_offset = i;
}
else
{
gain = 1.0 - s->missing_samples*ATTENUATION_INCREMENT;
i = 0;
}
for ( ; gain > 0.0 && i < frames; i++)
{
int index = (i * channel_count) + j;
amp[index] = s->pitchbuf[s->pitch_offset]*gain;
gain -= ATTENUATION_INCREMENT;
if (++s->pitch_offset >= s->pitch)
s->pitch_offset = 0;
}
for ( ; i < frames; i++)
{
int index = (i * channel_count) + j;
amp[i] = 0;
}
s->missing_samples += orig_len;
save_history(s, amp, j, frames);
delete [] tmp;
j++;
}
return frames;
}
void PcmConcealer::save_history(plc_state_t *s, short *buf, int channel_index, int frames)
{
if (frames >= plc_history_len)
{
/* Just keep the last part of the new data, starting at the beginning of the buffer */
//memcpy(s->history, buf + len - plc_history_len, sizeof(short)*plc_history_len);
int frames_to_copy = plc_history_len;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + frames - plc_history_len)) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = 0;
return;
}
if (s->buf_ptr + frames > plc_history_len)
{
/* Wraps around - must break into two sections */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*(plc_history_len - s->buf_ptr));
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = plc_history_len - s->buf_ptr;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
frames -= (plc_history_len - s->buf_ptr);
//memcpy(s->history, buf + (plc_history_len - s->buf_ptr), sizeof(short)*len);
frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * (i + (plc_history_len - s->buf_ptr))) + channel_index;
s->history[i] = buf[index];
}
s->buf_ptr = frames;
return;
}
/* Can use just one section */
//memcpy(s->history + s->buf_ptr, buf, sizeof(short)*len);
short *hist_ptr = s->history + s->buf_ptr;
int frames_to_copy = frames;
for(int i = 0; i < frames_to_copy; i ++)
{
int index = (channel_count * i) + channel_index;
hist_ptr[i] = buf[index];
}
s->buf_ptr += frames;
}
void PcmConcealer::normalise_history(plc_state_t *s)
{
short *tmp = new short[plc_history_len];
if (s->buf_ptr == 0)
return;
memcpy(tmp, s->history, sizeof(short)*s->buf_ptr);
memcpy(s->history, s->history + s->buf_ptr, sizeof(short)*(plc_history_len - s->buf_ptr));
memcpy(s->history + plc_history_len - s->buf_ptr, tmp, sizeof(short)*s->buf_ptr);
s->buf_ptr = 0;
delete [] tmp;
}
int PcmConcealer::amdf_pitch(int min_pitch, int max_pitch, short amp[], int frames)
{
int i;
int j;
int acc;
int min_acc;
int pitch;
pitch = min_pitch;
min_acc = INT_MAX;
for (i = max_pitch; i <= min_pitch; i++)
{
acc = 0;
/*for (j = 0; j < frames; j++)
{
int index1 = (channel_count * (i+j)) + channel_index;
int index2 = (channel_count * j) + channel_index;
//std::cout << "Index 1: " << index1 << ", Index 2: " << index2 << std::endl;
acc += abs(amp[index1] - amp[index2]);
}*/
for (j = 0; j < frames; j++)
acc += abs(amp[i + j] - amp[j]);
if (acc < min_acc)
{
min_acc = acc;
pitch = i;
}
}
//std::cout << "Pitch: " << pitch << std::endl;
return pitch;
}
}