Problems benchmarking simple code with googlebenchmark - c++

I want to benchmark this simple C code:
float f(float x[], float y[]) {
float p = 0;
for (int i = 0; i <64; i++)
p += x[i] * y[i];
return p;
}
My motivation is to try different compiler flags and also gcc and clang to see what difference it makes.
I found this test framework and have been trying to get it to work. Although I am completely new to C++, here is my best effort:
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
while (state.KeepRunning()) {
benchmark::DoNotOptimize(f((float*) state.range(0), (float*) state.range(1)));
}
}
void args(benchmark::internal::Benchmark* b) {
std::uniform_real_distribution<float> rand(0, 100);
for (int i = 0; i < 10; i++) {
float* x = new float[64];
float* y = new float[64];
for (int i = 0; i < 64; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
printf("%f %f\n", x[i], y[i]);
}
b->Args({(int) x, (int) y});
}
}
BENCHMARK(f_benchmark)->Apply(args);
BENCHMARK_MAIN();
To compile it I do:
g++ -Ofast -Wall -std=c++11 test.cpp -Ibenchmark/include/benchmark/
-Lbenchmark/src/ -o test -lbenchmark -lpthread
This gives me :
test.cpp: In function ‘void f_benchmark(benchmark::State&)’:
test.cpp:20:54: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
benchmark::DoNotOptimize(f((float*) state.range(0), (float*) state.range(1)));
[...]
test.cpp: In function ‘void args(benchmark::internal::Benchmark*)’:
test.cpp:38:20: error: cast from ‘float*’ to ‘int’ loses precision [-fpermissive]
b->Args({(int) x, (int) y});
^
[...]
How can I get rid of those warnings and in general, am I doing this
right?

Your code casts a float* to int and back to a float* - this can cause problems, because sizeof(int) and sizeof(float*) are not guaranteed to be identical (i.e. on x86-64 int is 32bit, while float* is 64bit!).
The reason why you run into this issue, is probably because Args() supports only int arguments (they're supposed to be used as index for a family of benchmarks, not as actual function arguments in your function). To use parameters of a different type you could:
A. use global variables to store the pre-calculated random array
i.e.
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float x[64*10], y[64*10];
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
while (state.KeepRunning()) {
benchmark::DoNotOptimize(f(&x[state.range(0)*64], &y[state.range(0)*64]));
}
}
void args(benchmark::internal::Benchmark* b) {
std::uniform_real_distribution<float> rand(0, 100);
for (int i = 0; i < 64*10; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
}
for (int i = 0; i < 10; ++i)
b->Arg({ i });
}
BENCHMARK(f_benchmark)->Apply(args);
BENCHMARK_MAIN();
B. calculate the random numbers as part of the benchmark function (choose this approach if, you really require different random values for each iteration - the timing needs to be paused / resumed accordingly to not include the time for the random generation/memory allocation in the benchmark)
i.e.
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
state.PauseTiming();
std::uniform_real_distribution<float> rand(0, 100);
float* x = new float[64];
float* y = new float[64];
while (state.KeepRunning()) {
for (int i = 0; i < 64; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
}
state.ResumeTiming();
benchmark::DoNotOptimize(f(x, y));
state.PauseTiming();
}
delete[] x;
delete[] y;
}
BENCHMARK(f_benchmark)->Apply([](benchmark::internal::Benchmark* b){
for (int i = 0; i < 10; ++i)
b->Arg({ i });
});
BENCHMARK_MAIN();
Side note: Also take care about the the leaking memory in your for loop - you should call the delete[] operator once for every new[] operator.

Related

Garbage in array after calling function

The problem occurs in foo() (in the commented lines), and is that foo2() should return the result of a matrix multiplication repeated process in it's first parameter. It is working in the first case and failing right after.
B and B_tmp arrays should have the same values at the end of foo() and that's not happening
T is 1x6 matrix, A is 6x3 matrix, B is 200x3 matrix
foo3() multiplies TxA and store the result (1x3 matrix) at the end of B
What foo2() does at the beginning with B_t1_t2 is not relevant, it just prepares the 1x6 matrix, changing the order in some way
I must try to solve this without changing any function declaration
I'm new to c++ and have been searching for too long now, I'm desperated
#include <stdio.h>
#include <iostream>
#include <random>
#include <thread>
using namespace std;
double fRand(const double & min, const double & max) {
thread_local std::mt19937 generator(std::random_device{}());
std::uniform_real_distribution<double> distribution(min, max);
return distribution(generator);
}
int iRand(const int & min, const int & max) {
thread_local std::mt19937 generator(std::random_device{}());
std::uniform_int_distribution<int> distribution(min, max);
return distribution(generator);
}
void foo3(double T[6], double A[18], double *B)
{
for(int i = 0; i < 3; i++) {
double r = 0;
for(int j = 0; j < 6; j++) {
r += T[j] * A[i*3+j];
}
*B = r; B++;
}
}
void foo2(double *B, double *A, int from, int to)
{
for (int i=from; i < to; i++) { //This is not relevant but I leave it just in case
double B_t1_t2[6];
for (int x = 0; x < 3; x++)
B_t1_t2[x] = B[(i-1)*3 + x];
for (int x = 0; x < 3; x++)
B_t1_t2[x+3] = B[(i-2)*3 + x];
foo3(B_t1_t2, A, &B[i*3]);
}
}
void foo(double *A, double *B)
{
for (int i = 0; i < 18; i++)
A[i] = fRand(1, 2);
foo2(B, A, 2, 200);
cout << "\nB" << endl;
for (int i = 0; i < 600; i++)
cout << B[i] << " "; // HERE IS WORKING, B DOES NOT CONTAIN GARBAGE
cout << endl;
double B_tmp[600];
foo2(B_tmp, A, 2, 200);
cout << "\nB_tmp" << endl;
for (int i = 0; i < 600; i++)
cout << B_tmp[i] << " "; // WHY NOT WORKING HERE?
cout << endl;
}
int main()
{
double A[18], B[600];
for(int i = 0; i<6; i++)
B[i] = 1;
foo(A, B);
}
Why the second cout in foo() is showing garbage?
Also, if declarations must change, what would be the best way?
Im trying to use stack memory as much as I can.
Before calling foo(A, B); first 6 elements of B array were filled (all are set to 1). In foo function you call foo2 function twice. In first call you pass B array into foo2 function, and it works because B is filled. In second call of foo2 in foo you pass B_tmp array but all items of this array have garbage value, you didn't initialize them. So do
double B_tmp[600];
for (int i = 0; i < 6; ++i)
B_tmp[i] = 1;
foo2(B_tmp, A, 2, 200);

Avoid blas when involving temporary memory allocation?

I have a program that computes the matrix product x'Ay repeatedly. Is it better practice to compute this by making calls to MKL's blas, i.e. cblas_dgemv and cblas_ddot, which requires allocating memory to a temporary vector, or is better to simply take the sum of x_i * a_ij * y_j? In other words, does MKL's blas theoretically add any value?
I benchmarked this for my laptop. There was virtually no difference in each of the tests, other than g++_no_blas performed twice as poorly as the other tests (why?). There was also no difference between O2, O3 and Ofast.
g++_blas_static 57ms
g++_blas_dynamic 58ms
g++_no_blas 100ms
icpc_blas_static 57ms
icpc_blas_dynamic 58ms
icpc_no_blas 58ms
util.h
#ifndef UTIL_H
#define UTIL_H
#include <random>
#include <memory>
#include <iostream>
struct rng
{
rng() : unif(0.0, 1.0)
{
}
std::default_random_engine re;
std::uniform_real_distribution<double> unif;
double rand_double()
{
return unif(re);
}
std::unique_ptr<double[]> generate_square_matrix(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N * N]);
for (unsigned i = 0; i < N; ++i)
{
for (unsigned j = 0; j < N; ++j)
{
p.get()[i*N + j] = rand_double();
}
}
return p;
}
std::unique_ptr<double[]> generate_vector(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N]);
for (unsigned i = 0; i < N; ++i)
{
p.get()[i] = rand_double();
}
return p;
}
};
#endif // UTIL_H
main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>
#include "util.h"
#include "mkl.h"
double vtmv_blas(double* x, double* A, double* y, const unsigned n)
{
double temp[n];
cblas_dgemv(CblasRowMajor, CblasNoTrans, n, n, 1.0, A, n, y, 1, 0.0, temp, 1);
return cblas_ddot(n, temp, 1, x, 1);
}
double vtmv_non_blas(double* x, double* A, double* y, const unsigned n)
{
double r = 0;
for (unsigned i = 0; i < n; ++i)
{
for (unsigned j = 0; j < n; ++j)
{
r += x[i] * A[i*n + j] * y[j];
}
}
return r;
}
int main()
{
std::cout << std::fixed;
std::cout << std::setprecision(2);
constexpr unsigned N = 10000;
rng r;
std::unique_ptr<double[]> A = r.generate_square_matrix(N);
std::unique_ptr<double[]> x = r.generate_vector(N);
std::unique_ptr<double[]> y = r.generate_vector(N);
auto start = std::chrono::system_clock::now();
const double prod = vtmv_blas(x.get(), A.get(), y.get(), N);
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start);
std::cout << "Result: " << prod << std::endl;
std::cout << "Time (ms): " << duration.count() << std::endl;
GCC no blas is poor because it does not use vectorized SMID instructions, while others all do. icpc will auto-vectorize you loop.
You don't show your matrix size, but generally gemv is memory bound. As the matrix is much larger than a temp vector, eliminating it may not be able to increase the performance a lot.

Visual C++ SSE function slow when starting without debugger

i have a pretty weird problem regarding SSE usage.
I wrote the following function where i use SSE to calculate the maximum of the difference of two float arrays, each containing 64 floats.
The dists-array is a 2d-array allocated via _aligned_malloc.
#include <iostream>
#include <xmmintrin.h>
#include <time.h>
#include <stdio.h>
#include <algorithm>
#include <fstream>
#include "hr_time.h"
using namespace std;
float** dists;
float** dists2;
__m128* a;
__m128* b;
__m128* c;
__m128* d;
__m128 diff;
__m128 diff2;
__m128 mymax;
float* myfmax;
float test(int s, int t)
{
a = (__m128*) dists[s];
b = (__m128*) dists[t];
c = (__m128*) dists2[s];
d = (__m128*) dists2[t];
diff;
mymax = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
for (int i = 0; i <= 16; i++)
{
diff = _mm_sub_ps(*a, *b);
mymax = _mm_max_ps(diff, mymax);
diff2 = _mm_sub_ps(*d, *c);
mymax = _mm_max_ps(diff2, mymax);
a++;
b++;
c++;
d++;
}
_mm_store_ps(myfmax, mymax);
float res = max(max(max(myfmax[0], myfmax[1]), myfmax[2]), myfmax[3]);
return res;
}
int Deserialize(std::istream* stream)
{
int numOfElements, arraySize;
stream->read((char*)&numOfElements, sizeof(int)); // numOfElements = 64
stream->read((char*)&arraySize, sizeof(int)); // arraySize = 8000000
dists = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
dists2 = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
for (int j = 0; j < arraySize; j++)
{
dists[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
dists2[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists[i], (numOfElements*sizeof(float)));
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists2[i], (numOfElements*sizeof(float)));
}
return 0;
}
int main(int argc, char** argv)
{
int entries = 8000000;
myfmax = (float*)_aligned_malloc(4 * sizeof(float), 16);
ifstream fs("binary_file", std::ios::binary);
Deserialize(&fs);
CStopWatch* watch = new CStopWatch();
watch->StartTimer();
int i;
for (i = 0; i < entries; i++)
{
int s = rand() % entries;
int t = rand() % entries;
test(s, t);
}
watch->StopTimer();
cout << i << " iterations took " << watch->GetElapsedTimeMs() << "ms" << endl;
cin.get();
}
My problem is, that this code runs very fast if i run it in Visual Studio with an attached debugger. But as soon as i execute it without the debugger it gets very slow.
So i did a little reasearch and found out that one difference between those two starting methods is the "Debug Heap". So i disabled that by defining "_NO_DEBUG_HEAP=1". With that option i get very poor performance with an attached debugger too.
But i don't understand how i can get better performance by using the Debug Heap? And i don't know how to solve this problem, so i hope one of you guys can help me.
Thanks in advance.
Regards,
Karsten
Your code has a bug. _mm_store_ps stores an array of four floats but you only declare one. The compiler should not even allow you do to that.
Change
float fmax;
_mm_store_ps(fmax, max);
pi = std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);
to
float __declspec(align(16)) fmax[4];
_mm_store_ps(fmax, max);
return std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);

Unhandled exception with C++ class function

I am writing a program which will preform texture synthesis. I have been away from C++ for a while and am having trouble figuring out what I am doing wrong in my class. When I run the program, I get an unhandled exception in the copyToSample function when it tries to access the arrays. It is being called from the bestSampleSearch function when the unhandled exception occurs. The function has been called before and works just fine, but later on in the program it is called a second time and fails. Any ideas? Let me know if anyone needs to see more code. Thanks!
Edit1: Added the bestSampleSearch function and the compareMetaPic function
Edit2: Added a copy constructor
Edit3: Added main()
Edit4: I have gotten the program to work. However there is now a memory leak of some kind or I am running out of memory when I run the program. It seems in the double for loop in main which starts "// while output picture is unfilled" is the problem. If I comment this portion out the program finishes in a timely manner but only one small square is output. Something must be wrong with my bestSampleSearch function.
MetaPic.h
#pragma once
#include <pic.h>
#include <stdlib.h>
#include <cmath>
class MetaPic
{
public:
Pic* source;
Pixel1*** meta;
int x;
int y;
int z;
MetaPic();
MetaPic(Pic*);
MetaPic(const MetaPic&);
MetaPic& operator=(const MetaPic&);
~MetaPic();
void allocateMetaPic();
void copyPixelData();
void copyToOutput(Pic*&);
void copyToMetaOutput(MetaPic&, int, int);
void copyToSample(MetaPic&, int, int);
void freeMetaPic();
};
MetaPic.cpp
#include "MetaPic.h"
MetaPic::MetaPic()
{
source = NULL;
meta = NULL;
x = 0;
y = 0;
z = 0;
}
MetaPic::MetaPic(Pic* pic)
{
source = pic;
x = pic->nx;
y = pic->ny;
z = pic->bpp;
allocateMetaPic();
copyPixelData();
}
MetaPic::MetaPic(const MetaPic& mp)
{
source = mp.source;
x = mp.x;
y = mp.y;
z = mp.z;
allocateMetaPic();
copyPixelData();
}
MetaPic::~MetaPic()
{
freeMetaPic();
}
// create a 3 dimensional array from the original one dimensional array
void MetaPic::allocateMetaPic()
{
meta = (Pixel1***)calloc(x, sizeof(Pixel1**));
for(int i = 0; i < x; i++)
{
meta[i] = (Pixel1**)calloc(y, sizeof(Pixel1*));
for(int j = 0; j < y; j++)
{
meta[i][j] = (Pixel1*)calloc(z, sizeof(Pixel1));
}
}
}
void MetaPic::copyPixelData()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < x; i++)
{
for(int k = 0; k < z; k++)
meta[i][j][k] = source->pix[(j*z*x)+(i*z)+k];
}
}
}
void MetaPic::copyToOutput(Pic* &output)
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < x; i++)
{
for(int k = 0; k < z; k++)
output->pix[(j*z*x)+(i*z)+k] = meta[i][j][k];
}
}
}
// copy the meta data to the final pic output starting at the top left of the picture and mapped to 'a' and 'b' coordinates in the output
void MetaPic::copyToMetaOutput(MetaPic &output, int a, int b)
{
for(int j = 0; (j < y) && ((j+b) < output.y); j++)
{
for(int i = 0; (i < x) && ((i+a) < output.x); i++)
{
for(int k = 0; k < z; k++)
output.meta[i+a][j+b][k] = meta[i][j][k];
}
}
}
// copies from a source image to a smaller sample image
// *** Must make sure that the x and y coordinates have enough buffer space ***
void MetaPic::copyToSample(MetaPic &sample, int a, int b)
{
for(int j = 0; (j < sample.y) && ((b+j) < y); j++)
{
for(int i = 0; i < (sample.x) && ((a+i) < x); i++)
{
for(int k = 0; k < sample.z; k++)
{
**sample.meta[i][j][k] = meta[i+a][j+b][k];**
}
}
}
}
// free the meta pic data (MetaPic.meta)
// *** Not to be used outside of class declaration ***
void MetaPic::freeMetaPic()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < z; i++)
free(meta[i][j]);
}
for(int i = 0; i < x; i++)
free(meta[i]);
free(meta);
}
MetaPic MetaPic::operator=(MetaPic mp)
{
MetaPic newMP(mp.source);
return newMP;
}
main.cpp
#ifdef WIN32
// For VC++ you need to include this file as glut.h and gl.h refer to it
#include <windows.h>
// disable the warning for the use of strdup and friends
#pragma warning(disable:4996)
#endif
#include <stdio.h> // Standard Header For Most Programs
#include <stdlib.h> // Additional standard Functions (exit() for example)
#include <iostream>
// Interface to libpicio, provides functions to load/save jpeg files
#include <pic.h>
#include <string.h>
#include <time.h>
#include <cmath>
#include "MetaPic.h"
using namespace std;
MetaPic bestSampleSearch(MetaPic, MetaPic);
double compareMetaPics(MetaPic, MetaPic);
#define SAMPLE_SIZE 23
#define OVERLAP 9
// Texture source image (pic.h uses the Pic* data structure)
Pic *sourceImage;
Pic *outputImage;
int main(int argc, char* argv[])
{
char* pictureName = "reg1.jpg";
int outputWidth = 0;
int outputHeight = 0;
// attempt to read in the file name
sourceImage = pic_read(pictureName, NULL);
if(sourceImage == NULL)
{
cout << "Couldn't read the file" << endl;
system("pause");
exit(EXIT_FAILURE);
}
// *** For now set the output image to 3 times the original height and width ***
outputWidth = sourceImage->nx*3;
outputHeight = sourceImage->ny*3;
// allocate the output image
outputImage = pic_alloc(outputWidth, outputHeight, sourceImage->bpp, NULL);
Pic* currentImage = pic_alloc(SAMPLE_SIZE, SAMPLE_SIZE, sourceImage->bpp, NULL);
MetaPic metaSource(sourceImage);
MetaPic metaOutput(outputImage);
MetaPic metaCurrent(currentImage);
// seed the output image
int x = 0;
int y = 0;
int xupperbound = metaSource.x - SAMPLE_SIZE;
int yupperbound = metaSource.y - SAMPLE_SIZE;
int xlowerbound = 0;
int ylowerbound = 0;
// find random coordinates
srand(time(NULL));
while((x >= xupperbound) || (x <= xlowerbound))
x = rand() % metaSource.x;
while((y >= yupperbound) || (y <= ylowerbound))
y = rand() % metaSource.y;
// copy a random sample from the source to the metasample
metaSource.copyToSample(metaCurrent, x, y);
// copy the seed to the metaoutput
metaCurrent.copyToMetaOutput(metaOutput, 0, 0);
int currentOutputX = 0;
int currentOutputY = 0;
// while the output picture is unfilled...
for(int j = 0; j < yupperbound; j+=(SAMPLE_SIZE-OVERLAP))
{
for(int i = 0; i < xupperbound; i+=(SAMPLE_SIZE-OVERLAP))
{
// move the sample to correct overlap
metaSource.copyToSample(metaCurrent, i, j);
// find the best match for the sample
metaCurrent = bestSampleSearch(metaSource, metaCurrent);
// write the best match to the metaoutput
metaCurrent.copyToMetaOutput(metaOutput, i, j);
// update the values
}
}
// copy the metaOutput to the output
metaOutput.copyToOutput(outputImage);
// output the image
pic_write("reg1_output.jpg", outputImage, PIC_JPEG_FILE);
// clean up
pic_free(sourceImage);
pic_free(outputImage);
pic_free(currentImage);
// return success
cout << "Done!" << endl;
system("pause");
// return success
return 0;
}
// finds the best sample to insert into the image
// *** best must be the sample which consists of the overlap ***
MetaPic bestSampleSearch(MetaPic source, MetaPic best)
{
MetaPic metaSample(best);
double bestScore = 999999.0;
double currentScore = 0.0;
for(int j = 0; j < source.y; j++)
{
for(int i = 0; i < source.x; i++)
{
// copy the image starting at the top left of the source image
source.copyToSample(metaSample, i, j);
// compare the sample with the overlap
currentScore = compareMetaPics(best, metaSample);
// if best score is greater than current score then copy the better sample to best and continue searching
if( bestScore > currentScore)
{
metaSample.copyToSample(best, 0, 0);
bestScore = currentScore;
}
// otherwise, the score is less than current score then do nothing (a better sample has not been found)
}
}
return best;
}
// find the comparison score for the two MetaPics based on their rgb values
// *** Both of the meta pics should be the same size ***
double compareMetaPics(MetaPic pic1, MetaPic pic2)
{
float r1 = 0.0;
float g1 = 0.0;
float b1 = 0.0;
float r2 = 0.0;
float g2 = 0.0;
float b2 = 0.0;
float r = 0.0;
float g = 0.0;
float b = 0.0;
float sum = 0.0;
// take the sum of the (sqrt((r1-r2)^2 + ((g1-g2)^2 + ((b1-b2)^2))
for(int j = 0; (j < pic1.y) && (j < pic2.y); j++)
{
for(int i = 0; (i < pic1.x) && (i < pic2.x); i++)
{
r1 = PIC_PIXEL(pic1.source, i, j, 0);
r2 = PIC_PIXEL(pic2.source, i, j, 0);
g1 = PIC_PIXEL(pic1.source, i, j, 1);
g2 = PIC_PIXEL(pic2.source, i, j, 1);
b1 = PIC_PIXEL(pic1.source, i, j, 2);
b2 = PIC_PIXEL(pic2.source, i, j, 2);
r = r1 - r2;
g = g1 - g2;
b = b1 - b2;
sum += sqrt((r*r) + (g*g) + (b*b));
}
}
return sum;
}
I'm not sure if this is the root cause of the problem, but your assignment operator does not actually assign anything:
MetaPic MetaPic::operator=(MetaPic mp)
{
MetaPic newMP(mp.source);
return newMP;
}
This should probably look something like the following (based off of the code in your copy constructor):
edit: with credit to Alf P. Steinbach
MetaPic& MetaPic::operator=(MetaPic mp)
{
mp.swap(*this);
return *this;
}
It turns out that the deallocate function is incorrect. It should be freeing in the same manner that it was allocating.
void MetaPic::freeMetaPic()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < z; i++)
free(meta[i][j]);
}
for(int i = 0; i < x; i++)
free(meta[i]);
free(meta);
}

Segmentation fault In the Cascaded Struct Pointers Test Code

The following dummy test code gives segmentation fault at the end of execution (to be more specific in main at return 0). I wondered the reason of this behavior. Would it be because it couldn't free the dummy variable? I'm using g++ 4.4 with no optimization flags for the tests.
#include <vector>
#include <boost/multi_array.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
using std::vector;
typedef boost::multi_array<float, 1> DVec;
class Point{
public:
int x, y;
double *dist;
DVec dir;
};
struct another_struct {
vector <Point *>c;
};
struct in_foo{
vector <another_struct *>aVec;
char *aname;
float b;
};
struct foo {
DVec b;
vector<in_foo *> mVec;
};
int main(){
DVec c(boost::extents[4]);
foo **dummy = (foo **) calloc(4, sizeof(*dummy));
vector <in_foo *>test_var(5);
for(int i =0; i < 6; i++){
test_var[i] = (in_foo *) malloc(sizeof(in_foo));
memset(test_var[i], 0, sizeof(*test_var[i]));
test_var[i]->aname = "42!\n";
test_var[i]->b = (float) i;
}
for (int i = 0 ; i < 4; i++) {
dummy[i] = (foo *) malloc(sizeof(*dummy[i]));
(dummy[i]->b).resize(boost::extents[2]);
(dummy[i]->mVec) = test_var;
}
for (int i = 0 ; i < 4; i++) {
for(int j = 0; j < 5; j++){
(dummy[i]->mVec[j]->aVec).resize(5);
for (int n = 0; n < 6; n++) {
dummy[i]->mVec[j]->aVec[n] = new another_struct();
(dummy[i]->mVec[j]->aVec[n])->c.resize(3);
for (int m = 0; m < 4; m++) {
(dummy[i]->mVec[j]->aVec[n]->c[m]) = new Point();
(dummy[i]->mVec[j]->aVec[n]->c[m])->x = 100 * n;
(dummy[i]->mVec[j]->aVec[n]->c[m])->y = 11000 * m;
(dummy[i]->mVec[j]->aVec[n]->c[m])->dist = new double[2];
(dummy[i]->mVec[j]->aVec[n]->c[m])->dist[0] = 11200.123;
(dummy[i]->mVec[j]->aVec[n]->c[m])->dist[1] = 66503.131;
printf("x: %d, y: %d, dist 0: %f, dist 1: %f \n", (dummy[i]->mVec[j]->aVec[n]->c[m])->x, (dummy[i]->mVec[j]->aVec[n]->c[m])->y, (dummy[i]->mVec[j]->aVec[n]->c[m])->dist[0], (dummy[i]->mVec[j]->aVec[n]->c[m])->dist[1]);
}
}
printf("b: %f aname: %s \n", dummy[i]->mVec[j]->b, dummy[i]->mVec[j]->aname);
}
}
if (NULL != dummy) {
for(int i = 0; i < 4; i++)
{
free(dummy[i]);
}
free(dummy);
}
return 0;
}
You can't use malloc or calloc to allocate memory for a class or struct that is non-POD, for example vector, foo, in_foo. Once you do that all bets are off and any behavior your program displays is within reason.
Use new with smart pointers or better yet use composition if possible.pointers with new.