I am trying to print out a 3D matrix into a text file that can be read by a MATLAB code. The example I have here prints the tensor in a text file but when I try to open it in MATLAB it reads something like 256 x 16 instead of 16x16x16.
Example:
static const int nx = 16;
static const int ny = 16;
static const int nz = 16;double Lx = 128;
double Ly = 128;
double LZ = 128;
double dx = Lx / nx;
double dy = Ly / ny;
double dz = Lz / nz;
double a = (2 * EIGEN_PI)/Lx;
double b = (2 * EIGEN_PI)/ Ly;
Eigen::Tensor<double, 3> X(nx,ny,nz);
eXX.setZero();
Eigen::Tensor<double, 3> Y(nx,ny,nz);
eYY.setZero();
Eigen::Tensor<double, 3> Z(nx,ny,nz);
eZZ.setZero();
for(int i = 0; i< nx; i++){
for(int j = 0; j< ny; j++){
for(int k = 0; k< nz; k++){
X(k,i,j) = i*dx;
Y(j,i,k) = j*dy;
Z(j,i,k) = k*dz;
}
}
}
Eigen::Tensor<double, 3> Out(nx,ny,nz);
Out.setZero();
for(int i = 0; i< nx; i++){
for(int j = 0; j< ny; j++){
for(int k = 0; k< nz; k++){
Out(k,i,j) = sin(3. * a * Z(k,i,j)) * sin(a * X(k,i,j)) * cos(b * Y(k,i,j));
}
}
}
std::ofstream file("test.txt");
if (file.is_open())
{
file << Out << '\n';
}
I know I will probably have to write my own output routine instead os using the operator << since it treats a rank-3 tensor as a 2-d matrix for the purpose of output.
I am attempting to write a naive implementation of the Short-Time Fourier Transform using consecutive FFT frames in time, calculated using the FFTW library, but I am getting a Segmentation fault and cannot work out why.
My code is as below:
// load in audio
AudioFile<double> audioFile;
audioFile.load ("assets/example-audio/file_example_WAV_1MG.wav");
int N = audioFile.getNumSamplesPerChannel();
// make stereo audio mono
double fileDataMono[N];
if (audioFile.isStereo())
for (int i = 0; i < N; i++)
fileDataMono[i] = ( audioFile.samples[0][i] + audioFile.samples[1][i] ) / 2;
// setup stft
// (test transform, presently unoptimized)
int stepSize = 512;
int M = 2048; // fft size
int noOfFrames = (N-(M-stepSize))/stepSize;
// create Hamming window vector
double w[M];
for (int m = 0; m < M; m++) {
w[m] = 0.53836 - 0.46164 * cos( 2*M_PI*m / M );
}
double* input;
// (pads input array if necessary)
if ( (N-(M-stepSize))%stepSize != 0) {
noOfFrames += 1;
int amountOfZeroPadding = stepSize - (N-(M-stepSize))%stepSize;
double ipt[N + amountOfZeroPadding];
for (int i = 0; i < N; i++) // copy values from fileDataMono into input
ipt[i] = fileDataMono[i];
for (int i = 0; i < amountOfZeroPadding; i++)
ipt[N + i] = 0;
input = ipt;
} else {
input = fileDataMono;
}
// compute stft
fftw_complex* stft[noOfFrames];
double frames[noOfFrames][M];
fftw_plan fftPlan;
for (int i = 0; i < noOfFrames; i++) {
stft[i] = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * M);
for (int m = 0; m < M; m++)
frames[i][m] = input[i*stepSize + m] * w[m];
fftPlan = fftw_plan_dft_r2c_1d(M, frames[i], stft[i], FFTW_ESTIMATE);
fftw_execute(fftPlan);
}
// compute istft
double* outputFrames[noOfFrames];
double output[N];
for (int i = 0; i < noOfFrames; i++) {
outputFrames[i] = (double*)fftw_malloc(sizeof(double) * M);
fftPlan = fftw_plan_dft_c2r_1d(M, stft[i], outputFrames[i], FFTW_ESTIMATE);
fftw_execute(fftPlan);
for (int m = 0; i < M; m++) {
output[i*stepSize + m] += outputFrames[i][m];
}
}
fftw_destroy_plan(fftPlan);
for (int i = 0; i < noOfFrames; i++) {
fftw_free(stft[i]);
fftw_free(outputFrames[i]);
}
// output audio
AudioFile<double>::AudioBuffer outputBuffer;
outputBuffer.resize (1);
outputBuffer[0].resize(N);
outputBuffer[0].assign(output, output+N);
bool ok = audioFile.setAudioBuffer(outputBuffer);
audioFile.setAudioBufferSize (1, N);
audioFile.setBitDepth (16);
audioFile.setSampleRate (8000);
audioFile.save ("out/audioOutput.wav");
The segfault seems to be being raised by the first fftw_malloc when computing the forward STFT.
Thanks in advance!
The relevant bit of code is:
double* input;
if ( (N-(M-stepSize))%stepSize != 0) {
double ipt[N + amountOfZeroPadding];
//...
input = ipt;
}
//...
input[i*stepSize + m];
Your input pointer points at memory that exists only inside the if statement. The closing brace denotes the end of the lifetime of the ipt array. When dereferencing the pointer later, you are addressing memory that no longer exists.
I am new to c++ and I have no real idea why my program crashes only guesses.
The following program suddenly started to crash on line 49 at the void saveSig(cv::Mat *frame) line it self
without even steping in to the function it self.
It ran fine before.
The program soposed to track a person in a video under certain circumstances which I will not go over since they haven't been impllementet yet.
I can only guess that I have ran out of stack and I'm not sure why, again it might be a leek that I missed or maybe I just ran out of stack space or maybe it's completely something else and very stupid.
PS: sorry if the code is not "pretty" I'm really new to C++ and OpenCV and I will appreciate any comments about "bad coding practice".
#include "myCVFunctions.h"
#include <vector>
#define LOADING_VIDEO_ERROR -1
#define LOADING_BACKGROUND_IMAGE_ERROR -2
#define FRAME_BUFFER_SIZE 10
#define SIG_BUFFER_SIZE 6
const cv::string g_c_videoFilePath = "res/tmp.mp4";
const cv::string g_c_bgFilePath = "res/bg.jpg";
const cv::Mat g_c_bg = cv::imread(g_c_bgFilePath);
const cv::Rect g_c_entranceROIRect(869, 999, 345, 80);
const cv::Rect g_c_largeEntranceROIRect(869, 340, 345, 740);
const cv::Rect g_c_sigROI(869,539,345,541);
cv::Mat g_currFrameBackup;
cv::Point g_clickCoords(-1,-1);
cv::Rect g_markedROI;
bool g_trace = false;
bool g_personInside = false;
bool g_useSig = false;
char g_sigCount = 0;
double g_sig[SIG_BUFFER_SIZE];
double g_newSig[SIG_BUFFER_SIZE];
cv::Point g_inSigHeadCoords[SIG_BUFFER_SIZE];
cv::Point g_inNewSigHeadCoords[SIG_BUFFER_SIZE];
long double av1 = 0;
long double av2 = 0;
double minDiff = 9999999999.999999;
void onMouse(int event, int x, int y, int flags, void* userdata){
if(event == CV_EVENT_LBUTTONDOWN){
g_clickCoords.x = x;
g_clickCoords.y = y;
}
if(event == CV_EVENT_MOUSEMOVE && g_clickCoords.x>=0){
g_markedROI = cv::Rect(g_clickCoords, cv::Point(x,y));
g_currFrameBackup.copyTo(*((cv::Mat*)userdata));
cv::rectangle(*((cv::Mat*)userdata), g_markedROI, cv::Scalar(0,255,0));
}
if(event == CV_EVENT_LBUTTONUP){
g_trace = true;
g_useSig = true;
g_clickCoords = cv::Point(-1,-1);
}
}
void saveSig(cv::Mat *frame){ //the crash occurs here
double fftData[512*512];
cv::Mat sigROI, sigHSV, resized;
sigROI = (*frame)(g_c_sigROI);
cv::cvtColor(sigROI, sigHSV, CV_BGR2HSV);
resized = my_cv::resize_zeros(sigHSV, cv::Size(512,512));
cv::MatIterator_<cv::Vec3b> m_it = resized.begin<cv::Vec3b>();
for(int i=0; m_it!=resized.end<cv::Vec3b>(); m_it++, i++){
fftData[i] = (*m_it)[2];
}
my_cv::FFTR fft = my_cv::createFFTR<double>(fftData, 512, 512, FFT_TYPE_2D);
//cv::flip(sigHSV, sigHSV, -1);
//cv::transpose(sigHSV, sigHSV);
//cv::flip(sigHSV, sigHSV, 0);
//cv::imshow("1", sigROI);
//cv::imshow("", sigHSV);
//cv::waitKey();
//resized = my_cv::resize_zeros(sigHSV, cv::Size(512,512));
//m_it = resized.begin<cv::Vec3b>();
//for(int i=0; m_it!=resized.end<cv::Vec3b>(); m_it++, i++){
// fftData[i] = (*m_it)[2];
//}
//my_cv::FFTR fft180 = my_cv::createFFTR<double>(fftData, 512, 512, FFT_TYPE_2D);
my_cv::FFTR multFFT = my_cv::multFFT(fft, fft);
my_cv::m_reverseFFTR(multFFT, FFT_TYPE_2D);
if(g_useSig){
g_newSig[g_sigCount] = my_cv::getFFTAverege(multFFT);
}else{
g_sig[g_sigCount] = my_cv::getFFTAverege(multFFT);
}
g_sigCount++;
if(g_sigCount>=SIG_BUFFER_SIZE&&g_useSig){
av1 = ((g_sig[0]+g_sig[1]+g_sig[2]+g_sig[3]+g_sig[4]+g_sig[5])/6)/1000000.0;
av2 = ((g_newSig[0]+g_newSig[1]+g_newSig[2]+g_newSig[3]+g_newSig[4]+g_newSig[5])/6)/1000000.0;
/*for(int i=0; i<SIG_BUFFER_SIZE; i++){
for(int j=0; j<SIG_BUFFER_SIZE; j++){
double diff = abs(g_newSig[i]-g_sig[j]);
minDiff = (diff<minDiff ? diff : minDiff);
}
}*/
my_cv::deleteFFTR(fft);
//my_cv::deleteFFTR(fft180);
my_cv::deleteFFTR(multFFT);
}
}
void proccesFrame(cv::Mat *frame){
cv::Mat grayFrame, negativeFrame, bwFrame, entranceROI;
negativeFrame = g_c_bg - *frame;
cv::cvtColor(negativeFrame, grayFrame, CV_BGR2GRAY);
cv::threshold(grayFrame, bwFrame, 30, 255, cv::THRESH_BINARY);
cv::Mat erode = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(7,7));
cv::Mat dilate = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(10,10));
cv::erode(bwFrame, bwFrame, erode);
cv::dilate(bwFrame, bwFrame, dilate);
entranceROI = bwFrame(g_c_largeEntranceROIRect);
cv::MatIterator_<uchar> m_it = entranceROI.begin<uchar>();
for(g_personInside = false; m_it!=entranceROI.end<uchar>(); m_it++){
if(*m_it==255){
g_personInside = true;
break;
}
}
if(!g_personInside){
g_trace = false;
g_sigCount = 0;
av1 = 0;
av2 = 0;
minDiff = 9999999999.999999;
}else{
if(g_sigCount<SIG_BUFFER_SIZE){
cv::Mat ROI = bwFrame(g_c_entranceROIRect);
cv::MatIterator_<uchar> bw_it = bwFrame.begin<uchar>();
if(!g_useSig){
for(int i=0; bw_it!=bwFrame.end<uchar>(); bw_it++, i++){
if(*bw_it==255){
g_inSigHeadCoords[g_sigCount] = cv::Point(i%bwFrame.cols, i/bwFrame.cols);
break;
}
}
}else{
for(int i=0; bw_it!=bwFrame.end<uchar>(); bw_it++, i++){
if(*bw_it==255){
g_inNewSigHeadCoords[g_sigCount] = cv::Point(i%bwFrame.cols, i/bwFrame.cols);
break;
}
}
}
saveSig(frame);
}
cv::putText(*frame, "Person inside", cv::Point(20,120), CV_FONT_HERSHEY_PLAIN, 3.0, cv::Scalar(0,255,0), 2);
if(g_useSig&&g_sigCount>=SIG_BUFFER_SIZE){
g_sig;
g_newSig;
g_sigCount++;
//g_trace = true;
}
if(g_trace){
std::vector<std::vector<cv::Point>> contours;
std::vector<cv::Vec4i> hierarchy;
findContours(bwFrame, contours, hierarchy, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
std::vector<std::vector<cv::Point>>::iterator o_it = contours.begin();
for(; o_it!=contours.end(); o_it++){
std::vector<cv::Point>::iterator i_it = (*o_it).begin();
for(; i_it!=(*o_it).end()-1; i_it++){
cv::line(*frame, *i_it, *(i_it+1), cv::Scalar(0,255,0) , 3);
}
}
}
}
}
int main(int argc, char* argv[]){
//init//////////////////////////////////////////////////////////////////////
cv::VideoCapture videoBuffer(g_c_videoFilePath);
if(!videoBuffer.isOpened()){
std::cerr << "Can't load video please check the paths\n";
return LOADING_VIDEO_ERROR;
}
if(!g_c_bg.data){
std::cerr << "Can't load background image please check the paths\n";
return LOADING_BACKGROUND_IMAGE_ERROR;
}
std::vector<cv::Mat> frameBuffer;
frameBuffer.resize(FRAME_BUFFER_SIZE);
const std::vector<cv::Mat>::iterator currFrame = frameBuffer.begin();
const cv::string mainWindow = "Object Tracker";
cv::namedWindow(mainWindow, CV_WINDOW_AUTOSIZE);
cv::setMouseCallback(mainWindow, onMouse, (void*)&(*currFrame));
//init end/////////////////////////////////////////////////////////////////////////////
//video loop///////////////////////////////////////////////////////////////////////////
for(char paused = 0;;){
paused = (cv::waitKey(20)==' ' ? 1 : 0);
while(paused){
cv::resize(*currFrame, *currFrame, cv::Size(900, 540));
cv::imshow(mainWindow, *currFrame);
paused = (cv::waitKey(20)==' ' ? 0 : 1);
}
cv::Mat frame;
videoBuffer.read(frame);
frame.copyTo(g_currFrameBackup);
frameBuffer.pop_back();
frameBuffer.insert(frameBuffer.begin(), frame);
std::stringstream ss;
ss << "Frame: " << videoBuffer.get(CV_CAP_PROP_POS_FRAMES);
cv::putText(*currFrame, ss.str().c_str(), cv::Point(20,70), CV_FONT_HERSHEY_PLAIN, 3.0, cv::Scalar(0,255,0), 2);
proccesFrame(&(*currFrame));
/*if(g_personInside){
cv::resize(*currFrame, *currFrame, cv::Size(900, 540));
while(cv::waitKey(40)!=' ')
cv::imshow(mainWindow, *currFrame);
}*/
cv::resize(*currFrame, *currFrame, cv::Size(900, 540));
cv::imshow(mainWindow, *currFrame);
}
//video loop end///////////////////////////////////////////////////////////////////////
return 0;
}
and the "myCVFunctions.h" file:
#pragma once
#include "opencv\cv.h"
#include "opencv\highgui.h"
#include "fftw3.h"
#define FFT_TYPE_1D 1
#define FFT_TYPE_2D 2
namespace my_cv{
struct myComplex{
double real;
double imag;
};
struct FFTR{
myComplex** data;
int cols;
int rows;
};
struct ENTROPR{
double** data;
int cols;
int rows;
};
void printFFTR(FFTR fft);
FFTR createFFTR(cv::Mat mGrey, int type){
FFTR result;
result.rows = mGrey.rows, result.cols = mGrey.cols;
result.data = new myComplex*[result.cols];
for(int i = 0; i<result.cols; i++)
result.data[i] = new myComplex[result.rows];
fftw_complex *in, *out;
fftw_plan p;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * result.rows * result.cols);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * result.rows * result.cols);
switch(type){
case FFT_TYPE_1D:
p = fftw_plan_dft_1d(result.rows*result.cols, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
break;
case FFT_TYPE_2D:
p = fftw_plan_dft_2d(result.rows, result.cols, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
break;
}
cv::MatIterator_<uchar> mGrey_it = mGrey.begin<uchar>();
for(int i=0; mGrey_it != mGrey.end<uchar>(); mGrey_it++, i++){
in[i][0] = *mGrey_it;
in[i][1] = 0;
}
fftw_execute(p);
for(int i=0; i<result.rows*result.cols; i++){
int x = i%result.cols, y = i/result.cols;
result.data[x][y].real = out[i][0];
result.data[x][y].imag = out[i][1];
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
return result;
}
template<class T> FFTR createFFTR(const T* const mat, int cols, int rows, int type){
FFTR result;
result.rows = rows, result.cols = cols;
result.data = new myComplex*[result.cols];
for(int i = 0; i<result.cols; i++)
result.data[i] = new myComplex[result.rows];
fftw_complex *in, *out;
fftw_plan p;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * result.rows * result.cols);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * result.rows * result.cols);
switch(type){
case FFT_TYPE_1D:
p = fftw_plan_dft_1d(result.rows*result.cols, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
break;
case FFT_TYPE_2D:
p = fftw_plan_dft_2d(result.rows, result.cols, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
break;
}
for(int i=0; i<cols*rows; i++){
in[i][0] = mat[i];
in[i][1] = 0;
}
fftw_execute(p);
for(int i=0; i<result.rows*result.cols; i++){
int x = i%result.cols, y = i/result.cols;
result.data[x][y].real = out[i][0];
result.data[x][y].imag = out[i][1];
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
return result;
}
void m_reverseFFTR(FFTR fft, int type){
fftw_complex *in, *out;
fftw_plan p;
int scaleFactor = fft.cols*fft.rows;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * fft.rows * fft.cols);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * fft.rows * fft.cols);
switch(type){
case FFT_TYPE_1D:
p = fftw_plan_dft_1d(fft.rows*fft.cols, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
break;
case FFT_TYPE_2D:
p = fftw_plan_dft_2d(fft.rows, fft.cols, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
break;
}
for(int j=0; j<fft.rows; j++)
for(int i=0; i<fft.cols; i++){
int index = j*fft.cols+i;
in[index][0] = fft.data[i][j].real;
in[index][1] = fft.data[i][j].imag;
}
fftw_execute(p);
for(int i=0; i<fft.rows*fft.cols; i++){
int x = i%fft.cols, y = i/fft.cols;
fft.data[x][y].real = out[i][0]/scaleFactor;
fft.data[x][y].imag = out[i][1]/scaleFactor;
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
}
FFTR multFFT(const FFTR fft1, const FFTR fft2){
FFTR result;
result.cols = fft1.cols;
result.rows = fft1.rows;
result.data = new myComplex*[result.cols];
for(int i=0; i<result.cols; i++)
result.data[i] = new myComplex[result.rows];
for(int i=0; i<result.cols; i++){
for(int j=0; j<result.rows; j++){
result.data[i][j].real = (fft1.data[i][j].real*fft2.data[i][j].real)-(fft1.data[i][j].imag*fft2.data[i][j].imag);
result.data[i][j].imag = (fft1.data[i][j].real*fft2.data[i][j].imag)+(fft1.data[i][j].imag*fft2.data[i][j].real);
}
}
return result;
}
long double getFFTAverege(FFTR fft){
long double result = 0;
for(int i=0; i<fft.cols; i++){
long double sum=0;
for(int j=0; j<fft.rows; j++){
sum += fft.data[i][j].real;
}
result += sum/fft.rows;
}
return result/fft.rows;
}
void deleteFFTR(FFTR fftr){
for(int i=0; i<fftr.cols; i++)
if(fftr.data[i]) delete [] fftr.data[i];
if(fftr.data) delete [] fftr.data;
}
void printFFTR(FFTR fft){
for(int j=0; j<fft.rows; j++){
for(int i=0; i<fft.cols; i++){
printf("%f%si%f\n", fft.data[i][j].real, (fft.data[i][j].imag<0 ? "-" : "+"), abs(fft.data[i][j].imag));
}
}
}
cv::Mat resize_zeros(const cv::Mat src, cv::Size newSize){
cv::Mat srcROI, result, resultROI;
result.create(newSize, src.type());
srcROI = src(cv::Rect(0,0,(src.cols>result.cols ? result.cols : src.cols), (src.rows>result.rows ? result.rows : src.rows)));
result = 0;
resultROI = result(cv::Rect(0,0, srcROI.cols, srcROI.rows));
srcROI.copyTo(resultROI);
return result;
}
//otsu's threshhold
template<class T> T getThreshold(cv::Mat mGrey){
uchar* image = mGrey.data;
int columns = mGrey.cols;
int rows = mGrey.rows;
const T SIGMA = 0.000001;
const int num_bins = 257;
int counts[num_bins] = {0};
T p[num_bins] = {0};
T mu[num_bins] = {0};
T omega[num_bins] = {0};
T sigma_b_squared[num_bins] = {0};
int sumC;
// calculate histogram
for(int i = 0; i < rows*columns; i++)
counts[image[i]]++;
sumC = 0;
for(int i = 0; i < num_bins; i++)
sumC += counts[i];
for(int i = 0; i < num_bins; i++)
p[i] = ((T)counts[i])/sumC;
mu[0] = omega[0] = p[0];
for(int i = 1; i < num_bins; i++){
omega[i] = omega[i-1] + p[i];
mu[i] = mu[i-1] + p[i]*(i+1);
}
T mu_t = mu[num_bins-1];
T maxval = -1.0;
for(int i = 0; i < num_bins; i++){
T v = mu_t * omega[i] - mu[i];
if (omega[i] > SIGMA && abs(1.0-omega[i]) > SIGMA){
sigma_b_squared[i] = v*v/(omega[i]* (1.0 - omega[i]));
maxval = std::max(maxval,sigma_b_squared[i]);
}
}
// Find the location of the maximum value of sigma_b_squared.
// The maximum may extend over several bins, so average together the
// locations.
// If maxval == -1, sigma_b_squared is not defined, then return 0.
T level = 0;
if (maxval > 0){
T idx = 0;
int maxNumbers = 0;
for(int i = 0; i < num_bins; i++){
if (sigma_b_squared[i] == maxval){
idx += i;
maxNumbers++;
}
}
if (maxNumbers >= 0){
idx /= maxNumbers;
// Normalize the threshold to the range [0, 1].
// level = (idx - 1) / (num_bins - 1);
level = idx / (num_bins - 1);
}
}
return level;
}
}
double fftData[512*512];
That's (probably) 2MB of data, which is (probably) too big to fit on the stack. The simplest fix is to use a dynamic array instead:
std::vector<double> fftData(512*512);
Alternatively, if dynamic allocation is too expensive, you could use a static or global array. This is usually a bad idea, since it makes the function non-reentrant and awkward to use in a multi-threaded program; however, you already have so many globals that one more probably won't hurt.
I implemented the Jacobi algorithm using TBB and it works just fine. Then I parallelized the convergence calculation using a reduction, but for some reason if I use more than 1 logical core I get an segmentation fault and i can't figure out why.
I can use more than 1 thread on a system that has only 1 logical core.
The same implementation using OpenMP works without a hassle
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/task_scheduler_init.h>
#include <tbb/tick_count.h>
// ----------------------------------------------------------------
#define SIZE 1024
#define RESIDUO 0.0009f*SIZE
#define THREADS 2
using namespace tbb;
// ----------------------------------------------------------------
struct Sum {
float ret;
float (*a)[SIZE];
float (*x);
float (*b);
Sum(float A[SIZE][SIZE], float X[SIZE], float B[SIZE]) : ret(0), a(A), x(X), b(B) {}
Sum( Sum&, split ) {ret = 0;}
void operator()( const blocked_range<int>& r ) {
float temp = ret;
for( int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
}
temp += pow(b[i] - sum, 2);
}
ret = temp;
}
void join( Sum& rhs ) {ret += rhs.ret;}
};
/*
// || b - Ax ||
*/
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
Sum total(a, x, b);
parallel_reduce( blocked_range<int>(0, SIZE), total );
float norm = sqrt(total.ret);
printf("Ret: %f | Residuo: %f\n", total.ret, norm);
return (norm <= RESIDUO);
}
// ----------------------------------------------------------------
float randomFloat()
{
float r = (float)rand()/(float)RAND_MAX;
return r;
}
// ----------------------------------------------------------------
int check_ddm(float (*a)[SIZE]){
float sum = 0.0f;
int i = 0, j = 0;
for(i = 0; i < SIZE; i++){
sum = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
sum += a[i][j];
}
}
if(a[i][i] < sum){
printf("line: %d, sum: %f, a[i][i]: %f \n", i, sum, a[i][i]);
for(j = 0; j < SIZE; j++){
if(i != j) printf("%f ", a[i][j]);
else printf("(%f) ", a[i][j]);
}
printf("\n");
return 0;
}
}
return 1;
}
// ----------------------------------------------------------------
int generate_ddm(float (*a)[SIZE], float *b)
{
int i = 0, j = 0;
float line = 0.0f;
for(i = 0; i < SIZE; i++){
line = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
a[i][j] = randomFloat();
}
line += a[i][j];
}
a[i][i] = SIZE;
b[i] = line + SIZE;
}
return check_ddm(a);
}
// ----------------------------------------------------------------
int main( )
{
float (*x)[SIZE] = (float(*)[SIZE])malloc(sizeof *x * 2);
float (*a)[SIZE] = (float(*)[SIZE])malloc(sizeof *a * SIZE);
float (*b) = (float*)malloc(sizeof(float) * SIZE);
int i = 0, j = 0;
float delta = 0.0f;
int read = 0;
int write = 1;
srand(time(NULL));
tbb::task_scheduler_init init(THREADS);
// set up initial solution
for(i = 0; i < SIZE; i++){
x[0][i] = i;
x[1][i] = i;
}
// generate a diagonal dominant matrix
if(!generate_ddm(a, b)){
printf("Array generated is not ddm!\n");
return 1;
}
tick_count startTime = tick_count::now();
while(!converge(a, x[write], b)){
read = !read;
write = !write;
parallel_for(blocked_range<int>(0,SIZE),
[&] (const blocked_range<int>& r) {
for (int i = r.begin(); i < r.end(); i++) {
float delta = 0.0f;
for(int j = 0; j < SIZE; j++){
if(j != i){
delta += a[i][j] * x[read][j];
}
}
x[write][i] = (b[i] - delta) / a[i][i];
}
});
}
tick_count lastTime = tick_count::now();
float walltime = (lastTime - startTime).seconds();
printf("tbb %f\n", walltime);
converge(a, x[write], b);
printf("x0: %f | x%d: %f\n", x[write][0], SIZE-1, x[write][SIZE-1]);
free(a);
free(b);
free(x);
return 0;
}
The segfault occurs on the following line inside the Sum class:
sum += a[i][j] * x[j];
And if I change that line to
float tmpa = a[i][j];
float tmpx = x[j];
sum += tmpa * tmpx;
The error continues to be on
sum += tmpa * tmpx;
In the original version, the "splitting constructor" left a, x, and b undefined. They need to be copied from the incoming Sum& argument. E.g., change the splitting constructor to:
Sum( Sum& s, split ) {a=s.a; b=s.b; x=s.x; ret = 0;}
Changing the Class to a lambda expression solved the problem. It maybe a bug in TBB's parallel_reduce
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
float val = 0.0f;
val = parallel_reduce(
blocked_range<int>(0, SIZE),
0.0f,
[&]( const blocked_range<int>& r, float init )->float {
float temp = init;
for(int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
}
temp += pow(b[i] - sum, 2);
}
return temp;
},
[]( float x, float y)->float{
return x+y;
}
);
float norm = sqrt(val);
printf("Ret: %f | Residuo: %f\n", val, norm);
return (norm <= RESIDUO);
}
I'm using this fftw library.
Currently I'm trying to plot a 2D Gaussian in the form e^(-(x^2+y^2)/a^2).
Here is the code:
using namespace std;
int main(int argc, char** argv ){
fftw_complex *in, *out, *data;
fftw_plan p;
int i,j;
int w=16;
int h=16;
double a = 2;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*w*h);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*w*h);
for(i=0;i<w;i++){
for(j=0;j<h;j++){
in[i*h+j][0] = exp(- (i*i+j*j)/(a*a));
in[i*h+j][1] = 0;
}
}
p = fftw_plan_dft_2d(w, h, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
//This is something that print what's in the matrix
print_2d(out,w,h);
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
return 0;
}
Turns out negative numbers shows up. I thought Fourier transform of a Gaussian is another Gaussian, which shouldn't include any negative numbers.
Also, the current origin is at in[0]
EDIT: the previous answer is wrong, shifting the center of the Gaussian won't help as it introduces another phase shift. The right solution is to wrap high indices to negative ones:
double x = (i < w*0.5) ? i : (i - w);
double y = (j < h*0.5) ? j : (j - h);
in[i*h+j][0] = exp(-(x*x+y*y)/(a*a));
This allows the input to cover the entire Gaussian instead of a quarter of it. The entire code is attached below.
#include <stdio.h>
#include <math.h>
#include <fftw3.h>
int main(int argc, char** argv)
{
fftw_complex *in, *out;
fftw_plan p;
int i, j, w = 16, h = 16;
double a = 2, x, y;
in = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * w * h);
out = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * w * h);
for (i = 0; i < w; i++) {
x = (i < w*0.5) ? i : (i - w);
for (j = 0; j < h; j++) {
y = (j < h*0.5) ? j : (j - h);
in[i*h+j][0] = exp(-1.*(x*x+y*y)/(a*a));
in[i*h+j][1] = 0;
}
}
p = fftw_plan_dft_2d(w, h, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (i = 0; i < w; i++) {
for (j = 0; j < h; j++) {
printf("%4d %4d %+9.4f %+9.4f\n", i, j, out[i*h+j][0], out[i*h+j][1]);
}
}
fftw_destroy_plan(p);
fftw_cleanup();
fftw_free(in);
fftw_free(out);
return 0;
}