Combining integers and floating point numbers: performance considerations - c++

I have a complex set of template functions which do calculations in a loop, combining floating point numbers and the uint32_t loop indices. I was surprised to observe that for this kind of functions, my test code runs faster with double precision floating point numbers than with single precision ones.
As a test, I changed the format of my indices to uint16_t. After this, both the double and float version of the program were faster (as expected), but now the float version was significantly faster than the double version. I also tested the program with uint64_t indices. In this case the double and the float version are equally slow.
I imagine that this is because an uint32_t fits into the mantissa of a double but not into a float. Once the indices type was reduced to uint16_t, they also fit into the mantissa of a float and a conversion should be trivial. In case of uint64_t, the conversion to double also needs rounding, which would explain why both versions perform equally.
Can anybody confirm this explanation?
EDIT: Using int or long as index type, the program runs as fast as for unit16_t. I guess this speaks against what I suspected first.
EDIT: I compiled the program for windows on an x86 architecture.
EDIT: Here is a piece of code that reproduces the effect of double being faster as float for uint32_t and both cases being equally fast for int. Please do not comment on the usefulness of this code. It is a modified fragment of code reproducing the effect which does nothing sensible.
The main file:
#include "stdafx.h"
typedef short spectraType;
typedef int intermediateValue;
typedef double returnType;
#include "Preprocess_t.h"
#include "Windows.h"
#include <iostream>
int main()
{
const size_t numberOfBins = 10000;
const size_t numberOfSpectra = 500;
const size_t peakWidth = 25;
bool startPeak = false;
short peakHeight;
Preprocess<short, returnType> myPreprocessor;
std::vector<returnType> processedSpectrum;
std::vector<std::vector<short>> spectra(numberOfSpectra, std::vector<short>(numberOfBins));
std::vector<float> peakShape(peakWidth);
LARGE_INTEGER freq, start, stop;
double time_ms;
QueryPerformanceFrequency(&freq);
for (size_t i = 0; i < peakWidth; ++i)
{
peakShape[i] = static_cast<float>(exp(-(i - peakWidth / 2.0) *(i - peakWidth / 2.0) / 10.0));
}
for (size_t i = 0; i < numberOfSpectra; ++i)
{
size_t j = 0;
for (; j < 200; ++j)
{
spectra[i][j] = rand() % 100;
}
for (size_t k = 0; k < 25; ++k)
{
spectra[i][j] = static_cast<short>(16383 * peakShape[k]);
j++;
}
for (; j < numberOfBins; ++j)
{
startPeak = !static_cast<bool>(abs(rand()) % (numberOfBins / 4));
if (startPeak)
{
peakHeight = rand() % 16384;
for (size_t k = 0; k < 25 && j< numberOfBins; ++k)
{
spectra[i][j] = peakHeight * peakShape[k] + rand() % 100;
j++;
}
}
else
{
spectra[i][j] = rand() % 100;
}
}
for (j = 0; j < numberOfBins; ++j)
{
double temp = 1000.0*exp(-(static_cast<float>(j) / (numberOfBins / 3.0)))*sin(static_cast<float>(j) / (numberOfBins / 10.0));
spectra[i][j] -= static_cast<short>(1000.0*exp(-(static_cast<float>(j) / (numberOfBins / 3.0)))*sin(static_cast<float>(j) / (numberOfBins / 10.0)));
}
}
// This is where the critical code is called
QueryPerformanceCounter(&start);
for (int i = 0; i < numberOfSpectra; ++i)
{
myPreprocessor.SetSpectrum(&spectra[i], 1000, &processedSpectrum);
myPreprocessor.CorrectBaseline(30, 2.0);
}
QueryPerformanceCounter(&stop);
time_ms = static_cast<double>(stop.QuadPart - start.QuadPart) / static_cast<double>(freq.QuadPart);
std::cout << "time spend preprocessing: " << time_ms << std::endl;
std::cin.ignore();
return 0;
}
And the included header Preprocess_t.h:
#pragma once
#include <vector>
//typedef unsigned int indexType;
typedef unsigned short indexType;
template<typename T, typename Out_Type>
class Preprocess
{
public:
Preprocess() :threshold(1), sdev(1), laserPeakThreshold(500), a(0), b(0), firstPointUsedAfterLaserPeak(0) {};
~Preprocess() {};
void SetSpectrum(std::vector<T>* input, T laserPeakThreshold, std::vector<Out_Type>* processedSpectrum); ///#note We need the laserPeakThresholdParameter for the baseline correction, not onla for the shift.
void CorrectBaseline(indexType numberOfPoints, Out_Type thresholdFactor);
private:
void LinFitValues(indexType beginPoint);
Out_Type SumOfSquareDiffs(Out_Type x, indexType n);
Out_Type LinResidualSumOfSquareDist(indexType beginPoint);
std::vector<T>* input;
std::vector<Out_Type>* processedSpectrum;
std::vector<indexType> fitWave_X;
std::vector<Out_Type> fitWave;
Out_Type threshold;
Out_Type sdev;
T laserPeakThreshold;
Out_Type a, b;
indexType firstPointUsedAfterLaserPeak;
indexType numberOfPoints;
};
template<typename T, typename Out_Type>
void Preprocess<T, Out_Type>::CorrectBaseline(indexType numberOfPoints, Out_Type thresholdFactor)
{
this->numberOfPoints = numberOfPoints;
indexType numberOfBins = input->size();
indexType firstPointUsedAfterLaserPeak = 0;
indexType positionInFitWave = 0;
positionInFitWave = firstPointUsedAfterLaserPeak;
for (indexType i = firstPointUsedAfterLaserPeak; i < numberOfBins - numberOfPoints; i++) {
LinFitValues(positionInFitWave);
processedSpectrum->at(i + numberOfPoints) = input->at(i + numberOfPoints) - static_cast<Out_Type>(a + b*(i + numberOfPoints));
positionInFitWave++;
fitWave[positionInFitWave + numberOfPoints - 1] = input->at(i + numberOfPoints - 1);
fitWave_X[positionInFitWave + numberOfPoints - 1] = i + numberOfPoints - 1;
}
}
template<typename T, typename Out_Type>
void Preprocess<T, Out_Type>::LinFitValues(indexType beginPoint)
{
Out_Type y_mean, x_mean, SSxy, SSxx, normFactor;
y_mean = x_mean = SSxy = SSxx = normFactor = static_cast<Out_Type>(0);
indexType endPoint = beginPoint + numberOfPoints;
Out_Type temp;
if ((fitWave_X[endPoint - 1] - fitWave_X[beginPoint]) == numberOfPoints)
{
x_mean = (fitWave_X[endPoint - 1] - fitWave_X[beginPoint]) / static_cast<Out_Type>(2);
for (indexType i = beginPoint; i < endPoint; i++) {
y_mean += fitWave[i];
}
y_mean /= numberOfPoints;
SSxx = SumOfSquareDiffs(x_mean, fitWave_X[endPoint - 1]) - SumOfSquareDiffs(x_mean, fitWave_X[beginPoint]);
for (indexType i = beginPoint; i < endPoint; i++)
{
SSxy += (fitWave_X[i] - x_mean)*(fitWave[i] - y_mean);
}
}
else
{
for (indexType i = beginPoint; i < endPoint; i++) {
y_mean += fitWave[i];
x_mean += fitWave_X[i];
}
y_mean /= numberOfPoints;
x_mean /= numberOfPoints;
for (indexType i = beginPoint; i < endPoint; i++)
{
temp = (fitWave_X[i] - x_mean);
SSxy += temp*(fitWave[i] - y_mean);
SSxx += temp*temp;
}
}
b = SSxy / SSxx;
a = y_mean - b*x_mean;
}
template<typename T, typename Out_Type>
inline Out_Type Preprocess<T, Out_Type>::SumOfSquareDiffs(Out_Type x, indexType n)
{
return n*x*x + n*(n - 1)*x + ((n - 1)*n*(2 * n - 1)) / static_cast<Out_Type>(6);
}
template<typename T, typename Out_Type>
Out_Type Preprocess<T, Out_Type>::LinResidualSumOfSquareDist(indexType beginPoint)
{
Out_Type sumOfSquares = 0;
Out_Type temp;
for (indexType i = 0; i < numberOfPoints; ++i) {
temp = fitWave[i + beginPoint] - (a + b*fitWave_X[i + beginPoint]);
sumOfSquares += temp*temp;
}
return sumOfSquares;
}
template<typename T, typename Out_Type>
inline void Preprocess<T, Out_Type>::SetSpectrum(std::vector<T>* input, T laserPeakThreshold, std::vector<Out_Type>* processedSpectrum)
{
this->input = input;
fitWave_X.resize(input->size());
fitWave.resize(input->size());
this->laserPeakThreshold = laserPeakThreshold;
this->processedSpectrum = processedSpectrum;
processedSpectrum->resize(input->size());
}

You are using MSVC? I had a similar effect when I implemented code that essentially was a matrix-multiplication plus a vector addition. Here, I thought that floats would be faster because they can be better SIMD-parallelized as more can be packed in the SSE registers. However, using doubles was much faster.
After some investigation, I figured out from the assembler code that the float's need conversion from the internal FPU precision and this rounding was consuming most of the runtime. You can change the FP model to something that is faster with the cost of reduced precision. There is also some discussion in older threads here at SO.

Related

BigInt multiplication and to_string implementation outputs too many zeros

I created the following for multiplying two big integers stored with base 1,000,000,000 as a vector<int32_t>:
#include <iostream>
#include <vector>
#include <cmath>
#include <limits>
#include <algorithm>
template<typename T>
constexpr T power_of_10(T n)
{
return n < 0 ? 0 : n == 0 ? 1 : (n == 1 ? 10 : 10 * power_of_10(n - 1));
}
template<typename T>
constexpr T base_value = power_of_10<T>(std::numeric_limits<T>::digits10);
template<typename T>
constexpr T max_value = base_value<T> - 1;
class BigInt {
private:
static constexpr const std::uint32_t base = base_value<std::uint32_t>;
static constexpr const std::uint32_t max_digits = std::numeric_limits<std::uint32_t>::digits10;
std::vector<std::uint64_t> digits;
public:
BigInt(const char* value) : BigInt(std::string(value))
{
}
BigInt(const std::string& value)
{
constexpr const int stride = std::numeric_limits<std::uint32_t>::digits10;
const std::size_t size = value.size() / stride;
for (std::size_t i = 0; i < size; ++i)
{
auto it = value.begin();
auto jt = value.begin();
std::advance(it, i * stride);
std::advance(jt, (i * stride) + stride);
digits.push_back(std::stoull(std::string(it, jt)));
}
if (value.size() % stride)
{
auto remainder = std::string(value.begin() + size * stride, value.end());
digits.push_back(std::stoull(remainder));
}
std::reverse(digits.begin(), digits.end());
}
BigInt& multiply(const BigInt& other)
{
std::vector<std::uint64_t> product = std::vector<std::uint64_t>(digits.size() + other.digits.size(), 0);
for (std::size_t i = 0; i < other.digits.size(); ++i)
{
std::uint64_t carry = 0, total = 0;
for (std::size_t j = 0; j < digits.size(); ++j)
{
total = product.at(i + j) + (other.digits[i] * digits[j]) + carry;
carry = total / base;
total %= base;
product.at(i + j) = total;
}
if (carry)
{
product[i + digits.size()] = carry;
}
}
digits = product;
return *this;
}
std::string to_string() {
std::string result = std::to_string(digits[digits.size() - 1]);
//
// for (std::int64_t i = digits.size() - 2; i >= 0; --i)
// {
// std::string group = std::to_string(digits[i]);
// while (group.size() < max_digits) {
// group = '0' + group;
// }
// result += group;
// }
for (std::int64_t i = digits.size() - 2; i >= 0; --i)
{
std::uint64_t value = digits[i];
std::uint32_t divisor = base;
while(divisor)
{
if (divisor != base)
{
result += (value / divisor) + '0';
}
value %= divisor;
divisor /= 10;
}
}
return result;
}
};
int main(int argc, const char * argv[])
{
BigInt a = "5000000000";
BigInt b = "5000000000";
std::cout<<a.multiply(b).to_string()<<"\n";
std::cout<<"25000000000000000000"<<"\n";
return 0;
}
When I print the result of the multiplication, I am getting 5,000,000,000 * 5,000,000,000 = 250,000,000,000,000,000,000,000,000,000,000,000 which has way too many zeroes!
It should have 18 zeroes, but mine has 34.
I believe my multiplication algorithm is correct and my to_string is incorrect because 500 * 500 prints correctly as 25,000.
Any ideas what is wrong?
The problem comes from this line:
product[digits.size() + 1] = static_cast<T>(carry);
The index digits.size() + 1 is incorrect. It should be digits.size() + j.

Is it possible to use CUDA parallelizing this nested for loop?

I want to speed up this nested for loop, just start learn CUDA, how could I use CUDA to parallel this c++ code ?
#define PI 3.14159265
using namespace std;
int main()
{
int nbint = 2;
int hits = 20;
int nbinp = 2;
float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
float x[20],y[20],z[20],a[20],t[20];
for (int i = 0; i < hits; ++i)
{
x[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
y[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
z[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
a[i] = rand() / (float)(RAND_MAX / 100);
}
float maxforall = 1e-6;
float theta0;
float phi0;
for (int i = 0; i < nbint; i++)
{
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
_phi = (0.5 + j)*delta / _theta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
{
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
}
qsort(t, 0, hits - 1);
float max = t[0];
for (int k = 0; k < hits; k++)
{
if (max < t[k])
max = t[k];
}
if (max > maxforall)
{
maxforall = max;
}
}
}
return 0;
}
I want to put innermost for loop and the sort part(maybe the whole nested loop) into parallel. After sort those array I found the maximum of all arrays. I use maximum to simplify the code. The reason I need sort is that maximum represent
here is a continuous time information(all arrays contain time information). The sort part make those time from lowest to highest. Then I compare the a specific time interval(not a single value). The compare process almost like I choose maximum but with a continuous interval not a single value.
Your 3 nested loops calculate nbint*nbinp*hits values. Since each of those values is independent from each other, all values can be calculated in parallel.
You stated in your comments that you have a commutative and associative "filter condition" which reduces the output to a single scalar value. This can be exploited to avoid sorting and storing the temporary values. Instead, we can calculate the values on-the-fly and then apply a parallel reduction to determine the end result.
This can be done in "raw" CUDA, below I implemented this idea using thrust. The main idea is to run grid_op nbint*nbinp*hits times in parallel. In order to find out the three original "loop indices" from the single scalar index which is passed to grid_op the algorithm from this SO question is used.
thrust::transform_reduce performs the on-the-fly transformation and the subsequent parallel reduction (here thrust::maximum is used as a substitute).
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>
thrust::host_vector<float> random_vector(const size_t N)
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for(size_t i = 0; i < N; i++) {
temp[i] = u01(rng);
}
return temp;
}
// ### END utility for demo ####
template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename ZipIterator>
class grid_op
{
public:
grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}
__host__ __device__
float operator()(std::size_t index) const
{
const auto coords = unflatten_3d_index(index, dim1, dim2);
const auto values = zipIt[thrust::get<2>(coords)];
const float delta = 5;
const float _theta = (0.5f + thrust::get<0>(coords))*delta;
const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
const float _n = cos(_theta* M_PI / 180.0);
const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
return (thrust::get<3>(values) - _k);
}
private:
__host__ __device__
thrust::tuple<std::size_t, std::size_t, std::size_t>
unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
{
// taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index
std::size_t x = index % dim1;
std::size_t y = ( ( index - x ) / dim1 ) % dim2;
std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
return thrust::make_tuple(x,y,z);
}
ZipIterator zipIt;
std::size_t dim1;
std::size_t dim2;
};
template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
{
return grid_op<ZipIterator>(zipIt, dim1, dim2);
}
int main()
{
const int nbint = 3;
const int nbinp = 4;
const int hits = 20;
const std::size_t N = nbint * nbinp * hits;
thrust::device_vector<float> d_x = random_vector(hits);
thrust::device_vector<float> d_y = random_vector(hits);
thrust::device_vector<float> d_z = random_vector(hits);
thrust::device_vector<float> d_a = random_vector(hits);
auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
auto countingIt = thrust::counting_iterator<std::size_t>(0);
auto unary_op = make_grid_op(zipIt, nbint, nbinp);
auto binary_op = thrust::maximum<float>();
const float init = 0;
float max = thrust::transform_reduce(
countingIt, countingIt+N,
unary_op,
init,
binary_op
);
std::cout << "max = " << max << std::endl;
}

Optimization of C++ code - std::vector operations

I have this funcition (RotateSlownessTop) and it's called about 800 times computing the corresponding values. But the calculation is slow and is there a way I can make the computations faster.
The number of element in X/Y is 7202. (Fairly large set)
I did the performance analysis and the screenshot has been attached.
void RotateSlownessTop(vector <double> &XR1, vector <double> &YR1, float theta = 0.0)
{
Matrix2d a;
a(0,0) = cos(theta);
a(0,1) = -sin(theta);
a(1, 0) = sin(theta);
a(1, 1) = cos(theta);
vector <double> XR2(7202), YR2(7202);
for (size_t i = 0; i < X.size(); ++i)
{
XR2[i] = (a(0, 0)*X[i] + a(0, 1)*Y[i]);
YR2[i] = (a(1, 0)*X[i] + a(1, 1)*Y[i]);
}
size_t i = 0;
size_t j = 0;
while (i < YR2.size())
{
if (i > 0)
if ((XR2[i]>0) && (XR2[i-1]<0))
j = i;
if (YR2[i] > (-1e-10) && YR2[i]<0.0)
YR2[i] = 0.0;
if (YR2[i] < (1e-10) && YR2[i]>0.0)
YR2[i] = -YR2[i];
if ( YR2[i]<0.0)
{
YR2.erase(YR2.begin() + i);
XR2.erase(XR2.begin() + i);
--i;
}
++i;
}
size_t k = 0;
while (j < YR2.size())
{
YR1[k] = (YR2[j]);
XR1[k] = (XR2[j]);
YR2.erase(YR2.begin() + j);
XR2.erase(XR2.begin() + j);
++k;
}
size_t l = 0;
for (; k < XR1.size(); ++k)
{
XR1[k] = XR2[l];
YR1[k] = YR2[l];
l++;
}
}
Edit1: I have updated the code by replacing all push_back() with operator[], since I read somewhere that this is much faster.
However the whole program is still slow. Any suggestions are appreciated.
If the size is large, you can improve the push_back by pre-allocating the space needed. Add this before the loop:
XR2.reserve(X.size());
YR2.reserve(X.size());

How to implement midpoint displacement

I'm trying to implement procedural generation in my game. I want to really grasp and understand all of the algorithms nessecary rather than simply copying/pasting existing code. In order to do this I've attempted to implement 1D midpoint displacement on my own. I've used the information here to write and guide my code. Below is my completed code, it doesn't throw an error but that results don't appear correct.
srand(time(NULL));
const int lineLength = 65;
float range = 1.0;
float displacedLine[lineLength];
for (int i = 0; i < lineLength; i++)
{
displacedLine[i] = 0.0;
}
for (int p = 0; p < 100; p++)
{
int segments = 1;
for (int i = 0; i < (lineLength / pow(2, 2)); i++)
{
int segs = segments;
for (int j = 0; j < segs; j++)
{
int x = floor(lineLength / segs);
int start = (j * x) + 1;
int end = start + x;
if (i == 0)
{
end--;
}
float lo = -range;
float hi = +range;
float change = lo + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (hi - lo)));
int center = ((end - start) / 2) + start;
displacedLine[center - 1] += change;
segments++;
}
range /= 2;
}
}
Where exactly have I made mistakes and how might I correct them?
I'm getting results like this:
But I was expecting results like this:
The answer is very simple and by the way I'm impressed you managed to debug all the potential off-by-one errors in your code. The following line is wrong:
displacedLine[center - 1] += change;
You correctly compute the center index and change amount but you missed that the change should be applied to the midpoint in terms of height. That is:
displacedLine[center - 1] = (displacedLine[start] + displacedLine[end]) / 2;
displacedLine[center - 1] += change;
I'm sure you get the idea.
The problem seems to be that you are changing only the midpoint of each line segment, rather than changing the rest of the line segment in proportion to its distance from each end to the midpoint. The following code appears to give you something more like what you're looking for:
#include <iostream>
#include <cstdlib>
#include <math.h>
#include <algorithm>
using namespace std;
void displaceMidPt (float dline[], int len, float disp) {
int midPt = len/2;
float fmidPt = float(midPt);
for (int i = 1; i <= midPt; i++) {
float ptDisp = disp * float(i)/fmidPt;
dline[i] += ptDisp;
dline[len-i] += ptDisp;
}
}
void displace (float displacedLine[], int lineLength, float range) {
for (int p = 0; p < 100; p++) {
int segs = pow(p, 2);
for (int j = 0; j < segs; j++) {
float lo = -range;
float hi = +range;
float change = lo + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (hi - lo)));
int start = int(float(j)/float(segs)*float(lineLength));
int end = int(float(j+1)/float(segs)*float(lineLength));
displaceMidPt (displacedLine+start,end-start,change);
}
range /= 2;
}
}
void plot1D (float x[], int len, int ht = 10) {
float minX = *min_element(x,x+len);
float maxX = *max_element(x,x+len);
int xi[len];
for (int i = 0; i < len; i++) {
xi[i] = int(ht*(x[i] - minX)/(maxX - minX) + 0.5);
}
char s[len+1];
s[len] = '\0';
for (int j = ht; j >= 0; j--) {
for (int i = 0; i < len; i++) {
if (xi[i] == j) {
s[i] = '*';
} else {
s[i] = ' ';
}
}
cout << s << endl;
}
}
int main () {
srand(time(NULL));
const int lineLength = 65;
float range = 1.0;
float displacedLine[lineLength];
for (int i = 0; i < lineLength; i++) {
displacedLine[i] = 0.0;
}
displace (displacedLine,lineLength,range);
plot1D (displacedLine,lineLength);
return 0;
}
When run this way, it produces the following result:
$ c++ -lm displace.cpp
$ ./a
*
* *
* ***
* * * *
* ** **** * **
* *** **** * * * ** *
* * ** ** *** * * * *
** ** *
* * * ***
** ***
*

'std::vector<double>::iterator' has no member named 'begin'

So I am trying to perform recursion ( A very simple code for split radix recursive butterflies) on a large C++ STL vector and I am using iterators to call the recursion but it isn't working as I keep getting errors.
#include <iostream>
#include <cmath>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
template <typename T>
class fft_data{
public:
vector<T> re;
vector<T> im;
};
void inline split_radix_rec(vector<double>::iterator r,vector<double>::iterator i, int sgn,int N) {
if (N == 1) {
return;
} else if (N == 2) {
for (int k = 0; k < N/2; k++) {
int index = 2*k;
int index1 = index+1;
double taur = *(r+index1);
double taui = *(i+index1);
*(r+index1) = *(r+index) - taur;
*(i+index1) = *(i+index) - taui;
*(r+index) = *(r+index) + taur;
*(i+index) = *(i+index) + taui;
}
N=N/2;
} else {
int m = N/2;
int p = N/4;
double PI2 = 6.28318530717958647692528676655900577;
double theta = -1.0 * sgn * PI2/N;
double S = sin(theta);
double C = cos(theta);
double PI6 = 3.0*6.28318530717958647692528676655900577;
double theta3 = -1.0 * sgn * PI6/N;
double S3 = sin(theta3);
double C3 = cos(theta3);
double wlr = 1.0;
double wli = 0.0;
//T wl2r = (T) 1.0;
//T wl2i = (T) 0.0;
double wl3r = 1.0;
double wl3i = 0.0;
double tau1r,tau1i,tau2r,tau2i;
double ur,ui,vr,vi;
for (int j = 0; j < p; j++) {
int index1 = j+m;
int index2 = index1+p;
int index3 = j+p;
tau1r = *(r+index1);
tau1i = *(i+index1);
tau2r = *(r+index2);
tau2i = *(i+index2);
ur = tau1r + tau2r;
ui = tau1i + tau2i;
vr = sgn* (tau2r - tau1r);
vi = sgn* (tau2i - tau1i);
*(r+index2) = *(r+index3) - vi;
*(i+index2) = *(i+index3) + vr;
*(r+index1) = *(r+j) - ur;
*(i+index1) = *(i+j) - ui;
*(r+index3) = *(r+index3) + vi;
*(i+index3) = *(i+index3) - vr;
*(r+j) = *(r+j) + ur;
*(i+j) = *(i+j) + ui;
}
split_radix_rec(r.begin(),i.begin(),sgn,m);
split_radix_rec(r.begin()+m,i.begin()+m,sgn,p);
split_radix_rec(r.begin()+m+p,i.begin()+m+p,sgn,p);
}
}
int main() {
vector<double> u,v;
for (int i = 0; i < 256; i++) {
u.push_back(i);
v.push_back(i);
}
int sgn = 1;
int N = 256;
split_radix_rec(u.begin(),v.begin(),sgn,N);
return 0;
}
Here are the errors I am getting
main.cpp:93:21: error: 'std::vector<double>::iterator' has no member named 'begin'
6 Identical errors on lines 93,94,95 (the three split_radix_rec() functions called from within the split_radix_rec function). This is part of a much larger code so I want it to work for STL vectors. What am I doing wrong?
As the error states, you are calling begin() on a std::vector<double>::iterator.
You should call that on a std::vector<double>, so that it could return you a std::vector<double>::iterator.
r,i are itself iterators(begins) in your code.
Try:
split_radix_rec(r,i,sgn,m);
split_radix_rec(r+m,i+m,sgn,p);
split_radix_rec(r+m+p,i+m+p,sgn,p);
There is way too much code to give you a concise answer, but the error clearly states that you are calling begin() on a vector iterator instead of a vector. And that happens at the split_radix_rec recursive call. You may have intended this instead:
split_radix_rec(r,i,sgn,m);
split_radix_rec(r+m,i+m,sgn,p);
split_radix_rec(r+m+p,i+m+p,sgn,p);