I am working on a music program that calls notes from the chromatic scale based on intervals. These interval variables (h - half step, w - whole step and wh -whole and a half step) will be used for determining scale incriments (Major = WWHWWWH) and will later be used to measure interval lengths across a vector of strings to potentially output measurements like "3 Whole Steps and a Half Step".
I'm wondering what would be the more efficient way to store the simple variables, as I would eventually like to make a cellphone app out of it and want it to be as easy on the battery/memory as possible. . And I am still learning. Here are my thoughts:
int H = 1;
int W = 2;
int WH = 3;
Int Fiv = 5;
Int Sev = 7;
or
int H = 1;
int W = H+H;
int WH = W + H;
int Fiv = WH+W;
int Sev = Fiv + W;
Int H = 1; int W = H*2; int WH = W+H; etc..
I'm primarily interested in how the differentiation of initialization will effect both memory and performance if at all?
I know I shouldn't have everything in main, but this is a work in progress, and I am obviously new to programming - so please look past the layout .. here is the code it's presently being used in..
#include <algorithm>
#include <iostream>
#include <iterator>
#include <string>
#include <sstream>
#include <vector>
#include <map>
const std::vector<std::string> st_sharps{"C","C#","D","D#","E","F","F#","G","G#","A","A#","B" };
const std::vector<std::string> st_flats{"C","Db","D","Eb","E","F","Gb","G","Ab","A","Bb","B" };
struct steps{ int maj = 0; int min = 0;} step;
constexpr int H = 1;
constexpr int W = 2;
constexpr int Tre = 3;
constexpr int Fif = 5;
constexpr int Sev = 7;
const int size = st_flats.size();
const std::vector<int> Major = { W, W, H, W, W, W, H };
struct circle{
std::stringstream sharp;
std::stringstream flat;
std::stringstream minor;
std::stringstream dimin; };
struct scales{
circle fifths;
std::stringstream maj;
std::stringstream min; } scale;
int main(){
//Circle of Fifths
for (int j = 0; j < size; j++){
int five = j * Sev;
scale.fifths.sharp << st_sharps[five % size] << " ";
scale.fifths.flat << st_flats[five % size] << " ";
scale.fifths.minor << st_sharps[((size - Tre) + five) % size] << " ";
scale.fifths.dimin << st_sharps[((size - H) + five) % size] << " ";
}
std::cout << "Circle of Fifths:\n";
std::cout << "Major >> Relative Minor >> Diminished " << std::endl;
std::cout << "Maj: " << scale.fifths.sharp.str() << std::endl;
std::cout << "Min: " << scale.fifths.minor.str() << std::endl;
std::cout << "Dim: " << scale.fifths.dimin.str() << std::endl;
std::cout << "\nflats: " << scale.fifths.flat.str() << "\n" << std::endl;
//Major and Minor Scales
for (int i = 0; i < Major.size(); i++) {
scale.maj << st_sharps[step.maj] << " ";
scale.min << st_flats[((size - Tre) + step.min) % size] << " ";
step.maj += Major[i];
step.min += Major[(i + Fif) % Major.size()];
}
std::cout << "C Major:\n" << scale.maj.str() << "\n" << std::endl;
std::cout << "A Minor:\n" << scale.min.str() << "\n" << std::endl;
return 0;
}
I'd choose a version that expresses "'W' is the double of 'H'" the best way. My preferred way would therefore be:
constexpr int H = 1;
constexpr int W = 2*H;
constexpr int WH = W+H;
Note that your version int W = H++ is not what you probably intend, since H++ is not equal to H+1; it is actually equal to int W = H; H = H + 1.
Related
I'm trying to parallelise a large linear system solve using Armadillo and OpenMP using arma::solve. Instead of directly calling the solver I would like to split the problems into smaller chunks of RHS vectors and call them in parallel in an OpenMP loop as shown in the Listing below.
This should result in the same answer since the multiple RHS are independent problems but I often get a few columns mangled when I run the code in this manner. I've even tried to enclose the write back in an omp critical section but it still fails.
Is Armadillo safe to run in this manner or am I missing something here?
// to compile the code run as
// g++ parals.cpp -I$ARMADILLO_INCLUDE_DIR -L$ARMADILLO_INCLUDE_DIR/../lib64
// -larmadillo -fopenmp -lopenblas -o parals.out
#define ARMA_DONT_USE_WRAPPER
#define ARMA_USE_BLAS
#define ARMA_USE_LAPACK
#include <armadillo>
#include <omp.h>
#include <iostream>
using namespace arma;
/*
* Solves LS for a single problem,
*
* ||AX - B ||_F^2
*/
int main(int argc, char *argv[]) {
int m = atoi(argv[1]); // A is of size m \times n
int n = atoi(argv[2]); // A is of size m \times n
int k = atoi(argv[3]); // B is of size m \times k
int seed = atoi(argv[4]); // seed for random inits
int chunk = atoi(argv[5]); // chunk size to group RHS
arma::arma_rng::set_seed(seed);
std::cout << "m::" << m << "::n::" << n << "::k::" << k
<< "::seed::" << seed << "::chunk::" << chunk
<< std::endl;
mat A(m,n,arma::fill::randu);
//std::cout << "A::" << std::endl << A << std::endl;
mat B(m,k,arma::fill::randu);
//std::cout << "B::" << std::endl << B << std::endl;
mat AtA = A.t() * A;
mat AtB = A.t() * B;
// solve sequentially
mat Xseq = arma::solve(AtA, AtB, arma::solve_opts::likely_sympd);
int num_chunks = AtB.n_cols / chunk;
if (num_chunks * chunk < AtB.n_cols) num_chunks++;
mat Xchunk(n,k,arma::fill::zeros);
for (int nt = 0; nt < num_chunks; nt++) {
int spanStart = nt * chunk;
int spanEnd = (nt + 1) * chunk - 1;
if (spanEnd > AtB.n_cols - 1) {
spanEnd = AtB.n_cols - 1;
}
mat rhs = AtB.cols(spanStart, spanEnd);
mat Y = arma::solve(AtA, rhs, arma::solve_opts::likely_sympd);
Xchunk.cols(spanStart, spanEnd) = Y;
}
bool chkchunk = arma::approx_equal(Xseq, Xchunk, "absdiff", 0.0001);
std::cout << "(Xseq == Xchunk) ? = " << chkchunk << std::endl;
if (!chkchunk) {
std::cout << "Xseq::" << std::endl << Xseq << std::endl;
std::cout << "Xchunk::" << std::endl << Xchunk << std::endl;
}
mat Xpar(n,k,arma::fill::zeros);
#pragma omp parallel for schedule(static,1)
for (int nt = 0; nt < num_chunks; nt++) {
int spanStart = nt * chunk;
int spanEnd = (nt + 1) * chunk - 1;
if (spanEnd > AtB.n_cols - 1) {
spanEnd = AtB.n_cols - 1;
}
mat rhs = AtB.cols(spanStart, spanEnd);
mat Y = arma::solve(AtA, rhs, arma::solve_opts::likely_sympd);
Xpar.cols(spanStart, spanEnd) = Y;
}
bool chkpar = arma::approx_equal(Xseq, Xpar, "absdiff", 0.0001);
std::cout << "(Xseq == Xpar) ? = " << chkpar << std::endl;
if (!chkpar) {
std::cout << "Xseq::" << std::endl << Xseq << std::endl;
std::cout << "Xpar::" << std::endl << Xpar << std::endl;
}
return 0;
}
A small driver script to hopefully reproduce my errors. My Armadillo instance is linked with OpenBLAS and I don't specify the OMP_NUM_THREADS variable.
#!/bin/bash
for i in {1..20}
do
echo $i
unset OMP_NUM_THREADS;
./parals.out 10 5 20 17 3
done
I'm using the vector_of_vectors example in nanoflann to find the nearest neighbors to a 128 dimensional float vector.
When using 1 Million samples everything seems fast enough: Building the tree and building the index.
But When using 10 Million samples which is 10 times larger, the tree takes a LOT more time to build and also to index.
I did this example in Python/Numpy/cKdTree and it really wasn't this slow to build the tree and index.
Is my approach wrong?
#include <nanoflann.hpp>
using namespace nanoflann;
#include "KDTreeVectorOfVectorsAdaptor.h"
#include <ctime>
#include <cstdlib>
#include <iostream>
const int SAMPLES_DIM = 128;
typedef std::vector<std::vector<float>> my_vector_of_vectors_t;
void generateRandomPointCloud(my_vector_of_vectors_t& samples,
const size_t N = 1e7,
const size_t dim = 128,
const float max_range = 1.0)
{
std::cout << "Generating " << N << " random points...";
samples.resize(N);
for (size_t i = 0; i < N; i++)
{
samples[i].resize(dim);
for (size_t d = 0; d < dim; d++)
samples[i][d] = max_range * (rand() % 1000) / (1000.0);
}
std::cout << "done\n";
}
void kdtree_demo(const size_t nSamples = 1e7, const size_t dim = 128)
{
my_vector_of_vectors_t samples;
const float max_range = 1.0;
// Generate points:
generateRandomPointCloud(samples, nSamples, dim, max_range);
// Query point:
std::vector<float> query_pt(dim);
for (size_t d = 0; d < dim; d++)
query_pt[d] = max_range * (rand() % 1000) / (1000.0);
// construct a kd-tree index:
// Dimensionality set at run-time (default: L2)
// ------------------------------------------------------------
std::cout << "Constructing Kd Tree" << std::endl;
typedef KDTreeVectorOfVectorsAdaptor<my_vector_of_vectors_t, float> my_kd_tree_t;
my_kd_tree_t mat_index(dim /*dim*/, samples, 20 /* max leaf */);
std::cout << "Building Index" << std::endl;
mat_index.index->buildIndex();
std::cout << "Initializing Indexes" << std::endl;
// do a knn search
const size_t num_results = 3;
std::vector<size_t> ret_indexes(num_results);
std::vector<float> out_dists_sqr(num_results);
std::cout << "Initializing Resultset" << std::endl;
nanoflann::KNNResultSet<float> resultSet(num_results);
resultSet.init(&ret_indexes[0], &out_dists_sqr[0]);
std::cout << "Starting " << std::endl;
mat_index.index->findNeighbors(resultSet, &query_pt[0], nanoflann::SearchParams(10));
std::cout << "knnSearch(number or results=" << num_results << "): \n";
for (size_t i = 0; i < num_results; i++)
std::cout << "ret_index[" << i << "]=" << ret_indexes[i] << " out_dist_sqr=" << out_dists_sqr[i] << std::endl;
}
int main()
{
// Randomize Seed
srand(time(NULL));
kdtree_demo(1e7 /* samples */, SAMPLES_DIM /* dim */);
}
Why is this plain array implementation slower than the std::vector implementation performance?
Due to some weired results I was seeing on something I'm working on, I decided to write a simplified test to compare std::vector vs plain array efficiency.
I have a struct which I implement in both ways,
1 using plain arrays (of different sizes)
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[50];
double mean;
} a_segment_t;
2 using STL
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
The creation of this object in memory is not what I'm interested in (so I dont mind the push_back()), once this object is in memory it is used for an operation and that efficiency is what I'm analyzing. The vals are filled with some random data.
The operation goes through the vals stored in each segment, in this case a simple mean calculation. The test is as follows:
using namespace std;
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000
// plain array approach
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[MAX_NPXS];
double mean;
} a_segment_t;
uint16_t operation(uint16_t, a_segment_t*);
uint16_t print(uint16_t nsegments, a_segment_t* p_segments);
// stl vector approach
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
uint16_t operation(uint16_t, vector<b_segment_t>*);
uint16_t print(uint16_t nsegments, vector<b_segment_t>*);
void delta_time(struct timespec*, struct timespec*, struct timespec*);
uint16_t operation(uint16_t nsegments, a_segment_t* p_segments) {
// the operation (plain array approach)
uint64_t sum;
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
sum = 0;
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
sum = sum + p_segments[nsegment].vals[nval];
}
p_segments[nsegment].mean = sum/p_segments[nsegment].nvals;
}
return nsegments;
}
uint16_t print(uint16_t nsegments, a_segment_t* p_segments) {
// print data (plain array approach)
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
cout << "index : " << setfill('0') << setw(3) << p_segments[nsegment].index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segments[nsegment].nvals;
cout << "\tvals : [";
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
cout << p_segments[nsegment].vals[nval] << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
uint16_t operation(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// the operation (stl vector approach)
uint32_t sum;
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
sum = 0;
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
sum = sum + (*p_val);
}
p_segment->mean = sum/(p_segment->nvals);
}
return nsegments;
}
uint16_t print(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// print data (stl vector approach)
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
cout << "index : " << setfill('0') << setw(3) << p_segment->index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segment->nvals;
cout << "\tvals : [";
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
cout << *p_val << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
if ((t2->tv_nsec - t1->tv_nsec) < 0) {
dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
} else {
dt->tv_sec = t2->tv_sec - t1->tv_sec;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
}
return;
}
int main(int argc, char const *argv[]) {
uint16_t nsegments = NSEGMENTS;
uint16_t nsegment = 0;
uint16_t i = 0;
//create an populate the segments with dummy data (plain array approach)
a_segment_t* a_segments = new a_segment_t[nsegments];
for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
a_segments[nsegment].index = nsegment;
srand(nsegment);
a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
a_segments[nsegment].vals[nval] = nval;
}
}
//create an populate the segments with dummy data (stl vector approach)
nsegment = 0;
vector<b_segment_t> b_segments(nsegments);
for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
p_segment->index = nsegment;
srand(nsegment);
p_segment->nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
p_segment->vals.push_back(nval);
}
nsegment++;
}
// print(nsegments, a_segments);
// cout << "===================================" << endl;
// print(nsegments, &b_segments);
// cout << "===================================" << endl;
// ======================= plain array timing measure ========================
struct timespec a_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, a_segments);
clock_gettime(CLOCK_REALTIME, &(a_times[i]));
}
// ===========================================================================
// ========================= vector timing measure ===========================
struct timespec b_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, &b_segments);
clock_gettime(CLOCK_REALTIME, &(b_times[i]));
}
// ===========================================================================
// =========================== timing console log ============================
struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
cout << "\t\t plain array\t\t stl vector" << endl;
cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
for(i = 0; i < N-1; i=i+1000) {
delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
cout << i << ",\t"
<< a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
<< a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
<< b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
<< b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
}
// ===========================================================================
}
An online version. Note: All of the tests were compiled with -O3
Can someone please point out why the plain array implementation is slower than the std::vector implementation?
Shouldn't the plain array implementation be faster?
What can I do to improve the speed of the plain array implementation?
The compiler will do a much better job of optimising code if you express algorithms in terms of iterators. One of the reasons is that it can make assumptions about the size and overflow characteristics of array indexes (which translate to indexed addressing with offset in machine code).
Refactoring to express both operation() and print() in terms of iterators (which can be pointers):
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#include <numeric>
using namespace std;
#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000
// plain array approach
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[MAX_NPXS];
double mean;
} a_segment_t;
// stl vector approach
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
void delta_time(struct timespec*, struct timespec*, struct timespec*);
template<class Iter>
uint16_t operation(Iter first, Iter last) {
auto result = std::uint16_t(std::distance(first, last));
// the operation (plain array approach)
for( ; first != last ; ++first ) {
auto sum = std::accumulate(std::begin(first->vals), std::begin(first->vals) + first->nvals, uint64_t(0), std::plus<>());
first->mean = sum / first->nvals;
}
return result;
}
template<class Iter>
uint16_t print(Iter first, Iter last) {
auto result = std::uint16_t(std::distance(first, last));
// print data (plain array approach)
for( ; first != last ; ++first ) {
cout << "index : " << setfill('0') << setw(3) << first->index;
cout << "\tnval : " << setfill('0') << setw(3) << first->nvals;
cout << "\tvals : [";
for_each(std::begin(first->vals), std::begin(first->vals) + first->nvals, [](const auto& val)
{
cout << val << ",";
});
cout << "\b]" << endl;
}
return result;
}
void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
if ((t2->tv_nsec - t1->tv_nsec) < 0) {
dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
} else {
dt->tv_sec = t2->tv_sec - t1->tv_sec;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
}
return;
}
int main(int argc, char const *argv[]) {
uint16_t nsegments = NSEGMENTS;
uint16_t nsegment = 0;
uint16_t i = 0;
//create an populate the segments with dummy data (plain array approach)
a_segment_t* a_segments = new a_segment_t[nsegments];
for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
a_segments[nsegment].index = nsegment;
srand(nsegment);
a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
a_segments[nsegment].vals[nval] = nval;
}
}
//create an populate the segments with dummy data (stl vector approach)
nsegment = 0;
vector<b_segment_t> b_segments(nsegments);
for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
p_segment->index = nsegment;
srand(nsegment);
p_segment->nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
p_segment->vals.push_back(nval);
}
nsegment++;
}
// print(a_segments, a_segments + nsegments);
// cout << "===================================" << endl;
// print(b_segments.begin(), b_segments.end());
// cout << "===================================" << endl;
// ======================= plain array timing measure ========================
struct timespec a_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(a_segments, a_segments + nsegments);
clock_gettime(CLOCK_REALTIME, &(a_times[i]));
}
// ===========================================================================
// ========================= vector timing measure ===========================
struct timespec b_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(b_segments.begin(), b_segments.begin() + nsegments);
clock_gettime(CLOCK_REALTIME, &(b_times[i]));
}
// ===========================================================================
// =========================== timing console log ============================
struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
cout << "\t\t plain array\t\t stl vector" << endl;
cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
for(i = 0; i < N-1; i=i+1000) {
delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
cout << i << ",\t"
<< a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
<< a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
<< b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
<< b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
}
// ===========================================================================
}
Yields expected results:
plain array stl vector
frame # elapsedtime deltatime elapsedtime deltatime
0, 0.000000000, 0.000002000, 0.000000000, 0.000002000
1000, 0.001533000, 0.000001000, 0.001551000, 0.000002000
2000, 0.003061000, 0.000002000, 0.003096000, 0.000002000
3000, 0.004589000, 0.000001000, 0.004771000, 0.000002000
4000, 0.006255000, 0.000001000, 0.006433000, 0.000002000
5000, 0.007785000, 0.000002000, 0.007975000, 0.000001000
6000, 0.009326000, 0.000002000, 0.009494000, 0.000001000
7000, 0.010893000, 0.000002000, 0.011012000, 0.000001000
8000, 0.012435000, 0.000002000, 0.012650000, 0.000002000
9000, 0.014024000, 0.000002000, 0.014273000, 0.000001000
The two versions aren't actually equivalent.
Firstly, your "array version" has mean as a double, and the "STL version" has mean as uint32_t. For the two functions to be remotely equivalent, the calculation of mean needs to be the same.
Second, your "array version" uses array subscripting, whereas the STL version increments and dereferences iterators. Since the compiler/optimiser will need to allow for more concerns (such as pointer aliasing) in the array version, it is probably unable to optimise performance as much.
Try turning your array version into something like;
uint16_t operation(uint16_t nsegments, a_segment_t* p_segments)
{
uint64_t sum;
for(a_segment *pseg = p_segments, *eseg = p_segments + nsegments; pseg < eseg; ++pseg)
{
sum = 0;
for(uint16_t *val = pseg->vals, *eval = pseg->vals + pseg->nvals; val < eval; ++val)
{
sum = sum + (*val);
}
p_seg->mean = sum/(pseg->nvals);
}
return nsegments;
}
This will (barring mistakes I've made in translating to this form - I haven' tested) give the same result, but will at least give the compiler a fighting chance of being able to apply the same type of performance optimisations to your "array version" as to the "STL version".
This sort of thing is one reason (of several) that the C++ standard algorithms work with iterators, rather than array indexing on containers like vector. The compiler has a better chance of optimising performance. Note that a pointer is a type of iterator.
I'm attempting to test a mathematical class I've created using random numbers from the full range of representable positivefloats, but I find that I seem to be having a problem with my use of std::random. This program
#include <random>
#include <iostream>
#include <functional>
template <typename T>
class Rand {
public:
Rand(T lo=std::numeric_limits<T>::min(),
T hi=std::numeric_limits<T>::max()) :
r(bind(std::uniform_real_distribution<>(lo, hi),std::mt19937_64{})) {}
T operator()() const { return r(); }
private:
std::function<T()> r;
};
int main()
{
Rand<float> f{};
const int samples = 1000000;
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
std::cout << "range min = " << max
<< ", max = " << min << '\n';
for (int i=0; i < samples; ++i) {
float r = f();
if (r < min) min = r;
if (r > max) max = r;
}
std::cout << "for n = " << samples
<< "\nsample min = " << min
<< ", max = " << max << std::endl;
}
produces this output
range min = 1.17549e-38, max = 3.40282e+38
for n = 1000000
sample min = 8.14884e+31, max = 3.40281e+38
Clearly the range is extremely skewed toward larger numbers. How do I generate the desired range of floats with a uniform distribution?
In addition to the statistics you have printed out, I've computed both the theoretical and actual mean, variance, skew and kurtosis of this distribution. Here is my code, and the results:
#include <random>
#include <iostream>
#include <functional>
#include <vector>
#include <numeric>
#include <cmath>
template <typename T>
class Rand {
public:
Rand(T lo=std::numeric_limits<T>::min(),
T hi=std::numeric_limits<T>::max()) :
r(bind(std::uniform_real_distribution<>(lo, hi),std::mt19937_64{})) {}
T operator()() const { return r(); }
private:
std::function<T()> r;
};
template <class T>
inline
T
sqr(T x)
{
return x * x;
}
int main()
{
Rand<float> f{};
const int samples = 1000000;
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
std::vector<float> u;
std::cout << "range min = " << max
<< ", max = " << min << '\n';
for (int i=0; i < samples; ++i) {
float r = f();
if (r < min) min = r;
if (r > max) max = r;
u.push_back(r);
}
std::cout << "for n = " << samples
<< "\nsample min = " << min
<< ", max = " << max << std::endl;
double mean = std::accumulate(u.begin(), u.end(),
double(0)) / u.size();
double var = 0;
double skew = 0;
double kurtosis = 0;
for (int i = 0; i < u.size(); ++i)
{
double d = (u[i] - mean);
double d2 = sqr(d);
var += d2;
skew += d * d2;
kurtosis += d2 * d2;
}
var /= u.size();
double dev = std::sqrt(var);
skew /= u.size() * dev * var;
kurtosis /= u.size() * var * var;
kurtosis -= 3;
double x_mean = ((double)min + max) / 2;
double x_var = sqr((double)max - min) / 12;
double x_skew = 0;
double x_kurtosis = -6./5;
std::cout << std::scientific << '\n';
std::cout << " expected actual\n";
std::cout << "mean " << x_mean << " " << mean << "\n";
std::cout << "variance " << x_var << " " << var << "\n";
std::cout << "skew " << x_skew << " " << skew << "\n";
std::cout << "kurtosis " << x_kurtosis << " " << kurtosis << "\n";
}
And here are the results:
range min = 1.17549e-38, max = 3.40282e+38
for n = 1000000
sample min = 8.14884e+31, max = 3.40281e+38
expected actual
mean 1.701407e+38 1.700724e+38
variance 9.649275e+75 9.645774e+75
skew 0.000000e+00 7.401975e-04
kurtosis -1.200000e+00 -1.199432e+00
Everything is looking pretty good to me.
The biggest point you are missing is that you are not generating numbers between (-max_value,max_value) , but between ( 0 ,max_value).
There are about pow(10,32) numbers between 0 and 8.14884e+31, but there are about pow(10,37) numbers between 8.14884e+31 and 3.40281e+38. Hence the result is obvious.
I'm trying to get some old C++ code up and running. I've gotten it to compile without error, but it immediately segfaults when I run, without entering main. When I use gdb to find out where things are going wrong, I find the following:
(gdb) run
Starting program: /Users/dreens/Documents/OH/extrabuncher2/ParaOHSB
Reading symbols for shared libraries +++. done
Program received signal EXC_BAD_ACCESS, Could not access memory.
Reason: KERN_INVALID_ADDRESS at address: 0x00007fff5636581c
0x000000010000151e in main (argc=1, argv=0x100000ad0) at ParaMainOHSlowerBuncher.cc:13
13 int main(int argc, char *argv[]){
(gdb) backtrace
#0 0x000000010000151e in main (argc=1, argv=0x100000ad0) at ParaMainOHSlowerBuncher.cc:13
(gdb)
Does anyone know what could cause a memory access issue right at the start of the main method?
The code is rather large, but here is the file containing the main method. Could the included .hh and .cc files be a part of the problem? Should I attach them?
Thanks!
David
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "MoleculeEnsemble.hh"
#include "SlowerForceLoadOH32.cc"
#include "SlowerForceLoadOH12.cc"
//#include "SlowerForceLoad3mmBuncher.cc"
#include "SlowerForceLoad4mmBuncher.cc"
using namespace std;
int main(int argc, char *argv[]){
//int main(){
cout << "Ahhhh!" << endl;
/******Parallel Crap********/
/*
int totalnodes = 0;
int mynode = 0;
MPI_Status status;
MPI_Init(&argv,&argc);
MPI_Comm_size(MPI_COMM_WORLD,&totalnodes);
MPI_Comm_rank(MPI_COMM_WORLD,&mynode);
srand(time(NULL)*mynode);
*/
/******Distribution Parameters *******/
long MoleculeNumber = long(5e4);
double Xcenter = 0;
double Ycenter = 0;
double Zcenter = 0;
double DeltaX = 0.0015;
double DeltaY = 0.0015;
double DeltaZ = 0.01;
int FlatX = 1;
int FlatY = 1;
int FlatZ = 1;
double vXcenter = 0;
double vYcenter = 0;
double vZcenter = 406;
double Vcalc = 406;
double vZfinal = 0;
double DeltavX = 2;
double DeltavY = DeltavX;
double DeltavZ = 40;
int FlatvX = 0;
int FlatvY = 0;
int FlatvZ = 0;
int TimeArrayOnly = 0; //Outputs only Time Array
double TimeOffset = 0; //Adds valve-skimmer flight time to ToF array
/*******Overtone Parameters********/
int S = 1; //parameter S=Vz/Vswitch as defined by VDM et al.
int JILAOT = 0; //JILAOT is either 0 or 1, denoting whether or not to use special switching
/*******Hexapole Parameters********/
double VSD = 0.06;
double Voltage = 2000;
double HexRadius = .00268;
double HexStart = .0238;
double HexEnd = .083170;//0.089103;
double HexOn = 1e-6;
double HexOff = 203e-6;//224e-6; 212 for current data; Good = 243e-6 for 408m/s
double DeltaT = 1e-6;
double DeltaTSeqGen = 1e-9; //Need to use smaller time steps for finding the time sequence
double DetectionTime = HexOff; //Use to fake out hex code
double TriggerLatency = 0;//170e-9;
/*******Detection Parameters*******/
double DetectionPosition = double(0.9319);//0.257480; <- for viewing at 31.5 ||||| 0.9428; <-Mag trap(4stages), .9319 <-MagTrap(3Stages)
double IrisWidth = 0.008;//31.5 0.0023 //PostSlower.015;
double LaserRadius = .001;
/*****Bunching Paramaters******/
int BunchNumber = 0;
int NumberUsed = 0;
/*****Timing Variables*********/
time_t start, finish;
time( &start);
/*****Molecule Parameters******/
double mass =double(17*1.672e-27);
/******ToF Detection Arrays and Slowing Parameters *********/
double Phi = double(34.2);
double PhiEB = double(0);
int NumberOfStages = int(142/S); //Use 142 for Big machine
int EBStages = 3; //Larger Add-on stages at end of slower
double BuncherScale = 1;
double Time[int(1e7)];
int ToFSignal32[int(1e7)];
int ToFSignal12[int(1e7)];
double TimeArray[800];
double VExit[800];
double Average32[7];
double Average12[7];
int LOST[200];
/*************Finished ToF Detection Arrays and Slowing Parameters ********/
/******Force Arrays********/
int Xnumber = 111;
int Ynumber = 21;
int Znumber = 21;
int FLength = Xnumber*Ynumber*Znumber;
double AXxDT[FLength];
double AYxDT[FLength];
double AZxDT[FLength];
double AZxDTSeqGen[FLength];
SlowerForceLoadOH32(AZxDT, AYxDT, AXxDT); //Note how Z and X are placed in this function. My matlab code calls the longitudnal dimension X, here it is Z
double DTovermass = DeltaT/mass;
for(int j = 0; j <FLength; j++){
AXxDT[j] = DTovermass*AXxDT[j];
AYxDT[j] = DTovermass*AYxDT[j];
AZxDT[j] = DTovermass*AZxDT[j];
AZxDTSeqGen[j] = DeltaTSeqGen*AZxDT[j]/DeltaT;
}
double AXxDT12[FLength];
double AYxDT12[FLength];
double AZxDT12[FLength];
SlowerForceLoadOH12(AZxDT12, AYxDT12, AXxDT12); //Note how Z and X are placed in this function. My matlab code calls the longitudnal dimension X, here it is Z
for(int j = 0; j <FLength; j++){
AXxDT12[j] = DTovermass*AXxDT12[j];
AYxDT12[j] = DTovermass*AYxDT12[j];
AZxDT12[j] = DTovermass*AZxDT12[j];
}
/********Load Extra Buncher Forces*********/
int XnumberEB = 251;
int YnumberEB = 41;
int ZnumberEB = 41;
int FLengthEB = XnumberEB*YnumberEB*ZnumberEB;
double AXxDTEB[FLengthEB], AYxDTEB[FLengthEB], AZxDTEB[FLengthEB], AZxDTSeqGenEB[FLengthEB];
SlowerForceLoad4mmBuncher(AZxDTEB, AYxDTEB, AXxDTEB);
for(int j = 0; j <FLengthEB; j++)
{
AXxDTEB[j] = DTovermass*AXxDTEB[j]/BuncherScale;
AYxDTEB[j] = DTovermass*AYxDTEB[j]/BuncherScale;
AZxDTEB[j] = DTovermass*AZxDTEB[j]/BuncherScale;
AZxDTSeqGenEB[j] = DeltaTSeqGen*AZxDTEB[j]/(DeltaT*BuncherScale);
}
/********* End All initiliazation ***************************/
/************Beginning Calculation *************************/
//Create Molecule Ensemble
MoleculeEnsemble Alice(MoleculeNumber,Xcenter,Ycenter,Zcenter,DeltaX,DeltaY,DeltaZ,FlatX,FlatY,FlatZ,vXcenter,vYcenter,vZcenter,DeltavX,DeltavY,DeltavZ,FlatvX,FlatvY,FlatvZ);
//MoleculeEnsemble Bob(MoleculeNumber,Xcenter,Ycenter,Zcenter,DeltaX,DeltaY,DeltaZ,FlatX,FlatY,FlatZ,vXcenter,vYcenter,vZcenter,DeltavX,DeltavY,DeltavZ,FlatvX,FlatvY,FlatvZ);
//Generate the Timing Sequence
Alice.TimeArrayGeneratorWithBuncher(Vcalc,Phi,PhiEB,TimeArray,VExit,AZxDTSeqGen,AZxDTSeqGenEB,HexOff,DeltaTSeqGen,BunchNumber,vZfinal,NumberUsed,NumberOfStages,S,EBStages);
/*if(mynode == 0){
cout << "Slowing utilized " << NumberUsed << " stages, yielding a final velocity of " << VExit[NumberUsed] << " m/s." << endl;
cout << endl;
for(int kk = 0; kk < NumberOfStages; kk++){cout << kk << " , " << TimeArray[kk] << " , " << VExit[kk] << endl;}
}*/
/*Alice.MoleculeEnsemble_Averager(Average32);
Bob.MoleculeEnsemble_Averager(Average12);
cout << "Processor: " << mynode << "\t" << sqrt(pow(Average32[3],2)+pow(Average32[4],2)) << ", " << sqrt(pow(Average12[3],2)+pow(Average12[4],2));
cout << " Mean = " << Average32[6] << ", " << Average12[6] << endl << endl << endl;
*/
if(TimeArrayOnly!=1)
{
//Fly the Ensemble through the hexapole
Alice.HexapoleFlightOH(Voltage, HexRadius, HexStart, HexEnd, HexOn, HexOff, DeltaT, double(3/2), DetectionTime);
//Bob.HexapoleFlightOH(Voltage, HexRadius, HexStart, HexEnd, HexOn, HexOff, DeltaT, double(1/2), DetectionTime);
/*
Alice.MoleculeEnsemble_Averager(Average32);
Bob.MoleculeEnsemble_Averager(Average12);
cout << "Processor: " << mynode << "\t" << sqrt(pow(Average32[3],2)+pow(Average32[4],2)) << ", " << sqrt(pow(Average12[3],2)+pow(Average12[4],2));
cout << " Mean = " << Average32[6] << ", " << Average12[6] << endl << endl << endl;
*/
//Fly the Ensemble through the slower
Alice.SlowerFlight(LOST, Time, ToFSignal32, Phi, TimeArray, DeltaT, AXxDT, AYxDT, AZxDT, AXxDTEB, AYxDTEB, AZxDTEB, Xnumber, Ynumber, Znumber, DetectionPosition, IrisWidth, LaserRadius, NumberOfStages, EBStages,S, TriggerLatency);
//Bob.SlowerFlight(LOST, Time, ToFSignal12, Phi, TimeArray, DeltaT, AXxDT12, AYxDT12, AZxDT12, Xnumber, Ynumber, Znumber, DetectionPosition, IrisWidth, LaserRadius, NumberOfStages, EBStages, S, TriggerLatency);
}
/**********Ending Calculation **********************/
//Alice.MoleculeEnsemble_Drawer();
/*
Alice.MoleculeEnsemble_Averager(Average32);
Bob.MoleculeEnsemble_Averager(Average12);
cout << "Processor: " << mynode << "\t" << sqrt(pow(Average32[3],2)+pow(Average32[4],2)) << ", " << sqrt(pow(Average12[3],2)+pow(Average12[4],2));
cout << " Mean = " << Average32[6] << ", " << Average12[6] << endl << endl;
*/
//Output ToF signal
if(TimeArrayOnly!=1)
{
for(int ii = 0; ii < int(1e7); ii++)
{
if(ToFSignal32[ii] > 0 && Time[ii] > 3e-3)
{
cout << Time[ii]+TimeOffset << "," << ToFSignal32[ii] << endl;
//+double(VSD/vZcenter)+38e-6 << "," << ToFSignal32[ii] << endl;
}
if(ToFSignal12[ii] > 0 && Time[ii] > 3e-3)
{
cout << Time[ii]+TimeOffset << "," << ToFSignal12[ii] << endl;
//+double(VSD/vZcenter)+38e-6 << "," << ToFSignal12[ii] << endl;
}
}
}
if(TimeArrayOnly==1)
{
for(int ii = 0; ii < NumberOfStages+EBStages+1; ii++)
{
cout << ii << "\t" << TimeArray[ii] << "\t" << VExit[ii] << endl;
//+double(VSD/vZcenter)+double(265e-6) << "\t" << VExit[ii] << endl;
}
}
/*for(int ii = 0; ii < NumberOfStages; ii++)
{
cout << ii << "\t" << LOST[ii] << endl;
}
*/
/*
MPI_Finalize();
*/
}
You're out of stack space.
You declare very large arrays in your code (over 10 million elements), which are all allocated on the stack. Instead of declaring the arrays statically, use dynamic memory allocation. So, instead of
double Time[int(1e7)];
write
double* Time;
Time = new double[int(1e7)];
and hope to have enough RAM in your computer :)