My system:
system specification:
Intel core2duo E4500 3700g memory L2 cache 2M x64 fedora 17
How I measure flops/mflops
well,I use papi library (to read hardware performance counter) to measure flops and mflops of my code.it return real time procesing time, flops and finally flops/process time which is equal to MFLOPS.library use hardware counter to count floating point inststruction or floating point operations and Total cycle to get the final result that contain flops and MFLOPS.
MY computational kernel
I used three loop matrix matrix multiplication (square matrix) and three nested loop which do some operation on 1d array in its inner-loop.
First Kernel MM
float a[size][size];
float b[size][size];
float c[size][size];
start_calculate_MFlops();
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
for (int k = 0; k < size; **k+=1**) {
*c[i][j]=c[i][j]+a[i][k] * b[k][j];*
}
}
}
stop_calculate_MFlops();
Second kernel with 1d array
float d[size];
float e[size];
float f[size];
float g[size];
float r = 3.6541;
start_calculate_MFlops();
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
for (int k = 0; k < size; ++k) {
d[k]=d[k]+e[k]+f[k]+g[k]+r;
}
}
}
stop_calculate_MFlops();
what I know about flops
Matrix matrix Multiplication (MM) do 2 operation in its inner loop (here floating point operation) and as there is 3 loop which iterate for size X therefore in theory we have total flops of 2*n^3 for MM.
In second kernel we have 3 loop which in inner-most loop we have 1d array which do some computation.there is 4 floating point operation in this loop.hence we have total flops of 4*n^3 flops in theory
I know that the flops that we calculate above is not exactly the same as what will happen in real machine. In real machine there are other operation like load and store wich will add up to out theoretical flops.
Questions ?:
when I use 1d array as in second kernel theoretical flops is the
same or around the flops I get by executing the code and measuring
it.actually when I use 1d array flops is equal to # of operation in
inner-most loop multiply by n^3 but when I use my first kernel MM
which use 2d array theoretical flop is 2n^3 but when I run the code
,measured value is too much higher than theoretical value,it is
about 4+(2 operation in inner-most loop of matrix multiplication)*n^3+=6n^3.
I changed the matrix multiplication line in innermost loop with just the code below:
A[i][j]++;
the theoretical flops for this code in 3 nested loop is 1 operation * n^3=n^3 again when I ran the code the result was too higher than what expected which was 2+(1 operation of inner-most loop)*n^3=3*n^3
Sample Results for matrix of size 512X512 :
Real_time: 1.718368 Proc_time: 1.227672 Total flpops:
807,107,072 MFLOPS: 657.429016
Real_time: 3.608078 Proc_time: 3.042272 Total flpops:
807,024,448 MFLOPS: 265.270355
theoretical flop: 2*512*512*512=268,435,456
Measured flops= 6*512^3 =807,107,072
Sample Result for 1d array operation in 3 nested loop
Real_time: 1.282257 Proc_time: 1.155990 Total flpops:
536,872,000 MFLOPS: 464.426117
theoretical flop: 4n^3 = 536,870,912
Measured flop: 4n^3=4*512^3+overheads(other operation?)=536,872,000
I could not find any reason for the aforementioned behaviour?
Is my assumption true ?
Hope to make it much simpler than before description.
By practical I meant real flop measured by executing the code.
Code:
void countFlops() {
int size = 512;
int itr = 20;
float a[size][size];
float b[size][size];
float c[size][size];
/* float d[size];
float e[size];
float f[size];
float g[size];*/
float r = 3.6541;
float real_time, proc_time, mflops;
long long flpops;
float ireal_time, iproc_time, imflops;
long long iflpops;
int retval;
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
a[j][j] = b[j][j] = c[j][j] = 1.0125;
}
}
/* for (int i = 0; i < size; ++i) {
d[i]=e[i]=f[i]=g[i]=10.235;
}*/
if ((retval = PAPI_flops(&ireal_time, &iproc_time, &iflpops, &imflops))
< PAPI_OK) {
printf("Could not initialise PAPI_flops \n");
printf("Your platform may not support floating point operation event.\n");
printf("retval: %d\n", retval);
exit(1);
}
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
for (int k = 0; k < size; k+=16) {
c[i][j]=c[i][j]+a[i][k] * b[k][j];
}
}
}
/* for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
for (int k = 0; k < size; ++k) {
d[k]=d[k]+e[k]+f[k]+g[k]+r;
}
}
}*/
if ((retval = PAPI_flops(&real_time, &proc_time, &flpops, &mflops))
< PAPI_OK) {
printf("retval: %d\n", retval);
exit(1);
}
string flpops_tmp;
flpops_tmp = output_formatted_string(flpops);
printf(
"calculation: Real_time: %f Proc_time: %f Total flpops: %s MFLOPS: %f\n",
real_time, proc_time, flpops_tmp.c_str(), mflops);
}
thank you
If you need to count number of your operations - you can make simple class which acts like floating point value and gathers statistics. It will be interchangeable with builtin types.
LIVE DEMO:
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/operators.hpp>
#include <iostream>
#include <ostream>
#include <utility>
#include <cstddef>
#include <vector>
using namespace boost;
using namespace std;
class Statistic
{
size_t ops = 0;
public:
Statistic &increment()
{
++ops;
return *this;
}
size_t count() const
{
return ops;
}
};
template<typename Domain>
class Profiled: field_operators<Profiled<Domain>>
{
Domain value;
static vector<Statistic> stat;
void stat_increment()
{
stat.back().increment();
}
public:
struct StatisticScope
{
StatisticScope()
{
stat.emplace_back();
}
Statistic ¤t()
{
return stat.back();
}
~StatisticScope()
{
stat.pop_back();
}
};
template<typename ...Args>
Profiled(Args&& ...args)
: value{forward<Args>(args)...}
{}
Profiled& operator+=(const Profiled& x)
{
stat_increment();
value+=x.value;
return *this;
}
Profiled& operator-=(const Profiled& x)
{
stat_increment();
value-=x.value;
return *this;
}
Profiled& operator*=(const Profiled& x)
{
stat_increment();
value*=x.value;
return *this;
}
Profiled& operator/=(const Profiled& x)
{
stat_increment();
value/=x.value;
return *this;
}
};
template<typename Domain>
vector<Statistic> Profiled<Domain>::stat{1};
int main()
{
typedef Profiled<double> Float;
{
Float::StatisticScope s;
Float x = 1.0, y = 2.0, res = 0.0;
res = x+y*x+y;
cout << s.current().count() << endl;
}
{
using namespace numeric::ublas;
Float::StatisticScope s;
matrix<Float> x{10, 20},y{20,5},res{10,5};
res = prod(x,y);
cout << s.current().count() << endl;
}
}
Output is:
3
2000
P.S. Your matrix loop is not cache-friendly, and as the result very inefficient.
P.P.S
int size = 512;
float a[size][size];
This is not legal C++ code. C++ does not support VLA.
Related
I have a matrix (relatively big) that I need to transpose. For example assume that my matrix is
a b c d e f
g h i j k l
m n o p q r
I want the result be as follows:
a g m
b h n
c I o
d j p
e k q
f l r
What is the fastest way to do this?
This is a good question. There are many reason you would want to actually transpose the matrix in memory rather than just swap coordinates, e.g. in matrix multiplication and Gaussian smearing.
First let me list one of the functions I use for the transpose (EDIT: please see the end of my answer where I found a much faster solution)
void transpose(float *src, float *dst, const int N, const int M) {
#pragma omp parallel for
for(int n = 0; n<N*M; n++) {
int i = n/N;
int j = n%N;
dst[n] = src[M*j + i];
}
}
Now let's see why the transpose is useful. Consider matrix multiplication C = A*B. We could do it this way.
for(int i=0; i<N; i++) {
for(int j=0; j<K; j++) {
float tmp = 0;
for(int l=0; l<M; l++) {
tmp += A[M*i+l]*B[K*l+j];
}
C[K*i + j] = tmp;
}
}
That way, however, is going to have a lot of cache misses. A much faster solution is to take the transpose of B first
transpose(B);
for(int i=0; i<N; i++) {
for(int j=0; j<K; j++) {
float tmp = 0;
for(int l=0; l<M; l++) {
tmp += A[M*i+l]*B[K*j+l];
}
C[K*i + j] = tmp;
}
}
transpose(B);
Matrix multiplication is O(n^3) and the transpose is O(n^2), so taking the transpose should have a negligible effect on the computation time (for large n). In matrix multiplication loop tiling is even more effective than taking the transpose but that's much more complicated.
I wish I knew a faster way to do the transpose (Edit: I found a faster solution, see the end of my answer). When Haswell/AVX2 comes out in a few weeks it will have a gather function. I don't know if that will be helpful in this case but I could image gathering a column and writing out a row. Maybe it will make the transpose unnecessary.
For Gaussian smearing what you do is smear horizontally and then smear vertically. But smearing vertically has the cache problem so what you do is
Smear image horizontally
transpose output
Smear output horizontally
transpose output
Here is a paper by Intel explaining that
http://software.intel.com/en-us/articles/iir-gaussian-blur-filter-implementation-using-intel-advanced-vector-extensions
Lastly, what I actually do in matrix multiplication (and in Gaussian smearing) is not take exactly the transpose but take the transpose in widths of a certain vector size (e.g. 4 or 8 for SSE/AVX). Here is the function I use
void reorder_matrix(const float* A, float* B, const int N, const int M, const int vec_size) {
#pragma omp parallel for
for(int n=0; n<M*N; n++) {
int k = vec_size*(n/N/vec_size);
int i = (n/vec_size)%N;
int j = n%vec_size;
B[n] = A[M*i + k + j];
}
}
EDIT:
I tried several function to find the fastest transpose for large matrices. In the end the fastest result is to use loop blocking with block_size=16 (Edit: I found a faster solution using SSE and loop blocking - see below). This code works for any NxM matrix (i.e. the matrix does not have to be square).
inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size) {
#pragma omp parallel for
for(int i=0; i<block_size; i++) {
for(int j=0; j<block_size; j++) {
B[j*ldb + i] = A[i*lda +j];
}
}
}
inline void transpose_block(float *A, float *B, const int n, const int m, const int lda, const int ldb, const int block_size) {
#pragma omp parallel for
for(int i=0; i<n; i+=block_size) {
for(int j=0; j<m; j+=block_size) {
transpose_scalar_block(&A[i*lda +j], &B[j*ldb + i], lda, ldb, block_size);
}
}
}
The values lda and ldb are the width of the matrix. These need to be multiples of the block size. To find the values and allocate the memory for e.g. a 3000x1001 matrix I do something like this
#define ROUND_UP(x, s) (((x)+((s)-1)) & -(s))
const int n = 3000;
const int m = 1001;
int lda = ROUND_UP(m, 16);
int ldb = ROUND_UP(n, 16);
float *A = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
float *B = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
For 3000x1001 this returns ldb = 3008 and lda = 1008
Edit:
I found an even faster solution using SSE intrinsics:
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
__m128 row1 = _mm_load_ps(&A[0*lda]);
__m128 row2 = _mm_load_ps(&A[1*lda]);
__m128 row3 = _mm_load_ps(&A[2*lda]);
__m128 row4 = _mm_load_ps(&A[3*lda]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_store_ps(&B[0*ldb], row1);
_mm_store_ps(&B[1*ldb], row2);
_mm_store_ps(&B[2*ldb], row3);
_mm_store_ps(&B[3*ldb], row4);
}
inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
#pragma omp parallel for
for(int i=0; i<n; i+=block_size) {
for(int j=0; j<m; j+=block_size) {
int max_i2 = i+block_size < n ? i + block_size : n;
int max_j2 = j+block_size < m ? j + block_size : m;
for(int i2=i; i2<max_i2; i2+=4) {
for(int j2=j; j2<max_j2; j2+=4) {
transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
}
}
}
}
}
This is going to depend on your application but in general the fastest way to transpose a matrix would be to invert your coordinates when you do a look up, then you do not have to actually move any data.
Some details about transposing 4x4 square float (I will discuss 32-bit integer later) matrices with x86 hardware. It's helpful to start here in order to transpose larger square matrices such as 8x8 or 16x16.
_MM_TRANSPOSE4_PS(r0, r1, r2, r3) is implemented differently by different compilers. GCC and ICC (I have not checked Clang) use unpcklps, unpckhps, unpcklpd, unpckhpd whereas MSVC uses only shufps. We can actually combine these two approaches together like this.
t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);
r0 = _mm_shuffle_ps(t0,t2, 0x44);
r1 = _mm_shuffle_ps(t0,t2, 0xEE);
r2 = _mm_shuffle_ps(t1,t3, 0x44);
r3 = _mm_shuffle_ps(t1,t3, 0xEE);
One interesting observation is that two shuffles can be converted to one shuffle and two blends (SSE4.1) like this.
t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);
v = _mm_shuffle_ps(t0,t2, 0x4E);
r0 = _mm_blend_ps(t0,v, 0xC);
r1 = _mm_blend_ps(t2,v, 0x3);
v = _mm_shuffle_ps(t1,t3, 0x4E);
r2 = _mm_blend_ps(t1,v, 0xC);
r3 = _mm_blend_ps(t3,v, 0x3);
This effectively converted 4 shuffles into 2 shuffles and 4 blends. This uses 2 more instructions than the implementation of GCC, ICC, and MSVC. The advantage is that it reduces port pressure which may have a benefit in some circumstances.
Currently all the shuffles and unpacks can go only to one particular port whereas the blends can go to either of two different ports.
I tried using 8 shuffles like MSVC and converting that into 4 shuffles + 8 blends but it did not work. I still had to use 4 unpacks.
I used this same technique for a 8x8 float transpose (see towards the end of that answer).
https://stackoverflow.com/a/25627536/2542702. In that answer I still had to use 8 unpacks but I manged to convert the 8 shuffles into 4 shuffles and 8 blends.
For 32-bit integers there is nothing like shufps (except for 128-bit shuffles with AVX512) so it can only be implemented with unpacks which I don't think can be convert to blends (efficiently). With AVX512 vshufi32x4 acts effectively like shufps except for 128-bit lanes of 4 integers instead of 32-bit floats so this same technique might be possibly with vshufi32x4 in some cases. With Knights Landing shuffles are four times slower (throughput) than blends.
If the size of the arrays are known prior then we could use the union to our help. Like this-
#include <bits/stdc++.h>
using namespace std;
union ua{
int arr[2][3];
int brr[3][2];
};
int main() {
union ua uav;
int karr[2][3] = {{1,2,3},{4,5,6}};
memcpy(uav.arr,karr,sizeof(karr));
for (int i=0;i<3;i++)
{
for (int j=0;j<2;j++)
cout<<uav.brr[i][j]<<" ";
cout<<'\n';
}
return 0;
}
Consider each row as a column, and each column as a row .. use j,i instead of i,j
demo: http://ideone.com/lvsxKZ
#include <iostream>
using namespace std;
int main ()
{
char A [3][3] =
{
{ 'a', 'b', 'c' },
{ 'd', 'e', 'f' },
{ 'g', 'h', 'i' }
};
cout << "A = " << endl << endl;
// print matrix A
for (int i=0; i<3; i++)
{
for (int j=0; j<3; j++) cout << A[i][j];
cout << endl;
}
cout << endl << "A transpose = " << endl << endl;
// print A transpose
for (int i=0; i<3; i++)
{
for (int j=0; j<3; j++) cout << A[j][i];
cout << endl;
}
return 0;
}
transposing without any overhead (class not complete):
class Matrix{
double *data; //suppose this will point to data
double _get1(int i, int j){return data[i*M+j];} //used to access normally
double _get2(int i, int j){return data[j*N+i];} //used when transposed
public:
int M, N; //dimensions
double (*get_p)(int, int); //functor to access elements
Matrix(int _M,int _N):M(_M), N(_N){
//allocate data
get_p=&Matrix::_get1; // initialised with normal access
}
double get(int i, int j){
//there should be a way to directly use get_p to call. but i think even this
//doesnt incur overhead because it is inline and the compiler should be intelligent
//enough to remove the extra call
return (this->*get_p)(i,j);
}
void transpose(){ //twice transpose gives the original
if(get_p==&Matrix::get1) get_p=&Matrix::_get2;
else get_p==&Matrix::_get1;
swap(M,N);
}
}
can be used like this:
Matrix M(100,200);
double x=M.get(17,45);
M.transpose();
x=M.get(17,45); // = original M(45,17)
of course I didn't bother with the memory management here, which is crucial but different topic.
template <class T>
void transpose( const std::vector< std::vector<T> > & a,
std::vector< std::vector<T> > & b,
int width, int height)
{
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
b[j][i] = a[i][j];
}
}
}
Modern linear algebra libraries include optimized versions of the most common operations. Many of them include dynamic CPU dispatch, which chooses the best implementation for the hardware at program execution time (without compromising on portability).
This is commonly a better alternative to performing manual optimization of your functinos via vector extensions intrinsic functions. The latter will tie your implementation to a particular hardware vendor and model: if you decide to swap to a different vendor (e.g. Power, ARM) or to a newer vector extensions (e.g. AVX512), you will need to re-implement it again to get the most of them.
MKL transposition, for example, includes the BLAS extensions function imatcopy. You can find it in other implementations such as OpenBLAS as well:
#include <mkl.h>
void transpose( float* a, int n, int m ) {
const char row_major = 'R';
const char transpose = 'T';
const float alpha = 1.0f;
mkl_simatcopy (row_major, transpose, n, m, alpha, a, n, n);
}
For a C++ project, you can make use of the Armadillo C++:
#include <armadillo>
void transpose( arma::mat &matrix ) {
arma::inplace_trans(matrix);
}
intel mkl suggests in-place and out-of-place transposition/copying matrices. here is the link to the documentation. I would recommend trying out of place implementation as faster ten in-place and into the documentation of the latest version of mkl contains some mistakes.
I think that most fast way should not taking higher than O(n^2) also in this way you can use just O(1) space :
the way to do that is to swap in pairs because when you transpose a matrix then what you do is: M[i][j]=M[j][i] , so store M[i][j] in temp, then M[i][j]=M[j][i],and the last step : M[j][i]=temp. this could be done by one pass so it should take O(n^2)
my answer is transposed of 3x3 matrix
#include<iostream.h>
#include<math.h>
main()
{
int a[3][3];
int b[3];
cout<<"You must give us an array 3x3 and then we will give you Transposed it "<<endl;
for(int i=0;i<3;i++)
{
for(int j=0;j<3;j++)
{
cout<<"Enter a["<<i<<"]["<<j<<"]: ";
cin>>a[i][j];
}
}
cout<<"Matrix you entered is :"<<endl;
for (int e = 0 ; e < 3 ; e++ )
{
for ( int f = 0 ; f < 3 ; f++ )
cout << a[e][f] << "\t";
cout << endl;
}
cout<<"\nTransposed of matrix you entered is :"<<endl;
for (int c = 0 ; c < 3 ; c++ )
{
for ( int d = 0 ; d < 3 ; d++ )
cout << a[d][c] << "\t";
cout << endl;
}
return 0;
}
Inside a performance-critical, parallel code I have a vector whose elements are:
Very expensive to compute, and the result is deterministic (the value of the element at a given position will depend on the position only)
Random access (typically the number of accesses are larger or much larger than the size of the vector)
Clustered accesses (many accesses request the same value)
The vector is shared by different threads (race condition?)
To avoid heap defragmention, the object should never be recreated, but whenever possible resetted and recycled
The value to be placed in the vector will be provided by a polymorphic object
Currently, I precompute all possible values of the vectors, so race condition should not be an issue.
In order to improve performances, I am considering to create a lazy vector, such that the code performs computations only when the element of the vector is requested.
In a parallel region, it might happen that more than one thread are requesting, and perhaps calculating, the same element at the same time.
How do I take care of this possible race condition?
Below is an example of what I want to achieve. It compiles and runs properly under Windows 10, Visual Studio 17. I use C++17.
// Lazy.cpp : Defines the entry point for the console application.
#include "stdafx.h"
#include <vector>
#include <iostream>
#include <stdlib.h>
#include <chrono>
#include <math.h>
const double START_SUM = 1;
const double END_SUM = 1000;
//base object responsible for providing the values
class Evaluator
{
public:
Evaluator() {};
~Evaluator() {};
//Function with deterministic output, depending on the position
virtual double expensiveFunction(int pos) const = 0;
};
//
class EvaluatorA: public Evaluator
{
public:
//expensive evaluation
virtual double expensiveFunction(int pos) const override {
double t = 0;
for (int j = START_SUM; j++ < END_SUM; j++)
t += log(exp(log(exp(log(j + pos)))));
return t;
}
EvaluatorA() {};
~EvaluatorA() {};
};
class EvaluatorB : public Evaluator
{
public:
//even more expensive evaluation
virtual double expensiveFunction(int pos) const override {
double t = 0;
for (int j = START_SUM; j++ < 10*END_SUM; j++)
t += log(exp(log(exp(log(j + pos)))));
return t;
}
EvaluatorB() {};
~EvaluatorB() {};
};
class LazyVectorTest //vector that contains N possible results
{
public:
LazyVectorTest(int N,const Evaluator & eval) : N(N), innerContainer(N, 0), isThatComputed(N, false), eval_ptr(&eval)
{};
~LazyVectorTest() {};
//reset, to generate a new table of values
//the size of the vector stays constant
void reset(const Evaluator & eval) {
this->eval_ptr = &eval;
for (int i = 0; i<N; i++)
isThatComputed[i] = false;
}
int size() { return N; }
//accessing the same position should yield the same result
//unless the object is resetted
const inline double& operator[](int pos) {
if (!isThatComputed[pos]) {
innerContainer[pos] = eval_ptr->expensiveFunction(pos);
isThatComputed[pos] = true;
}
return innerContainer[pos];
}
private:
const int N;
const Evaluator* eval_ptr;
std::vector<double> innerContainer;
std::vector<bool> isThatComputed;
};
//the parallel access will take place here
template <typename T>
double accessingFunction(T& A, const std::vector<int>& elementsToAccess) {
double tsum = 0;
int size = elementsToAccess.size();
//#pragma omp parallel for
for (int i = 0; i < size; i++)
tsum += A[elementsToAccess[i]];
return tsum;
}
std::vector<int> randomPos(int sizePos, int N) {
std::vector<int> elementsToAccess;
for (int i = 0; i < sizePos; i++)
elementsToAccess.push_back(rand() % N);
return elementsToAccess;
}
int main()
{
srand(time(0));
int minAccessNumber = 1;
int maxAccessNumber = 100;
int sizeVector = 50;
auto start = std::chrono::steady_clock::now();
double res = 0;
float numberTest = 100;
typedef LazyVectorTest container;
EvaluatorA eval;
for (int i = 0; i < static_cast<int>(numberTest); i++) {
res = eval.expensiveFunction(i);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli>diff(end - start);
double benchmark = diff.count() / numberTest;
std::cout <<"Average time to compute expensive function:" <<benchmark<<" ms"<<std::endl;
std::cout << "Value of the function:" << res<< std::endl;
std::vector<std::vector<int>> indexs(numberTest);
container A(sizeVector, eval);
for (int accessNumber = minAccessNumber; accessNumber < maxAccessNumber; accessNumber++) {
indexs.clear();
for (int i = 0; i < static_cast<int>(numberTest); i++) {
indexs.emplace_back(randomPos(accessNumber, sizeVector));
}
auto start_lazy = std::chrono::steady_clock::now();
for (int i = 0; i < static_cast<int>(numberTest); i++) {
A.reset(eval);
double res_lazy = accessingFunction(A, indexs[i]);
}
auto end_lazy = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli>diff_lazy(end_lazy - start_lazy);
std::cout << accessNumber << "," << diff_lazy.count() / numberTest << ", " << diff_lazy.count() / (numberTest* benchmark) << std::endl;
}
return 0;
}
Rather than roll you own locking, I'd first see if you get acceptable performance with std::call_once.
class LazyVectorTest //vector that contains N possible results
{
//Function with deterministic output, depending on the position
void expensiveFunction(int pos) {
double t = 0;
for (int j = START_SUM; j++ < END_SUM; j++)
t += log(exp(log(exp(log(j+pos)))));
values[pos] = t;
}
public:
LazyVectorTest(int N) : values(N), flags(N)
{};
int size() { return values.size(); }
//accessing the same position should yield the same result
double operator[](int pos) {
std::call_once(flags[pos], &LazyVectorTest::expensiveFunction, this, pos);
return values[pos];
}
private:
std::vector<double> values;
std::vector<std::once_flag> flags;
};
call_once is pretty transparent. It allows exactly one thread to run a function to completion. The only potential drawback is that it will block a second thread waiting for a possible exception, rather than immediately do nothing. In this case that is desirable, as you want the modification values[pos] = t; to be sequenced before the read return values[pos];
Your current code is problematic, mainly because of std::vector<bool> being horrible, but also atomicity and memory consistency is missing. Here is the sketch of a solution based entirely on OpenMP. I would suggest to actually special marker for missing entries instead of a separate vector<bool> - it makes everything much easier:
class LazyVectorTest //vector that contains N possible results
{
public:
LazyVectorTest(int N,const Evaluator & eval) : N(N), innerContainer(N, invalid), eval_ptr(&eval)
{};
~LazyVectorTest() {};
//reset, to generate a new table of values
//the size of the vector stays constant
void reset(const Evaluator & eval) {
this->eval_ptr = &eval;
for (int i = 0; i<N; i++) {
// Use atomic if that could possible be done in parallel
// omit that for performance if you doun't ever run it in parallel
#pragma omp atomic write
innerContainer[i] = invalid;
}
// Flush to make sure invalidation is visible to all threads
#pragma omp flush
}
int size() { return N; }
// Don't return a reference here
double operator[] (int pos) {
double value;
#pragma omp atomic read
value = innerContainer[pos];
if (value == invalid) {
value = eval_ptr->expensiveFunction(pos);
#pragma omp atomic write
innerContainer[pos] = value;
}
return value;
}
private:
// Use nan, inf or some random number - doesn't really matter
static constexpr double invalid = std::nan("");
const int N;
const Evaluator* eval_ptr;
std::vector<double> innerContainer;
};
In case of a collision, the other threads will just redundantly compute the value. - exploiting the deterministic nature. My using omp atomic on both read and write of the elements, you ensure that no inconsistent "half-written" values are ever read.
This solution may create some additional latency for the rare bad cases. In turn, the good cases are optimal, with just a single atomic read. You don't even need any memory flushes / seq_cst - worst case is a redundant computation. You would need these (sequential consistency) if you write the flag and value separately, to ensure the order in which the changes becomes visible is correct.
I'm having trouble creating a Matrix using Vectors. The rows and columns will all be random doubles and I am trying to fill it in.
I have tried to initialize the size of the vector before hand, I believe it initializes correctly, however, when I try to push onto the rows and columns I get very odd output. Here is my Header File:
#ifndef MATRIX_NORM_HPP
#define MATRIX_NORM_HPP
#include <vector>
class MatrixNorm
{
public:
void initProgram();
void printResults();
double randNumber();
private:
std::vector<std::vector<double>> M;
double mNorm1 = 0.0;
double mNormInf = 0.0;
};
#endif
Here is my CPP File:
#include "matrixNorm.hpp"
#include <iostream>
#include <random>
void initProgram()
{
double ranNum = 0.0;
int size = 0;
std::cout << "Please enter a size of an n by n Matrix: ";
std::cin >> size;
std::vector<std::vector<double>> temp(size, std::vector<double>(size));
for(int i = 0; i < size; ++i)
{
for(int j = 0; j < size; ++j)
{
ranNum = randNumber();
temp[i].push_back(ranNum);
temp[j].push_back(ranNum);
}
}
M = temp;
printResults();
}
void MatrixNorm::printResults()
{
for(auto &&e: M)
{
for(auto && f: e)
{
std::cout << f << " ";
}
std::cout << std::endl;
}
}
double MatrixNorm::randNumber()
{
double ranDouble = 0.0;
std::random_device rd;
std::default_random_engine generator(rd());
std::uniform_real_distribution<double> unif(-1000.0,1000.0);
ranDouble = unif(generator);
return ranDouble;
}
The output I receive when I run the program from main.cpp is:
Please enter a size of an n by n Matrix: 3
0 0 0 792.208 792.208 -361.248 -776.871 742.521 116.732
0 0 0 -361.248 742.521 411.965 411.965 909.313 -50.0048
0 0 0 -776.871 909.313 116.732 -50.0048 79.6189 79.6189
As you can see, it seems to get the column size correctly, but it does not get the row size correctly, and if you look very closely. Some of the numbers are duplicates, I wish I knew how to format it more clearly but if you start at the top left you see 792.208 792.208 then go down a row and you see 411.965 411.965 and last it finishes off at 79.6189 79.6189 in the lower right.
What am I doing wrong? How do I do this correctly? Any help would be appreciated.
Seems to me that the correct way to initialize your matrix is:
(...)
std::vector<std::vector<double>> temp;
for(int i = 0; i < size; ++i)
{
std::vector<double> k;
for(int j = 0; j < size; ++j)
{
ranNum = randNumber();
k.emplace_back(ranNum);
}
temp.emplace_back(k);
}
(...)
Explanation:
with this constructor:
std::vector<std::vector<double>> temp(size, std::vector<double>(size));
you are creating size copies of default vector constructed of size elements (std::vector<double>(size)). In other words, you have a size x size matrix.
So, instead of pushing new values in your code, you should be changing it. In the code I proposed, it is just simpler to populated this matrix when you are creating it.
I want to output my histogram using the fewest amount of for loops possible
int* histogram(int size, int* arr)
{
int bin[10] = {};
for (int i = 0; i < size; i++)
{
if (arr[i] >= 0 && arr[i] < 10)
{
bin[0]++;
}
else if (arr[i] >= 10 && arr[i] < 20)
{
bin[1]++;
}
return bin;
}
Currently I am outputting the histogram like this:
cout << "0|";
for (int j = 0; j < bin[0]; j++)
cout << "*";
cout << endl;
But this is long and annoying. Is there a way to achieve the same output in fewer
for loops?
I am going to ignore the bugs in your histogram code, as it isn't really relevant to the question of optimising histogram output.
For information on the bug (returning a local variable), check out this Stack Overflow question.
Also, you are missing a curly brace. Always check that your code compiles and runs in its most minimalist form before posting it.
You state that the problem is that the method you use is "long and annoying", but it isn't clear if you are referring to the design of your code or the speed at which it performs.
Performance
The fastest you can possibly read the histogram is with O(n), where n is the number of bins in the histogram. In this sense your code is about as fast as it can get without micro-optimising it.
If you include the printing out of your histogram, then you have O(n*m), where m is the average number of entries per bin.
Writing a histogram is also O(n*k), where k is the number of entries in your array, because you have to figure out which bin each value belongs in.
Design
If the problem you have is that the code is bloated and unwieldy, then use less magic numbers and add more arguments to the function, like this:
#include <iostream>
void histogram(int const size, int const * const arr, unsigned int const number_of_bins, float const bin_min, float const bin_max, int * output)
{
float const binsize = (bin_max - bin_min)/number_of_bins;
for (int i = 0; i < size; i++)
{
for(int j = 0; j < number_of_bins; ++j)
{
if (arr[i] >= bin_min + binsize*j && arr[i] < bin_min + binsize*(j+1))
{
output[j]++;
}
}
}
}
int main(){
int const number_of_bins = 10;
float const bin_min = 0;
float const bin_max = 100;
int const size = 20;
int const array[size] = {5,6,20,40,44,50,110,6,-1,51,55,56,20,50,60,80,81,0,32,3};
int bin[number_of_bins] = {};
histogram(size, array, number_of_bins, bin_min, bin_max, bin);
for(int i = 0; i < number_of_bins; ++i)
{
std::cout << i << "|";
for (int j = 0; j < bin[i]; j++)
{
std::cout << "*";
}
std::cout << std::endl;
}
}
Compiled with:
g++ main.cc -o Output
Output:
0|*****
1|
2|**
3|*
4|**
5|*****
6|*
7|
8|**
9|
(Bonus, your bugs are fixed)
First of all your program is incorrect since, as pointed out, you return a pointer to a local variable form a function. To correct this you should use either std::array<Type, Size> or std::vector<Type>.
Regarding your question if you want short and compact code try this:
#include <string>
#include <algorithm>
#include <iostream>
#include <array>
std::array<int, 10> bin;
// Fill your array here
int i = 0;
std::for_each(bin.begin(), bin.end(), [&i](auto x)
{
std::cout << i++ << "|" << std::string(x, '*') << std::endl;
});
This code takes advantage of fill constructor of std::string which avoids your for cycle. But since you want to iterate through the array you need to do it in one way or the other. Either by an explicit for or by calling another function.
Note: this code is less efficient than a standard for loop but your question is how to avoid these.
Given two strings string X of length x1 and string Y of length y1, find the longest sequence of characters that appear left to right (but not necessarily in contiguous block) in both strings.
e.g if X = ABCBDAB and Y = BDCABA, the LCS(X,Y) = {"BCBA","BDAB","BCAB"} and LCSlength is 4.
I used the standard solution for this problem:
if(X[i]=Y[j]) :1+LCS(i+1,j+1)
if(X[i]!=Y[j]) :LCS(i,j+1) or LCS(i+1,j), whichever is greater
and then I used memorization, making it a standard DP problem.
#include<iostream>
#include<string>
using namespace std;
int LCS[1024][1024];
int LCSlen(string &x, int x1, string &y, int y1){
for(int i = 0; i <= x1; i++)
LCS[i][y1] = 0;
for(int j = 0; j <= y1; j++)
LCS[x1][j] = 0;
for(int i = x1 - 1; i >= 0; i--){
for(int j = y1 - 1; j >= 0; j--){
LCS[i][j] = LCS[i+1][j+1];
if(x[i] == y[j])
LCS[i][j]++;
if(LCS[i][j+1] > LCS[i][j])
LCS[i][j] = LCS[i][j+1];
if(LCS[i+1][j] > LCS[i][j])
LCS[i][j] = LCS[i+1][j];
}
}
return LCS[0][0];
}
int main()
{
string x;
string y;
cin >> x >> y;
int x1 = x.length() , y1 = y.length();
int ans = LCSlen( x, x1, y, y1);
cout << ans << endl;
return 0;
}
Running here, this solution I used in SPOJ and I got a time limit exceeded and/or runtime error.
Only 14 user solutions are yet accepted. Is there a smarter trick to decrease the time complexity of this question?
LCS is a classical, well studied computer science problem, and for the case with two sequences it is known that its lower bound is O(n·m).
Furthermore, your algorithm implementation has no obvious efficiency bugs, so it should run close to as fast as possible (although it may be beneficial to use a dynamically sized 2D matrix rather than an oversized one, which takes up 4 MiB of memory, and will require frequent cache invalidation (which is a costly operation, since it causes a transfer from main memory to the processor cache, which is several orders of magnitude slower than cached memory access).
In terms of algorithm, in order to lower the theoretical bound you need to exploit specifics of your input structure: for instance, if you are searching one of the strings repeatedly, it may pay to build a search index which takes some processing time, but will make the actual search much faster. Two classical variants of that are the suffix array and the suffix tree.
If it is known that at least one of your strings is very short (< 64 characters) you can use Myers’ bit vector algorithm, which performs much faster. Unfortunately the algorithm is far from trivial to implement. There exists an implementation in the SeqAn library, but using the library itself has a steep learning curve.
(As a matter of interest, this algorithm finds frequent application in bioinformatics, and has been used during the sequence assembly in the Human Genome Project.)
Although I still didn't get an AC because of time limit exceeded ,I was however able to implement the linear space algorithm.In case anyone wants to see, here is the c++ implementation of the Hirschbirg algorithm.
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <cstring>
#include <string>
#include <cstdio>
using namespace std;
int* compute_help_table(const string & A,const string & B);
string lcs(const string & A, const string & B);
string simple_solution(const string & A, const string & B);
int main(void) {
string A,B;
cin>>A>>B;
cout << lcs(A, B).size() << endl;
return 0;
}
string lcs(const string &A, const string &B) {
int m = A.size();
int n = B.size();
if (m == 0 || n == 0) {
return "";
}
else if(m == 1) {
return simple_solution(A, B);
}
else if(n == 1) {
return simple_solution(B, A);
}
else {
int i = m / 2;
string Asubstr = A.substr(i, m - i);
//reverse(Asubstr.begin(), Asubstr.end());
string Brev = B;
reverse(Brev.begin(), Brev.end());
int* L1 = compute_help_table(A.substr(0, i), B);
int* L2 = compute_help_table(Asubstr, Brev);
int k;
int M = -1;
for(int j = 0; j <= n; j++) {
if(M < L1[j] + L2[n-j]) {
M = L1[j] + L2[n-j];
k = j;
}
}
delete [] L1;
delete [] L2;
return lcs(A.substr(0, i), B.substr(0, k)) + lcs(A.substr(i, m - i), B.substr(k, n - k));
}
}
int* compute_help_table(const string &A, const string &B) {
int m = A.size();
int n = B.size();
int* first = new int[n+1];
int* second = new int[n+1];
for(int i = 0; i <= n; i++) {
second[i] = 0;
}
for(int i = 0; i < m; i++) {
for(int k = 0; k <= n; k++) {
first[k] = second[k];
}
for(int j = 0; j < n; j++) {
if(j == 0) {
if (A[i] == B[j])
second[1] = 1;
}
else {
if(A[i] == B[j]) {
second[j+1] = first[j] + 1;
}
else {
second[j+1] = max(second[j], first[j+1]);
}
}
}
}
delete [] first;
return second;
}
string simple_solution(const string & A, const string & B) {
int i = 0;
for(; i < B.size(); i++) {
if(B.at(i) == A.at(0))
return A;
}
return "";
}
Running here.
If the two strings share a common prefix (e.g. "ABCD" and "ABXY" share "AB") then that will be part of the LCS. Same for common suffixes. So for some pairs of strings you can gain some speed by skipping over the longest common prefix and longest common suffix before starting the DP algorithm; this doesn't change the worst-case bounds, but it changes the best case complexity to linear time and constant space.