Lazy vector access in parallel loops - c++

Inside a performance-critical, parallel code I have a vector whose elements are:
Very expensive to compute, and the result is deterministic (the value of the element at a given position will depend on the position only)
Random access (typically the number of accesses are larger or much larger than the size of the vector)
Clustered accesses (many accesses request the same value)
The vector is shared by different threads (race condition?)
To avoid heap defragmention, the object should never be recreated, but whenever possible resetted and recycled
The value to be placed in the vector will be provided by a polymorphic object
Currently, I precompute all possible values of the vectors, so race condition should not be an issue.
In order to improve performances, I am considering to create a lazy vector, such that the code performs computations only when the element of the vector is requested.
In a parallel region, it might happen that more than one thread are requesting, and perhaps calculating, the same element at the same time.
How do I take care of this possible race condition?
Below is an example of what I want to achieve. It compiles and runs properly under Windows 10, Visual Studio 17. I use C++17.
// Lazy.cpp : Defines the entry point for the console application.
#include "stdafx.h"
#include <vector>
#include <iostream>
#include <stdlib.h>
#include <chrono>
#include <math.h>
const double START_SUM = 1;
const double END_SUM = 1000;
//base object responsible for providing the values
class Evaluator
{
public:
Evaluator() {};
~Evaluator() {};
//Function with deterministic output, depending on the position
virtual double expensiveFunction(int pos) const = 0;
};
//
class EvaluatorA: public Evaluator
{
public:
//expensive evaluation
virtual double expensiveFunction(int pos) const override {
double t = 0;
for (int j = START_SUM; j++ < END_SUM; j++)
t += log(exp(log(exp(log(j + pos)))));
return t;
}
EvaluatorA() {};
~EvaluatorA() {};
};
class EvaluatorB : public Evaluator
{
public:
//even more expensive evaluation
virtual double expensiveFunction(int pos) const override {
double t = 0;
for (int j = START_SUM; j++ < 10*END_SUM; j++)
t += log(exp(log(exp(log(j + pos)))));
return t;
}
EvaluatorB() {};
~EvaluatorB() {};
};
class LazyVectorTest //vector that contains N possible results
{
public:
LazyVectorTest(int N,const Evaluator & eval) : N(N), innerContainer(N, 0), isThatComputed(N, false), eval_ptr(&eval)
{};
~LazyVectorTest() {};
//reset, to generate a new table of values
//the size of the vector stays constant
void reset(const Evaluator & eval) {
this->eval_ptr = &eval;
for (int i = 0; i<N; i++)
isThatComputed[i] = false;
}
int size() { return N; }
//accessing the same position should yield the same result
//unless the object is resetted
const inline double& operator[](int pos) {
if (!isThatComputed[pos]) {
innerContainer[pos] = eval_ptr->expensiveFunction(pos);
isThatComputed[pos] = true;
}
return innerContainer[pos];
}
private:
const int N;
const Evaluator* eval_ptr;
std::vector<double> innerContainer;
std::vector<bool> isThatComputed;
};
//the parallel access will take place here
template <typename T>
double accessingFunction(T& A, const std::vector<int>& elementsToAccess) {
double tsum = 0;
int size = elementsToAccess.size();
//#pragma omp parallel for
for (int i = 0; i < size; i++)
tsum += A[elementsToAccess[i]];
return tsum;
}
std::vector<int> randomPos(int sizePos, int N) {
std::vector<int> elementsToAccess;
for (int i = 0; i < sizePos; i++)
elementsToAccess.push_back(rand() % N);
return elementsToAccess;
}
int main()
{
srand(time(0));
int minAccessNumber = 1;
int maxAccessNumber = 100;
int sizeVector = 50;
auto start = std::chrono::steady_clock::now();
double res = 0;
float numberTest = 100;
typedef LazyVectorTest container;
EvaluatorA eval;
for (int i = 0; i < static_cast<int>(numberTest); i++) {
res = eval.expensiveFunction(i);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli>diff(end - start);
double benchmark = diff.count() / numberTest;
std::cout <<"Average time to compute expensive function:" <<benchmark<<" ms"<<std::endl;
std::cout << "Value of the function:" << res<< std::endl;
std::vector<std::vector<int>> indexs(numberTest);
container A(sizeVector, eval);
for (int accessNumber = minAccessNumber; accessNumber < maxAccessNumber; accessNumber++) {
indexs.clear();
for (int i = 0; i < static_cast<int>(numberTest); i++) {
indexs.emplace_back(randomPos(accessNumber, sizeVector));
}
auto start_lazy = std::chrono::steady_clock::now();
for (int i = 0; i < static_cast<int>(numberTest); i++) {
A.reset(eval);
double res_lazy = accessingFunction(A, indexs[i]);
}
auto end_lazy = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli>diff_lazy(end_lazy - start_lazy);
std::cout << accessNumber << "," << diff_lazy.count() / numberTest << ", " << diff_lazy.count() / (numberTest* benchmark) << std::endl;
}
return 0;
}

Rather than roll you own locking, I'd first see if you get acceptable performance with std::call_once.
class LazyVectorTest //vector that contains N possible results
{
//Function with deterministic output, depending on the position
void expensiveFunction(int pos) {
double t = 0;
for (int j = START_SUM; j++ < END_SUM; j++)
t += log(exp(log(exp(log(j+pos)))));
values[pos] = t;
}
public:
LazyVectorTest(int N) : values(N), flags(N)
{};
int size() { return values.size(); }
//accessing the same position should yield the same result
double operator[](int pos) {
std::call_once(flags[pos], &LazyVectorTest::expensiveFunction, this, pos);
return values[pos];
}
private:
std::vector<double> values;
std::vector<std::once_flag> flags;
};
call_once is pretty transparent. It allows exactly one thread to run a function to completion. The only potential drawback is that it will block a second thread waiting for a possible exception, rather than immediately do nothing. In this case that is desirable, as you want the modification values[pos] = t; to be sequenced before the read return values[pos];

Your current code is problematic, mainly because of std::vector<bool> being horrible, but also atomicity and memory consistency is missing. Here is the sketch of a solution based entirely on OpenMP. I would suggest to actually special marker for missing entries instead of a separate vector<bool> - it makes everything much easier:
class LazyVectorTest //vector that contains N possible results
{
public:
LazyVectorTest(int N,const Evaluator & eval) : N(N), innerContainer(N, invalid), eval_ptr(&eval)
{};
~LazyVectorTest() {};
//reset, to generate a new table of values
//the size of the vector stays constant
void reset(const Evaluator & eval) {
this->eval_ptr = &eval;
for (int i = 0; i<N; i++) {
// Use atomic if that could possible be done in parallel
// omit that for performance if you doun't ever run it in parallel
#pragma omp atomic write
innerContainer[i] = invalid;
}
// Flush to make sure invalidation is visible to all threads
#pragma omp flush
}
int size() { return N; }
// Don't return a reference here
double operator[] (int pos) {
double value;
#pragma omp atomic read
value = innerContainer[pos];
if (value == invalid) {
value = eval_ptr->expensiveFunction(pos);
#pragma omp atomic write
innerContainer[pos] = value;
}
return value;
}
private:
// Use nan, inf or some random number - doesn't really matter
static constexpr double invalid = std::nan("");
const int N;
const Evaluator* eval_ptr;
std::vector<double> innerContainer;
};
In case of a collision, the other threads will just redundantly compute the value. - exploiting the deterministic nature. My using omp atomic on both read and write of the elements, you ensure that no inconsistent "half-written" values are ever read.
This solution may create some additional latency for the rare bad cases. In turn, the good cases are optimal, with just a single atomic read. You don't even need any memory flushes / seq_cst - worst case is a redundant computation. You would need these (sequential consistency) if you write the flag and value separately, to ensure the order in which the changes becomes visible is correct.

Related

Avoiding struct downcasting when passing to a function (C++)

I have a set of Arguments defined as struct for a set of operations (mean, minmax etc.)
struct Arguments {
double *data;
int num_data;
Arguments(double *data, int num_data) : data(data), num_data(num_data) {}
};
struct MeanOperationArguments: Arguments {
MeanOperationArguments(double *data, int num_data) : Arguments(data, num_data) {}
};
struct MinmaxOperationArguments: Arguments {
bool is_min_op;
MinmaxOperationArguments(double *data, int num_data, bool is_min_op) : is_min_op(is_min_op), Arguments(data, num_data) {}
};
I need to define an Operation class as follows:
class Operation {
public:
virtual void execute() = 0;
}
class MeanOperation: public Operation {}
// an operation that can be told to display either the minimum or the maximum.
class MinmaxOperation: public Operation {}
Also, I have an operation factory with returns the specifc operation object instance based on the type of operation:
class OperationFactory {
public:
Operation *get(OP_TYPE t, Arguments *args) {
switch(t) {
case MEAN:
return new MeanOperation(args);
case MINMAX:
return args->is_min_op ? // ERROR: Because struct downcasts to `Arguments`
new MinOperation(args):
new MaxOperation(args);
}
}
};
I need to be able to run my operation based on the type of argument struct like this:
int main() {
double data[] = { 1, 2, 3, 4 };
int num_data = 4;
OperationFactory operations;
Arguments *mean_args = new MeanOperationArguments(data, num_data);
Operation *mean_op = operations.get(MEAN, mean_args);
mean_op->execute();
Arguments *min_args = new MinmaxOperationArguments(data, num_data, true);
Operation *min_op = operations.get(MINMAX, min_args);
min_op->execute();
return 0;
}
How can I initialize my operation with require arguments based on the use case?
If you put a single virtual method in the base class, preferably the destructor, you could use dynamic_cast to convert the pointer to an instance of the derived class. If the conversion fails you have your answer, if it succeeds you can call any of the derived class methods on it.
There are multiple things I have to address. First, avoid structure parent / child relationships. It adds unnecessary dependencies. Look at structures like custom data structures. Data is data at the end of the day. It only has meaning when you interpret it. Going off that logic, your argument structure could be simplified as an array with an unsigned integer that tells how long is that array (similar to a vector, so maybe you could look into using a vector instead of a struct). Going off this logic, the best approach you can take is having multiple functions with different names that take in the same arguments but return different result based on whatever it is that you want it to do. Here is what I am talking about:
#include <iostream>
struct DataSet {
public:
double* data;
int size;
DataSet(double* data, unsigned int size) {
this->data = new double[size];
this->size = size;
for (unsigned int i = 0; i < size; i++)
this->data[i] = data[i];
}
};
double mean(const DataSet& dataSet) {
double mean = 0;
for (unsigned int i = 0; i < dataSet.size; i++)
mean += dataSet.data[i];
mean = mean / dataSet.size;
return mean;
}
double min(const DataSet& dataSet) {
double min = dataSet.data[0];
for (unsigned int i = 1; i < dataSet.size; i++)
if (dataSet.data[i] < min)
min = dataSet.data[i];
return min;
}
double max(const DataSet& dataSet) {
double min = dataSet.data[0];
for (unsigned int i = 1; i < dataSet.size; i++)
if (dataSet.data[i] > min)
min = dataSet.data[i];
return min;
}
int main() {
double data[5] = { 1, 2, 3, 4, 5 };
unsigned int size = 5;
DataSet dataSet = DataSet(data, size);
double result = 0;
result = mean(dataSet);
std::cout << "Mean: " << result << std::endl;
result = min(dataSet);
std::cout << "Min: " << result << std::endl;
result = max(dataSet);
std::cout << "Max: " << result << std::endl;
}
I included everything in one .cpp file for convenience. If you are trying to implement a system, I would suggest making an enum class, store an enum value that represents what operation the user wants to perform, make a switch statement that points to these functions.
Note, be careful with passing pointers around because you might end up with memory leaks. If you notice in the code implementation, I am doing a deep copy, therefore passing memory ownership to the structure to DataSet.
Edit for better system design fit
#include <iostream>
class DataSet {
public:
double* data;
int size;
DataSet() {
data = nullptr;
size = 0;
}
DataSet(double* data, unsigned int size) {
this->data = new double[size];
this->size = size;
for (unsigned int i = 0; i < size; i++)
this->data[i] = data[i];
}
~DataSet() {
if (data != nullptr)
delete(data);
}
};
class Operation {
protected:
DataSet dataSet;
public:
Operation(double* data, unsigned int size) : dataSet(data, size) {
}
virtual double execute() = 0;
};
class Mean : public Operation {
public:
Mean(double* data, unsigned int size) : Operation(data, size) {
}
~Mean() {
}
double execute() {
double mean = 0;
for (unsigned int i = 0; i < dataSet.size; i++)
mean += dataSet.data[i];
mean = mean / dataSet.size;
return mean;
}
};
class MinMax : public Operation {
public:
bool useMin;
MinMax(double* data, unsigned int size) : useMin(true), Operation(data, size) {
}
~MinMax() {
}
double execute() {
if (useMin) {
double min = dataSet.data[0];
for (unsigned int i = 1; i < dataSet.size; i++)
if (dataSet.data[i] < min)
min = dataSet.data[i];
return min;
}
else {
double min = dataSet.data[0];
for (unsigned int i = 1; i < dataSet.size; i++)
if (dataSet.data[i] > min)
min = dataSet.data[i];
return min;
}
}
};
int main() {
double data[5] = { 1, 2, 3, 4, 5 };
unsigned int size = 5;
DataSet dataSet = DataSet(data, size);
double result = 0;
Mean mean = Mean(data, size);
std::cout << "Mean: " << mean.execute() << std::endl;
MinMax minMax = MinMax(data, size);
std::cout << "MinMax: " << minMax.execute() << std::endl;
minMax.useMin = false;
std::cout << "MinMax: " << minMax.execute() << std::endl;
}
For better fit your system, I worked out a better solution. I still got rid of your struct hierarchy but kept the hierarchy in your classes. MinMax will return min or max depending on the useMin boolean value. You said you are printing it in the comments, so you would just have to change it to void and instead of returning the value, just print it. I hope this points you into a better direction.
Something like:
case MINMAX:
return dynamic_cast<MinmaxOperationArguments*>(args)->is_min_op ?
new MinOperation(args):
new MaxOperation(args);
}
Note, that result of cast should be checked before use or it may crash in case of incorrect argument.
RTTI has to be enabled.

How to get rid of this global variable when using the recursion?

First of all, I don't want to use sort. This is just an illustration example. The main purpose of this question is that I want to:
find all possible combinations of m numbers out of n numbers and
process them, then return the unique processed result (since the
processed results of all possible combinations will be compared).
Question start at here
The following code get all possible combinations M numbers out of N numbers. Sum the M numbers and find the largest sum. In doing this I used a recursion function.
However, it seems that I must define a global variable to store the temporary largest sum. Is there any way to get rid of this global variable? For example, define the recursion function to return the largest sum... I don't want the global variable just become an argument &max_sum in the find_sum, since find_sum already have too many arguments.
#include <iostream>
#include <vector>
void find_sum(const std::vector<int>& ar, std::vector<int>& combine,
int index, int start);
int max_sum =0;
int main() {
int N = 10;
int M = 3;
std::vector<int> ar(N);
ar = {0,9,2,3,7,6,1,4,5,8};
int index = 0, start =0;
std::vector<int> combine(M);
find_sum(ar, combine, index, start);
std::cout << max_sum <<std::endl;
return 0;
}
void find_sum(const std::vector<int>& ar, std::vector<int>& combine,
int index, int start) {
if(index == combine.size()) {
int sum =0;
for(int i=0; i<index; ++i) {
sum += combine[i];
}
if(max_sum < sum) {
max_sum = sum;
}
return ;
}
for(int i = start;
i < ar.size() && ar.size()-i > combine.size()-index;
++i) {
combine[index] = ar[i];
find_sum(ar, combine, index+1, start+1);
}
}
An approach that scales well is to turn find_sum into a function object. The trick is to define a struct with an overloaded () operator that takes a certain set of parameters:
struct FindSum
{
void operator()(const std::vector<int>& ar, std::vector<int>& combine,
int index, int start){
/*ToDo - write the function here, a very explicit way of
/*engineering the recursion is to use this->operator()(...)*/
}
int max_sum; // I am now a member variable
};
Then instantiate FindSum find_sum;, set find_sum.max_sum if needed (perhaps even do that in a constructor), then call the overloaded () operator using find_sum(...).
This technique allows you to pass state into what essentially is a function.
From find_sum, return the so-far maximum sum (instead of void). That means that the recursion-terminating code would be:
if(index == combine.size()) {
int sum =0;
for(int i=0; i<index; ++i) {
sum += combine[i];
}
return sum;
}
and the recursive part would be
int max_sum = 0;
for(int i = start;
i < ar.size() && ar.size()-i > combine.size()-index;
++i) {
combine[index] = ar[i];
int thismaxsum = find_sum(ar, combine, index+1, start+1);
if(thismaxssum > max_sum)
max_sum = thismaxsum;
}
return max_sum;
So, the overall solution is:
#include <iostream>
#include <vector>
int find_sum(const std::vector<int>& ar, std::vector<int>& combine,
int index, int start);
int main() {
int N = 10;
int M = 3;
std::vector<int> ar(N);
ar = { 0,9,2,3,7,6,1,4,5,8 };
int index = 0, start = 0;
std::vector<int> combine(M);
int max_sum = find_sum(ar, combine, index, start);
std::cout << max_sum << std::endl;
return 0;
}
int find_sum(const std::vector<int>& ar, std::vector<int>& combine,
int index, int start)
{
if (index == combine.size())
{
int sum = 0;
for (int i = 0; i<index; ++i)
{
sum += combine[i];
}
return sum;
}
int max_sum = 0;
for (int i = start;
i < ar.size() && ar.size() - i > combine.size() - index;
++i)
{
combine[index] = ar[i];
int thismaxsum = find_sum(ar, combine, index + 1, start + 1);
if (thismaxsum > max_sum)
max_sum = thismaxsum;
}
return max_sum;
}
Global variables are much better then adding operands and variables to recursion functions because each operand and variable causes heap/stack trashing negatively impact performance and space usage risking stack overflow for higher recursions.
To avoid global variables (for code cosmetics and multi threading/instancing purposes) I usually use context or temp struct. For example like this:
// context type
struct f1_context
{
// here goes any former global variables and stuff you need
int n;
};
// recursive sub function
int f1_recursive(f1_context &ctx)
{
if (ctx.n==0) return 0;
if (ctx.n==1) return 1;
ctx.n--;
return (ctx.n+1)*f1_recursive(ctx.n);
}
// main API function call
int f1(int n)
{
// init context
f1_context ctx;
ctx.n=n;
// start recursion
return f1_recursion(ctx);
}
the f1(n) is factorial example. This way the operands are limited to single pointer to structure. Of coarse you can add any recursion tail operands after the context... the context is just for global and persistent stuff (even if I did use it for the recursion tail instead but that is not always possible).

Parallel order-preserving selection from an array using tbb

I have a range-image and want to convert it into a libpointmatcher point cloud. The cloud is an Eigen::Matrix with 4 rows (x,y,z,1) and several columns for every point.
The range-image is an unsigned short*array including the range values (z) and an unsigned char*array including information about the pixel visibility.
In serial, my code looks like this:
//container to hold the data
std::vector<Eigen::Vector4d> vec;
vec.reserve(this->Height*this->Width);
//contains information about pixel visibility
unsigned char* mask_data = (unsigned char*)range_image.mask.ToPointer();
//contains the actual pixel data
unsigned short* pixel_data = (unsigned short*)range_image.pixel.ToPointer();
for (int y =0;y < range_image.Height; y++)
{
for (int x = 0; x < range_image.Width; x++)
{
int index =x+y*range_image.Width;
if(*(mask_data+index) != 0)
{
vec.push_back(Eigen::Vector4d(x,y,(double)*(data+index),1));
}
}
}
// libpointmatcher point cloud with size of visible pixel
PM::Matrix features(4,vec.size());
PM::DataPoints::Labels featureLabels;
featureLabels.resize(4);
featureLabels[0] = PM::DataPoints::Label::Label("x");
featureLabels[1] = PM::DataPoints::Label::Label("y");
featureLabels[2] = PM::DataPoints::Label::Label("z");
featureLabels[3] = PM::DataPoints::Label::Label("pad");
//fill with data
for(int i = 0; i<vec.size(); i++)
{
features.col(i) = vec[i];
}
Because of the large images this loop takes 500ms for 840000 points and thats too slow. Now my idea was to integrate the code above in one parallized function. The problem is that the Eigen::Matrix does not provide a push_back functionality, i dont know the number of visible points in advance and i need the points in the right order to process the point cloud.
So i need a parallel algorithm to extract visible 3D-Points from my range-image and insert them into the Eigen::Matrix in the right order. I'm working with Microsoft Visual Studio 2012 and i can use either OpenMP 2.0 or TBB. I appreciate any help :)
UPDATE
As Arch D. Robison suggeested i tried the tbb::parallel_scan. I passed the mask array and a double array to hold the 3D-coodinates. The output array has four times the size of the input array to store homogeneous 3D data (x,y,z,1). Then i map the otput array in a Eigen::Matrix.The number of rows is fixed and the cols coming from the result from the parallel_scan.
size_t vec_size = width*height;
double* out = new double[vec_size * 4];
size_t m1 = Compress(mask, pixel, out, height, width,
[](unsigned char x) {return x != 0; });
Map<MatrixXd> features(out, 4, m1);
. Here is the code from the operator():
void operator()(const tbb::blocked_range2d<size_t, size_t>& r, Tag) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const unsigned char* m = in;
const unsigned short* p = in2;
T* values = out;
size_t yend = r.rows().end();
for (size_t y = r.rows().begin(); y != yend; ++y)
{
size_t xend = r.cols().end();
for (size_t x = r.cols().begin(); x != xend; ++x)
{
size_t index = x + y*width;
if (pred(m[index]))
{
if (Tag::is_final_scan())
{
size_t idx = j*4;
values[idx] = (double)x;
values[idx + 1] = (double)y;
values[idx + 2] = p[index];
values[idx + 3] = 1.0;
}
++j;
}
}
}
sum = j;
}
I'm now 4x faster then the serial version. What do you think about this approach? Did i miss anythink and are there improvements? Thanks
Here is an example of how to do something like std::copy_if using tbb::parallel_scan. The key method is operator(), which is usually called twice per subrange, once for a prescan and once for a final scan. (But be aware that TBB omits the prescan when it's not necessary.) Here the prescan just does tallying and the final scan does the final work (which includes replaying the tallying). See https://software.intel.com/sites/default/files/bc/2b/parallel_scan.pdf for more details on the methods. Another good references is https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf , which shows lots of things you can do with parallel scan (a.k.a. prefix-sum).
```
#include "tbb/parallel_scan.h"
#include "tbb/blocked_range.h"
#include <cstddef>
template<typename T, typename Pred>
class Body {
const T* const in;
T* const out;
Pred pred;
size_t sum;
public:
Body( T* in_, T* out_, Pred pred_) :
in(in_), out(out_), pred(pred_), sum(0)
{}
size_t getSum() const {return sum;}
template<typename Tag>
void operator()( const tbb::blocked_range<size_t>& r, Tag ) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const T* x = in;
T* y = out;
for( size_t i=r.begin(); i<r.end(); ++i ) {
if( pred(x[i]) ) {
if( Tag::is_final_scan() )
y[j] = x[i];
++j;
}
}
sum = j;
}
// Splitting constructor used for parallel fork.
// Note that it's sum(0), not sum(b.sum), because this
// constructor will be used to compute a partial sum.
// Method reverse_join will put together the two sub-sums.
Body( Body& b, tbb::split ) :
in(b.in), out(b.out), pred(b.pred), sum(0)
{}
// Join partial solutions computed by two Body objects.
// Arguments "this" and "a" correspond to the splitting
// constructor arguments "b" and "this". That's why
// it's called a reverse join.
void reverse_join( Body& a ) {
sum += a.sum;
}
void assign( Body& b ) {sum=b.sum;}
};
// Copy to out each element of in that satisfies pred.
// Return number of elements copied.
template<typename T, typename Pred>
size_t Compress( T* in, T* out, size_t n, Pred pred ) {
Body<T,Pred> b(in,out,pred);
tbb::parallel_scan(tbb::blocked_range<size_t>(0,n), b);
return b.getSum();
}
#include <cmath>
#include <algorithm>
#include <cassert>
int main() {
const size_t n = 10000000;
float* a = new float[n];
float* b = new float[n];
float* c = new float[n];
for( size_t i=0; i<n; ++i )
a[i] = std::cos(float(i));
size_t m1 = Compress(a, b, n, [](float x) {return x<0;});
size_t m2 = std::copy_if(a, a+n, c, [](float x) {return x<0;})-c;
assert(m1==m2);
for( size_t i=0; i<n; ++i )
assert(b[i]==c[i]);
}
```
Why do not you check out the condition *(m_maskData+index)==0 before m_features(0,index) = x;?

Using openMP to get the index of minimum element parallelly

I tried to write this code
float* theArray; // the array to find the minimum value
int index, i;
float thisValue, min;
index = 0;
min = theArray[0];
#pragma omp parallel for reduction(min:min_dist)
for (i=1; i<size; i++) {
thisValue = theArray[i];
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
return(index);
However this one is not outputting correct answers. Seems the min is OK but the correct index has been destroyed by threads.
I also tried some ways provided on the Internet and here (using parallel for for outer loop and use critical for final comparison) but this cause a speed drop rather than speedup.
What should I do to make both the min value and its index correct? Thanks!
I don't know of an elegant want to do a minimum reduction and save an index. I do this by finding the local minimum and index for each thread and then the global minimum and index in a critical section.
index = 0;
min = theArray[0];
#pragma omp parallel
{
int index_local = index;
float min_local = min;
#pragma omp for nowait
for (i = 1; i < size; i++) {
if (theArray[i] < min_local) {
min_local = theArray[i];
index_local = i;
}
}
#pragma omp critical
{
if (min_local < min) {
min = min_local;
index = index_local;
}
}
}
With OpenMP 4.0 it's possible to use user-defined reductions. A user-defined minimum reduction can be defined like this
struct Compare { float val; sizt_t index; };
#pragma omp declare reduction(minimum : struct Compare : omp_out = omp_in.val < omp_out.val ? omp_in : omp_out)
Then the reduction can be done like this
struct Compare min;
min.val = theArray[0];
min.index = 0;
#pragma omp parallel for reduction(minimum:min)
for(int i = 1; i<size; i++) {
if(theArray[i]<min.val) {
min.val = a[i];
min.index = i;
}
}
That works for C and C++. User defined reductions have other advantages besides simplified code. There are multiple algorithms for doing reductions. For example the merging can be done in O(number of threads) or O(Log(number of threads). The first solution I gave does this in O(number of threads) however using user-defined reductions let's OpenMP choose the algorithm.
Basic Idea
This can be accomplished without any parellelization-breaking critical or atomic sections by creating a custom reduction. Basically, define an object that stores both the index and value, and then create a function that sorts two of these objects by only the value, not the index.
Details
An object to store an index and value together:
typedef std::pair<unsigned int, float> IndexValuePair;
You can access the index by accessing the first property and the value by accessing the second property, i.e.,
IndexValuePair obj(0, 2.345);
unsigned int ix = obj.first; // 0
float val = obj.second; // 2.345
Define a function to sort two IndexValuePair objects:
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
Then, construct a custom reduction following the guidelines in the OpenMP documentation:
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
In this case, I've chosen to initialize the index to 0 and the value to 1000. The value should be initialized to some number larger than the largest value you expect to sort.
Functional Example
Finally, combine all these pieces with the parallel for loop!
// Compile with g++ -std=c++11 -fopenmp demo.cpp
#include <iostream>
#include <utility>
#include <vector>
typedef std::pair<unsigned int, float> IndexValuePair;
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
int main(){
std::vector<float> vals {10, 4, 6, 2, 8, 0, -1, 2, 3, 4, 4, 8};
unsigned int i;
IndexValuePair minValueIndex(0, 1000);
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
#pragma omp parallel for reduction(minPair:minValueIndex)
for(i = 0; i < vals.size(); i++){
if(vals[i] < minValueIndex.second){
minValueIndex.first = i;
minValueIndex.second = vals[i];
}
}
std::cout << "minimum value = " << minValueIndex.second << std::endl; // Should be -1
std::cout << "index = " << minValueIndex.first << std::endl; // Should be 6
return EXIT_SUCCESS;
}
Because you're not only trying to find the minimal value (reduction(min:___)) but also retain the index, you need to make the check critical. This can significantly slow down the loop (as reported). In general, make sure that there is enough work so you don't encounter overhead as in this question. An alternative would be to have each thread find the minimum and it's index and save them to a unique variable and have the master thread do a final check on those as in the following program.
#include <iostream>
#include <vector>
#include <ctime>
#include <random>
#include <omp.h>
using std::cout;
using std::vector;
void initializeVector(vector<double>& v)
{
std::mt19937 generator(time(NULL));
std::uniform_real_distribution<double> dis(0.0, 1.0);
v.resize(100000000);
for(int i = 0; i < v.size(); i++)
{
v[i] = dis(generator);
}
}
int main()
{
vector<double> vec;
initializeVector(vec);
float minVal = vec[0];
int minInd = 0;
int startTime = clock();
for(int i = 1; i < vec.size(); i++)
{
if(vec[i] < minVal)
{
minVal = vec[i];
minInd = i;
}
}
int elapsedTime1 = clock() - startTime;
// Change the number of threads accordingly
vector<float> threadRes(4, std::numeric_limits<float>::max());
vector<int> threadInd(4);
startTime = clock();
#pragma omp parallel for
for(int i = 0; i < vec.size(); i++)
{
{
if(vec[i] < threadRes[omp_get_thread_num()])
{
threadRes[omp_get_thread_num()] = vec[i];
threadInd[omp_get_thread_num()] = i;
}
}
}
float minVal2 = threadRes[0];
int minInd2 = threadInd[0];
for(int i = 1; i < threadRes.size(); i++)
{
if(threadRes[i] < minVal2)
{
minVal2 = threadRes[i];
minInd2 = threadInd[i];
}
}
int elapsedTime2 = clock() - startTime;
cout << "Min " << minVal << " at " << minInd << " took " << elapsedTime1 << std::endl;
cout << "Min " << minVal2 << " at " << minInd2 << " took " << elapsedTime2 << std::endl;
}
Please note that with optimizations on and nothing else to be done in the loop, the serial version seems to remain king. With optimizations turned off, OMP gains the upper hand.
P.S. you wrote reduction(min:min_dist) and the proceeded to use min instead of min_dist.
Actually, we can use omp critical directive to make only one thread run the code inside the critical region at a time.So only one thread can run it and the indexvalue wont be destroyed by other threads.
About omp critical directive:
The omp critical directive identifies a section of code that must be executed by a single thread at a time.
This code solves your issue:
#include <stdio.h>
#include <omp.h>
int main() {
int i;
int arr[10] = {11,42,53,64,55,46,47, 68, 59, 510};
float* theArray; // the array to find the minimum value
int index;
float thisValue, min;
index = 0;
min = arr[0];
int size=10;
#pragma omp parallel for
for (i=1; i<size; i++) {
thisValue = arr[i];
#pragma omp critical
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
printf("min:%d index:%d",min,index);
return 0;
}

Threads failing to affect performance

Below is a small program meant to parallelize the approximation of the 1/(n^2) series. Note the global parameter NUM_THREADS.
My issue is that increasing the number of threads from 1 to 4 (the number of processors my computer has is 4) does not significantly affect the outcomes of timing experiments. Do you see a logical flaw in the ThreadFunction? Is there false sharing or misplaced blocking that ends up serializing the execution?
#include <iostream>
#include <thread>
#include <vector>
#include <mutex>
#include <string>
#include <future>
#include <chrono>
std::mutex sum_mutex; // This mutex is for the sum vector
std::vector<double> sum_vec; // This is the sum vector
int NUM_THREADS = 1;
int UPPER_BD = 1000000;
/* Thread function */
void ThreadFunction(std::vector<double> &l, int beg, int end, int thread_num)
{
double sum = 0;
for(int i = beg; i < end; i++) sum += (1 / ( l[i] * l[i]) );
std::unique_lock<std::mutex> lock1 (sum_mutex, std::defer_lock);
lock1.lock();
sum_vec.push_back(sum);
lock1.unlock();
}
void ListFill(std::vector<double> &l, int z)
{
for(int i = 0; i < z; ++i) l.push_back(i);
}
int main()
{
std::vector<double> l;
std::vector<std::thread> thread_vec;
ListFill(l, UPPER_BD);
int len = l.size();
int lower_bd = 1;
int increment = (UPPER_BD - lower_bd) / NUM_THREADS;
for (int j = 0; j < NUM_THREADS; ++j)
{
thread_vec.push_back(std::thread(ThreadFunction, std::ref(l), lower_bd, lower_bd + increment, j));
lower_bd += increment;
}
for (auto &t : thread_vec) t.join();
double big_sum;
for (double z : sum_vec) big_sum += z;
std::cout << big_sum << std::endl;
return 0;
}
From looking at your code, I suspect that ListFill is taking longer than ThreadFunction. Why pass a list of values to the thread instead of the bounds each thread should loop over? Something like:
void ThreadFunction( int beg, int end ) {
double sum = 0.0;
for(double i = beg; i < end; i++)
sum += (1.0 / ( i * i) );
std::unique_lock<std::mutex> lock1 (sum_mutex);
sum_vec.push_back(sum);
}
To maximize parallelism, you need to push as much work as possible onto the threads. See Amdahl's Law
In addition to dohashi's nice improvement, you can remove the need for the mutex by populating the sum_vec in advance in the main thread:
sum_vec.resize(4);
then writing directly to it in ThreadFunction:
sum_vec[thread_num] = sum;
since each thread writes to a distinct element and doesn't modify the vector itself there is no need to lock anything.