Threading with shared variables - c++

I tried to use multiple threads to insert into a boost::bimap. I have some shared variable between the threads, which I need to pass by reference and some of them are modified by each thread execution. However, I get the error:
Segmentation fault (core dumped)
I have the following code with me. I have tried to avoid concurrent access to the variables by using std::lock_guard<std::mutex> lock(mtx), but not able to make it work.
parallel_index.cpp
#include <iostream>
#include <string>
#include <algorithm>
#include <thread>
#include <mutex>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include "parallel_index.h"
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string> > bimap_reference;
typedef bimap_reference::value_type position;
bimap_reference reference_index_vector;
size_t total_threads = std::thread::hardware_concurrency();
std::string sequence_content = "ABCDDBACDDDCBBAAACBDAADCBDAAADCBDADADACBDDCBBBCDCBCDAADCBBCDAAADCBDA";
uint64_t sequence_length = sequence_content.length();
int split = 5;
uint64_t erase_length = 0;
unsigned int seq_itr = 0;
std::mutex mtx; // to protect against concurent access
int main(){
thread_test::create_index index;
std::thread threads[total_threads];
std::cout << total_threads << " threads lanched" << std::endl;
for(unsigned int i = 0; i < total_threads; i++){
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(sequence_length), std::ref(split), std::ref(sequence_content), std::ref(erase_length));
}
for(unsigned int i = 0; i < total_threads; i++){
threads[i].join();
}
}
/*
* Creating index
*/
void thread_test::create_index::reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, uint64_t &erase_length ){
for (; seq_itr < sequence_length; ++seq_itr ){
std::lock_guard<std::mutex> lock(mtx);
std::string splitstr = sequence_content.substr(erase_length, split);
reference_index_vector.insert(position(seq_itr, splitstr));
seq_itr += split-1;
erase_length += split;
if(erase_length > 10000){
sequence_content.erase(0,erase_length);
erase_length = 0;
}
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
parallel_index.h
#ifndef PARALLEL_INDEX_H_
#define PARALLEL_INDEX_H_
#include<iostream>
#include <algorithm>
#include <utility>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
//typedef boost::unordered_map<int, std::pair<int, unsigned long int>& > reference_map;
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string > > bimap_reference;
typedef bimap_reference::value_type position;
extern bimap_reference reference_index_vector;
namespace thread_test{
class create_index{
public:
void reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, uint64_t &erase_length);
};
}
#endif /* PARALLEL_INDEX_H_ */
-------------------------------EDIT---------------------------------
I tried to divide the string content into partitions equal to the number of threads to have each thread its part locally available. But nothing seems to work. Some times it finishes the first thread and stops there after with a Segmentation fault (core dumped).
parallel_index.cpp
#include <iostream>
#include <string>
#include <algorithm>
#include <thread>
#include <mutex>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include "parallel_index.h"
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string> > bimap_reference;
typedef bimap_reference::value_type position;
bimap_reference reference_index_vector;
//create threads
size_t total_threads = std::thread::hardware_concurrency();
std::string sequence_content = "ABCDDBACDDDCBBAAACBDAADCBDAAADCBDADADACBDDCBBBCDCBCDAADCBBCDAAADCBDADDCCCAAABBBAAACDCA";
uint64_t sequence_length = sequence_content.length();
int split = 5;
// split the sequence_content equal to the number of threads, and assign each partition to each thread.
uint64_t each_partition_len = sequence_content.length()/total_threads- (sequence_content.length()/total_threads)%split ;
uint64_t last_partition_len = sequence_content.length()/total_threads +
(((sequence_content.length()/total_threads)%split)*(total_threads-1)) + sequence_content.length()%total_threads;
std::mutex mtx; // to protect against concurent access
int main(){
thread_test::create_index index;
std::thread threads[total_threads];
std::cout << total_threads << " threads lanched" << std::endl;
for(unsigned int i = 0; i < total_threads; i++){
if(i < total_threads-1)
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(each_partition_len), std::ref(split), std::ref(sequence_content), i);
else
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(last_partition_len), std::ref(split), std::ref(sequence_content), i);
//std::lock_guard<std::mutex> lck(mtx);
std::cout << "launched thread " << i << "with id " << threads[i].get_id() << std::endl;
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
for( unsigned int i = 0; i < total_threads; ++i){
if(threads[i].joinable()){
std::cout << "trying to join thread " << i << std:: endl;
threads[i].join();
std::cout << "thread joined " << i << std:: endl;
}
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
/*
* Creating index
*/
void thread_test::create_index::reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, int i ){
uint64_t seq_strt = 0;
// set seq_strt
if(i == 0)
seq_strt = sequence_length * i;
else
seq_strt = sequence_length * i + 1;
for (uint64_t seq_itr = seq_strt; seq_itr <= sequence_length; ++seq_itr ){
std::string splitstr = sequence_content.substr(seq_itr, split);
mtx.lock();
//std::lock_guard<std::mutex> lock(mtx);
reference_index_vector.insert(position(seq_itr, splitstr));
mtx.unlock();
seq_itr += split-1;
}
}
parallel_index.h
#ifndef PARALLEL_INDEX_H_
#define PARALLEL_INDEX_H_
#include<iostream>
#include <algorithm>
#include <utility>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string > > bimap_reference;
typedef bimap_reference::value_type position;
extern bimap_reference reference_index_vector;
namespace thread_test{
class create_index{
public:
void reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, int i);
};
}
#endif /* PARALLEL_INDEX_H_ */

I feel the culprit for segmentation fault is nothing but static linking of the libraries. Its not by incrementing seq_itr to a value bigger than the actual sequence length, because your for loop will never allow to enter if seq_itr is greater than actual sequence length. You try by removing the -static flag and it should work by not giving the segmentation fault, however it does not guarantee the correctness of the other code. More details about segmentation fault with thread can be found here

All the threads will try to get the lock in the critical section, to keep the bitmap intact, you need a conditional variable so that threads will orderly get executed. This is justified as you are using seq_itr as local variable inside reference_index_hash() and it needs to be incremented in proper sequence.

One problem in your original code is that unsigned int seq_itr is accessed without synchronization from multiple threads. Besides yielding invalid results it might lead to seq_itr being incremented to a value bigger than the actual sequence length, and the following accesses might lead to a crash.
The new code addresses this by just passing indexes, which should be OK as long as those indexes are non-overlapping and correctly calculated. I can't follow the logic completely, but in case your seq_strt calculation is off the program might also crash due to an invalid index. Should be easy to verify in a debugger or with some index assertions.
However there is an issue in the second code example with printing the map directly after threads are started with
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
This will not yield correct results, since the map is concurrently accessed by all worker threads. Access after the join()s is safe.

Related

boost interprocess vector slower than std::vector

I'm doing a benchmark on boost::interprocess:vector and std::vector, since I'm gonna use shared memory in my program but I'm concerned with any potential performance issues.
My benchmark is simply random accessing a vector, and it turned out that std::vector is almost 2x faster than boost::interprocess::vector.
Note: in the benchmark I only have a single process, and I don't do any synchronization manually.
I don't know where is the bottleneck....I have three guess:
shared memory make it slower
boost vector has a slower implementation
boost shared memory as well as its containers have some overhead somehow, i.e., if I use mmap and do things in a plain way, it will be better
And what further experiment should I do to figure out this? Or tune something to make it faster? Any idea?
Here is the benchmark code:
for boost::interprocess::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <random>
#include <string>
#include <vector>
using namespace boost::interprocess;
typedef allocator<double, managed_shared_memory::segment_manager> ShmemAllocator;
typedef vector<double, ShmemAllocator> MyVector;
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 2;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
struct shm_remove {
shm_remove() { shared_memory_object::remove("MySharedMemory"); }
~shm_remove() { shared_memory_object::remove("MySharedMemory"); }
} remover;
managed_shared_memory segment(create_only, "MySharedMemory", mem_size);
const ShmemAllocator alloc_inst(segment.get_segment_manager());
MyVector *myvector = segment.construct<MyVector>("MyVector")(alloc_inst);
myvector->reserve(total_size);
for (auto d : data) myvector->push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = (*myvector)[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}
for std::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib> //std::system
#include <iostream>
#include <random>
#include <string>
#include <vector>
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 8;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
std::vector<double> myvector;
myvector.reserve(total_size);
for (auto d : data) myvector.push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = myvector[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}

Efficiently process each unique permutation of a vector when number of unique elements in vector is much smaller than vector size

In a program I need to apply a function in parallel to each unique permutation of a vector. The size of the vector is around N=15
I already have a function void parallel_for_each_permutation which I can use in combination with a std::set to only process each unique permutation exactly once.
This all works well for the general case. However, in my use case the number of unique elements k per vector is very limited, usually around k=4. This means that I'm currently wasting time constructing the same unique permutation over and over again, just to throw it away because it has already been processed.
Is it possible to process all unique permutations in this special case, without constructing all N! permutations?
Example use-case:
#include <algorithm>
#include <thread>
#include <vector>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
template<class Container1, class Container2>
struct Comp{
//compare element-wise less than
bool operator()(const Container1& l, const Container2& r) const{
auto pair = std::mismatch(l.begin(), l.end(), r.begin());
if(pair.first == l.end() && pair.second == r.end())
return false;
return *(pair.first) < *(pair.second);
}
};
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int num_threads, Func func){
auto ithPermutation = [](int n, size_t i) -> std::vector<size_t>{
// https://stackoverflow.com/questions/7918806/finding-n-th-permutation-without-computing-others
std::vector<size_t> fact(n);
std::vector<size_t> perm(n);
fact[0] = 1;
for(int k = 1; k < n; k++)
fact[k] = fact[k-1] * k;
for(int k = 0; k < n; k++){
perm[k] = i / fact[n-1-k];
i = i % fact[n-1-k];
}
for(int k = n-1; k > 0; k--){
for(int j = k-1; j >= 0; j--){
if(perm[j] <= perm[k])
perm[k]++;
}
}
return perm;
};
size_t totalNumPermutations = 1;
for(size_t i = 1; i <= container.size(); i++)
totalNumPermutations *= i;
std::vector<std::thread> threads;
for(int threadId = 0; threadId < num_threads; threadId++){
threads.emplace_back([&, threadId](){
const size_t firstPerm = size_t(float(threadId) * totalNumPermutations / num_threads);
const size_t last_excl = std::min(totalNumPermutations, size_t(float(threadId+1) * totalNumPermutations / num_threads));
Container permutation(container);
auto permIndices = ithPermutation(container.size(), firstPerm);
size_t count = firstPerm;
do{
for(int i = 0; i < int(permIndices.size()); i++){
permutation[i] = container[permIndices[i]];
}
func(threadId, permutation);
std::next_permutation(permIndices.begin(), permIndices.end());
++count;
}while(count < last_excl);
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func){
using Comparator = Comp<Container, Container>;
constexpr int numThreads = 4;
std::set<Container, Comparator> uniqueProcessedPermutations(Comparator{});
std::mutex m;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
{
std::lock_guard<std::mutex> lg(m);
if(uniqueProcessedPermutations.count(permutation) > 0){
return;
}else{
uniqueProcessedPermutations.insert(permutation);
}
}
func(permutation);
}
);
}
int main(){
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1};
auto func = [](const auto& vec){return;};
parallel_for_each_unique_permutation(vector1, func);
}
The permutations you have to work with are known in the field of combinatorics as multiset permutations.
They are described for example on The Combinatorial Object Server
with more detailed explanations in this paper by professor Tadao Takaoka.
You have some related Python code and some C++ code in the FXT open source library.
You might consider adding the "multiset" and "combinatorics" tags to your question.
One possibility is to borrow the (header-only) algorithmic code from the FXT library, which provides a simple generator class for those multiset permutations.
Performance level:
Using the FXT algorithm on a test vector of 15 objects, {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}, one can generate all associated 12,612,600 "permutations" in less than 2 seconds on a plain vanilla Intel x86-64 machine; this is without diagnostics text I/O and without any attempt at optimization.
The algorithm generates exactly those "permutations" that are required, nothing more. So there is no longer a need to generate all 15! "raw" permutations nor to use mutual exclusion to update a shared data structure for filtering purposes.
An adaptor class for generating the permutations:
I will try below to provide code for an adaptor class, which allows your application to use the FXT algorithm while containing the dependency into a single implementation file. That way, the code will hopefully fit better into your application. Think FXT's ulong type and use of raw pointers, versus std::vector<std::size_t> in your code. Besides, FXT is a very extensive library.
Header file for the "adaptor" class:
// File: MSetPermGen.h
#ifndef MSET_PERM_GEN_H
#define MSET_PERM_GEN_H
#include <iostream>
#include <vector>
class MSetPermGenImpl; // from algorithmic backend
using IntVec = std::vector<int>;
using SizeVec = std::vector<std::size_t>;
// Generator class for multiset permutations:
class MSetPermGen {
public:
MSetPermGen(const IntVec& vec);
std::size_t getCycleLength() const;
bool forward(size_t incr);
bool next();
const SizeVec& getPermIndices() const;
const IntVec& getItems() const;
const IntVec& getItemValues() const;
private:
std::size_t cycleLength_;
MSetPermGenImpl* genImpl_; // implementation generator
IntVec itemValues_; // only once each
IntVec items_; // copy of ctor argument
SizeVec freqs_; // repetition counts
SizeVec state_; // array of indices in 0..n-1
};
#endif
The class constructor takes exactly the argument type provided in your main program. Of course, the key method is next(). You can also move the automaton by several steps at once using the forward(incr)method.
Example client program:
// File: test_main.cpp
#include <cassert>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
// utility functions:
std::vector<int> getMSPermutation(const MSetPermGen& mspg)
{
std::vector<int> res;
auto indices = mspg.getPermIndices(); // always between 0 and n-1
auto values = mspg.getItemValues(); // whatever the user put in
std::size_t n = indices.size();
assert( n == items.size() );
res.reserve(n);
for (std::size_t i=0; i < n; i++) {
auto xi = indices[i];
res.push_back(values[xi]);
}
return res;
}
void printPermutation(const std::vector<int>& p, std::ostream& fh)
{
std::size_t n = p.size();
for (size_t i=0; i < n; i++)
fh << p[i] << " ";
fh << '\n';
}
int main(int argc, const char* argv[])
{
std::vector<int> vec0{1,1, 2,2,2}; // N=5
std::vector<int> vec1{1,1, 1,1, 2, 3, 2,2, 3,3, 1}; // N=11
std::vector<int> vec2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
MSetPermGen pg0{vec0};
MSetPermGen pg1{vec1};
MSetPermGen pg2{vec2};
auto pg = &pg0; // choice of 0, 1, 2 for sizing
auto cl = pg->getCycleLength();
auto permA = getMSPermutation(*pg);
printPermutation(permA, cout);
for (std::size_t pi=0; pi < (cl-1); pi++) {
pg->next();
auto permB = getMSPermutation(*pg);
printPermutation(permB, cout);
}
return EXIT_SUCCESS;
}
Text output from the above small program:
1 1 2 2 2
1 2 1 2 2
1 2 2 1 2
1 2 2 2 1
2 1 1 2 2
2 1 2 1 2
2 1 2 2 1
2 2 1 1 2
2 2 1 2 1
2 2 2 1 1
You get only 10 items from vector {1,1, 2,2,2}, because 5! / (2! * 3!) = 120/(2*6) = 10.
The implementation file for the adaptor class, MSetPermGen.cpp, consists of two parts. The first part is FXT code with minimal adaptations. The second part is the MSetPermGen class proper.
First part of implementation file:
// File: MSetPermGen.cpp - part 1 of 2 - FXT code
// -------------- Beginning of header-only FXT combinatorics code -----------
// This file is part of the FXT library.
// Copyright (C) 2010, 2012, 2014 Joerg Arndt
// License: GNU General Public License version 3 or later,
// see the file COPYING.txt in the main directory.
//-- https://www.jjj.de/fxt/
//-- https://fossies.org/dox/fxt-2018.07.03/mset-perm-lex_8h_source.html
#include <cstddef>
using ulong = std::size_t;
inline void swap2(ulong& xa, ulong& xb)
{
ulong save_xb = xb;
xb = xa;
xa = save_xb;
}
class mset_perm_lex
// Multiset permutations in lexicographic order, iterative algorithm.
{
public:
ulong k_; // number of different sorts of objects
ulong *r_; // number of elements '0' in r[0], '1' in r[1], ..., 'k-1' in r[k-1]
ulong n_; // number of objects
ulong *ms_; // multiset data in ms[0], ..., ms[n-1], sentinels at [-1] and [-2]
private: // have pointer data
mset_perm_lex(const mset_perm_lex&); // forbidden
mset_perm_lex & operator = (const mset_perm_lex&); // forbidden
public:
explicit mset_perm_lex(const ulong *r, ulong k)
{
k_ = k;
r_ = new ulong[k];
for (ulong j=0; j<k_; ++j) r_[j] = r[j]; // get buckets
n_ = 0;
for (ulong j=0; j<k_; ++j) n_ += r_[j];
ms_ = new ulong[n_+2];
ms_[0] = 0; ms_[1] = 1; // sentinels: ms[0] < ms[1]
ms_ += 2; // nota bene
first();
}
void first()
{
for (ulong j=0, i=0; j<k_; ++j)
for (ulong h=r_[j]; h!=0; --h, ++i)
ms_[i] = j;
}
~mset_perm_lex()
{
ms_ -= 2;
delete [] ms_;
delete [] r_;
}
const ulong * data() const { return ms_; }
ulong next()
// Return position of leftmost change,
// return n with last permutation.
{
// find rightmost pair with ms[i] < ms[i+1]:
const ulong n1 = n_ - 1;
ulong i = n1;
do { --i; } while ( ms_[i] >= ms_[i+1] ); // can read sentinel
if ( (long)i < 0 ) return n_; // last sequence is falling seq.
// find rightmost element ms[j] less than ms[i]:
ulong j = n1;
while ( ms_[i] >= ms_[j] ) { --j; }
swap2(ms_[i], ms_[j]);
// Here the elements ms[i+1], ..., ms[n-1] are a falling sequence.
// Reverse order to the right:
ulong r = n1;
ulong s = i + 1;
while ( r > s ) { swap2(ms_[r], ms_[s]); --r; ++s; }
return i;
}
};
// -------------- End of header-only FXT combinatorics code -----------
Second part of the class implementation file:
// Second part of file MSetPermGen.cpp: non-FXT code
#include <cassert>
#include <tuple>
#include <map>
#include <iostream>
#include <cstdio>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
class MSetPermGenImpl { // wrapper class
public:
MSetPermGenImpl(const SizeVec& freqs) : fg(freqs.data(), freqs.size())
{}
private:
mset_perm_lex fg;
friend class MSetPermGen;
};
static std::size_t fact(size_t n)
{
std::size_t f = 1;
for (std::size_t i = 1; i <= n; i++)
f = f*i;
return f;
}
MSetPermGen::MSetPermGen(const IntVec& vec) : items_(vec)
{
std::map<int,int> ma;
for (int i: vec) {
ma[i]++;
}
int item, freq;
for (const auto& p : ma) {
std::tie(item, freq) = p;
itemValues_.push_back(item);
freqs_.push_back(freq);
}
cycleLength_ = fact(items_.size());
for (auto i: freqs_)
cycleLength_ /= fact(i);
// create FXT-level generator:
genImpl_ = new MSetPermGenImpl(freqs_);
for (std::size_t i=0; i < items_.size(); i++)
state_.push_back(genImpl_->fg.ms_[i]);
}
std::size_t MSetPermGen::getCycleLength() const
{
return cycleLength_;
}
bool MSetPermGen::forward(size_t incr)
{
std::size_t n = items_.size();
std::size_t rc = 0;
// move forward state by brute force, could be improved:
for (std::size_t i=0; i < incr; i++)
rc = genImpl_->fg.next();
for (std::size_t j=0; j < n; j++)
state_[j] = genImpl_->fg.ms_[j];
return (rc != n);
}
bool MSetPermGen::next()
{
return forward(1);
}
const SizeVec& MSetPermGen::getPermIndices() const
{
return (this->state_);
}
const IntVec& MSetPermGen::getItems() const
{
return (this->items_);
}
const IntVec& MSetPermGen::getItemValues() const
{
return (this->itemValues_);
}
Adapting the parallel application:
Regarding your multithreaded application, given that generating the "permutations" is cheap, you can afford to create one generator object per thread.
Before launching the actual computation, you forward each generator to its appropriate initial position, that is at step thread_id * (cycleLength / num_threads).
I have tried to adapt your code to this MSetPermGen class along these lines. See code below.
With 3 threads, an input vector {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4} of size 15 (giving 12,612,600 permutations) and all diagnostics enabled, your modified parallel program runs in less than 10 seconds; less than 2 seconds with all diagnostics switched off.
Modified parallel program:
#include <algorithm>
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include "MSetPermGen.h"
using std::cout;
using std::endl;
// debug and instrumentation:
static std::atomic<size_t> permCounter;
static bool doManagePermCounter = true;
static bool doThreadLogfiles = true;
static bool doLogfileHeaders = true;
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int numThreads, Func mfunc) {
MSetPermGen gen0(container);
std::size_t totalNumPermutations = gen0.getCycleLength();
std::size_t permShare = totalNumPermutations / numThreads;
if ((totalNumPermutations % numThreads) != 0)
permShare++;
std::cout << "totalNumPermutations: " << totalNumPermutations << std::endl;
std::vector<std::thread> threads;
for (int threadId = 0; threadId < numThreads; threadId++) {
threads.emplace_back([&, threadId]() {
// generate some per-thread logfile name
std::ostringstream fnss;
fnss << "thrlog_" << threadId << ".txt";
std::string fileName = fnss.str();
std::ofstream fh(fileName);
MSetPermGen thrGen(container);
const std::size_t firstPerm = permShare * threadId;
thrGen.forward(firstPerm);
const std::size_t last_excl = std::min(totalNumPermutations,
(threadId+1) * permShare);
if (doLogfileHeaders) {
fh << "MSG threadId: " << threadId << '\n';
fh << "MSG firstPerm: " << firstPerm << '\n';
fh << "MSG lastExcl : " << last_excl << '\n';
}
Container permutation(container);
auto values = thrGen.getItemValues();
auto permIndices = thrGen.getPermIndices();
auto nsz = permIndices.size();
std::size_t count = firstPerm;
do {
for (std::size_t i = 0; i < nsz; i++) {
permutation[i] = values[permIndices[i]];
}
mfunc(threadId, permutation);
if (doThreadLogfiles) {
for (std::size_t i = 0; i < nsz; i++)
fh << permutation[i] << ' ';
fh << '\n';
}
thrGen.next();
permIndices = thrGen.getPermIndices();
++count;
if (doManagePermCounter) {
permCounter++;
}
} while (count < last_excl);
fh.close();
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func) {
constexpr int numThreads = 3;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
// no longer need any mutual exclusion
func(permutation);
}
);
}
int main()
{
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1}; // N=11
std::vector<int> vector0{1,1, 2,2,2}; // N=5
std::vector<int> vector2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
auto func = [](const auto& vec) { return; };
permCounter.store(0);
parallel_for_each_unique_permutation(vector2, func);
auto finalPermCounter = permCounter.load();
cout << "FinalPermCounter = " << finalPermCounter << endl;
}

Replace a loop to retrieve the index of the first element in an array

Is there a better way to search for the index of the first element using a predicate?
// ... this code looks a bit long to me. Anyway to do better?
auto it = std::find_if(collection + startAt,
collection + COLLECTION_SIZE,
[](const char* line) { return strlen(line) <= 10; });
int idx = std::dist(collection, it); //= it - collection;
This is my attempt to refactor the C-style code below:
for (posEmptyItem = startAt; strlen(collection[posEmptyItem]) > 10; posEmptyItem++) {}
std::cout << posEmptyItem << std::endl;
Here a complete example of
#include "stdafx.h"
#include <cstring>
#include <iostream>
#include <algorithm>
#define COLLECTION_SIZE 123
int main()
{
char* collection[COLLECTION_SIZE]{ "time11,time2,time3",
"time12,time2,time3",
"time13,time2,time3",
"time14,time2,time3",
"time15,time2,time3",
"x\n",
"" };
auto startAt = 2;
int posEmptyItem;
// legacy code
for (posEmptyItem = startAt; strlen(collection[posEmptyItem]) > 10; posEmptyItem++) {}
std::cout << posEmptyItem << std::endl;
// replace the loop to search an index by calling to standard library
auto it = std::find_if(collection + startAt,
collection + COLLECTION_SIZE,
[](const char* line) { return strlen(line) <= 10; });
posEmptyItem = it - collection;
std::cout << posEmptyItem << std::endl;
return 0;
}

Proper usage of std::atomic, pre increment value as function param

I have code where I need unique id (packet id for some protocol). So I used std::atomic<int>. After reading documentation I was confused because it stated that increment is done in this way. fetch_add(1)+1
I understand that value inside fetch_add is incremented atomically but I get pre-increment value +1 outside atomic operation. What I would guess is not atomic.
Can I use some_func(++atomic_value)?
I wrote simple code to check if it works. And it works but I don't understand why.
#include<iostream>
#include <atomic>
#include <thread>
#include <vector>
#include <random>
#include <mutex>
#include <algorithm>
std::atomic<int> Index = 0;
//int Index = 0; // non atomic Index. It will generate duplicities
std::vector<int> Numbers;
std::mutex Mutex;
std::default_random_engine Generator;
std::uniform_int_distribution<int> Distribution(5, 10);
void func(int Value)
{
std::lock_guard<std::mutex> Guard(Mutex);
Numbers.push_back(Value);
}
void ThreadProc()
{
Sleep(Distribution(Generator));
func(++Index); // is this proper usage of std::atomic?
}
int main()
{
const int ThreadCount = 1000;
std::vector<std::thread> Threads;
for ( int i = 0; i < ThreadCount; i++ )
{
Threads.push_back(std::thread(ThreadProc));
}
for_each(Threads.begin(), Threads.end(), [](std::thread& t) { t.join(); });
std::sort(Numbers.begin(), Numbers.end());
auto End = std::unique(Numbers.begin(), Numbers.end());
if ( Numbers.end() == End )
{
std::cout << "No duplicites found." << std::endl;
}
else
{
std::cout << "Duplicites found ! - " << Numbers.end() - End << std::endl;
for_each(End, Numbers.end(), [](int n) { std::cout << n << ", "; });
}
return 0;
}
Off-topic question: When I defined Index as non atomic I get duplicities but only from end of range. Numbers are always 900+. Why it is so?

Increasing allocation performance for strings

I ported a Java GC test program to C++ (see the code below) as well as Python. The Java and Python performance is much greater than C++ and I was thinking this was due to all the calls to new that have to be done to create the strings each time. I've tried using Boost's fast_pool_allocator but that actually worsened performance from 700ms to 1200ms. Am I using the allocator wrong, or is there something else I should be doing?
EDIT: Compiled with g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ is version 4.8.1
One iteration takes in Python is about 300ms and with Java about 50ms. std::allocator gives about 700ms and boost::fast_pool_allocator gives about 1200ms.
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>
using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
template <typename T> pool_string to_string(const T& in) {
std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
stm << in;
return stm.str();
}
}
#include "mytime.hpp"
class Garbage {
public:
vector<pool_string> outer;
vector<pool_string> old;
const int nThreads = 1;
//static auto time = chrono::high_resolution_clock();
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
auto tt = mytime::msecs();
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
tt = mytime::msecs();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
auto t = mytime::msecs();
cout << (t - tt) << endl;
tt = t;
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
The problem is you're using a new string stream each time to convert an integer.
Fix it:
namespace patch {
template <typename T> pool_string to_string(const T& in) {
return boost::lexical_cast<pool_string>(in);
}
}
Now the timings are:
DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185
real 0m0.801s
user 0m0.784s
sys 0m0.016s
See it Live On Coliru
Full code for reference:
#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>
using string = std::string;
namespace patch {
template <typename T> string to_string(const T& in) {
return boost::lexical_cast<string>(in);
}
}
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
std::vector<string> outer;
std::vector<string> old;
const int nThreads = 1;
void go() {
outer.resize(1000000);
//old.reserve(1000000);
Timer timer;
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
std::cout << "DOING AN OLD" << std::endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
std::cout << timer.elapsed() << std::endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
Since I don't use boost on my machine, I simplified the code to use standard C++11 to_string (thus accidentally "fixing" the problem sehe found), and got this:
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
vector<string> outer;
vector<string> old;
const int nThreads = 1;
Timer timer;
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(to_string(j));
outer.clear();
cout << timer.elapsed() << endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(to_string(i));
}
};
int main() {
Garbage().go();
}
Compiling with:
$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835
clang 3.5 build with source from Friday 18th of April 2014 gives similar results with the same compiler options.
My processor is a AMD Phenom(tm) II X4 965, running at 3.6GHz (if I remember right).