I'm doing a benchmark on boost::interprocess:vector and std::vector, since I'm gonna use shared memory in my program but I'm concerned with any potential performance issues.
My benchmark is simply random accessing a vector, and it turned out that std::vector is almost 2x faster than boost::interprocess::vector.
Note: in the benchmark I only have a single process, and I don't do any synchronization manually.
I don't know where is the bottleneck....I have three guess:
shared memory make it slower
boost vector has a slower implementation
boost shared memory as well as its containers have some overhead somehow, i.e., if I use mmap and do things in a plain way, it will be better
And what further experiment should I do to figure out this? Or tune something to make it faster? Any idea?
Here is the benchmark code:
for boost::interprocess::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <random>
#include <string>
#include <vector>
using namespace boost::interprocess;
typedef allocator<double, managed_shared_memory::segment_manager> ShmemAllocator;
typedef vector<double, ShmemAllocator> MyVector;
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 2;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
struct shm_remove {
shm_remove() { shared_memory_object::remove("MySharedMemory"); }
~shm_remove() { shared_memory_object::remove("MySharedMemory"); }
} remover;
managed_shared_memory segment(create_only, "MySharedMemory", mem_size);
const ShmemAllocator alloc_inst(segment.get_segment_manager());
MyVector *myvector = segment.construct<MyVector>("MyVector")(alloc_inst);
myvector->reserve(total_size);
for (auto d : data) myvector->push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = (*myvector)[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}
for std::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib> //std::system
#include <iostream>
#include <random>
#include <string>
#include <vector>
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 8;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
std::vector<double> myvector;
myvector.reserve(total_size);
for (auto d : data) myvector.push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = myvector[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}
Related
I have this writer, I run this like so: ./writer 12 14
it creates two shared memory segments with a spsc queue in each one.
The writer just send text with a counter to the spsc queue of his memory segment.
#include <boost/container/scoped_allocator.hpp>
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/string.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/lockfree/spsc_queue.hpp>
#include <boost/process.hpp>
#include <iostream>
using namespace std::chrono_literals;
namespace bip = boost::interprocess;
namespace chro = std::chrono;
namespace blf = boost::lockfree;
using char_alloc = bip::allocator<char, bip::managed_shared_memory::segment_manager>;
using shared_string = bip::basic_string<char, std::char_traits<char>, char_alloc>;
using ring_buffer = blf::spsc_queue<shared_string, blf::capacity<200>>;
int main(int argc, char* argv[]) {
if (argc > 1) {
std::string n1 = argv[1];
std::string n2 = argv[2];
const std::string shm_name1 = "segmentrb" + n1;
const std::string shm_name2 = "segmentrb" + n2;
const std::string qname = "queue";
boost::interprocess::shared_memory_object::remove(shm_name1.c_str());
boost::interprocess::shared_memory_object::remove(shm_name2.c_str());
bip::managed_shared_memory seg1(bip::open_or_create, shm_name1.c_str(), 10'000);
char_alloc char_alloc1(seg1.get_segment_manager());
ring_buffer* qu1 = seg1.find_or_construct<ring_buffer>(qname.c_str())();
bip::managed_shared_memory seg2(bip::open_or_create, shm_name2.c_str(), 10'000);
char_alloc char_alloc2(seg2.get_segment_manager());
ring_buffer* qu2 = seg2.find_or_construct<ring_buffer>(qname.c_str())();
int counter = 0;
while (true) {
std::string text1 = "Text from 1, count ";
text1.append(std::to_string(counter));
qu1->push(shared_string(text1.c_str(), char_alloc1));
std::string text2 = "Text from 2, count ";
text2.append(std::to_string(counter));
qu2->push(shared_string(text2.c_str(), char_alloc2));
std::this_thread::sleep_for(std::chrono::milliseconds(1));
counter++;
}
}
}
Then I have this reader, that reads an pop the spsc of the two segments:
I run this with: ./reader 12 14
#include <boost/container/scoped_allocator.hpp>
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/string.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/lockfree/spsc_queue.hpp>
#include <boost/process.hpp>
#include <boost/unordered_map.hpp>
#include <iostream>
using namespace std::chrono_literals;
namespace bip = boost::interprocess;
namespace chro = std::chrono;
namespace blf = boost::lockfree;
using char_alloc = bip::allocator<char, bip::managed_shared_memory::segment_manager>;
using shared_string = bip::basic_string<char, std::char_traits<char>, char_alloc>;
using ring_buffer = blf::spsc_queue<shared_string, blf::capacity<200>>;
int main(int argc, char* argv[]) {
if (argc > 1) {
std::string n1 = argv[1];
std::string n2 = argv[2];
const std::string shm_name1 = "segmentrb" + n1;
const std::string shm_name2 = "segmentrb" + n2;
const std::string qname = "queue";
bip::managed_shared_memory seg1(bip::open_only, shm_name1.c_str());
char_alloc char_alloc1(seg1.get_segment_manager());
ring_buffer* qu1 = seg1.find<ring_buffer>(qname.c_str()).first;
bip::managed_shared_memory seg2(bip::open_only, shm_name2.c_str());
char_alloc char_alloc2(seg2.get_segment_manager());
ring_buffer* qu2 = seg2.find<ring_buffer>(qname.c_str()).first;
while (true) {
shared_string v1(char_alloc1);
shared_string v2(char_alloc2);
qu1->pop(v1);
qu2->pop(v2);
long lv1 = v1.length();
long lv2 = v2.length();
long lvs = lv1 + lv2;
if (lvs > 0) {
if (lv1 > 0) {
std::cout << "Rec1: " << v1 << "\n";
}
if (lv2 > 0) {
std::cout << "Rec2: " << v2 << "\n";
}
}
else {
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
}
}
}
when I do kill -9 on the reader I get this on the writer:
terminate called after throwing an instance of 'boost::interprocess::bad_alloc'
what(): boost::interprocess::bad_alloc
Aborted (core dumped)
How can I avoid the writer being killed?
Reviewed your producer code, removing duplicate code:
Live On Coliru
#include <boost/container/scoped_allocator.hpp>
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/string.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/lockfree/spsc_queue.hpp>
#include <boost/process.hpp>
#include <iostream>
using namespace std::chrono_literals;
namespace bip = boost::interprocess;
namespace blf = boost::lockfree;
using namespace std::literals;
namespace Shared {
using Segment = bip::managed_shared_memory;
using Mgr = Segment::segment_manager;
template <typename T> using Alloc = bip::allocator<T, Mgr>;
using shared_string = bip::basic_string<char, std::char_traits<char>, Alloc<char>>;
using ring_buffer = blf::spsc_queue<shared_string, blf::capacity<200>>;
struct Queue {
Queue(std::string const& name, std::string const& qname = "queue")
: _name(name)
, _buffer(_segment.find_or_construct<ring_buffer>(qname.c_str())()) {}
std::string const& name() const { return _name; }
bool push(std::string const& item) {
return _buffer->push(
shared_string(item.begin(), item.end(), _segment.get_segment_manager()));
}
private:
std::string const _name;
struct pre_remover_t {
pre_remover_t(std::string const& name) {
bip::shared_memory_object::remove(name.c_str());
}
} _pre_remover{_name};
bip::managed_shared_memory _segment{bip::open_or_create, _name.c_str(), 10'000};
ring_buffer* _buffer = nullptr;
};
} // namespace Shared
int main(int argc, char* argv[]) {
std::deque<Shared::Queue> queues;
for (auto& arg : std::vector<std::string>(argv + 1, argv + argc))
queues.emplace_back("segmentrb" + arg);
for (int counter = 0; true; ++counter) {
for (auto& q : queues)
q.push("Text from " + q.name() + ", count " + std::to_string(counter));
std::this_thread::sleep_for(1ms);
}
}
Reading that makes the problem readily evident: You are pushing elements every 1ms.
Killing the producer gives you 200*1ms = 0.2s before the queue is full. Obviously, then push will fail with the exception indicating that the queue is out of capacity.
UPDATE
From the comments, a version that combines producer and consumer:
Live On Coliru
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/string.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/lockfree/spsc_queue.hpp>
#include <deque>
#include <iomanip>
#include <iostream>
#include <ranges>
#include <thread>
namespace bip = boost::interprocess;
namespace blf = boost::lockfree;
using namespace std::literals;
using std::this_thread::sleep_for;
using std::ranges::views::transform;
namespace Shared {
static constexpr std::string_view queue_name = "queue";
using Segment = bip::managed_shared_memory;
struct pre_remover_t {
pre_remover_t(std::string const& name) { bip::shared_memory_object::remove(name.c_str()); }
};
using Mgr = Segment::segment_manager;
template <typename T> using Alloc = bip::allocator<T, Mgr>;
using String = bip::basic_string<char, std::char_traits<char>, Alloc<char>>;
using ring_buffer = blf::spsc_queue<String, blf::capacity<200>>;
struct Producer {
Producer(std::string const& name)
: _name(name)
, _buffer(_segment.find_or_construct<ring_buffer>(queue_name.data())()) {}
std::string const& name() const { return _name; }
bool push(std::string const& item) {
std::cerr << "push: " << quoted(item) << std::endl;
return _buffer->push(
String(item.begin(), item.end(), _segment.get_segment_manager()));
}
private:
std::string const _name;
pre_remover_t _pre_remover{_name};
Segment _segment{bip::open_or_create, _name.c_str(), 10'000};
ring_buffer* _buffer = nullptr;
};
struct Consumer {
Consumer(std::string const& name)
: _name(name)
, _buffer(_segment.find_or_construct<ring_buffer>(queue_name.data())()) {}
String pop() {
String r(_segment.get_segment_manager());
_buffer->pop(r);
return r;
}
private:
std::string const _name;
Segment _segment{bip::open_only, _name.c_str()};
ring_buffer* _buffer = nullptr;
};
} // namespace Shared
int main(int argc, char* argv[]) {
std::deque<std::string> args(argv + 1, argv + argc);
bool const is_producer = args.front() == "producer";
args.pop_front();
if (is_producer) {
std::deque<Shared::Producer> queues;
for (auto& arg : args)
queues.emplace_back("segmentrb" + arg);
for (int counter = 0; true; ++counter) {
for (auto& q : queues)
q.push("Text from " + q.name() + ", count " + std::to_string(counter));
sleep_for(1s);
}
} else { // consumer
std::deque<Shared::Consumer> queues;
for (auto& arg : args)
queues.emplace_back("segmentrb" + arg);
for (;;) {
auto no_data = true;
for (int index = 0; auto&& v : queues | transform(&Shared::Consumer::pop)) {
if (!v.empty()) {
no_data = false;
std::cout << "Rec" << ++index << ": " << v << "\n";
}
}
if (no_data) {
std::cerr << "Consumer no-data cycle" << std::endl;
sleep_for(2s);
}
}
}
}
With a local demonstration as well:
I create test code where I am computing in parallel one complex matrix.
I am computing on CPU.
I observed that it takes around 3 seconds to finish all the blocks.
Can someone explain why it takes so long time ?
Code
Utils.hpp
#pragma once
#include <chrono>
#include <armadillo>
namespace utils
{
class watch : std::chrono::steady_clock {
time_point start_ = now();
public: auto elapsed_sec() const {return std::chrono::duration<double>(now() - start_).count();}
};
void op_herk(arma::cx_mat && A, arma::cx_mat & C)
{
using blas_int = int;
using T = double;
const char uplo = 'U';
const char trans_A = 'N';
const auto n = blas_int(C.n_cols);
const auto k = blas_int(A.n_cols);
const T local_alpha = T(1);
const T local_beta = T(0);
const blas_int lda = n;
arma::blas::herk<T>( &uplo, &trans_A, &n, &k, &local_alpha, A.mem, &lda, &local_beta, C.memptr(), &n);
arma::herk_helper::inplace_conj_copy_upper_tri_to_lower_tri(C);
}
}
ThreadPoll
#pragma once
#include <boost/thread.hpp>
#include <boost/asio.hpp>
#include <boost/asio/thread_pool.hpp>
class ThreadPool {
public:
explicit ThreadPool(size_t size = boost::thread::hardware_concurrency()) : threadPool(size)
{ }
template<typename F>
void addTask(F &&f)
{
boost::asio::post(threadPool, std::forward<F>(f));
}
void wait()
{
threadPool.wait();
}
~ThreadPool()
{
threadPool.join();
}
private:
boost::asio::thread_pool threadPool;
};
main.cpp
#include <armadillo>
#include "Utils.h"
#include "ThreadPool.h"
int main() {
ThreadPool threadPool;
arma::cx_mat test (256, 30000 , arma::fill::randu);
arma::vec averageTime(30, arma::fill::zeros);
std::vector<arma::cx_mat > results(30);
for(auto &it : results)
it.set_size(256, 256);
{
for(int i = 0; i < 30; ++i)
{
threadPool.addTask([i = i, &results, &averageTime, test = test.submat(arma::span::all, arma::span(0, 20000)), _ = utils::watch() ]() {
utils::op_herk(test, results[i]);
arma::vec r = arma::sort(arma::eig_sym(results[i]), "descent");
std::cout << _.elapsed_sec() << '\n';
averageTime[i] = _.elapsed_sec();
});
}
threadPool.wait();
std::cout << "average " << arma::sum(averageTime)/averageTime.size() <<std::endl;
}
return 0;
}
Parameters :
gcc 9.4
computer : Intel 6 Cores , 12 threads;
armadillo 10.7.3
openblas 0.3.17
CMAKE parameters : set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -msse2 -O3 -mtune=native -flto")
My results :
1.16084
1.16434
1.16571
1.16601
1.17055
1.17118
1.17382
1.17511
1.1767
1.17981
1.18254
1.18537
2.40071
2.40225
2.4025
2.40511
2.40545
2.40565
2.40583
2.40941
2.40972
2.40974
2.41172
2.41291
3.23446
3.23592
3.23734
3.23972
3.24305
3.24484
3.24728
average 2.14871
I tried to use multiple threads to insert into a boost::bimap. I have some shared variable between the threads, which I need to pass by reference and some of them are modified by each thread execution. However, I get the error:
Segmentation fault (core dumped)
I have the following code with me. I have tried to avoid concurrent access to the variables by using std::lock_guard<std::mutex> lock(mtx), but not able to make it work.
parallel_index.cpp
#include <iostream>
#include <string>
#include <algorithm>
#include <thread>
#include <mutex>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include "parallel_index.h"
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string> > bimap_reference;
typedef bimap_reference::value_type position;
bimap_reference reference_index_vector;
size_t total_threads = std::thread::hardware_concurrency();
std::string sequence_content = "ABCDDBACDDDCBBAAACBDAADCBDAAADCBDADADACBDDCBBBCDCBCDAADCBBCDAAADCBDA";
uint64_t sequence_length = sequence_content.length();
int split = 5;
uint64_t erase_length = 0;
unsigned int seq_itr = 0;
std::mutex mtx; // to protect against concurent access
int main(){
thread_test::create_index index;
std::thread threads[total_threads];
std::cout << total_threads << " threads lanched" << std::endl;
for(unsigned int i = 0; i < total_threads; i++){
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(sequence_length), std::ref(split), std::ref(sequence_content), std::ref(erase_length));
}
for(unsigned int i = 0; i < total_threads; i++){
threads[i].join();
}
}
/*
* Creating index
*/
void thread_test::create_index::reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, uint64_t &erase_length ){
for (; seq_itr < sequence_length; ++seq_itr ){
std::lock_guard<std::mutex> lock(mtx);
std::string splitstr = sequence_content.substr(erase_length, split);
reference_index_vector.insert(position(seq_itr, splitstr));
seq_itr += split-1;
erase_length += split;
if(erase_length > 10000){
sequence_content.erase(0,erase_length);
erase_length = 0;
}
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
parallel_index.h
#ifndef PARALLEL_INDEX_H_
#define PARALLEL_INDEX_H_
#include<iostream>
#include <algorithm>
#include <utility>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
//typedef boost::unordered_map<int, std::pair<int, unsigned long int>& > reference_map;
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string > > bimap_reference;
typedef bimap_reference::value_type position;
extern bimap_reference reference_index_vector;
namespace thread_test{
class create_index{
public:
void reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, uint64_t &erase_length);
};
}
#endif /* PARALLEL_INDEX_H_ */
-------------------------------EDIT---------------------------------
I tried to divide the string content into partitions equal to the number of threads to have each thread its part locally available. But nothing seems to work. Some times it finishes the first thread and stops there after with a Segmentation fault (core dumped).
parallel_index.cpp
#include <iostream>
#include <string>
#include <algorithm>
#include <thread>
#include <mutex>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include "parallel_index.h"
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string> > bimap_reference;
typedef bimap_reference::value_type position;
bimap_reference reference_index_vector;
//create threads
size_t total_threads = std::thread::hardware_concurrency();
std::string sequence_content = "ABCDDBACDDDCBBAAACBDAADCBDAAADCBDADADACBDDCBBBCDCBCDAADCBBCDAAADCBDADDCCCAAABBBAAACDCA";
uint64_t sequence_length = sequence_content.length();
int split = 5;
// split the sequence_content equal to the number of threads, and assign each partition to each thread.
uint64_t each_partition_len = sequence_content.length()/total_threads- (sequence_content.length()/total_threads)%split ;
uint64_t last_partition_len = sequence_content.length()/total_threads +
(((sequence_content.length()/total_threads)%split)*(total_threads-1)) + sequence_content.length()%total_threads;
std::mutex mtx; // to protect against concurent access
int main(){
thread_test::create_index index;
std::thread threads[total_threads];
std::cout << total_threads << " threads lanched" << std::endl;
for(unsigned int i = 0; i < total_threads; i++){
if(i < total_threads-1)
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(each_partition_len), std::ref(split), std::ref(sequence_content), i);
else
threads[i] = std::thread(&thread_test::create_index::reference_index_hash, index,
std::ref(last_partition_len), std::ref(split), std::ref(sequence_content), i);
//std::lock_guard<std::mutex> lck(mtx);
std::cout << "launched thread " << i << "with id " << threads[i].get_id() << std::endl;
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
for( unsigned int i = 0; i < total_threads; ++i){
if(threads[i].joinable()){
std::cout << "trying to join thread " << i << std:: endl;
threads[i].join();
std::cout << "thread joined " << i << std:: endl;
}
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
/*
* Creating index
*/
void thread_test::create_index::reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, int i ){
uint64_t seq_strt = 0;
// set seq_strt
if(i == 0)
seq_strt = sequence_length * i;
else
seq_strt = sequence_length * i + 1;
for (uint64_t seq_itr = seq_strt; seq_itr <= sequence_length; ++seq_itr ){
std::string splitstr = sequence_content.substr(seq_itr, split);
mtx.lock();
//std::lock_guard<std::mutex> lock(mtx);
reference_index_vector.insert(position(seq_itr, splitstr));
mtx.unlock();
seq_itr += split-1;
}
}
parallel_index.h
#ifndef PARALLEL_INDEX_H_
#define PARALLEL_INDEX_H_
#include<iostream>
#include <algorithm>
#include <utility>
#include <boost/bimap.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
namespace bimaps = boost::bimaps;
typedef boost::bimap<bimaps::unordered_set_of<uint64_t>,
bimaps::unordered_multiset_of<std::string > > bimap_reference;
typedef bimap_reference::value_type position;
extern bimap_reference reference_index_vector;
namespace thread_test{
class create_index{
public:
void reference_index_hash(uint64_t &sequence_length, int &split,
std::string &sequence_content, int i);
};
}
#endif /* PARALLEL_INDEX_H_ */
I feel the culprit for segmentation fault is nothing but static linking of the libraries. Its not by incrementing seq_itr to a value bigger than the actual sequence length, because your for loop will never allow to enter if seq_itr is greater than actual sequence length. You try by removing the -static flag and it should work by not giving the segmentation fault, however it does not guarantee the correctness of the other code. More details about segmentation fault with thread can be found here
All the threads will try to get the lock in the critical section, to keep the bitmap intact, you need a conditional variable so that threads will orderly get executed. This is justified as you are using seq_itr as local variable inside reference_index_hash() and it needs to be incremented in proper sequence.
One problem in your original code is that unsigned int seq_itr is accessed without synchronization from multiple threads. Besides yielding invalid results it might lead to seq_itr being incremented to a value bigger than the actual sequence length, and the following accesses might lead to a crash.
The new code addresses this by just passing indexes, which should be OK as long as those indexes are non-overlapping and correctly calculated. I can't follow the logic completely, but in case your seq_strt calculation is off the program might also crash due to an invalid index. Should be easy to verify in a debugger or with some index assertions.
However there is an issue in the second code example with printing the map directly after threads are started with
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
This will not yield correct results, since the map is concurrently accessed by all worker threads. Access after the join()s is safe.
In using packaged_task, I collected all the futures in a vector. After that, I push back the future values with get(). However, I got the wrong answer. Can anyone help? Thank you very much.
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
using namespace std;
vector<int> subFun(int n) {
vector<int> a{ 2 * n, 3 * n };
return a;
}
int main() {
vector<boost::future<vector<int>>> g;
vector<vector<int>> x(10, vector<int>(2));
int i;
for (i = 0; i < 10; i++) {
boost::packaged_task<vector<int>> task{ boost::bind(&subFun, i) };
g.push_back(task.get_future());
boost::thread t{ std::move(task) };
}
for (auto& m : g) {
x.push_back(m.get());
}
cout << x[3][0] << endl;//should be 6, now is 0
return 0;
}
The realest issue is that you push_back into x, but you already had it initialized here:
vector<vector<int>> x(10, vector<int>(2));
So, you just add 10 more elements, instead of putting the result at indices 0..9. I'd suggest not pre-filling, like #patrick's answer, or instead filling the designated slot:
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
using namespace std;
void subFun(int n, vector<int>& into) {
into = { 2 * n, 3 * n };
}
int main() {
vector<boost::future<void>> futures;
vector<vector<int>> x(10, vector<int>(2));
for (size_t i = 0; i < x.size(); i++) {
boost::packaged_task<void> task{ boost::bind(&subFun, i, std::ref(x[i])) };
futures.push_back(task.get_future());
boost::thread(std::move(task)).detach();
}
for (auto& f : futures)
f.wait();
cout << x[3][0] << endl;
}
Of course you can be more complex:
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
struct TaskResult {
int index;
std::vector<int> data;
};
TaskResult subFun(int n) {
return { n, { 2 * n, 3 * n } };
}
int main() {
std::vector<boost::future<TaskResult>> futures;
std::vector<std::vector<int>> x(10, std::vector<int>(2));
for (size_t i = 0; i < x.size(); i++) {
boost::packaged_task<TaskResult> task{ boost::bind(&subFun, i) };
futures.push_back(task.get_future());
boost::thread(std::move(task)).detach();
}
for (auto& f : futures) {
auto r = f.get();
x[r.index] = r.data;
}
std::cout << x[3][0] << std::endl;
}
After much tinkering, I found this program works without abort traps (which I'm surprised you weren't getting):
#include <future>
#include <thread>
#include <functional>
#include <vector>
#include <iostream>
std::vector<int> subFun(int n) {
std::vector<int> a { 2 * n, 3 * n };
return a;
}
int main() {
std::vector<std::future<std::vector<int>>> g;
std::vector<std::vector<int>> x;
int i;
for (i = 0; i < 10; i++) {
std::packaged_task<std::vector<int>(int)> task{ subFun };
g.push_back(task.get_future());
std::thread { std::move(task), i }.detach();
}
for (auto& m : g) {
m.wait();
x.push_back(m.get());
}
std::cout << x[3][0] << std::endl; // is now 6
return 0;
}
Convert to boost as necessary. This answer was extremely helpful in finding a couple of key issues.
I ported a Java GC test program to C++ (see the code below) as well as Python. The Java and Python performance is much greater than C++ and I was thinking this was due to all the calls to new that have to be done to create the strings each time. I've tried using Boost's fast_pool_allocator but that actually worsened performance from 700ms to 1200ms. Am I using the allocator wrong, or is there something else I should be doing?
EDIT: Compiled with g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ is version 4.8.1
One iteration takes in Python is about 300ms and with Java about 50ms. std::allocator gives about 700ms and boost::fast_pool_allocator gives about 1200ms.
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>
using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
template <typename T> pool_string to_string(const T& in) {
std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
stm << in;
return stm.str();
}
}
#include "mytime.hpp"
class Garbage {
public:
vector<pool_string> outer;
vector<pool_string> old;
const int nThreads = 1;
//static auto time = chrono::high_resolution_clock();
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
auto tt = mytime::msecs();
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
tt = mytime::msecs();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
auto t = mytime::msecs();
cout << (t - tt) << endl;
tt = t;
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
The problem is you're using a new string stream each time to convert an integer.
Fix it:
namespace patch {
template <typename T> pool_string to_string(const T& in) {
return boost::lexical_cast<pool_string>(in);
}
}
Now the timings are:
DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185
real 0m0.801s
user 0m0.784s
sys 0m0.016s
See it Live On Coliru
Full code for reference:
#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>
using string = std::string;
namespace patch {
template <typename T> string to_string(const T& in) {
return boost::lexical_cast<string>(in);
}
}
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
std::vector<string> outer;
std::vector<string> old;
const int nThreads = 1;
void go() {
outer.resize(1000000);
//old.reserve(1000000);
Timer timer;
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
std::cout << "DOING AN OLD" << std::endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
std::cout << timer.elapsed() << std::endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
Since I don't use boost on my machine, I simplified the code to use standard C++11 to_string (thus accidentally "fixing" the problem sehe found), and got this:
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
vector<string> outer;
vector<string> old;
const int nThreads = 1;
Timer timer;
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(to_string(j));
outer.clear();
cout << timer.elapsed() << endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(to_string(i));
}
};
int main() {
Garbage().go();
}
Compiling with:
$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835
clang 3.5 build with source from Friday 18th of April 2014 gives similar results with the same compiler options.
My processor is a AMD Phenom(tm) II X4 965, running at 3.6GHz (if I remember right).