OpenMP parallelize but not accelerate code - c++

I've been trying to accelerate the following code
void calcstiffTestOMP(TPZCompMesh *cmesh,int nthread){
int64_t nelem = cmesh->NElements();
omp_set_num_threads(nthread);
auto beginCalcStiff = std::chrono::high_resolution_clock::now();
int s=0;
#pragma omp parallel for reduction(+:s)
for (int64_t iel = 0; iel < nelem; iel++)
{
TPZCompEl *el = cmesh->Element(iel);
if (!el) continue;
TPZElementMatrix ek(cmesh, TPZElementMatrix::EK), ef(cmesh, TPZElementMatrix::EF);
el->CalcStiff(ek, ef);
}
auto endCalcStiff = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(endCalcStiff - beginCalcStiff);
unsigned long int duration = static_cast<unsigned long int>(elapsed.count());
std::cout << " CalcStiff parallel duration omp = " << duration*1E-9 << " seconds, with nthreads= " << nthread << std::endl;
}
but the simulation time hasn't changed, independently of the number of threads.
The same code has been accelerated when using TBB directives, as follows
void calcstiffTestTBB(TPZCompMesh *cmesh,int nthread){
auto beginCalcStiff = std::chrono::high_resolution_clock::now();
int64_t nelem = cmesh->NElements();
tbb::task_scheduler_init init(nthread);
tbb::parallel_for( tbb::blocked_range<int64_t>(0,nelem),
[&](tbb::blocked_range<int64_t> r){
for (int64_t iel = r.begin(); iel < r.end(); iel++)
{
TPZCompEl *el = cmesh->Element(iel);
if (!el) continue;
TPZElementMatrix ek(cmesh, TPZElementMatrix::EK), ef(cmesh, TPZElementMatrix::EF);
el->CalcStiff(ek, ef);
}
});
auto endCalcStiff = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(endCalcStiff - beginCalcStiff);
unsigned long int duration = static_cast<unsigned long int>(elapsed.count());
std::cout << " CalcStiff duration tbb= " << duration*1E-9 << " seconds, with nthreads= "<<nthread << std::endl;
}
The calcstiff function is given by:
void TPZMultiphysicsCompEl<TGeometry>::CalcStiff(TPZElementMatrix &ek, TPZElementMatrix &ef)
{
TPZMaterial * material = Material();
if(!material){
PZError << "Error at " << __PRETTY_FUNCTION__ << " this->Material() == NULL\n";
ek.Reset();
ef.Reset();
return;
}
TPZNullMaterial *nullmat = dynamic_cast<TPZNullMaterial *>(material);
if(nullmat)
{
ek.Reset();
ef.Reset();
ek.fType = TPZElementMatrix::EK;
ef.fType = TPZElementMatrix::EF;
return;
}
InitializeElementMatrix(ek,ef);
if (this->NConnects() == 0) return;//boundary discontinuous elements have this characteristic
TPZManVector<TPZMaterialData,6> datavec;
const int64_t nref = fElementVec.size();
datavec.resize(nref);
InitMaterialData(datavec);
TPZManVector<TPZTransform<> > trvec;
AffineTransform(trvec);
int dim = Dimension();
TPZAutoPointer<TPZIntPoints> intrule;
TPZManVector<REAL,4> intpointtemp(TGeometry::Dimension,0.);
REAL weight = 0.;
TPZManVector<int,4> ordervec;
//ordervec.resize(nref);
for (int64_t iref=0; iref<nref; iref++)
{
TPZInterpolationSpace *msp = dynamic_cast <TPZInterpolationSpace *>(fElementVec[iref].Element());
int svec;
if(msp)
{
ordervec.Resize(ordervec.size()+1);
svec = ordervec.size();
}
else
{
continue;
}
datavec[iref].p =1; //msp->MaxOrder();
ordervec[svec-1] = datavec[iref].p;
}
int order = material->IntegrationRuleOrder(ordervec);
TPZGeoEl *ref = this->Reference();
intrule = ref->CreateSideIntegrationRule(ref->NSides()-1, order);
TPZManVector<int,4> intorder(dim,order);
intrule->SetOrder(intorder);
int intrulepoints = intrule->NPoints();
if(intrulepoints > 1000) {
DebugStop();
}
TPZFMatrix<REAL> jac, axe, jacInv;
REAL detJac;
for(int int_ind = 0; int_ind < intrulepoints; ++int_ind)
{
intrule->Point(int_ind,intpointtemp,weight);
ref->Jacobian(intpointtemp, jac, axe, detJac , jacInv);
weight *= fabs(detJac);
for (int i = 0; i < fElementVec.size(); i++) {
TPZInterpolationSpace *msp = dynamic_cast <TPZInterpolationSpace *>(fElementVec[i].Element());
if (!msp) {
continue;
}
datavec[i].intLocPtIndex = int_ind;
}
this->ComputeRequiredData(intpointtemp,trvec,datavec);
material->Contribute(datavec,weight,ek.fMat,ef.fMat);
}//loop over integration points
CleanupMaterialData(datavec);
}
The code has been parallelized but not accelerated. Are we using omp directives correctly?
Does OMP create mutex sections on its own? Is there a better way to parallelize this code using OMP?
PZ is a library which serve as a template for finite element code, and can be found in https://github.com/labmec/neopz
SOLUTION: Adding the 'schedule(dynamic,1)' flag to '#pragma' directive solved this problem. The code had different thread workloads
and OMP was using a single thread to handle the slow section of the loop. The 'schedule(dynamic,1)' directive solved this.

Related

It is normal that my code is faster without multithreading o am i doing something wrong?

i'm trying to optimize my code using multithreading and is not just that the program is not the double speed as is suposed to be in this dual-core computer, it is SO MUCH SLOW. And i just wanna know if i'm doing something wrong or is pretty normal that in this case use multithreading does not help. I make this recreation of how i used the multithreading, and in my computer the parallel versions take's 4 times the time in the comparation of the normal version:
#include <iostream>
#include <random>
#include <thread>
#include <chrono>
using namespace std;
default_random_engine ran;
inline bool get(){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
for (unsigned i = 0; i < repetitions; ++i)
result += get();
}
unsigned parallel_series(unsigned repetitions){
const unsigned hardware_threads = std::thread::hardware_concurrency();
cout << "Threads in this computer: " << hardware_threads << endl;
const unsigned threads_number = (hardware_threads != 0) ? hardware_threads : 2;
const unsigned its_per_thread = repetitions / threads_number;
unsigned *results = new unsigned[threads_number]();
std::thread *threads = new std::thread[threads_number - 1];
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i] = std::thread(normal_serie, its_per_thread, std::ref(results[i]));
normal_serie(its_per_thread, results[threads_number - 1]);
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i].join();
auto result = std::accumulate(results, results + threads_number, 0);
delete[] results;
delete[] threads;
return result;
}
int main()
{
constexpr unsigned repetitions = 100000000;
auto to = std::chrono::high_resolution_clock::now();
cout << parallel_series(repetitions) << endl;
auto tf = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Parallel duration: " << duration << "ms" << endl;
to = std::chrono::high_resolution_clock::now();
unsigned r = 0;
normal_serie(repetitions, r);
cout << r << endl;
tf = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Normal duration: " << duration << "ms" << endl;
return 0;
}
Things that i already know, but i didn't to make this code shorter:
I should set a max_iterations_per_thread because you don't wanna make 10 iterations per thread, but in this case we are doing one billion iterations so that is not gonna happend.
The number of iterations must be divisible by the number or threads, otherwise the code will not do an effective work.
This is the output that i get in my computer:
Threads in this computer: 2
66665160
Parallel duration: 4545ms
66664432
Normal duration: 1019ms
(Solved partially doing this changes: )
inline bool get(default_random_engine &ran){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
default_random_engine eng;
unsigned saver_result = 0;
for (unsigned i = 0; i < repetitions; ++i)
saver_result += get(eng);
result += saver_result;
}
All your threads are tripping over each other fighting for access to ran which can only perform one operation at a time because it only has one state and each operation advances its state. There is no point in running operations in parallel if the vast majority of each operation involves a choke point that cannot support any concurrency.
All elements of results are likely to share a cache line, which means there is lots of inter-core communication going on.
Try modifying normal_serie to accumulate into a local variable and only write it to results in the end.

Why does threading floating point computations on the CPU make them take significantly longer?

I am currently working on a scientific simulation (Gravitational nbody). I first wrote it with a naive single-threaded algorithm, and this performed acceptably for a small number of particles. I then multi-threaded this algorithm (it is embarrassingly parallel), and the program took about 3x as long. What follows is a minimum, complete, verifiable example of a trivial algorithm with similar properties and output to a file in /tmp (it is designed to run on Linux, but the C++ is also standard). Be warned that if you decide to run this code, it will produce a 152.62MB file. The data is outputted to prevent the compiler from optimizing the computation out of the program.
#include <iostream>
#include <functional>
#include <thread>
#include <vector>
#include <atomic>
#include <random>
#include <fstream>
#include <chrono>
constexpr unsigned ITERATION_COUNT = 2000;
constexpr unsigned NUMBER_COUNT = 10000;
void runThreaded(unsigned count, unsigned batchSize, std::function<void(unsigned)> callback){
unsigned threadCount = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
threads.reserve(threadCount);
std::atomic<unsigned> currentIndex(0);
for(unsigned i=0;i<threadCount;++i){
threads.emplace_back([&currentIndex, batchSize, count, callback]{
unsigned startAt = currentIndex.fetch_add(batchSize);
if(startAt >= count){
return;
}else{
for(unsigned i=0;i<count;++i){
unsigned index = startAt+i;
if(index >= count){
return;
}
callback(index);
}
}
});
}
for(std::thread &thread : threads){
thread.join();
}
}
void threadedTest(){
std::mt19937_64 rnd(0);
std::vector<double> numbers;
numbers.reserve(NUMBER_COUNT);
for(unsigned i=0;i<NUMBER_COUNT;++i){
numbers.push_back(rnd());
}
std::vector<double> newNumbers = numbers;
std::ofstream fout("/tmp/test-data.bin");
for(unsigned i=0;i<ITERATION_COUNT;++i) {
std::cout << "Iteration: " << i << "/" << ITERATION_COUNT << std::endl;
runThreaded(NUMBER_COUNT, 100, [&numbers, &newNumbers](unsigned x){
double total = 0;
for(unsigned y=0;y<NUMBER_COUNT;++y){
total += numbers[y]*(y-x)*(y-x);
}
newNumbers[x] = total;
});
fout.write(reinterpret_cast<char*>(newNumbers.data()), newNumbers.size()*sizeof(double));
std::swap(numbers, newNumbers);
}
}
void unThreadedTest(){
std::mt19937_64 rnd(0);
std::vector<double> numbers;
numbers.reserve(NUMBER_COUNT);
for(unsigned i=0;i<NUMBER_COUNT;++i){
numbers.push_back(rnd());
}
std::vector<double> newNumbers = numbers;
std::ofstream fout("/tmp/test-data.bin");
for(unsigned i=0;i<ITERATION_COUNT;++i){
std::cout << "Iteration: " << i << "/" << ITERATION_COUNT << std::endl;
for(unsigned x=0;x<NUMBER_COUNT;++x){
double total = 0;
for(unsigned y=0;y<NUMBER_COUNT;++y){
total += numbers[y]*(y-x)*(y-x);
}
newNumbers[x] = total;
}
fout.write(reinterpret_cast<char*>(newNumbers.data()), newNumbers.size()*sizeof(double));
std::swap(numbers, newNumbers);
}
}
int main(int argc, char *argv[]) {
if(argv[1][0] == 't'){
threadedTest();
}else{
unThreadedTest();
}
return 0;
}
When I run this (compiled with clang 7.0.1 on Linux), I get the following times from the Linux time command. The difference between these is similar to what I see in my real program. The entry labelled "real" is what is relevant to this question, as this is the clock time that the program takes to run.
Single-threaded:
real 6m27.261s
user 6m27.081s
sys 0m0.051s
Multi-threaded:
real 14m32.856s
user 216m58.063s
sys 0m4.492s
As such, I ask what is causing this massive slowdown when I expect it to speed up significantly (roughly by a factor of 8, as I have an 8 core 16 thread CPU). I am not implementing this on the GPU as the next step is to make some changes to the algorithm to take it from O(n²) to O(nlogn), but that are also not amicable to a GPU. The changed algorithm will have less difference with my currently implemented O(n²) algorithm than the included example. Lastly, I want to observe that the subjective time to run each iteration (judged by the time between the iteration lines appearing) changes significantly in both the threaded and unthreaded runs.
It's kind of hard to follow this code, but I think you're duplicating work on a massive scale because each thread does nearly all the work, just skipping a small portion of it at the start.
I'm presuming the inner loop of runThreaded should be:
unsigned startAt = currentIndex.fetch_add(batchSize);
while (startAt < count) {
if (startAt >= count) {
return;
} else {
for(unsigned i=0;i<batchSize;++i){
unsigned index = startAt+i;
if(index >= count){
return;
}
callback(index);
}
}
startAt = currentIndex.fetch_add(batchSize);
}
Where i < batchSize is the key here. You should only do as much work as the batch dictates, not count times, which is the whole list minus the initial offset.
With this update the code runs significantly faster. I'm not sure if it does all the required work because it's hard to tell if that's actually happening, the output is very minimal.
For easy parallelization over multiple CPUs I recommend using tbb::parallel_for. It uses the correct number of CPUs and splits the range for you, completely eliminating the risk of implementing it wrong. Alternatively, there is a parallel for_each in C++17. In other words, this problem has a number of good solutions.
Vectorizing code is a difficult problem and neither clang++-6 not g++-8 auto-vectorize the baseline code. Hence, SIMD version below I used excellent Vc: portable, zero-overhead C++ types for explicitly data-parallel programming library.
Below is a working benchmark that compares:
The baseline version.
SIMD version.
SIMD + multi-threading version.
#include <Vc/Vc>
#include <tbb/parallel_for.h>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <random>
#include <vector>
constexpr int ITERATION_COUNT = 20;
constexpr int NUMBER_COUNT = 20000;
double baseline() {
double result = 0;
std::vector<double> newNumbers(NUMBER_COUNT);
std::vector<double> numbers(NUMBER_COUNT);
std::mt19937 rnd(0);
for(auto& n : numbers)
n = rnd();
for(int i = 0; i < ITERATION_COUNT; ++i) {
for(int x = 0; x < NUMBER_COUNT; ++x) {
double total = 0;
for(int y = 0; y < NUMBER_COUNT; ++y) {
auto d = (y - x);
total += numbers[y] * (d * d);
}
newNumbers[x] = total;
}
result += std::accumulate(newNumbers.begin(), newNumbers.end(), 0.);
swap(numbers, newNumbers);
}
return result;
}
double simd() {
double result = 0;
constexpr int SIMD_NUMBER_COUNT = NUMBER_COUNT / Vc::double_v::Size;
using vector_double_v = std::vector<Vc::double_v, Vc::Allocator<Vc::double_v>>;
vector_double_v newNumbers(SIMD_NUMBER_COUNT);
vector_double_v numbers(SIMD_NUMBER_COUNT);
std::mt19937 rnd(0);
for(auto& n : numbers) {
alignas(Vc::VectorAlignment) double t[Vc::double_v::Size];
for(double& v : t)
v = rnd();
n.load(t, Vc::Aligned);
}
Vc::double_v const incv(Vc::double_v::Size);
for(int i = 0; i < ITERATION_COUNT; ++i) {
Vc::double_v x(Vc::IndexesFromZero);
for(auto& new_n : newNumbers) {
Vc::double_v totals;
int y = 0;
for(auto const& n : numbers) {
for(unsigned j = 0; j < Vc::double_v::Size; ++j) {
auto d = y - x;
totals += n[j] * (d * d);
++y;
}
}
new_n = totals;
x += incv;
}
result += std::accumulate(newNumbers.begin(), newNumbers.end(), Vc::double_v{}).sum();
swap(numbers, newNumbers);
}
return result;
}
double simd_mt() {
double result = 0;
constexpr int SIMD_NUMBER_COUNT = NUMBER_COUNT / Vc::double_v::Size;
using vector_double_v = std::vector<Vc::double_v, Vc::Allocator<Vc::double_v>>;
vector_double_v newNumbers(SIMD_NUMBER_COUNT);
vector_double_v numbers(SIMD_NUMBER_COUNT);
std::mt19937 rnd(0);
for(auto& n : numbers) {
alignas(Vc::VectorAlignment) double t[Vc::double_v::Size];
for(double& v : t)
v = rnd();
n.load(t, Vc::Aligned);
}
Vc::double_v const v0123(Vc::IndexesFromZero);
for(int i = 0; i < ITERATION_COUNT; ++i) {
constexpr int SIMD_STEP = 4;
tbb::parallel_for(0, SIMD_NUMBER_COUNT, SIMD_STEP, [&](int ix) {
Vc::double_v xs[SIMD_STEP];
for(int is = 0; is < SIMD_STEP; ++is)
xs[is] = v0123 + (ix + is) * Vc::double_v::Size;
Vc::double_v totals[SIMD_STEP];
int y = 0;
for(auto const& n : numbers) {
for(unsigned j = 0; j < Vc::double_v::Size; ++j) {
for(int is = 0; is < SIMD_STEP; ++is) {
auto d = y - xs[is];
totals[is] += n[j] * (d * d);
}
++y;
}
}
std::copy_n(totals, SIMD_STEP, &newNumbers[ix]);
});
result += std::accumulate(newNumbers.begin(), newNumbers.end(), Vc::double_v{}).sum();
swap(numbers, newNumbers);
}
return result;
}
struct Stopwatch {
using Clock = std::chrono::high_resolution_clock;
using Seconds = std::chrono::duration<double>;
Clock::time_point start_ = Clock::now();
Seconds elapsed() const {
return std::chrono::duration_cast<Seconds>(Clock::now() - start_);
}
};
std::ostream& operator<<(std::ostream& s, Stopwatch::Seconds const& a) {
auto precision = s.precision(9);
s << std::fixed << a.count() << std::resetiosflags(std::ios_base::floatfield) << 's';
s.precision(precision);
return s;
}
void benchmark() {
Stopwatch::Seconds baseline_time;
{
Stopwatch s;
double result = baseline();
baseline_time = s.elapsed();
std::cout << "baseline: " << result << ", " << baseline_time << '\n';
}
{
Stopwatch s;
double result = simd();
auto time = s.elapsed();
std::cout << " simd: " << result << ", " << time << ", " << (baseline_time / time) << "x speedup\n";
}
{
Stopwatch s;
double result = simd_mt();
auto time = s.elapsed();
std::cout << " simd_mt: " << result << ", " << time << ", " << (baseline_time / time) << "x speedup\n";
}
}
int main() {
benchmark();
benchmark();
benchmark();
}
Timings:
baseline: 2.76582e+257, 6.399848397s
simd: 2.76582e+257, 1.600373449s, 3.99897x speedup
simd_mt: 2.76582e+257, 0.168638435s, 37.9501x speedup
Notes:
My machine supports AVX but not AVX-512, so it is roughly 4x speedup when using SIMD.
simd_mt version uses 8 threads on my machine and larger SIMD steps. The theoretical speedup is 128x, on practice - 38x.
clang++-6 cannot auto-vectorize the baseline code, neither can g++-8.
g++-8 generates considerably faster code for SIMD versions than clang++-6 .
Your heart is certainly in the right place minus a bug or two.
par_for is a complex issue depending on the payload of your loop. There is
no one-size-fits-all solution to this. The payload can be anything from
a couple of adds to almost infinite mutex blocks - for example by doing memory
allocation.
The atomic variable as a work item pattern has always worked well for me but
remember that atomic variables have a high cost on X86 (~400 cycles) and even
incur a high cost if they are in an unexecuted branch as I found to my peril.
Some permutation of the following is usually good. Choosing the right chunks_per_thread (as in your batchSize) is critical. If you don't trust your
users, you can test execute a few iterations of the loop to guess the
best chunking level.
#include <atomic>
#include <future>
#include <thread>
#include <vector>
#include <stdio.h>
template<typename Func>
void par_for(int start, int end, int step, int chunks_per_thread, Func func) {
using namespace std;
using namespace chrono;
atomic<int> work_item{start};
vector<future<void>> futures(std::thread::hardware_concurrency());
for (auto &fut : futures) {
fut = async(std::launch::async, [&work_item, end, step, chunks_per_thread, &func]() {
for(;;) {
int wi = work_item.fetch_add(step * chunks_per_thread);
if (wi > end) break;
int wi_max = std::min(end, wi+step * chunks_per_thread);
while (wi < wi_max) {
func(wi);
wi += step;
}
}
});
}
for (auto &fut : futures) {
fut.wait();
}
}
int main() {
using namespace std;
using namespace chrono;
for (int k = 0; k != 2; ++k) {
auto t0 = high_resolution_clock::now();
constexpr int loops = 100000000;
if (k == 0) {
for (int i = 0; i != loops; ++i ) {
if (i % 10000000 == 0) printf("%d\n", i);
}
} else {
par_for(0, loops, 1, 100000, [](int i) {
if (i % 10000000 == 0) printf("%d\n", i);
});
}
auto t1 = high_resolution_clock::now();
duration<double, milli> ns = t1 - t0;
printf("k=%d %fms total\n", k, ns.count());
}
}
results
...
k=0 174.925903ms total
...
k=1 27.924738ms total
About a 6x speedup.
I avoid the term "embarassingly parallel" as it is almost never the case. You pay exponentially higher costs the more resources you use on your journey from level 1 cache (ns latency) to globe spanning cluster (ms latency). But I hope this code snippet is useful as an answer.

omp parallel for no optimization achieved for quadratic sieve

I am trying to implement parallel quadratic sieve using open mp. In sieving phase, I am using log approximations to check the divisibility. This is my code.
#pragma omp parallel for schedule (dynamic) num_threads(4)
for (int i = 0; i < factorBase.size(); ++i) {
const uint32_t p = factorBase[i];
const float logp = std::log(factorBase[i]) / std::log(2);
// Sieve first sequence.
while (startIndex.first[i] < intervalEnd) {
logApprox[startIndex.first[i] - intervalStart] -= logp;
startIndex.first[i] += p;
}
if (p == 2)
continue; // a^2 = N (mod 2) only has one root.
// Sieve second sequence.
while (startIndex.second[i] < intervalEnd) {
logApprox[startIndex.second[i] - intervalStart] -= logp;
startIndex.second[i] += p;
}
}
Here factorbase and logApprox are std::vectors initialized as follows
std::vector<float> logApprox(INTERVAL_LENGTH, 0);
std::vector<uint32_t> factorBase;
Whenever, I run this code and compare the running time, there is no much difference between sequential and parallel run. What are some optimizations that can be done? I am a beginner in openmp and any help is appreciated.Thanks
Very interesting task you have! Thanks!
Decided to make my own implementation with very many optimizations.
I achieved 20.4x times boost compared to your original code (your code gives 17.86 seconds, my gives 0.87 seconds). Also I used 2x times less memory for sieving compared to your algorithm, while achieving same goal.
To make comparison I simplified your code in such a way that it still does almost same thing and runs exactly same time, but looks much more simple:
#pragma omp parallel for
for (size_t i = 0; i < factorBase.size(); ++i) {
auto const p = factorBase[i];
float const logp = std::log(p) / std::log(2);
while (startIndex[i] < logApprox.size()) {
logApprox[startIndex[i]] += logp;
startIndex[i] += p;
}
}
You can see that I leaved only single sieve loop, second one does same thing and not necessary for demonstration, so I removed it. Also I removed startInterval as it is irrelevant to speed demonstration. And for simplicity I did += of logarithm instead of yours -=.
One important notice regarding your algorithm is that it doesn't do any synchronization, it means that different cores of CPU may write to same entry of logApprox array hence give wrong result.
And as I have measured this wrong result happens once or twice per hundred million entries of logApprox array. My optimized code overcame this limitation and did correct synchronization besides doing all speed optimizations.
I did following improvements to gain 20x times speedup:
I split whole array into blocks, approximately 2^13 elements in size. Each group of blocks is processed by separate thread/CPU-core hence no synchronization of threads is needed. Besides avoiding synchronization what is very important is that 2^13 block fits fully into L1 or L2 cache of CPU, hence speeds up things a lot.
Each block of 2^13 is processed for all possible primes. To keep track of which offsets of what primes are needed I created a special ring buffer of 2^7 size, this ring buffer is indexed with block number modulo 2^7 and keeps track which primes with which offsets are needed for each block (modulo 2^7).
I have as many threads as there are CPU cores. For each thread I precompute starting offsets of all primes for this thread, these starting offsets are computed through modular arithmetics based on startIndex array that you provided in your original code.
To speedup even more instead of float logarithm I use integer logarithm, which is based on uint16_t. This integer logarithm is computed as uint16_t integer_log = uint16_t(std::log2(p) * (1 << 8) + 0.5);. Besides increasing speed of computing += for integer logarithms, they also decrease occupied memory 2x times. If for some reason uint16_t logarithm is not enough for you then please replace using ILog2T = u16; with using ILog2T = u32; in my code, but this will double amount of used memory.
My code output following to console:
time_simple 17.859 sec, time_optimized 0.874 sec, boost 20.434, correct_ratio 0.999999993
Time simple is time of your original code for sieving array of size 2^28, time optimized is my code for same array, boost is how much my code is faster (you can see it is 20x times faster). Correct ratio says if there are any errors in your code, due to absence of multi-core synchronization (as you can see sometimes it is less than 1.0 hence there are some errors).
Full optimized code below:
Try it online!
#include <cstdint>
#include <random>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <thread>
#include <type_traits>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <mutex>
#include <omp.h>
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#define OSTR(code) ([&]{ std::ostringstream ss; ss code; return ss.str(); }())
#define COUT(code) { std::unique_lock<std::mutex> lock(cout_mux); std::cout code; std::cout << std::flush; }
#define LN { COUT(<< "LN " << __LINE__ << std::endl); }
#define DUMP(var) { COUT(<< #var << " = (" << (var) << ")" << std::endl); }
using u16 = uint16_t;
using u32 = uint32_t;
using u64 = uint64_t;
using ILog2T = u16;
using PrimeT = u32;
std::mutex cout_mux;
template <typename T>
std::vector<T> GenPrimes(size_t end) {
thread_local std::vector<T> primes = {2, 3};
while (primes.back() < end) {
for (T p = primes.back() + 2;; p += 2) {
bool is_prime = true;
for (auto d: primes) {
if (u64(d) * d > p)
break;
if (p % d == 0) {
is_prime = false;
break;
}
}
if (is_prime) {
primes.push_back(p);
break;
}
}
}
primes.pop_back();
return primes;
}
void SieveA(std::vector<float> & logApprox, std::vector<PrimeT> const & factorBase, std::vector<PrimeT> startIndex) {
#pragma omp parallel for
for (size_t i = 0; i < factorBase.size(); ++i) {
auto const p = factorBase[i];
float const logp = std::log(p) / std::log(2);
while (startIndex[i] < logApprox.size()) {
logApprox[startIndex[i]] += logp;
startIndex[i] += p;
}
}
}
size_t NThreads() {
//return 1;
return std::thread::hardware_concurrency();
}
ILog2T LogToI(double x) { return ILog2T(x * (1ULL << (sizeof(ILog2T) * 8 - 8)) + 0.5); }
double IToLog(ILog2T x) { return x / double(1ULL << (sizeof(ILog2T) * 8 - 8)); }
double Time() {
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
}
std::string FloatToStr(double x, size_t round = 6) {
return OSTR(<< std::fixed << std::setprecision(round) << x);
}
double SieveB(std::vector<ILog2T> & logs, std::vector<PrimeT> const & primes, std::vector<PrimeT> const & starts0) {
auto const nthr = NThreads();
std::vector<std::vector<PrimeT>> starts(nthr, std::vector<PrimeT>(primes.size()));
std::vector<std::vector<ILog2T>> plogs(nthr, std::vector<ILog2T>(primes.size()));
std::vector<std::pair<u64, u64>> ranges(nthr);
size_t constexpr block_log2 = 13, block = 1 << block_log2, ring_log2 = 6, ring_size = 1ULL << ring_log2, ring_mask = ring_size - 1;
std::vector<std::vector<std::vector<std::pair<u32, u32>>>> ring(nthr, std::vector<std::vector<std::pair<u32, u32>>>(ring_size));
#pragma omp parallel for
for (size_t ithr = 0; ithr < nthr; ++ithr) {
size_t const nblock = ((logs.size() + nthr - 1) / nthr + block - 1) / block * block,
begin = ithr * nblock, end = std::min<size_t>(logs.size(), (ithr + 1) * nblock);
ranges[ithr] = {begin, end};
for (size_t i = 0; i < primes.size(); ++i) {
PrimeT const p = primes[i];
size_t const mod0 = begin % p, mod = starts0[i] < mod0 ? p + starts0[i] - mod0 : starts0[i] - mod0;
starts[ithr][i] = mod;
plogs[ithr][i] = LogToI(std::log2(p));
ring[ithr][((begin + starts[ithr][i]) >> block_log2) & ring_mask].push_back({i, begin + starts[ithr][i]});
}
}
auto tim = Time();
#pragma omp parallel for
for (size_t ithr = 0; ithr < nthr; ++ithr) {
auto const [begin, end] = ranges[ithr];
auto const [bbegin, bend] = std::make_tuple(begin / block, (end - 1) / block + 1);
auto const & cstarts = starts.at(ithr);
auto const & cplogs = plogs.at(ithr);
auto & cring = ring[ithr];
std::decay_t<decltype(cring[0])> tmp;
size_t hit_cnt = 0, miss_cnt = 0;
for (size_t iblock = bbegin; iblock < bend; ++iblock) {
size_t const cbegin = iblock << block_log2, cend = std::min<size_t>(end, (iblock + 1) << block_log2);
auto & ring_cur = cring[iblock & ring_mask];
tmp = ring_cur;
ring_cur.clear();
for (auto [ip, off]: tmp)
if (off >= cend) {
//++miss_cnt;
ring_cur.push_back({ip, off});
} else {
//++hit_cnt;
auto const p = primes[ip];
auto const plog = cplogs[ip];
for (; off < cend; off += p) {
//if (8192 - 10 <= off && off <= 8192 + 10) COUT(<< "logs.size() " << logs.size() << " begin " << begin << " end " << end << " bbegin " << bbegin << " bend " << bend << " cbegin " << cbegin << " cend " << cend << " iblock " << iblock << " off " << off << " p " << p << " plog " << plog << std::endl);
logs[off] += plog;
}
if (off < end)
cring[(off >> block_log2) & ring_mask].push_back({ip, off});
}
}
//COUT(<< "hit_ratio " << std::fixed << std::setprecision(6) << double(hit_cnt) / (hit_cnt + miss_cnt) << std::endl);
}
return Time() - tim;
}
void Test() {
size_t constexpr len = 1ULL << 28;
std::mt19937_64 rng{123};
auto const primes = GenPrimes<PrimeT>(1 << 12);
std::vector<PrimeT> starts;
for (auto p: primes)
starts.push_back(rng() % p);
ASSERT(primes.size() == starts.size());
double tA = 0, tB = 0;
std::vector<float> logsA(len);
std::vector<ILog2T> logsB(len);
{
tA = Time();
SieveA(logsA, primes, starts);
tA = Time() - tA;
}
{
tB = SieveB(logsB, primes, starts);
}
size_t correct = 0;
for (size_t i = 0; i < len; ++i) {
//ASSERT_MSG(std::abs(logsA[i] - IToLog(logsB[i])) < 0.1, "i " + std::to_string(i) + " logA " + FloatToStr(logsA[i], 3) + " logB " + FloatToStr(IToLog(logsB[i]), 3));
if (std::abs(logsA[i] - IToLog(logsB[i])) < 0.1)
++correct;
}
std::cout << std::fixed << std::setprecision(3) << "time_simple " << tA << " sec, time_optimized " << tB << " sec, boost " << (tA / tB) << ", correct_ratio " << std::setprecision(9) << double(correct) / len << std::endl;
}
int main() {
try {
omp_set_num_threads(NThreads());
Test();
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
Output:
time_simple 17.859 sec, time_optimized 0.874 sec, boost 20.434, correct_ratio 0.999999993
In my opinion, you should turn the schedule to static and give it chunk-size (https://software.intel.com/en-us/articles/openmp-loop-scheduling).
A small optimization should be :
outside of the big FOR loop, declare a const and initialize it to 1/std::log(2), and then inside the FOR loop, instead of dividing by std::log(2), do a multiplication of the previous const, division is very expensive in CPU cycles.

The fastest way to use a binary expression on array of booleans

I need to do something like this in the fastest way possible (O(1) would be perfect):
for (int j = 0; j < V; ++j)
{
if(!visited[j]) required[j]=0;
}
I came up with this solution:
for (int j = 0; j < V; ++j)
{
required[j]=visited[j]&required[j];
}
Which made the program run 3 times faster but I believe there is an even better way to do this. Am I right?
Btw. required and visited are dynamically allocated arrays
bool *required;
bool *visited;
required = new bool[V];
visited = new bool[V];
In the case where you're using a list of simple objects, you are most likely best suited using the functionality provided by the C++ Standard Library. Structures like valarray and vectors are recognized and optimized very effectively by all modern compilers.
Much debate exists as to how much you can rely on your compiler, but one guarantee is, your compiler was built alongside the standard library and relying on it for basic functionality (such as your problem) is generally a safe bet.
Never be afraid to run your own time tests and race your compiler! It's a fun exercise and one that is ever increasingly difficult to achieve.
Construct a valarray (highly optimized in c++11 and later):
std::valarray<bool> valRequired(required, V);
std::valarray<bool> valVisited(visited, V);
valRequired &= valVisited;
Alternatively, you could do it with one line using transform:
std::transform(required[0], required[V-1], visited[0], required[0], [](bool r, bool v){ return r & v; })
Edit: while fewer lines is not faster, your compiler will likely vectorize this operation.
I also tested their timing:
int main(int argc, const char * argv[]) {
auto clock = std::chrono::high_resolution_clock{};
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] &= visited[i];
}
auto end = clock.now();
std::cout << "1: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] = visited[i] & required[i];
}
auto end = clock.now();
std::cout << "2: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
std::transform(required, required + 4, visited, required, [](bool r, bool v){ return r & v; });
auto end = clock.now();
std::cout << "3: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
std::valarray<bool> valVisited(visited, 5);
std::valarray<bool> valrequired(required, 5);
auto start = clock.now();
valrequired &= valVisited;
auto end = clock.now();
std::cout << "4: " << (end - start).count() << std::endl;
}
}
Output:
1: 102
2: 55
3: 47
4: 45
Program ended with exit code: 0
In the line of #AlanStokes, use packed binary data and combine with the AVX instruction _mm512_and_epi64, 512 bits at a time. Be prepared for your hair messed up.

OpenMP generate segfault in Rcpp code for the SEIR model

I wrote a (probably-inefficient, but anyway..) Rcpp code using inline to simulate a stochastic SEIR model.
The serial version compiles and works perfectly, but since I need to simulate from it a large number of times and since it seems to me like an embarrassingly parallel problem (just need to simulate again for other parameter values and return a matrix with the results) I tried to add #pragma omp parallel for and to compile with -fopenmp -lgomp but ... boom!
I get a segfault even for very small examples!
I tried to add setenv("OMP_STACKSIZE","24M",1); and values well over 24M but still the segfault happens.
I'll explain briefly the code since it's a bit long (I tried to shorten it but the result change and I can't reproduce it..):
I have two nested loops, the inner one execute the model for a given parameter set and the outer one changes the parameters.
The only reason a race condition might happen is if the code were trying to execute set of instructions inside inner the loop in parallel (which cannot be done because of the model structure, on iteration t it depends on iteration t-1) and not to parallelize the outer, but if I'm not mistaken that is what the parallel for constructor does for default if put just outside the outer...
This is basically the form of the code I'm trying to run:
mat result(n_param,T_MAX);
#pragma omp parallel for
for(int i=0,i<n_param_set;i++){
t=0;
rowvec jnk(T_MAX);
while(t < T_MAX){
...
jnk(t) = something(jnk(t-1));
...
t++;
}
result.row(i)=jnk;
}
return wrap(result);
And my question is: How I tell the compiler that I just want to compute in parallel the outer loop (even distributing them statically like n_loops/n_threads for each thread) and not the inner one (which is actually non-parallelizable)?
The real code is a bit more involved and I'll present it here for the sake of reproducibility if you're really willing, but I'm only asking about the behavior of OpenMP. Please notice that the only OpenMP instruction appears at line 122.
library(Rcpp);library(RcppArmadillo);library(inline)
misc='
#include <math.h>
#define _USE_MATH_DEFINES
#include <omp.h>
using namespace arma;
template <typename T> int sgn(T val) {
return (T(0) < val) - (val < T(0));
}
uvec rmultinomial(int n,vec prob)
{
int K = prob.n_elem;
uvec rN = zeros<uvec>(K);
double p_tot = sum(prob);
double pp;
for(int k = 0; k < K-1; k++) {
if(prob(k)>0) {
pp = prob[k] / p_tot;
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
n -= rN[k];
} else
rN[k] = 0;
if(n <= 0) /* we have all*/
return rN;
p_tot -= prob[k]; /* i.e. = sum(prob[(k+1):K]) */
}
rN[K-1] = n;
return rN;
}
'
model_and_summary='
mat SEIR_sim_plus_summaries()
{
vec alpha;
alpha << 0.002 << 0.0045;
vec beta;
beta << 0.01 << 0.01;
vec gamma;
gamma << 1.0/14.0 << 1.0/14.0;
vec sigma;
sigma << 1.0/(3.5) << 1.0/(3.5);
vec phi;
phi << 0.8 << 0.8;
int S_0 = 800;
int E_0 = 100;
int I_0 = 100;
int R_0 = 0;
int pop = 1000;
double tau = 0.01;
double t_0 = 0;
vec obs_time;
obs_time << 1 << 2 << 3 << 4 << 5 << 6 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 15 << 16 << 17 << 18 << 19 << 20 << 21 << 22 << 23 << 24;
const int n_obs = obs_time.n_elem;
const int n_part = alpha.n_elem;
mat stat(n_part,6);
//#pragma omp parallel for
for(int k=0;k<n_part;k++) {
ivec INC_i(n_obs);
ivec INC_o(n_obs);
// Event variables
double alpha_t;
int nX; //current number of people moving
vec rates(8);
uvec trans(4); // current transitions, e.g. from S to E,I,R,Universe
vec r(4); // rates e.g. from S to E, I, R, Univ.
/*********************** Initialize **********************/
int S_curr = S_0;
int S_prev = S_0;
int E_curr = E_0;
int E_prev = E_0;
int I_curr = I_0;
int I_prev = I_0;
int R_curr = R_0;
int R_prev = R_0;
int IncI_curr = 0;
int IncI_prev = 0;
int IncO_curr = 0;
int IncO_prev = 0;
double t_curr = t_0;
int t_idx =0;
while( t_idx < n_obs ) {
// next time preparation
t_curr += tau;
S_prev = S_curr;
E_prev = E_curr;
I_prev = I_curr;
R_prev = R_curr;
IncI_prev = IncI_curr;
IncO_prev = IncO_curr;
/*********************** description (rates) of the events **********************/
alpha_t = alpha(k)*(1+phi(k)*sin(2*M_PI*(t_curr+0)/52)); //real contact rate, time expressed in weeks
rates(0) = (alpha_t * ((double)I_curr / (double)pop ) * ((double)S_curr)); //e+1, s-1, r,i one s get infected (goes in E, not yey infectous)
rates(1) = (sigma(k) * E_curr); //e-1, i+1, r,s one exposed become infectous (goes in I) INCIDENCE!!
rates(2) = (gamma(k) * I_curr); //i-1, s,e, r+1 one i recover
rates(3) = (beta(k) * I_curr); //i-1, s, r,e one i dies
rates(4) = (beta(k) * R_curr); //i,e, s, r-1 one r dies
rates(5) = (beta(k) * E_curr); //e-1, s, r,i one e dies
rates(6) = (beta(k) * S_curr); //s-1 e, i ,r one s dies
rates(7) = (beta(k) * pop); //s+1 one susc is born
// Let the events occour
/*********************** S compartement **********************/
if((rates(0)+rates(6))>0){
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
r(0) = rates(0)/(rates(0)+rates(6)); r(1) = 0.0; r(2) = 0; r(3) = rates(6)/(rates(0)+rates(6));
trans = rmultinomial(nX, r);
S_curr -= nX;
E_curr += trans(0);
I_curr += trans(1);
R_curr += trans(2);
//trans(3) contains dead individual, who disappear...we could avoid this using sequential conditional binomial
}
/*********************** E compartement **********************/
if((rates(1)+rates(5))>0){
nX = rbinom(1,E_prev,1-exp(-(rates(1)+rates(5))*tau))(0);
r(0) = 0.0; r(1) = rates(1)/(rates(1)+rates(5)); r(2) = 0.0; r(3) = rates(5)/(rates(1)+rates(5));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr -= nX;
I_curr += trans(1);
R_curr += trans(2);
IncI_curr += trans(1);
}
/*********************** I compartement **********************/
if((rates(2)+rates(3))>0){
nX = rbinom(1,I_prev,1-exp(-(rates(2)+rates(3))*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = rates(2)/(rates(2)+rates(3)); r(3) = rates(3)/(rates(2)+rates(3));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr -= nX;
R_curr += trans(2);
IncO_curr += trans(2);
}
/*********************** R compartement **********************/
if(rates(4)>0){
nX = rbinom(1,R_prev,1-exp(-rates(4)*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = 0.0; r(3) = rates(4)/rates(4);
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr += trans(2);
R_curr -= nX;
}
/*********************** Universe **********************/
S_curr += pop - (S_curr+E_curr+I_curr+R_curr); //it should be poisson, but since the pop is fixed...
/*********************** Save & Continue **********************/
// Check if the time is interesting for us
if(t_curr > obs_time[t_idx]){
INC_i(t_idx) = IncI_curr;
INC_o(t_idx) = IncO_curr;
IncI_curr = IncI_prev = 0;
IncO_curr = IncO_prev = 0;
t_idx++;
}
//else just go on...
}
/*********************** Finished - Starting w/ stats **********************/
// INC_i is the useful variable, how can I change its reference withour copying it?
ivec incidence = INC_i; //just so if I want to use INC_o i have to change just this...
//Scan the epidemics to recover the summary stats (naively divide the data each 52 weeks)
double n_years = ceil((double)obs_time(n_obs-1)/52.0);
vec mu_attack(n_years);
vec ratio_attack(n_years-1);
vec peak(n_years);
vec atk(52);
peak(0)=0.0;
vec tmpExplo(52); //explosiveness
vec explo(n_years);
int year=0;
int week;
for(week=0 ; week<n_obs ; week++){
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
if(year>0)
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<52;i++){
if(atk(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
explo(year) = sum(tmpExplo);
year++;
peak(year)=0.0;
}
atk(week-52*year) = incidence(week);
if( peak(year) < incidence(week) )
peak(year)=incidence(week);
}
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
} else {
ivec idx(52);
for(int i=0;i<52;i++)
{ idx(i) = i; } //take just the updated ones...
vec tmp = atk.elem(find(idx<(week - 52*year)));
mu_attack(year) = sum( tmp )/((double)pop * (tmp.n_elem/52.0));
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<tmp.n_elem;i++){
if(tmp(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
for(int i=tmp.n_elem;i<52;i++)
tmpExplo(i) = 0.0; //to reset the others
explo(year) = sum(tmpExplo);
}
double correlation2;
double correlation4;
vec autocorr = acf(peak);
/***** ACF *****/
if(n_years<3){
correlation2=0.0;
correlation4=0.0;
} else {
if(n_years<5){
correlation2 = autocorr(1);
correlation4 = 0.0;
} else {
correlation2 = autocorr(1);
correlation4 = autocorr(3);
}
}
rowvec jnk(6);
jnk << sum(mu_attack)/(year+1.0)
<< (sum( log(ratio_attack)%log(ratio_attack) )/(n_years-1)) - (pow(sum( log(ratio_attack) )/(n_years-1),2))
<< correlation2 << correlation4 << max(peak) << sum(explo)/n_years;
stat.row(k) = jnk;
}
return stat;
}
'
main='
std::cout << "max_num_threads " << omp_get_max_threads() << std::endl;
RNGScope scope;
mat summaries = SEIR_sim_plus_summaries();
return wrap(summaries);
'
plug = getPlugin("RcppArmadillo")
## modify the plugin for Rcpp to support OpenMP
plug$env$PKG_CXXFLAGS <- paste('-fopenmp', plug$env$PKG_CXXFLAGS)
plug$env$PKG_LIBS <- paste('-fopenmp -lgomp', plug$env$PKG_LIBS)
SEIR_sim_summary = cxxfunction(sig=signature(),main,settings=plug,inc = paste(misc,model_and_summary),verbose=TRUE)
SEIR_sim_summary()
Thanks for the help!
NB: before you ask, I slightly modified the Rcpp multinomial sampling function just because I liked that way more than the one using pointer...not any other particular reason! :)
The core pseudo-random number generators (PRNGs) in R are not designed to be used in multithreaded environments. That is, their state is stored in a static array (dummy from src/main/PRNG.c) and therefore is shared among all threads. Moreover several other static structures are used to store states for the higher-level interfaces to the core PRNGs.
A possible solution could be that you put each call to rnorm() or other sampling functions inside named critical sections with all having the same name, e.g.:
...
#pragma omp critical(random)
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
...
if((rates(0)+rates(6))>0){
#pragma omp critical(random)
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
...
Note that the critical construct operates on the structured block following it and therefore locks the entire statement. If a random number is being drawn inline inside a call to a time-consuming function, e.g.
#pragma omp critical(random)
x = slow_computation(rbinom(...));
this is better transformed to:
#pragma omp critical(random)
rb = rbinom(...);
x = slow_computation(rb);
That way only the rb = rbinom(...); statement will be protected.