I have a list of item called L, and a sophisticated python function called func; the normal way is to use python-loop like:
out = [func(item) for item in L]
But it's single-thread, so I want to implement a function in c++, and bind with pybind11:
For cpp:
m.def("test_func_iter", [](const py::object &func, const py::sequence &iter) {
auto n = len(iter);
py::list l(n);
unsigned int k = std::thread::hardware_concurrency();
std::thread threads[k];
auto stride = n / k;
// [0, n//k), [n//k, ...), [...,n)
for (unsigned int w = 0; w < k; ++w) {
if (w < k - 1) {
threads[w] = std::thread([&l, &func, &iter](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
std::cout << "h: "<< i << std::endl;
l[i] = func(iter[i]);
}
}, w * stride, (w + 1) * stride);
} else {
threads[w] = std::thread([&l, &func, &iter](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
std::cout << "h: "<< i << std::endl;
l[i] = func(iter[i]);
}
}, w * stride, n);
}
}
std::cout << "Done spawning threads! Now wait for them to finish.\n";
for (auto& t: threads) {
t.join();
}
std::cout << "end" << std::endl;
return py::type::of(iter)(l);
And when I invoke the corresponding bind function in python, like:
def func(i):
# just simplify the actual logic, a sophisticated function that is hard to re-write totally in c++
print(i, i == 0)
return int(gmpy2.mpz(i) + 100)
b = test_func_iter(func, list(range(100)))
print(b)
And I get the output and error like:
h: h: Done spawning threads! Now wait for them to finish.
050
0 True
进程已结束,退出代码为 139 (interrupted by signal 11: SIGSEGV)
I have done some tries:
Not use thread : everything is OK in python
Use thread & k=1: just use one single thread, everything is OK in python
Use thread & k>=2: crash.
BTW, I use Mac M1 laptop, and version of clang is 12.05 ;
I am new to c++, and guess the reason may be the use of thread, but can not find some suggestions in google, can anybody give some hints?(Or some suggestions about the origin problem: elegant way for multi-thread support with pybind11) Thanks!
Related
I've been trying to accelerate the following code
void calcstiffTestOMP(TPZCompMesh *cmesh,int nthread){
int64_t nelem = cmesh->NElements();
omp_set_num_threads(nthread);
auto beginCalcStiff = std::chrono::high_resolution_clock::now();
int s=0;
#pragma omp parallel for reduction(+:s)
for (int64_t iel = 0; iel < nelem; iel++)
{
TPZCompEl *el = cmesh->Element(iel);
if (!el) continue;
TPZElementMatrix ek(cmesh, TPZElementMatrix::EK), ef(cmesh, TPZElementMatrix::EF);
el->CalcStiff(ek, ef);
}
auto endCalcStiff = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(endCalcStiff - beginCalcStiff);
unsigned long int duration = static_cast<unsigned long int>(elapsed.count());
std::cout << " CalcStiff parallel duration omp = " << duration*1E-9 << " seconds, with nthreads= " << nthread << std::endl;
}
but the simulation time hasn't changed, independently of the number of threads.
The same code has been accelerated when using TBB directives, as follows
void calcstiffTestTBB(TPZCompMesh *cmesh,int nthread){
auto beginCalcStiff = std::chrono::high_resolution_clock::now();
int64_t nelem = cmesh->NElements();
tbb::task_scheduler_init init(nthread);
tbb::parallel_for( tbb::blocked_range<int64_t>(0,nelem),
[&](tbb::blocked_range<int64_t> r){
for (int64_t iel = r.begin(); iel < r.end(); iel++)
{
TPZCompEl *el = cmesh->Element(iel);
if (!el) continue;
TPZElementMatrix ek(cmesh, TPZElementMatrix::EK), ef(cmesh, TPZElementMatrix::EF);
el->CalcStiff(ek, ef);
}
});
auto endCalcStiff = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(endCalcStiff - beginCalcStiff);
unsigned long int duration = static_cast<unsigned long int>(elapsed.count());
std::cout << " CalcStiff duration tbb= " << duration*1E-9 << " seconds, with nthreads= "<<nthread << std::endl;
}
The calcstiff function is given by:
void TPZMultiphysicsCompEl<TGeometry>::CalcStiff(TPZElementMatrix &ek, TPZElementMatrix &ef)
{
TPZMaterial * material = Material();
if(!material){
PZError << "Error at " << __PRETTY_FUNCTION__ << " this->Material() == NULL\n";
ek.Reset();
ef.Reset();
return;
}
TPZNullMaterial *nullmat = dynamic_cast<TPZNullMaterial *>(material);
if(nullmat)
{
ek.Reset();
ef.Reset();
ek.fType = TPZElementMatrix::EK;
ef.fType = TPZElementMatrix::EF;
return;
}
InitializeElementMatrix(ek,ef);
if (this->NConnects() == 0) return;//boundary discontinuous elements have this characteristic
TPZManVector<TPZMaterialData,6> datavec;
const int64_t nref = fElementVec.size();
datavec.resize(nref);
InitMaterialData(datavec);
TPZManVector<TPZTransform<> > trvec;
AffineTransform(trvec);
int dim = Dimension();
TPZAutoPointer<TPZIntPoints> intrule;
TPZManVector<REAL,4> intpointtemp(TGeometry::Dimension,0.);
REAL weight = 0.;
TPZManVector<int,4> ordervec;
//ordervec.resize(nref);
for (int64_t iref=0; iref<nref; iref++)
{
TPZInterpolationSpace *msp = dynamic_cast <TPZInterpolationSpace *>(fElementVec[iref].Element());
int svec;
if(msp)
{
ordervec.Resize(ordervec.size()+1);
svec = ordervec.size();
}
else
{
continue;
}
datavec[iref].p =1; //msp->MaxOrder();
ordervec[svec-1] = datavec[iref].p;
}
int order = material->IntegrationRuleOrder(ordervec);
TPZGeoEl *ref = this->Reference();
intrule = ref->CreateSideIntegrationRule(ref->NSides()-1, order);
TPZManVector<int,4> intorder(dim,order);
intrule->SetOrder(intorder);
int intrulepoints = intrule->NPoints();
if(intrulepoints > 1000) {
DebugStop();
}
TPZFMatrix<REAL> jac, axe, jacInv;
REAL detJac;
for(int int_ind = 0; int_ind < intrulepoints; ++int_ind)
{
intrule->Point(int_ind,intpointtemp,weight);
ref->Jacobian(intpointtemp, jac, axe, detJac , jacInv);
weight *= fabs(detJac);
for (int i = 0; i < fElementVec.size(); i++) {
TPZInterpolationSpace *msp = dynamic_cast <TPZInterpolationSpace *>(fElementVec[i].Element());
if (!msp) {
continue;
}
datavec[i].intLocPtIndex = int_ind;
}
this->ComputeRequiredData(intpointtemp,trvec,datavec);
material->Contribute(datavec,weight,ek.fMat,ef.fMat);
}//loop over integration points
CleanupMaterialData(datavec);
}
The code has been parallelized but not accelerated. Are we using omp directives correctly?
Does OMP create mutex sections on its own? Is there a better way to parallelize this code using OMP?
PZ is a library which serve as a template for finite element code, and can be found in https://github.com/labmec/neopz
SOLUTION: Adding the 'schedule(dynamic,1)' flag to '#pragma' directive solved this problem. The code had different thread workloads
and OMP was using a single thread to handle the slow section of the loop. The 'schedule(dynamic,1)' directive solved this.
I want to understand how to evaluate the complexity, big O, of below algorithm and how to approach those kind of issues of big O estimations in the future with kinda-like algorithms.
#include <iostream>
std::size_t const jobSize = 3;
std::size_t jobCallCounter = 0;
std::size_t jobsDoneCounter = 0;
void doJob( std::size_t jobSize )
{
jobCallCounter++;
for( std::size_t i = 0; i < jobSize; ++i )
{
jobsDoneCounter++;
}
}
std::size_t recursiveCallCounter = 0;
std::size_t const cycleSize = 3;
void recursiveCall( std::size_t recursionNumber )
{
recursiveCallCounter++;
if( !recursionNumber )
{
doJob( jobSize );
}
else
{
for( std::size_t i = 0; i < cycleSize; ++i )
{
recursiveCall( recursionNumber - 1 );
}
}
}
int main()
{
recursiveCall( 4 );
std::cout << recursiveCallCounter << " recursive calls happened" << std::endl;
std::cout << jobCallCounter << " job calls happened" << std::endl;
std::cout << jobsDoneCounter << " jobs done" << std::endl;
}
I understand that overall complexity is aproximately O( J * C^R ), where: J = jobSize, C = cycleSize, R = recursionNumber What I struggle to comprehend is how much recursive calls happen on each step of base cycle - cycle from the very first call, where (in this example) recursionNumber = 4.
Also I'm interesting in how to evaluate amount of doJob calls, a.k.a. jobCallCounter.
Thank you!
You can find a recusive formula for that. If the time complexity for the problem with R recursion number and C cycle size (it is not an input to the recursive function) is denoted by T(R), we will have the following recursive formula:
T(R) = C* T(R-1) + 1
And for the initial case of the recursion T(0) = J. The 1 in the formula is for the checking codition in the code.
To solve the formula, you can expand it:
T(R) = C* (C * T(R-2) + 1) + 1 = C^2 T(R-2) + C + 1
= C^R T(0) + C^{R-1} + ... + C + 1 =
C^R * J + C^{R-1} + ... + C + 1 = O(C^R * J)
Notice that as C and J are not changed their values during the recursion, we did not write the complexity function as T(R, C, J), to keep it simple for solving the recursion.
i'm trying to optimize my code using multithreading and is not just that the program is not the double speed as is suposed to be in this dual-core computer, it is SO MUCH SLOW. And i just wanna know if i'm doing something wrong or is pretty normal that in this case use multithreading does not help. I make this recreation of how i used the multithreading, and in my computer the parallel versions take's 4 times the time in the comparation of the normal version:
#include <iostream>
#include <random>
#include <thread>
#include <chrono>
using namespace std;
default_random_engine ran;
inline bool get(){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
for (unsigned i = 0; i < repetitions; ++i)
result += get();
}
unsigned parallel_series(unsigned repetitions){
const unsigned hardware_threads = std::thread::hardware_concurrency();
cout << "Threads in this computer: " << hardware_threads << endl;
const unsigned threads_number = (hardware_threads != 0) ? hardware_threads : 2;
const unsigned its_per_thread = repetitions / threads_number;
unsigned *results = new unsigned[threads_number]();
std::thread *threads = new std::thread[threads_number - 1];
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i] = std::thread(normal_serie, its_per_thread, std::ref(results[i]));
normal_serie(its_per_thread, results[threads_number - 1]);
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i].join();
auto result = std::accumulate(results, results + threads_number, 0);
delete[] results;
delete[] threads;
return result;
}
int main()
{
constexpr unsigned repetitions = 100000000;
auto to = std::chrono::high_resolution_clock::now();
cout << parallel_series(repetitions) << endl;
auto tf = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Parallel duration: " << duration << "ms" << endl;
to = std::chrono::high_resolution_clock::now();
unsigned r = 0;
normal_serie(repetitions, r);
cout << r << endl;
tf = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Normal duration: " << duration << "ms" << endl;
return 0;
}
Things that i already know, but i didn't to make this code shorter:
I should set a max_iterations_per_thread because you don't wanna make 10 iterations per thread, but in this case we are doing one billion iterations so that is not gonna happend.
The number of iterations must be divisible by the number or threads, otherwise the code will not do an effective work.
This is the output that i get in my computer:
Threads in this computer: 2
66665160
Parallel duration: 4545ms
66664432
Normal duration: 1019ms
(Solved partially doing this changes: )
inline bool get(default_random_engine &ran){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
default_random_engine eng;
unsigned saver_result = 0;
for (unsigned i = 0; i < repetitions; ++i)
saver_result += get(eng);
result += saver_result;
}
All your threads are tripping over each other fighting for access to ran which can only perform one operation at a time because it only has one state and each operation advances its state. There is no point in running operations in parallel if the vast majority of each operation involves a choke point that cannot support any concurrency.
All elements of results are likely to share a cache line, which means there is lots of inter-core communication going on.
Try modifying normal_serie to accumulate into a local variable and only write it to results in the end.
I am trying to implement parallel quadratic sieve using open mp. In sieving phase, I am using log approximations to check the divisibility. This is my code.
#pragma omp parallel for schedule (dynamic) num_threads(4)
for (int i = 0; i < factorBase.size(); ++i) {
const uint32_t p = factorBase[i];
const float logp = std::log(factorBase[i]) / std::log(2);
// Sieve first sequence.
while (startIndex.first[i] < intervalEnd) {
logApprox[startIndex.first[i] - intervalStart] -= logp;
startIndex.first[i] += p;
}
if (p == 2)
continue; // a^2 = N (mod 2) only has one root.
// Sieve second sequence.
while (startIndex.second[i] < intervalEnd) {
logApprox[startIndex.second[i] - intervalStart] -= logp;
startIndex.second[i] += p;
}
}
Here factorbase and logApprox are std::vectors initialized as follows
std::vector<float> logApprox(INTERVAL_LENGTH, 0);
std::vector<uint32_t> factorBase;
Whenever, I run this code and compare the running time, there is no much difference between sequential and parallel run. What are some optimizations that can be done? I am a beginner in openmp and any help is appreciated.Thanks
Very interesting task you have! Thanks!
Decided to make my own implementation with very many optimizations.
I achieved 20.4x times boost compared to your original code (your code gives 17.86 seconds, my gives 0.87 seconds). Also I used 2x times less memory for sieving compared to your algorithm, while achieving same goal.
To make comparison I simplified your code in such a way that it still does almost same thing and runs exactly same time, but looks much more simple:
#pragma omp parallel for
for (size_t i = 0; i < factorBase.size(); ++i) {
auto const p = factorBase[i];
float const logp = std::log(p) / std::log(2);
while (startIndex[i] < logApprox.size()) {
logApprox[startIndex[i]] += logp;
startIndex[i] += p;
}
}
You can see that I leaved only single sieve loop, second one does same thing and not necessary for demonstration, so I removed it. Also I removed startInterval as it is irrelevant to speed demonstration. And for simplicity I did += of logarithm instead of yours -=.
One important notice regarding your algorithm is that it doesn't do any synchronization, it means that different cores of CPU may write to same entry of logApprox array hence give wrong result.
And as I have measured this wrong result happens once or twice per hundred million entries of logApprox array. My optimized code overcame this limitation and did correct synchronization besides doing all speed optimizations.
I did following improvements to gain 20x times speedup:
I split whole array into blocks, approximately 2^13 elements in size. Each group of blocks is processed by separate thread/CPU-core hence no synchronization of threads is needed. Besides avoiding synchronization what is very important is that 2^13 block fits fully into L1 or L2 cache of CPU, hence speeds up things a lot.
Each block of 2^13 is processed for all possible primes. To keep track of which offsets of what primes are needed I created a special ring buffer of 2^7 size, this ring buffer is indexed with block number modulo 2^7 and keeps track which primes with which offsets are needed for each block (modulo 2^7).
I have as many threads as there are CPU cores. For each thread I precompute starting offsets of all primes for this thread, these starting offsets are computed through modular arithmetics based on startIndex array that you provided in your original code.
To speedup even more instead of float logarithm I use integer logarithm, which is based on uint16_t. This integer logarithm is computed as uint16_t integer_log = uint16_t(std::log2(p) * (1 << 8) + 0.5);. Besides increasing speed of computing += for integer logarithms, they also decrease occupied memory 2x times. If for some reason uint16_t logarithm is not enough for you then please replace using ILog2T = u16; with using ILog2T = u32; in my code, but this will double amount of used memory.
My code output following to console:
time_simple 17.859 sec, time_optimized 0.874 sec, boost 20.434, correct_ratio 0.999999993
Time simple is time of your original code for sieving array of size 2^28, time optimized is my code for same array, boost is how much my code is faster (you can see it is 20x times faster). Correct ratio says if there are any errors in your code, due to absence of multi-core synchronization (as you can see sometimes it is less than 1.0 hence there are some errors).
Full optimized code below:
Try it online!
#include <cstdint>
#include <random>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <thread>
#include <type_traits>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <mutex>
#include <omp.h>
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#define OSTR(code) ([&]{ std::ostringstream ss; ss code; return ss.str(); }())
#define COUT(code) { std::unique_lock<std::mutex> lock(cout_mux); std::cout code; std::cout << std::flush; }
#define LN { COUT(<< "LN " << __LINE__ << std::endl); }
#define DUMP(var) { COUT(<< #var << " = (" << (var) << ")" << std::endl); }
using u16 = uint16_t;
using u32 = uint32_t;
using u64 = uint64_t;
using ILog2T = u16;
using PrimeT = u32;
std::mutex cout_mux;
template <typename T>
std::vector<T> GenPrimes(size_t end) {
thread_local std::vector<T> primes = {2, 3};
while (primes.back() < end) {
for (T p = primes.back() + 2;; p += 2) {
bool is_prime = true;
for (auto d: primes) {
if (u64(d) * d > p)
break;
if (p % d == 0) {
is_prime = false;
break;
}
}
if (is_prime) {
primes.push_back(p);
break;
}
}
}
primes.pop_back();
return primes;
}
void SieveA(std::vector<float> & logApprox, std::vector<PrimeT> const & factorBase, std::vector<PrimeT> startIndex) {
#pragma omp parallel for
for (size_t i = 0; i < factorBase.size(); ++i) {
auto const p = factorBase[i];
float const logp = std::log(p) / std::log(2);
while (startIndex[i] < logApprox.size()) {
logApprox[startIndex[i]] += logp;
startIndex[i] += p;
}
}
}
size_t NThreads() {
//return 1;
return std::thread::hardware_concurrency();
}
ILog2T LogToI(double x) { return ILog2T(x * (1ULL << (sizeof(ILog2T) * 8 - 8)) + 0.5); }
double IToLog(ILog2T x) { return x / double(1ULL << (sizeof(ILog2T) * 8 - 8)); }
double Time() {
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
}
std::string FloatToStr(double x, size_t round = 6) {
return OSTR(<< std::fixed << std::setprecision(round) << x);
}
double SieveB(std::vector<ILog2T> & logs, std::vector<PrimeT> const & primes, std::vector<PrimeT> const & starts0) {
auto const nthr = NThreads();
std::vector<std::vector<PrimeT>> starts(nthr, std::vector<PrimeT>(primes.size()));
std::vector<std::vector<ILog2T>> plogs(nthr, std::vector<ILog2T>(primes.size()));
std::vector<std::pair<u64, u64>> ranges(nthr);
size_t constexpr block_log2 = 13, block = 1 << block_log2, ring_log2 = 6, ring_size = 1ULL << ring_log2, ring_mask = ring_size - 1;
std::vector<std::vector<std::vector<std::pair<u32, u32>>>> ring(nthr, std::vector<std::vector<std::pair<u32, u32>>>(ring_size));
#pragma omp parallel for
for (size_t ithr = 0; ithr < nthr; ++ithr) {
size_t const nblock = ((logs.size() + nthr - 1) / nthr + block - 1) / block * block,
begin = ithr * nblock, end = std::min<size_t>(logs.size(), (ithr + 1) * nblock);
ranges[ithr] = {begin, end};
for (size_t i = 0; i < primes.size(); ++i) {
PrimeT const p = primes[i];
size_t const mod0 = begin % p, mod = starts0[i] < mod0 ? p + starts0[i] - mod0 : starts0[i] - mod0;
starts[ithr][i] = mod;
plogs[ithr][i] = LogToI(std::log2(p));
ring[ithr][((begin + starts[ithr][i]) >> block_log2) & ring_mask].push_back({i, begin + starts[ithr][i]});
}
}
auto tim = Time();
#pragma omp parallel for
for (size_t ithr = 0; ithr < nthr; ++ithr) {
auto const [begin, end] = ranges[ithr];
auto const [bbegin, bend] = std::make_tuple(begin / block, (end - 1) / block + 1);
auto const & cstarts = starts.at(ithr);
auto const & cplogs = plogs.at(ithr);
auto & cring = ring[ithr];
std::decay_t<decltype(cring[0])> tmp;
size_t hit_cnt = 0, miss_cnt = 0;
for (size_t iblock = bbegin; iblock < bend; ++iblock) {
size_t const cbegin = iblock << block_log2, cend = std::min<size_t>(end, (iblock + 1) << block_log2);
auto & ring_cur = cring[iblock & ring_mask];
tmp = ring_cur;
ring_cur.clear();
for (auto [ip, off]: tmp)
if (off >= cend) {
//++miss_cnt;
ring_cur.push_back({ip, off});
} else {
//++hit_cnt;
auto const p = primes[ip];
auto const plog = cplogs[ip];
for (; off < cend; off += p) {
//if (8192 - 10 <= off && off <= 8192 + 10) COUT(<< "logs.size() " << logs.size() << " begin " << begin << " end " << end << " bbegin " << bbegin << " bend " << bend << " cbegin " << cbegin << " cend " << cend << " iblock " << iblock << " off " << off << " p " << p << " plog " << plog << std::endl);
logs[off] += plog;
}
if (off < end)
cring[(off >> block_log2) & ring_mask].push_back({ip, off});
}
}
//COUT(<< "hit_ratio " << std::fixed << std::setprecision(6) << double(hit_cnt) / (hit_cnt + miss_cnt) << std::endl);
}
return Time() - tim;
}
void Test() {
size_t constexpr len = 1ULL << 28;
std::mt19937_64 rng{123};
auto const primes = GenPrimes<PrimeT>(1 << 12);
std::vector<PrimeT> starts;
for (auto p: primes)
starts.push_back(rng() % p);
ASSERT(primes.size() == starts.size());
double tA = 0, tB = 0;
std::vector<float> logsA(len);
std::vector<ILog2T> logsB(len);
{
tA = Time();
SieveA(logsA, primes, starts);
tA = Time() - tA;
}
{
tB = SieveB(logsB, primes, starts);
}
size_t correct = 0;
for (size_t i = 0; i < len; ++i) {
//ASSERT_MSG(std::abs(logsA[i] - IToLog(logsB[i])) < 0.1, "i " + std::to_string(i) + " logA " + FloatToStr(logsA[i], 3) + " logB " + FloatToStr(IToLog(logsB[i]), 3));
if (std::abs(logsA[i] - IToLog(logsB[i])) < 0.1)
++correct;
}
std::cout << std::fixed << std::setprecision(3) << "time_simple " << tA << " sec, time_optimized " << tB << " sec, boost " << (tA / tB) << ", correct_ratio " << std::setprecision(9) << double(correct) / len << std::endl;
}
int main() {
try {
omp_set_num_threads(NThreads());
Test();
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
Output:
time_simple 17.859 sec, time_optimized 0.874 sec, boost 20.434, correct_ratio 0.999999993
In my opinion, you should turn the schedule to static and give it chunk-size (https://software.intel.com/en-us/articles/openmp-loop-scheduling).
A small optimization should be :
outside of the big FOR loop, declare a const and initialize it to 1/std::log(2), and then inside the FOR loop, instead of dividing by std::log(2), do a multiplication of the previous const, division is very expensive in CPU cycles.
I need to do something like this in the fastest way possible (O(1) would be perfect):
for (int j = 0; j < V; ++j)
{
if(!visited[j]) required[j]=0;
}
I came up with this solution:
for (int j = 0; j < V; ++j)
{
required[j]=visited[j]&required[j];
}
Which made the program run 3 times faster but I believe there is an even better way to do this. Am I right?
Btw. required and visited are dynamically allocated arrays
bool *required;
bool *visited;
required = new bool[V];
visited = new bool[V];
In the case where you're using a list of simple objects, you are most likely best suited using the functionality provided by the C++ Standard Library. Structures like valarray and vectors are recognized and optimized very effectively by all modern compilers.
Much debate exists as to how much you can rely on your compiler, but one guarantee is, your compiler was built alongside the standard library and relying on it for basic functionality (such as your problem) is generally a safe bet.
Never be afraid to run your own time tests and race your compiler! It's a fun exercise and one that is ever increasingly difficult to achieve.
Construct a valarray (highly optimized in c++11 and later):
std::valarray<bool> valRequired(required, V);
std::valarray<bool> valVisited(visited, V);
valRequired &= valVisited;
Alternatively, you could do it with one line using transform:
std::transform(required[0], required[V-1], visited[0], required[0], [](bool r, bool v){ return r & v; })
Edit: while fewer lines is not faster, your compiler will likely vectorize this operation.
I also tested their timing:
int main(int argc, const char * argv[]) {
auto clock = std::chrono::high_resolution_clock{};
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] &= visited[i];
}
auto end = clock.now();
std::cout << "1: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] = visited[i] & required[i];
}
auto end = clock.now();
std::cout << "2: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
std::transform(required, required + 4, visited, required, [](bool r, bool v){ return r & v; });
auto end = clock.now();
std::cout << "3: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
std::valarray<bool> valVisited(visited, 5);
std::valarray<bool> valrequired(required, 5);
auto start = clock.now();
valrequired &= valVisited;
auto end = clock.now();
std::cout << "4: " << (end - start).count() << std::endl;
}
}
Output:
1: 102
2: 55
3: 47
4: 45
Program ended with exit code: 0
In the line of #AlanStokes, use packed binary data and combine with the AVX instruction _mm512_and_epi64, 512 bits at a time. Be prepared for your hair messed up.