I know that this question sounds like an easy question and a duplicate of former ones, in which boost.timer and the chrono facility of C++11 are given as answers.
But, what I have in mind is a bit different and I found no answer to it either on StackOverflow or elsewhere:
In my (C++11) program on Ubuntu Linux, I start several threads with std::async and the std::future mechanism.
Inside every thread I measure CPU-Time with boost.timer(). If I start only one thread I get a CPU time of (in my example) ~0.39 sec and an equal WC time of ~0.39 sec.
If I start several threads I get a longer WC time for each, say 0.8 sec for 16 threads and now the CPU-time for each is about 6.4 sec, that is 8 * 0.8 sec (I have a quad-core Xeon CPU).
So the CPU-Time of each thread is seemingly multiplied by the (number of CPU cores) * 2.
Of course(?) I would like to see a CPU-time near 0.39 sec for each thread, as this is probably still the time the thread uses the CPU for its purposes. The longer CPU time shown (multiplied with the "CPU number factor") is not so much of help in gauging the true CPU consumption of each thread separately.
For illustration I append my test program and its output, first for one thread, then for 16 threads.
So my question is: What can I do, which library, function or programming technique can I use, to get the true CPU usage of each thread which should not change much with the number of threads started?
#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>
#include <future>
#include <mutex>
#include <chrono>
#include <boost/timer/timer.hpp>
std::mutex mtx;
class XTimer
{
public:
XTimer() {};
void start();
void stop();
double cpu_time();
double boost_cpu_time();
double wc_time();
std::chrono::time_point<std::chrono::system_clock> timestamp_wc;
std::chrono::time_point<std::chrono::steady_clock> timestamp_cpu;
boost::timer::cpu_timer timer_cpu;
double wc_time_val;
double cpu_time_val;
double boost_cpu_time_val;
};
void XTimer::start()
{
timestamp_wc = std::chrono::system_clock::now();
timestamp_cpu = std::chrono::steady_clock::now();
timer_cpu.start();
cpu_time_val = 0;
wc_time_val = 0;
boost_cpu_time_val = 0;
}
void XTimer::stop()
{
const auto ns_wc = std::chrono::system_clock::now() - timestamp_wc;
const auto ns_cpu = std::chrono::steady_clock::now() - timestamp_cpu;
auto elapsed_times(timer_cpu.elapsed());
auto cpu_elapsed(elapsed_times.system + elapsed_times.user);
//std::cout << "boost: cpu elapsed = " << cpu_elapsed << std::endl;
wc_time_val = double(ns_wc.count())/1e9;
cpu_time_val = double(ns_cpu.count())/1e9;
boost_cpu_time_val = double(cpu_elapsed)/1e9;
}
double XTimer::cpu_time()
{
return cpu_time_val;
}
double XTimer::boost_cpu_time()
{
return boost_cpu_time_val;
}
double XTimer::wc_time()
{
return wc_time_val;
}
template<class T>
int wait_for_all(std::vector<std::future<T>> & fuvec)
{
std::vector<T> res;
for(auto & fu: fuvec) {
res.push_back(fu.get());
}
return res.size();
}
int test_thread(int a)
{
const int N = 10000000;
double x = 0;
XTimer tt;
do {
std::lock_guard<std::mutex> lck {mtx};
std::cout << "start thread: " << a << std::endl;
} while (0);
tt.start();
for(int i = 0; i < N; ++i) {
if (i % 10000 == 0) {
//std::cout << (char((int('A') + a)));
}
x += sin(i);
}
tt.stop();
do {
std::lock_guard<std::mutex> lck {mtx};
std::cout << "end thread: " << a << std::endl;
std::cout << "boost cpu = " << tt.boost_cpu_time() << " wc = " << tt.wc_time() << std::endl;
} while (0);
return 0;
}
int test_threads_start(int num_threads)
{
std::vector<std::future<int>> fivec;
XTimer tt;
tt.start();
for(int i = 0; i < num_threads; ++i) {
fivec.push_back(std::async(test_thread, i));
}
int sz = wait_for_all(fivec);
tt.stop();
std::cout << std::endl << std::endl;
std::cout << "all threads finished: total wc time = " << tt.wc_time() << std::endl;
std::cout << "all threads finished: total boost cpu time = " << tt.boost_cpu_time() << std::endl;
}
int main(int argc, char** argv)
{
const int num_threads_default = 1;
int num_threads = num_threads_default;
//boost::timer::auto_cpu_timer ac;
if (argc > 1) {
num_threads = atoi(argv[1]);
}
std::cout << "starting " << num_threads << " threads." << std::endl;
test_threads_start(num_threads);
std::cout << "end." << std::endl;
return 0;
}
I can be compiled by
g++ -o testit testit.cpp -L/usr/lib/x86_64-linux-gnu -pthread -lboost_timer -lboost_system -lboost_thread
Sample output with 1 thread
starting 1 threads.
start thread: 0
end thread: 0
boost cpu = 0.37 wc = 0.374107
all threads finished: total wc time = 0.374374
all threads finished: total boost cpu time = 0.37
Sample output with 16 threads
starting 16 threads.
start thread: 0
start thread: 1
start thread: 2
start thread: 3
start thread: 4
start thread: 10
start thread: 5
start thread: 7
start thread: 6
start thread: 11
start thread: 8
start thread: 9
start thread: 13
start thread: 12
start thread: 14
start thread: 15
end thread: 1
boost cpu = 4.67 wc = 0.588818
end thread: 2
boost cpu = 5.29 wc = 0.66638
end thread: 0
boost cpu = 5.72 wc = 0.7206
end thread: 13
boost cpu = 5.82 wc = 0.728717
end thread: 11
boost cpu = 6.18 wc = 0.774979
end thread: 12
boost cpu = 6.17 wc = 0.773298
end thread: 6
boost cpu = 6.32 wc = 0.793143
end thread: 15
boost cpu = 6.12 wc = 0.767049
end thread: 4
boost cpu = 6.7 wc = 0.843377
end thread: 14
boost cpu = 6.74 wc = 0.84842
end thread: 3
boost cpu = 6.91 wc = 0.874065
end thread: 9
boost cpu = 6.83 wc = 0.86342
end thread: 5
boost cpu = 7 wc = 0.896873
end thread: 7
boost cpu = 7.05 wc = 0.917324
end thread: 10
boost cpu = 7.11 wc = 0.930335
end thread: 8
boost cpu = 7.03 wc = 0.940374
all threads finished: total wc time = 0.957748
all threads finished: total boost cpu time = 7.14
end.
Documentation of boost::timer does not mention anything about per thread measurements. Fortunately boost::chrono contains thread_clock which gives per thread CPU usage on platforms which support it. It uses the same interface as the std::chrono clocks and measures thread wall clock.
After adding following lines to your example code:
// Includes section
#include <boost/chrono.hpp>
// XTimer
boost::chrono::thread_clock::time_point timestamp_thread_wc;
double thread_wc_time_val;
// XTimer::start()
timestamp_thread_wc = boost::chrono::thread_clock::now();
// XTimer::stop()
const auto ns_thread_wc = boost::chrono::thread_clock::now() - timestamp_thread_wc;
thread_wc_time_val = double(ns_thread_wc.count())/1e9;
// test_thread() just after for loop
sleep(1);
// test_thread() in bottom do -> while(0) loop
std::cout << "thread cpu = " << tt.thread_wc_time_val << std::endl;
and compiling with additional -lboost_chrono option I get:
starting 1 threads.
start thread: 0
end thread: 0
boost cpu = 0.16 wc = 1.16715
thread cpu = 0.166943
all threads finished: total wc time = 1.16754
all threads finished: total boost cpu time = 0.16
end.
and:
starting 2 threads.
start thread: 0
start thread: 1
end thread: 1
boost cpu = 0.28 wc = 1.14168
thread cpu = 0.141524
end thread: 0
boost cpu = 0.28 wc = 1.14417
thread cpu = 0.14401
all threads finished: total wc time = 1.14442
all threads finished: total boost cpu time = 0.28
end.
Related
Even without contention, the scalability of std::mutex seems to be horrible. This is a case where every thread is guaranteed to use its own mutex. What is going on?
#include <mutex>
#include <vector>
#include <numeric>
void TestThread(bool *pbFinished, int* pResult)
{
std::mutex mtx;
for (; !*pbFinished; (*pResult)++)
{
mtx.lock();
mtx.unlock();
}
}
void Test(int coreCnt)
{
const int ms = 3000;
bool bFinished = false;
std::vector<int> results(coreCnt);
std::vector<std::thread*> threads(coreCnt);
for (int i = 0; i < coreCnt; i++)
threads[i] = new std::thread(TestThread, &bFinished, &results[i]);
std::this_thread::sleep_for(std::chrono::milliseconds(ms));
bFinished = true;
for (std::thread* pThread: threads)
pThread->join();
int sum = std::accumulate(results.begin(), results.end(), 0);
printf("%d cores: %.03fm ops/sec\n", coreCnt, double(sum)/double(ms)/1000.);
}
int main(int argc, char** argv)
{
for (int cores = 1; cores <= (int)std::thread::hardware_concurrency(); cores++)
Test(cores);
return 0;
}
Results in Windows are very bad:
1 cores: 15.696m ops/sec
2 cores: 12.294m ops/sec
3 cores: 17.134m ops/sec
4 cores: 9.680m ops/sec
5 cores: 13.012m ops/sec
6 cores: 21.142m ops/sec
7 cores: 18.936m ops/sec
8 cores: 18.507m ops/sec
Linux manages to be an even bigger loser:
1 cores: 46.525m ops/sec
2 cores: 15.089m ops/sec
3 cores: 15.105m ops/sec
4 cores: 14.822m ops/sec
5 cores: 14.519m ops/sec
6 cores: 14.544m ops/sec
7 cores: 13.996m ops/sec
8 cores: 13.869m ops/sec
I have also tried using tbb's readers/writer lock, and even rolled my own.
I made my own variant of your test with the following changes:
Each test thread executes for a specific number of iterations instead for a specific amount of time. Each thread returns how long it took to run the number of iterations. (For testing, I used 20 million iterations).
The main thread orchestrating the threads waits for each thread to "signal" that its ready to begin. Then the main thread, upon seeing all threads are "ready", it signals "go" to all the tests. These signals are basically condition_variables. This basically eliminates performance noise of having one thread starting to execute while another thread is getting warmed up the kernel.
No attempt by the thread to access a global variable until the thread exits to return results.
When all the threads have finished, the total number of iterations is computed with the total amount of time each thread took.
used the high resolution clock to measure time used in each thread
struct TestSignal
{
std::mutex mut;
std::condition_variable cv;
bool isReady;
TestSignal() : isReady(false)
{
}
void Signal()
{
mut.lock();
isReady = true;
mut.unlock();
cv.notify_all();
}
void Wait()
{
std::unique_lock<std::mutex> lck(mut);
cv.wait(lck, [this] {return isReady; });
}
};
long long TestThread2(long long iterations, TestSignal& signalReady, TestSignal& signalGo)
{
std::mutex mtx;
signalReady.Signal(); // signal to the main thread we're ready to proceed
signalGo.Wait(); // wait for the main thread to tell us to start
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; i++)
{
mtx.lock();
mtx.unlock();
}
auto end = std::chrono::high_resolution_clock::now();
auto diff = end - start;
auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
return milli.count(); // return how long it took to execute the iterations
}
void Test2(unsigned int threadcount)
{
long long iterations = 20000000; // 20 million
std::vector<std::thread> threads(threadcount);
std::vector<TestSignal> readySignals(threadcount);
std::vector<long long> results(threadcount);
TestSignal signalGo;
for (unsigned int i = 0; i < threadcount; i++)
{
auto t = std::thread([&results, &readySignals, &signalGo, i, iterations] {results[i] = TestThread2(iterations, readySignals[i], signalGo); });
readySignals[i].Wait();
threads[i] = std::move(t);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500));
signalGo.Signal(); // unleash the threads
for (unsigned int i = 0; i < threadcount; i++)
{
threads[i].join();
}
long long totaltime = 0;
double totalrate = 0;
for (unsigned int i = 0; i < threadcount; i++)
{
double rate = iterations / (double)(results[i]); // operations per millisecond
totalrate += rate;
}
std::cout << threadcount << " threads: " << totalrate/1000 << "m ops/sec (new test)\n";
}
Then a simple main to compare both results 3x times:
int main()
{
#ifdef WIN32
::SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
#endif
Test(std::thread::hardware_concurrency());
Test2(std::thread::hardware_concurrency());
Test(std::thread::hardware_concurrency());
Test2(std::thread::hardware_concurrency());
Test(std::thread::hardware_concurrency());
Test2(std::thread::hardware_concurrency());
return 0;
}
The results are noticably different:
12 cores: 66.343m ops/sec
12 threads: 482.187m ops/sec (new test)
12 cores: 111.061m ops/sec
12 threads: 474.199m ops/sec (new test)
12 cores: 66.758m ops/sec
12 threads: 481.353m ops/sec (new test)
I wrote a program which use std::thread::hardware_concurrency to get how much threads my computer could support.Then I divide the size of array by N and get N blocks. And I create N threads to calculate the sum of the block.Here is the code
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <numeric>
#include <thread>
#include <vector>
#include <stdlib.h>
int64_t thread_cost_time = 0;
template <typename Iterator, typename T> struct accumulate_block {
void operator()(Iterator first, Iterator last, T &result) {
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
result = std::accumulate(first, last, result);
auto stop = std::chrono::high_resolution_clock::now();
auto thread_time =
std::chrono::duration_cast<microseconds>(stop - start).count();
thread_cost_time = std::max(thread_time, thread_cost_time);
}
};
template <typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T &init, uint64_t num) {
uint64_t length = std::distance(first, last);
const uint64_t min_per_thread = 25;
// it will assign 12 to hard_ware_threads in my pc
const uint64_t hardware_threads = std::thread::hardware_concurrency();
const uint64_t max_threads = (length + min_per_thread - 1) / (min_per_thread);
// const uint64_t num_threads = std::min(hardware_threads != 0 ?
// hardware_threads : 2,
// max_threads);
const uint64_t num_threads = num;
const uint64_t block_size = length / num_threads;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads - 1);
Iterator block_start = first;
for (uint64_t i = 0; i < num_threads - 1; i++) {
Iterator block_end = block_start;
std::advance(block_end, block_size);
// calculate the sum of block
threads[i] = std::thread{accumulate_block<Iterator, T>(), block_start,
block_end, std::ref(results[i])};
block_start = block_end;
}
accumulate_block<Iterator, T>()(block_start, last, results[num_threads - 1]);
std::for_each(threads.begin(), threads.end(),
std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init);
}
int main(int argc, char *argv[]) {
// constexpr const uint64_t sz = 1000000000;
for (int number = 2; number < 32; number++) {
int64_t parr = 0;
int64_t single = 0;
int64_t thread_trivial = 0;
std::cout
<< "--------------------------------------------------------------"
<< std::endl;
std::cout << "---------------------thread: " << number
<< "-----------------------" << std::endl;
int iter_times = 10;
for (int iter = 0; iter < iter_times; iter++) {
thread_cost_time = 0;
constexpr const uint64_t sz = 100000000 ;
std::vector<uint64_t> arr;
for (uint32_t i = 0; i < sz; i++) {
arr.emplace_back(i);
}
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
uint64_t init = 0;
parallel_accumulate<decltype(arr.begin()), uint64_t>(
arr.begin(), arr.end(), std::ref(init), number);
auto stop = std::chrono::high_resolution_clock::now();
parr += std::chrono::duration_cast<microseconds>(stop - start).count();
thread_trivial +=
std::chrono::duration_cast<microseconds>(stop - start).count() -
thread_cost_time;
uint64_t init_ = 0;
uint64_t arr_sz = arr.size();
// uint64_t block_sz = arr.size() / 2;
start = std::chrono::high_resolution_clock::now();
std::accumulate(arr.begin(), arr.end(), init_);
// std::cout << init_ << std::endl;
stop = std::chrono::high_resolution_clock::now();
single += std::chrono::duration_cast<microseconds>(stop - start).count();
}
std::cout << "parallel " << parr / iter_times<< std::endl;
std::cout << "single thread " << single / iter_times<< std::endl;
std::cout << "parr is "
<< static_cast<double>(single) / static_cast<double>(parr)
<< "X fast" << std::endl;
std::cout << "thread create and destory time " << thread_trivial / iter_times
<< std::endl;
}
}
I record the time of multithread and single thread.
I can only achieve at most 6.57x faster than use only one thread, even though std::thread::hardware_concurrency tell me I have 12 threads could run simultaneously.
There were no contention of lock in this program.I also record the time of create and destory the thread, even if I minus it , I still cannot achieve 12X faster.
I think maybe thread schedule will make multithreads slow, but I have 12 threads, It shouldn't achieve only 6.57x faster.
I think maybe multithreads will decrease the hit ratio of cache,but I'm not quite sure.
So how can I achieve 12X faster than use only one thread?
Here is my static of my program
threads
parallel
single
faster
2
324868
633777
1.95
3
218584
633777
2.87
4
167169
633777
3.77
5
136542
633777
4.64
6
113207
633777
5.48
7
147324
633777
4.27
8
136768
633777
4.67
You could run my code to get the data from 2 threads to 31 threads
Apparently, at least on my Intel core i7, std::thread::hardware_concurrency() returns the number of hardware threads available. On hardware with simultaneous multi-threading typically 2 hardware threads share time on a single hardware core. The hardware core switches transparently between the 2 hardware threads. That means you only get about half the speedup factor that you might expect based on the result of std::thread::hardware_concurrency().
In practice each hardware thread will stall from time to time for various reasons, e.g. waiting for data to arrive from memory, giving the other hardware thread extra processing time. Typically simultaneous multi-threading (or Hyper-threading as Intel calls it) will give you an extra 15% of performance that way, so you may expect a speedup factor of up to (12/2)*(115/100) = 6.9.
Overheads, including the one you mention, but also in my experience the increased working-set size, can further reduce the speed-up factor.
Running a multithreaded program, I noticed that the program was running faster using 1 thread compared to 4 threads, despite the CPU having 4 cores.
After investigating, I found out that the issue appears only when shuffling something.
Below the minimal program I created to reproduce the problem:
#include <math.h>
#include <future>
#include <ctime>
#include <vector>
#include <iostream>
#include <algorithm>
#define NB_JOBS 5000.0
#define MAX_CORES 8
static bool _no_shuffle(int nb_jobs){
bool b=false;
for(int i=0;i<nb_jobs;i++){
std::vector<float> v;
for(float i=0;i<100.0;i+=0.01) v.push_back(i);
float sum = 0;
// no meaning, just using CPU
for(int i=0;i<v.size();i++) sum+=pow(sin(v[i]),1.1);
if(sum==100) b=true;
}
return b;
}
static bool _shuffle(int nb_jobs){
bool b=false;
for(int i=0;i<nb_jobs;i++){
std::vector<float> v;
for(float i=0;i<100.0;i+=0.01) v.push_back(i);
std::random_shuffle(v.begin(), v.end()); // !!!
if (v[0]==0.0) b=true;
}
return b;
}
static double _duration(int nb_cores){
auto start = std::chrono::system_clock::now();
int nb_jobs_per_core = rint ( NB_JOBS / (float)nb_cores );
std::vector < std::future<bool> > futures;
for(int i=0;i<nb_cores;i++){
futures.push_back( std::async(std::launch::async,_shuffle,nb_jobs_per_core));
}
for (auto &e: futures) {
bool foo = e.get();
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed = end - start;
return elapsed.count();
}
int main(){
for(int nb_cores=1 ; nb_cores<=MAX_CORES ; nb_cores++){
double duration = _duration(nb_cores);
std::cout << nb_cores << " threads: " << duration << " seconds\n";
}
return 0;
}
which prints:
1 threads: 1.18503 seconds
2 threads: 9.6502 seconds
3 threads: 3.64973 seconds
4 threads: 9.8834 seconds
5 threads: 10.7937 seconds
6 threads: 11.6447 seconds
7 threads: 11.9236 seconds
8 threads: 12.1254 seconds
Using threads slow down the program !
on the other hand, when replacing:
std::async(std::launch::async,_shuffle,nb_jobs_per_core));
with:
std::async(std::launch::async,_no_shuffle,nb_jobs_per_core));
then:
1 threads: 3.24132 seconds
2 threads: 1.62207 seconds
3 threads: 1.1745 seconds
4 threads: 1.09769 seconds
5 threads: 1.03182 seconds
6 threads: 0.892274 seconds
7 threads: 0.782815 seconds
8 threads: 0.704777 seconds
which seems as expected, indicating shuffling is indeed the issue.
Is shuffling not thread friendly, and if so, how to shuffle a vector in a multi-threaded program ?
I have created a member function that is called by a number of threads at the same time. Inside this function I want to count the total duration of the execution of a function . The problem is that if I create 4 threads for example, the time I get back is 4 times the actual time! How can I get the actual time? My method looks like this:
void Class1::myTask() {
//...code
chrono::steady_clock::time_point start = chrono::steady_clock::now();
theFunction();
chrono::steady_clock::time_point end = chrono::steady_clock::now();
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
mytime = time_span.count(); // mytime is of atomic type
setTheTime(mytime);
//...more code
}
// The method to set the Total Time
void Class1::setTheTime(double mTime){
time = time + mTime; // time is of atomic type
}
This method is called for a very large number of times, so everytime the "end - start" returns something like 0.000897442 sec. The total duration is about 11 sec, but time is ending as something like 44 seconds!
Here is an example of code that works so that you can see the problem:
#include <iostream>
#include <cstdlib>
#include <string>
#include <vector>
#include <thread>
#include <chrono>
#include <atomic>
using namespace std;
atomic<double> time1;
atomic<double> mytime;
void theFunction() {
int x = 0;
for (int i = 0; i < 10000000; ++i) {
x++;
}
}
double setTheTime(double mTime1) {
time1 = time1 + mTime1;
}
void countTime() {
chrono::steady_clock::time_point start = chrono::steady_clock::now();
theFunction();
chrono::steady_clock::time_point end = chrono::steady_clock::now();
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
mytime = time_span.count();
setTheTime(mytime);
}
int main(int argc, char** argv) {
vector<thread> threads;
long double mt;
chrono::steady_clock::time_point start = chrono::steady_clock::now();
for (int i = 0; i < 4; i++)
threads.push_back(thread(countTime));
for (auto& thread : threads)
thread.join();
chrono::steady_clock::time_point end = chrono::steady_clock::now();
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
mt = time_span.count(); // mytime is of atomic type
cout << "Time out of the function: " << mt * 1000 << endl;
cout << "Time inside the function: " << time1 * 1000 << endl;
return 0;
}
Let there be N threads, which run in parallel during X seconds natural time.
So for the time S they accumulate
S = N * X
roughly holds.
And 44s indeed equals 4 * 11s .
So what is the problem? :)
I wrote a small program to check on the performance of threading which I found a couple of questions from the result i obtained
(cpu of my laptop is i5 3220M)
1) The time required pumped up for 2 thread every time I ran the program. Is it because the omp timer I use or I have some logical error in the program?
2) Also will it be better if I use cpu cycle to measure the performance instead?
3) The time continue to decrease as the number of thread increase. I know my program is simple enough so probably requires no context switch but where does the extra performance come? Coz the cpu adj itself to the turbo freq? (Normal 2.6MHz, turbo 3.3MHz according to intel website)
Thanks!
Output
Adding 1 for 1000 million times
Average Time Elapsed for 1 threads = 3.11565(Check = 5000000000)
Average Time Elapsed for 2 threads = 4.54309(Check = 5000000000)
Average Time Elapsed for 4 threads = 2.19321(Check = 5000000000)
Average Time Elapsed for 8 threads = 2.48927(Check = 5000000000)
Average Time Elapsed for 16 threads = 1.84427(Check = 5000000000)
Average Time Elapsed for 32 threads = 1.30958(Check = 5000000000)
Average Time Elapsed for 64 threads = 1.08472(Check = 5000000000)
Average Time Elapsed for 128 threads = 0.996898(Check = 5000000000)
Average Time Elapsed for 256 threads = 1.01366(Check = 5000000000)
Average Time Elapsed for 512 threads = 0.951436(Check = 5000000000)
Average Time Elapsed for 1024 threads = 0.973331(Check = 4999997440)
Program
#include <iostream>
#include <thread>
#include <algorithm> // for_each
#include <vector>
#include <omp.h> // omp_get_wtime
class Adder{
public:
long sum;
Adder(){};
void operator()(long endVal_i){
sum = 0;
for (long i = 1; i<= endVal_i; i++)
sum++;
};
};
int main()
{
long totalCount = 1000000000;
int maxThread = 1025;
int numSample = 5;
std::vector<std::thread> threads;
Adder adderArray[maxThread];
std::cout << "Adding 1 for " << totalCount/1000000 << " million times\n\n";
for (int numThread = 1; numThread <=maxThread; numThread=numThread*2){
double avgTime=0;
long check = 0;
for (int i = 1; i<=numSample; i++){
double startTime = omp_get_wtime();
long loop = totalCount/numThread;
for (int i = 0; i<numThread;i++)
threads.push_back(std::thread(std::ref(adderArray[i]), loop));
std::for_each(threads.begin(), threads.end(),std::mem_fn(&std::thread::join));
double endTime = omp_get_wtime();
for (int i = 0; i<numThread;i++)
check += adderArray[i].sum;
threads.erase(threads.begin(), threads.end());
avgTime += endTime - startTime;
}
std::cout << "Average Time Elapsed for " << numThread<< " threads = " << avgTime/numSample << "(Check = "<<check<<")\n";
}
}