c++ multithreading and affinity - c++

I'm writing a simple thread pool for my application, which I test on dual-core processor. Usually it works good, but i noticed that when other processes are using more than 50% of processor, my application almost halts. This made me curious, so i decided to reproduce this situation and created auxiliary application, which simply runs infinite loop (without multithreading), taking 50% of processor. While auxiliary one is running, multithreaded application almost halts, as before (processing speed falls from 300-400 tasks per second to 5-10 tasks per second). But when I changed process affinity of my multithreaded program to use only one core (auxiliary still uses both), it started working, of course using at most 50% processor left. When I disabled multithreading in my application (still processing the same tasks, but without thread pool), it worked like charm, without any slow down from auxiliary, which was still running (and that's how two applications should behave when running on two cores). But when I enable multithreading, the problem comes back.
I've made special code for testing this particular ThreadPool:
header
#ifndef THREADPOOL_H_
#define THREADPOOL_H_
typedef double FloatingPoint;
#include <queue>
#include <vector>
#include <mutex>
#include <atomic>
#include <condition_variable>
#include <thread>
using namespace std;
struct ThreadTask
{
int size;
ThreadTask(int s)
{
size = s;
}
~ThreadTask()
{
}
};
class ThreadPool
{
protected:
queue<ThreadTask*> tasks;
vector<std::thread> threads;
std::condition_variable task_ready;
std::mutex variable_mutex;
std::mutex max_mutex;
std::atomic<FloatingPoint> max;
std::atomic<int> sleeping;
std::atomic<bool> running;
int threads_count;
ThreadTask * getTask();
void runWorker();
void processTask(ThreadTask*);
bool isQueueEmpty();
bool isTaskAvailable();
void threadMethod();
void createThreads();
void waitForThreadsToSleep();
public:
ThreadPool(int);
virtual ~ThreadPool();
void addTask(int);
void start();
FloatingPoint getValue();
void reset();
void clearTasks();
};
#endif /* THREADPOOL_H_ */
and .cpp
#include "stdafx.h"
#include <climits>
#include <float.h>
#include "ThreadPool.h"
ThreadPool::ThreadPool(int t)
{
running = true;
threads_count = t;
max = FLT_MIN;
sleeping = 0;
if(threads_count < 2) //one worker thread has no sense
{
threads_count = (int)thread::hardware_concurrency(); //default value
if(threads_count == 0) //in case it fails ('If this value is not computable or well defined, the function returns 0')
threads_count = 2;
}
printf("%d worker threads\n", threads_count);
}
ThreadPool::~ThreadPool()
{
running = false;
reset(); //it will make sure that all worker threads are sleeping on condition variable
task_ready.notify_all(); //let them finish in natural way
for (auto& th : threads)
th.join();
}
void ThreadPool::start()
{
createThreads();
}
FloatingPoint ThreadPool::getValue()
{
waitForThreadsToSleep();
return max;
}
void ThreadPool::createThreads()
{
threads.clear();
for(int i = 0; i < threads_count; ++i)
threads.push_back(std::thread(&ThreadPool::threadMethod, this));
}
void ThreadPool::threadMethod()
{
while(running)
runWorker();
}
void ThreadPool::runWorker()
{
ThreadTask * task = getTask();
processTask(task);
}
void ThreadPool::processTask(ThreadTask * task)
{
if(task == NULL)
return;
//do something to simulate processing
vector<int> v;
for(int i = 0; i < task->size; ++i)
v.push_back(i);
delete task;
}
void ThreadPool::addTask(int s)
{
ThreadTask * task = new ThreadTask(s);
std::lock_guard<std::mutex> lock(variable_mutex);
tasks.push(task);
task_ready.notify_one();
}
ThreadTask * ThreadPool::getTask()
{
std::unique_lock<std::mutex> lck(variable_mutex);
if(tasks.empty())
{
++sleeping;
task_ready.wait(lck);
--sleeping;
if(tasks.empty()) //in case of ThreadPool being deleted (destructor calls notify_all), or spurious notifications
return NULL; //return to main loop and repeat it
}
ThreadTask * task = tasks.front();
tasks.pop();
return task;
}
bool ThreadPool::isQueueEmpty()
{
std::lock_guard<std::mutex> lock(variable_mutex);
return tasks.empty();
}
bool ThreadPool::isTaskAvailable()
{
return !isQueueEmpty();
}
void ThreadPool::waitForThreadsToSleep()
{
while(isTaskAvailable())
std::this_thread::yield(); //wait for all tasks to be taken
while(true) //wait for all threads to finish they last tasks
{
if(sleeping == threads_count)
break;
std::this_thread::yield();
}
}
void ThreadPool::clearTasks()
{
std::unique_lock<std::mutex> lock(variable_mutex);
while(!tasks.empty()) tasks.pop();
}
void ThreadPool::reset() //don't call this when var_mutex is already locked by this thread!
{
clearTasks();
waitForThreadsToSleep();
max = FLT_MIN;
}
how it's tested:
ThreadPool tp(2);
tp.start();
int iterations = 1000;
int task_size = 1000;
for(int j = 0; j < iterations; ++j)
{
printf("\r%d left", iterations - j);
tp.reset();
for(int i = 0; i < 1000; ++i)
tp.addTask(task_size);
tp.getValue();
}
return 0;
I've build this code with mingw with gcc 4.8.1 (from here) and Visual Studio 2012 (VC11) on Win7 64, both on debug configuration.
Two programs build with mentioned compilers behave totally different.
a) program build with mingw works much faster than one build on VS, when it can take whole processor (system shows almost 100% CPU usage by this process, so i don't think mingw is secretly setting affinity to one core). But when i run auxiliary program (using 50% of CPU), it slows down greatly (about several dozen times). CPU usage in this case is about 50%-50% for main program and auxiliary one.
b) program build with VS 2012, when using whole CPU, is even slower than a) with slowdown (when i set task_size = 1, their speeds were similiar). But when auxiliary is running, main program even takes most of CPU (usage is about 66% main - 33% aux) and resulting slow down is barely noticeable.
When set to use only one core, both programs speed up noticeable (about 1.5 - 2 times), and mingw one stops being vulnerable to competition.
Well, now i don't know what to do. My program behaves differently when build by two different toolsets. Is this a flaw in my code (which is suppose is true), or something to do with compilers having problems with c++11 ?

Related

Thread with expensive operations slows down UI thread - Windows 10, C++

The Problem: I have two threads in a Windows 10 application I'm working on, a UI thread (called the render thread in the code) and a worker thread in the background (called the simulate thread in the code). Ever couple of seconds or so, the background thread has to perform a very expensive operation that involves allocating a large amount of memory. For some reason, when this operation happens, the UI thread lags for a split second and becomes unresponsive (this is seen in the application as a camera not moving for a second while the camera movement input is being given).
Maybe I'm misunderstanding something about how threads work on Windows, but I wasn't aware that this was something that should happen. I was under the impression that you use a separate UI thread for this very reason: to keep it responsive while other threads do more time intensive operations.
Things I've tried: I've removed all communication between the two threads, so there are no mutexes or anything of that sort (unless there's something implicit that Windows does that I'm not aware of). I have also tried setting the UI thread to be a higher priority than the background thread. Neither of these helped.
Some things I've noted: While the UI thread lags for a moment, other applications running on my machine are just as responsive as ever. The heavy operation seems to only affect this one process. Also, if I decrease the amount of memory being allocated, it alleviates the issue (however, for the application to work as I want it to, it needs to be able to do this allocation).
The question: My question is two-fold. First, I'd like to understand why this is happening, as it seems to go against my understanding of how multi-threading should work. Second, do you have any recommendations or ideas on how to fix this and get it so the UI doesn't lag.
Abbreviated code: Note the comment about epochs in timeline.h
main.cpp
#include "Renderer/Headers/Renderer.h"
#include "Shared/Headers/Timeline.h"
#include "Simulator/Simulator.h"
#include <iostream>
#include <Windows.h>
unsigned int __stdcall renderThread(void* timelinePtr);
unsigned int __stdcall simulateThread(void* timelinePtr);
int main() {
Timeline timeline;
HANDLE renderHandle = (HANDLE)_beginthreadex(0, 0, &renderThread, &timeline, 0, 0);
if (renderHandle == 0) {
std::cerr << "There was an error creating the render thread" << std::endl;
return -1;
}
SetThreadPriority(renderHandle, THREAD_PRIORITY_HIGHEST);
HANDLE simulateHandle = (HANDLE)_beginthreadex(0, 0, &simulateThread, &timeline, 0, 0);
if (simulateHandle == 0) {
std::cerr << "There was an error creating the simulate thread" << std::endl;
return -1;
}
SetThreadPriority(simulateHandle, THREAD_PRIORITY_IDLE);
WaitForSingleObject(renderHandle, INFINITE);
WaitForSingleObject(simulateHandle, INFINITE);
return 0;
}
unsigned int __stdcall renderThread(void* timelinePtr) {
Timeline& timeline = *((Timeline*)timelinePtr);
Renderer renderer = Renderer(timeline);
renderer.run();
return 0;
}
unsigned int __stdcall simulateThread(void* timelinePtr) {
Timeline& timeline = *((Timeline*)timelinePtr);
Simulator simulator(timeline);
simulator.run();
return 0;
}
simulator.cpp
// abbreviated
void Simulator::run() {
while (true) {
// abbreviated
timeline->push(latestState);
}
}
// abbreviated
timeline.h
#ifndef TIMELINE_H
#define TIMELINE_H
#include "WorldState.h"
#include <mutex>
#include <vector>
class Timeline {
public:
Timeline();
bool tryGetStateAtFrame(int frame, WorldState*& worldState);
void push(WorldState* worldState);
private:
// The concept of an Epoch was introduced to help reduce mutex conflicts, but right now since the threads are disconnected, there should be no mutex locks at all on the UI thread. However, every 1024 pushes onto the timeline, a new Epoch must be created. The amount of slowdown largely depends on how much memory the WorldState class takes. If I make WorldState small, there isn't a noticable hiccup, but when it is large, it becomes noticeable.
class Epoch {
public:
static const int MAX_SIZE = 1024;
void push(WorldState* worldstate);
int getSize();
WorldState* getAt(int index);
private:
int size = 0;
WorldState states[MAX_SIZE];
};
Epoch* pushEpoch;
std::mutex lock;
std::vector<Epoch*> epochs;
};
#endif // !TIMELINE_H
timeline.cpp
#include "../Headers/Timeline.h"
#include <iostream>
Timeline::Timeline() {
pushEpoch = new Epoch();
}
bool Timeline::tryGetStateAtFrame(int frame, WorldState*& worldState) {
if (!lock.try_lock()) {
return false;
}
if (frame >= epochs.size() * Epoch::MAX_SIZE) {
lock.unlock();
return false;
}
worldState = epochs.at(frame / Epoch::MAX_SIZE)->getAt(frame % Epoch::MAX_SIZE);
lock.unlock();
return true;
}
void Timeline::push(WorldState* worldState) {
pushEpoch->push(worldState);
if (pushEpoch->getSize() == Epoch::MAX_SIZE) {
lock.lock();
epochs.push_back(pushEpoch);
lock.unlock();
pushEpoch = new Epoch();
}
}
void Timeline::Epoch::push(WorldState* worldState) {
if (this->size == this->MAX_SIZE) {
throw std::out_of_range("Pushed too many items to Epoch without clearing");
}
this->states[this->size] = *worldState;
this->size++;
}
int Timeline::Epoch::getSize() {
return this->size;
}
WorldState* Timeline::Epoch::getAt(int index) {
if (index >= this->size) {
throw std::out_of_range("Tried accessing nonexistent element of epoch");
}
return &(this->states[index]);
}
Renderer.cpp: loops to call Presenter::update() and some OpenGL rendering tasks.
Presenter.cpp
// abbreviated
void Presenter::update() {
camera->update();
// timeline->tryGetStateAtFrame(Time::getFrames(), worldState); // Normally this would cause a potential mutex conflict, but for now I have it commented out. This is the only place that anything on the UI thread accesses timeline.
}
// abbreviated
Any help/suggestions?
I ended up figuring this out!
So as it turns out, the new operator in C++ is threadsafe, which means that once it starts, it has to finish before any other threads can do anything. Why was that a problem in my case? Well, when an Epoch was being initialized, it had to initialize an array of 1024 WorldStates, each of which has 10,000 CellStates that need to be initialized, and each of those had an array of 16 items that needed to be initalized, so we ended up with over 100,000,000 objects needing to be initialized before the new operator could return. That was taking long enough that it caused the UI to hiccup while it was waiting.
The solution was to create a factory function that would build the pieces of the Epoch piecemeal, one constructor at a time and then combine them together and return a pointer to the new epoch.
timeline.h
#ifndef TIMELINE_H
#define TIMELINE_H
#include "WorldState.h"
#include <mutex>
#include <vector>
class Timeline {
public:
Timeline();
bool tryGetStateAtFrame(int frame, WorldState*& worldState);
void push(WorldState* worldState);
private:
class Epoch {
public:
static const int MAX_SIZE = 1024;
static Epoch* createNew();
void push(WorldState* worldstate);
int getSize();
WorldState* getAt(int index);
private:
Epoch();
int size = 0;
WorldState* states[MAX_SIZE];
};
Epoch* pushEpoch;
std::mutex lock;
std::vector<Epoch*> epochs;
};
#endif // !TIMELINE_H
timeline.cpp
Timeline::Epoch* Timeline::Epoch::createNew() {
Epoch* epoch = new Epoch();
for (unsigned int i = 0; i < MAX_SIZE; i++) {
epoch->states[i] = new WorldState();
}
return epoch;
}

C++: Thread pool slower than single threading?

First of all I did look at the other topics on this website and found they don't relate to my problem as those mostly deal with people using I/O operations or thread creation overheads. My problem is that my threadpool or worker-task structure implementation is (in this case) a lot slower than single threading. I'm really confused by this and not sure if it's the ThreadPool, the task itself, how I test it, the nature of threads or something out of my control.
// Sorry for the long code
#include <vector>
#include <queue>
#include <thread>
#include <mutex>
#include <future>
#include "task.hpp"
class ThreadPool
{
public:
ThreadPool()
{
for (unsigned i = 0; i < std::thread::hardware_concurrency() - 1; i++)
m_workers.emplace_back(this, i);
m_running = true;
for (auto&& worker : m_workers)
worker.start();
}
~ThreadPool()
{
m_running = false;
m_task_signal.notify_all();
for (auto&& worker : m_workers)
worker.terminate();
}
void add_task(Task* task)
{
{
std::unique_lock<std::mutex> lock(m_in_mutex);
m_in.push(task);
}
m_task_signal.notify_one();
}
private:
class Worker
{
public:
Worker(ThreadPool* parent, unsigned id) : m_parent(parent), m_id(id)
{}
~Worker()
{
terminate();
}
void start()
{
m_thread = new std::thread(&Worker::work, this);
}
void terminate()
{
if (m_thread)
{
if (m_thread->joinable())
{
m_thread->join();
delete m_thread;
m_thread = nullptr;
m_parent = nullptr;
}
}
}
private:
void work()
{
while (m_parent->m_running)
{
std::unique_lock<std::mutex> lock(m_parent->m_in_mutex);
m_parent->m_task_signal.wait(lock, [&]()
{
return !m_parent->m_in.empty() || !m_parent->m_running;
});
if (!m_parent->m_running) break;
Task* task = m_parent->m_in.front();
m_parent->m_in.pop();
// Fixed the mutex being locked while the task is executed
lock.unlock();
task->execute();
}
}
private:
ThreadPool* m_parent = nullptr;
unsigned m_id = 0;
std::thread* m_thread = nullptr;
};
private:
std::vector<Worker> m_workers;
std::mutex m_in_mutex;
std::condition_variable m_task_signal;
std::queue<Task*> m_in;
bool m_running = false;
};
class TestTask : public Task
{
public:
TestTask() {}
TestTask(unsigned number) : m_number(number) {}
inline void Set(unsigned number) { m_number = number; }
void execute() override
{
if (m_number <= 3)
{
m_is_prime = m_number > 1;
return;
}
else if (m_number % 2 == 0 || m_number % 3 == 0)
{
m_is_prime = false;
return;
}
else
{
for (unsigned i = 5; i * i <= m_number; i += 6)
{
if (m_number % i == 0 || m_number % (i + 2) == 0)
{
m_is_prime = false;
return;
}
}
m_is_prime = true;
return;
}
}
public:
unsigned m_number = 0;
bool m_is_prime = false;
};
int main()
{
ThreadPool pool;
unsigned num_tasks = 1000000;
std::vector<TestTask> tasks(num_tasks);
for (auto&& task : tasks)
task.Set(randint(0, 1000000000));
auto s = std::chrono::high_resolution_clock::now();
#if MT
for (auto&& task : tasks)
pool.add_task(&task);
#else
for (auto&& task : tasks)
task.execute();
#endif
auto e = std::chrono::high_resolution_clock::now();
double seconds = std::chrono::duration_cast<std::chrono::nanoseconds>(e - s).count() / 1000000000.0;
}
Benchmarks with VS2013 Profiler:
10,000,000 tasks:
MT:
13 seconds of wall clock time
93.36% is spent in msvcp120.dll
3.45% is spent in Task::execute() // Not good here
ST:
0.5 seconds of wall clock time
97.31% is spent with Task::execute()
Usual disclaimer in such answers: the only way to tell for sure is to measure it with a profiler tool.
But I will try to explain your results without it. First of all, you have one mutex across all your threads. So only one thread at a time can execute some task. It kills all your gains you might have. In spite of your threads your code is perfectly serial. So at the very least make your task execution out of the mutex. You need to lock the mutex only to get a task out of the queue — you don't need to hold it when the task gets executed.
Next, your tasks are so simple that single thread will execute them in no time. You just can't measure any gains with such tasks. Create some heavy tasks which could produce some more interesting results(some tasks which are closer to the real world, not such contrived).
And the 3rd point: threads are not without their cost — context switching, mutex contention etc. To have real gains, as the previous 2 points say, you need to have tasks which take more time than the overheads threads introduce and the code should be truly parallel instead of waiting on some resource making it serial.
UPD: I looked at the wrong part of the code. The task is complex enough provided you create tasks with sufficiently large numbers.
UPD2: I've played with your code and found a good prime number to show how the MT code is better. Use the following prime number: 1019048297. It will give enough computation complexity to show the difference.
But why your code doesn't produce good results? It is hard to tell without seeing the implementation of randint() but I take it is pretty simple and in a half of the cases it returns even numbers and other cases produce not much of big prime numbers either. So the tasks are so simple that context switching and other things around your particular implementation and threads in general consume more time than the computation itself. Using the prime number I gave you give the tasks no choice but spend time computing — no easy answer since the number is big and actually prime. That's why the big number will give you the answer you seek — better time for the MT code.
You should not hold the mutex while the task is getting executed, otherwise other threads will not be able to get a task:
void work() {
while (m_parent->m_running) {
Task* currentTask = nullptr;
std::unique_lock<std::mutex> lock(m_parent->m_in_mutex);
m_parent->m_task_signal.wait(lock, [&]() {
return !m_parent->m_in.empty() || !m_parent->m_running;
});
if (!m_parent->m_running) continue;
currentTask = m_parent->m_in.front();
m_parent->m_in.pop();
lock.unlock(); //<- Release the lock so that other threads can get tasks
currentTask->execute();
currentTask = nullptr;
}
}
For MT, how much time is spent in each phase of the "overhead": std::unique_lock, m_task_signal.wait, front, pop, unlock?
Based on your results of only 3% useful work, this means the above consumes 97%. I'd get numbers for each part of the above (e.g. add timestamps between each call).
It seems to me, that the code you use to [merely] dequeue the next task pointer is quite heavy. I'd do a much simpler queue [possibly lockless] mechanism. Or, perhaps, use atomics to bump an index into the queue instead of the five step process above. For example:
void
work()
{
while (m_parent->m_running) {
// NOTE: this is just an example, not necessarily the real function
int curindex = atomic_increment(&global_index);
if (curindex >= max_index)
break;
Task *task = m_parent->m_in[curindex];
task->execute();
}
}
Also, maybe you should pop [say] ten at a time instead of just one.
You might also be memory bound and/or "task switch" bound. (e.g.) For threads that access an array, more than four threads usually saturates the memory bus. You could also have heavy contention for the lock, such that the threads get starved because one thread is monopolizing the lock [indirectly, even with the new unlock call]
Interthread locking usually involves a "serialization" operation where other cores must synchronize their out-of-order execution pipelines.
Here's a "lockless" implementation:
void
work()
{
// assume m_id is 0,1,2,...
int curindex = m_id;
while (m_parent->m_running) {
if (curindex >= max_index)
break;
Task *task = m_parent->m_in[curindex];
task->execute();
curindex += NUMBER_OF_WORKERS;
}
}

Multithreaded not efficient: Debugging False Sharing?

I have the following code, that starts multiple Threads (a threadpool) at the very beginning (startWorkers()). Subsequently, at some point i have a container full of myWorkObject instances, which I want to process using multiple worker threads simulatenously. The myWorkObject are completely isolated from another in terms of memory usage. For now lets assume myWorkObject has a method doWorkIntenseStuffHere() which takes some cpu time to calculate.
When benchmarking the following code, i have noticed that this code does not scale well with the number of threads, and the overhead for initializing/synchronizing the worker threads exceeds the benefit of multithreading unless there are 3-4 threads active. I've looked into this issue and read about the false-sharing problem and i assume my code suffers from this problem. However, I'd like to debug/profile my code to see whether there is some kind of starvation/false sharing going on. How can I do this? Please feel free to critize anything about my code as I'm still learning a lot about memory/cpu and multithreading in particular.
#include <boost/thread.hpp>
class MultiThreadedFitnessProcessingStrategy
{
public:
MultiThreadedFitnessProcessingStrategy(unsigned int numWorkerThreads):
_startBarrier(numWorkerThreads + 1),
_endBarrier(numWorkerThreads + 1),
_started(false),
_shutdown(false),
_numWorkerThreads(numWorkerThreads)
{
assert(_numWorkerThreads > 0);
}
virtual ~MultiThreadedFitnessProcessingStrategy()
{
stopWorkers();
}
void startWorkers()
{
_shutdown = false;
_started = true;
for(unsigned int i = 0; i < _numWorkerThreads;i++)
{
boost::thread* workerThread = new boost::thread(
boost::bind(&MultiThreadedFitnessProcessingStrategy::workerTask, this,i)
);
_threadQueue.push_back(new std::queue<myWorkObject::ptr>());
_workerThreads.push_back(workerThread);
}
}
void stopWorkers()
{
_startBarrier.wait();
_shutdown = true;
_endBarrier.wait();
for(unsigned int i = 0; i < _numWorkerThreads;i++)
{
_workerThreads[i]->join();
}
}
void workerTask(unsigned int id)
{
//Wait until all worker threads have started.
while(true)
{
//Wait for any input to become available.
_startBarrier.wait();
bool queueEmpty = false;
std::queue<SomeClass::ptr >* myThreadq(_threadQueue[id]);
while(!queueEmpty)
{
SomeClass::ptr myWorkObject;
//Make sure queue is not empty,
//Caution: this is necessary if start barrier was triggered without queue input (e.g., shutdown) , which can happen.
//Do not try to be smart and refactor this without knowing what you are doing!
queueEmpty = myThreadq->empty();
if(!queueEmpty)
{
chromosome = myThreadq->front();
assert(myWorkObject);
myThreadq->pop();
}
if(myWorkObject)
{
myWorkObject->doWorkIntenseStuffHere();
}
}
//Wait until all worker threads have synchronized.
_endBarrier.wait();
if(_shutdown)
{
return;
}
}
}
void doWork(const myWorkObject::chromosome_container &refcontainer)
{
if(!_started)
{
startWorkers();
}
unsigned int j = 0;
for(myWorkObject::chromosome_container::const_iterator it = refcontainer.begin();
it != refcontainer.end();++it)
{
if(!(*it)->hasFitness())
{
assert(*it);
_threadQueue[j%_numWorkerThreads]->push(*it);
j++;
}
}
//Start Signal!
_startBarrier.wait();
//Wait for workers to be complete
_endBarrier.wait();
}
unsigned int getNumWorkerThreads() const
{
return _numWorkerThreads;
}
bool isStarted() const
{
return _started;
}
private:
boost::barrier _startBarrier;
boost::barrier _endBarrier;
bool _started;
bool _shutdown;
unsigned int _numWorkerThreads;
std::vector<boost::thread*> _workerThreads;
std::vector< std::queue<myWorkObject::ptr >* > _threadQueue;
};
Sampling-based profiling can give you a pretty good idea whether you're experiencing false sharing. Here's a previous thread that describes a few ways to approach the issue. I don't think that thread mentioned Linux's perf utility. It's a quick, easy and free way to count cache misses that might tell you what you need to know (am I experiencing a significant number of cache misses that correlates with how many times I'm accessing a particular variable?).
If you do find that your threading scheme might be causing a lot of conflict misses, you could try declaring your myWorkObject instances or the data contained within them that you're actually concerned about with __attribute__((aligned(64))) (alignment to 64 byte cache lines).
If you're on Linux, there is a tool called valgrind, with one of the modules doing cache effects simulation (cachegrind). Please take a look at
http://valgrind.org/docs/manual/cg-manual.html

Calling functions at timed intervals using threads

I'm building a simulator to test student code for a very simple robot. I need to run two functions(to update robot sensors and robot position) on separate threads at regular time intervals. My current implementation is highly processor inefficient because it has a thread dedicated to simply incrementing numbers to keep track of the position in the code. My recent theory is that I may be able to use sleep to give the time delay between updating value of the sensor and robot position. My first question is: is this efficient? Second: Is there any way to do a simple thing but measure clock cycles instead of seconds?
Putting a thread to sleep by waiting on a mutex-like object is generally efficient. A common pattern involves waiting on a mutex with a timeout. When the timeout is reached, the interval is up. When the mutex is releaed, it is the signal for the thread to terminate.
Pseudocode:
void threadMethod() {
for(;;) {
bool signalled = this->mutex.wait(1000);
if(signalled) {
break; // Signalled, owners wants us to terminate
}
// Timeout, meaning our wait time is up
doPeriodicAction();
}
}
void start() {
this->mutex.enter();
this->thread.start(threadMethod);
}
void stop() {
this->mutex.leave();
this->thread.join();
}
On Windows systems, timeouts are generally specified in milliseconds and are accurate to roughly within 16 milliseconds (timeBeginPeriod() may be able to improve this). I do not know of a CPU cycle-triggered synchronization primitive. There are lightweight mutexes called "critical sections" that spin the CPU for a few thousand cycles before delegating to the OS thread scheduler. Within this time they are fairly accurate.
On Linux systems the accuracy may be a bit higher (high frequency timer or tickless kernel) and in addition to mutexes, there are "futexes" (fast mutex) which are similar to Windows' critical sections.
I'm not sure I grasped what you're trying to achieve, but if you want to test student code, you might want to use a virtual clock and control the passing of time yourself. For example by calling a processInputs() and a decideMovements() method that the students have to provide. After each call, 1 time slot is up.
This C++11 code uses std::chrono::high_resolution_clock to measure subsecond timing, and std::thread to run three threads. The std::this_thread::sleep_for() function is used to sleep for a specified time.
#include <iostream>
#include <thread>
#include <vector>
#include <chrono>
void seconds()
{
using namespace std::chrono;
high_resolution_clock::time_point t1, t2;
for (unsigned i=0; i<10; ++i) {
std::cout << i << "\n";
t1 = high_resolution_clock::now();
std::this_thread::sleep_for(std::chrono::seconds(1));
t2 = high_resolution_clock::now();
duration<double> elapsed = duration_cast<duration<double> >(t2-t1);
std::cout << "\t( " << elapsed.count() << " seconds )\n";
}
}
int main()
{
std::vector<std::thread> t;
t.push_back(std::thread{[](){
std::this_thread::sleep_for(std::chrono::seconds(3));
std::cout << "awoke after 3\n"; }});
t.push_back(std::thread{[](){
std::this_thread::sleep_for(std::chrono::seconds(7));
std::cout << "awoke after 7\n"; }});
t.push_back(std::thread{seconds});
for (auto &thr : t)
thr.join();
}
It's hard to know whether this meets your needs because there are a lot of details missing from the question. Under Linux, compile with:
g++ -Wall -Wextra -pedantic -std=c++11 timers.cpp -o timers -lpthread
Output on my machine:
0
( 1.00014 seconds)
1
( 1.00014 seconds)
2
awoke after 3
( 1.00009 seconds)
3
( 1.00015 seconds)
4
( 1.00011 seconds)
5
( 1.00013 seconds)
6
awoke after 7
( 1.0001 seconds)
7
( 1.00015 seconds)
8
( 1.00014 seconds)
9
( 1.00013 seconds)
Other C++11 standard features that may be of interest include timed_mutex and promise/future.
Yes your theory is correct. You can use sleep to put some delay between execution of a function by thread. Efficiency depends on how wide you can choose that delay to get desired result. You have to explain details of your implementation. For e.g we don't know whether two threads are dependent ( in that case you have to take care of synchronization which would blow up some cycles ).
Here's the one way to do it. I'm using C++11, thread, atomics and high precision clock. The scheduler will callback a function that takes dt seconds which is time elapsed since last call. The loop can be stopped by calling stop() method of if callback function returns false.
Scheduler code
#include <thread>
#include <chrono>
#include <functional>
#include <atomic>
#include <system_error>
class ScheduledExecutor {
public:
ScheduledExecutor()
{}
ScheduledExecutor(const std::function<bool(double)>& callback, double period)
{
initialize(callback, period);
}
void initialize(const std::function<bool(double)>& callback, double period)
{
callback_ = callback;
period_ = period;
keep_running_ = false;
}
void start()
{
keep_running_ = true;
sleep_time_sum_ = 0;
period_count_ = 0;
th_ = std::thread(&ScheduledExecutor::executorLoop, this);
}
void stop()
{
keep_running_ = false;
try {
th_.join();
}
catch(const std::system_error& /* e */)
{ }
}
double getSleepTimeAvg()
{
//TODO: make this function thread safe by using atomic types
//right now this is not implemented for performance and that
//return of this function is purely informational/debugging purposes
return sleep_time_sum_ / period_count_;
}
unsigned long getPeriodCount()
{
return period_count_;
}
private:
typedef std::chrono::high_resolution_clock clock;
template <typename T>
using duration = std::chrono::duration<T>;
void executorLoop()
{
clock::time_point call_end = clock::now();
while (keep_running_) {
clock::time_point call_start = clock::now();
duration<double> since_last_call = call_start - call_end;
if (period_count_ > 0 && !callback_(since_last_call.count()))
break;
call_end = clock::now();
duration<double> call_duration = call_end - call_start;
double sleep_for = period_ - call_duration.count();
sleep_time_sum_ += sleep_for;
++period_count_;
if (sleep_for > MinSleepTime)
std::this_thread::sleep_for(std::chrono::duration<double>(sleep_for));
}
}
private:
double period_;
std::thread th_;
std::function<bool(double)> callback_;
std::atomic_bool keep_running_;
static constexpr double MinSleepTime = 1E-9;
double sleep_time_sum_;
unsigned long period_count_;
};
Example usage
bool worldUpdator(World& w, double dt)
{
w.update(dt);
return true;
}
void main() {
//create world for your simulator
World w(...);
//start scheduler loop for every 2ms calls
ScheduledExecutor exec;
exec.initialize(
std::bind(worldUpdator, std::ref(w), std::placeholders::_1),
2E-3);
exec.start();
//main thread just checks on the results every now and then
while (true) {
if (exec.getPeriodCount() % 10000 == 0) {
std::cout << exec.getSleepTimeAvg() << std::endl;
}
}
}
There are also other, related questions on SO.

Determine user & system time used by a thread

We have a qthreads-based workflow engine where worker threads pick up bundles of input as they are placed on a queue, then place their output on another queue for other worker threads to run the next stage; and so on until all the input has been consumed and all the output has been generated.
Typically, several threads will be running the same task and others will be running other tasks at the same time. We want to benchmark performance of these threaded tasks in order to target optimization efforts.
It's easy to get the real (elapsed) time that a given thread, running a given task, has taken. We just look at the difference between the return values of the POSIX times() function at the start and end of the thread's run() procedure. However, I cannot figure out how to get the corresponding user and system time. Getting these from the struct tms that you pass to times() doesn't work, because this structure gives total user and system times of all threads running while the thread in question is active.
Assuming this is on Linux how about getrusage() with RUSAGE_THREAD? Solaris also offers RUSAGE_LWP which is similar and I guess there's probably equivalents for other POSIX-like systems.
Crude example:
#define _GNU_SOURCE
#include <sys/time.h>
#include <sys/resource.h>
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
#include <unistd.h>
struct tinfo {
pthread_t thread;
int id;
struct rusage start;
struct rusage end;
};
static void *
thread_start(void *arg)
{
struct tinfo *inf = arg;
getrusage(RUSAGE_THREAD, &inf->start);
if (inf->id) {
sleep(10);
}
else {
const time_t start = time(NULL);
while (time(NULL) - start < 10); // Waste CPU time!
}
getrusage(RUSAGE_THREAD, &inf->end);
return 0;
}
int main() {
static const int nrthr = 2;
struct tinfo status[nrthr];
for (int i = 0; i < nrthr; ++i) {
status[i].id = i;
const int s = pthread_create(&status[i].thread,
NULL, &thread_start,
&status[i]);
assert(!s);
}
for (int i = 0; i < nrthr; ++i) {
const int s = pthread_join(status[i].thread, NULL);
assert(!s);
// Sub-second timing is available too
printf("Thread %d done: %ld (s) user, %ld (s) system\n", status[i].id,
status[i].end.ru_utime.tv_sec - status[i].start.ru_utime.tv_sec,
status[i].end.ru_stime.tv_sec - status[i].start.ru_stime.tv_sec);
}
}
I think something similar is possible on windows using GetProcessTimes()