LuaPlus: How to call Lua function from multithreaded C++? - c++

I have a kind of callback function in my Lua script which I would like to call from different threads on the C++ side (0-100 times per second). So far it basically work, but as soon as I call it multiple times in a very short period of time it crashes the program causing errors like:
-As????ion failed: 0, file ...LuaFunction.h, line 146 or this one (completely random)
I think this happens, when it gets called from the C++ side before it finished another task. The most obvious thing for me to try (mutex lock all threads during the lua-function call) didn't help at all. :/
If I only call the Lua-function like once per 2 seconds, then I don't get any errors at all (Well, until the clean up part, if it gets to that point it will crash without a specific error).
Here is my code (I tried to crop and simplify my code as much as possible, and added a lot of commenting):
#include "stdafx.hpp"
#include <pthread.h> //for multithreading
#include <windows.h>
#include <iostream>
#include <map>
using namespace std;
unsigned int maxThreads = 100;
map<unsigned int, pthread_t> threads;
map<unsigned int, bool> threadsState;
pthread_mutex_t mutex; //to lock the pthreads (to keep printing from overlapping etc)
LuaPlus::LuaState* pState = LuaPlus::LuaState::Create( true ); //initialize LuaPlus
LuaPlus::LuaObject globals = pState->GetGlobals();
struct argumentStruct { //to pass multiple arguments to the function called when starting a pthread
unsigned int threadId;
int a;
int b;
};
map<unsigned int, struct argumentStruct> argumentMap; //we store the arguments of active threads in here
void *ThreadFunction(void *arguments) { //will be called for every pthread we're going to create
struct argumentStruct*args = (struct argumentStruct*)arguments; //get the arrgument struct
int threadId = args->threadId; //get variables for each struct field
int a = args->a;
int b = args->b;
Sleep(3000); //since this is a very simplified version of my actual project
int c = a+b;
pthread_mutex_lock(&mutex); //lock pthreads for the next lines
LuaPlus::LuaFunction<int> CPP_OnMyEvent = pState->GetGlobal("LUA_OnMyEvent"); //get the Lua callback function to call on the C++ side
CPP_OnMyEvent(a,b,c); //call to our lua-callback function
pthread_mutex_unlock(&mutex); //unlock pthreads
threadsState[threadId] = false; //mark the thread as finished/ready to get overwritten by a new one
return NULL;
}
bool AddThread(int a, int b) {
for (;;) {
if (threads.size() < maxThreads) { //if our array of threads isn't full yet, create a new thread
int id = threads.size();
argumentMap[id].threadId = threads.size();
argumentMap[id].a = a;
argumentMap[id].b = b;
threadsState[id] = true; //mark the thread as existing/running
pthread_create(&threads[id], NULL, &ThreadFunction, (void *)&argumentMap[id]);
return true;
} else {
unsigned int id;
for (auto thread=threads.begin(); thread!=threads.end(); ++thread) {
id = thread->first;
if(!threadsState[id]) { //if thread with id "id" has finished, create a new thread on it's pthread_t
argumentMap[id].threadId = id;
argumentMap[id].a = a;
argumentMap[id].b = b;
threadsState[id] = true; //mark the thread as existing/running
pthread_join(threads[id], NULL);
pthread_create(&threads[id], NULL, &ThreadFunction, (void *)&argumentMap[id]);
return true;
}
}
}
}
return false;
}
int main() {
pthread_mutex_init(&mutex, NULL); //initialize the mutex
//LuaPlus::LuaState* pState = LuaPlus::LuaState::Create( true ); //we already initialized this globally
//LuaPlus::LuaObject globals = pState->GetGlobals();
//pState->DoString("function LUA_OnMyEvent(arg1,arg2) print(arg1..arg2) end"); //it's already in main.lua
globals.RegisterDirect("AddThread", AddThread);
char pPath[ MAX_PATH ];
GetCurrentDirectory(MAX_PATH,pPath);
strcat_s(pPath,MAX_PATH,"\\main.lua");
if( pState->DoFile(pPath) ) { //run our main.lua script which contains the callback function that will run a print
if( pState->GetTop() == 1 )
std::cout << "An error occured: " << pState->CheckString(1) << std::endl;
}
for (auto thread=threads.begin(); thread!=threads.end(); ++thread) { //wait for threads to finish
unsigned int id = thread->first;
if(threadsState[id])
pthread_join(threads[id], NULL);
}
//clean up
LuaPlus::LuaState::Destroy( pState );
pState = nullptr;
pthread_mutex_destroy(&mutex);
getchar(); //keep console from closing
return 0;
}
main.lua
function LUA_OnMyEvent(a,b,c)
print(a.."+"..b.."="..c)
end
for i=1, 999, 1 do
AddThread(i,i*2)
end

I don't know Lua enough to give you a solution at Lua side, but this view of the problem may help you reaching that out.
When you call AddThread() from Lua, something like this will happen:
1. LuaState allocations
2. AddThread() execution
3. LuaState unwinding
While on ThreadFunction()...
A. Mutex lock
B. LuaState allocations
C. LUA_OnMyEvent() execution
D. LuaState unwinding
E. Mutex Unlock
There is no mutex control at AddThread, so a race condition can happen between 1/3 and B/D.
However, adding the mutex to AddThread would not solve the problem, because it would still run between 1 and 3.
If AddThread() is called only at the program initialization, then you could block all threads till initialization is done. If it is called frequently during program execution, then I would make those calls from a separate LuaState.
[EDIT] 2nd idea: Use a producer/consumer approach. Then C++ threads won't need to run Lua code.
C++ suggestion:
//-- start Task.h --
struct Task{
static list<Task*> runningTasks;
static list<Task*> doneTasks;
static pthread_mutex_t mutex;
list<Task*>::iterator iterator;
virtual ~Task(){}
bool start(){
pthread_mutex_lock(&mutex);
bool hasSpace = runningTasks.size() < 100;
if(hasSpace){
runningTasks.push_front(this);
iterator = runningTasks.begin();
pthread_t unusedID;
pthread_create(&unusedID, NULL, Task::threadBody, this);
}
pthread_mutex_unlock(&mutex);
return hasSpace;
}
virtual void run() = 0;
virtual void processResults() = 0;
protected:
void finish(){
pthread_mutex_lock(&mutex);
runningTasks.erase(iterator);
doneTasks.push_front(this);
pthread_mutex_unlock(&mutex);
}
static void* threadBody(void* instance){
Task* task = static_cast<Task*>(instance);
task->run();
task->finish();
return NULL;
}
};
//-- end Task.h --
//-- start Task.cpp --
//Instantiate Task's static attributes
pthread_mutex_t Task::mutex;
list<Task*> Task::runningTasks;
list<Task*> Task::doneTasks;
//-- end Task.cpp --
struct SumTask: public Task{
int a, b, c;
void run(){
Sleep(3000);
c = a+b;
}
void processResults(){
LuaPlus::LuaFunction<int> CPP_OnMyEvent = pState->GetGlobal("LUA_OnMyEvent");
CPP_OnMyEvent(a,b,c);
}
}
//functions called by Lua
bool runSumTask(int a, int b){
SumTask task* = new SumTask();
task->a = a; task->b = b;
bool ok = task->start();
if(!ok)
delete task;
return ok;
}
int gatherResults(){
pthread_mutex_lock(&Task::mutex);
int totalResults = Task::doneTasks.size();
while(Task::doneTasks.size() > 0){
Task* t = Task::doneTasks.front();
Task::doneTasks.pop_front();
t->processResults();
delete t;
}
pthread_mutex_unlock(&Task::mutex);
return totalResults;
}
int main() {
//Must initialize/destroy Task::mutex
pthread_mutex_init(&Task::mutex, NULL);
//...
pthread_mutex_destroy(&Task::mutex);
}
Lua code:
function LUA_OnMyEvent(a,b,c)
print(a.."+"..b.."="..c)
end
local totalRunning = 0;
for i=1, 999, 1 do
if (runSumTask(i,i*2))
totalRunning = totalRunning + 1;
totalRunning -= gatherResults();
end
while(totalRunning > 0) do
totalRunning -= gatherResults();
mySleepFunction(...);
end

Related

C++ pthread deadlock solution

I am trying to use p_thread to print out numbers in order like:
0
1
2
3
4
Without using a global variable, just local variable, but I met deadlock and I don't know how to fix it yet.
This is my code:
#include <pthread.h>
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <fcntl.h>
static pthread_mutex_t bsem; // Mutex semaphore
static pthread_cond_t waitTurn = PTHREAD_COND_INITIALIZER; // Condition variable to control the turn
//static int turn; // Index to control access to the turn array
static int nthreads; // Number of threads from input
struct SFE{
int turn;
int thread;
};
void *thread_function(void *void_ptr_argv)
{
SFE *threadNum = (SFE *) void_ptr_argv;
pthread_mutex_lock(&bsem);
// if its not our turn then wait
while(threadNum->turn != threadNum->thread){
pthread_cond_wait(&waitTurn, &bsem);
}
pthread_mutex_unlock(&bsem);
std::cout << "I am Thread " << threadNum->turn << std::endl;
pthread_mutex_lock(&bsem);
threadNum->turn++;
pthread_cond_broadcast(&waitTurn);
pthread_mutex_unlock(&bsem);
return nullptr;
}
int main()
{
std::cin >> nthreads;
pthread_mutex_init(&bsem, NULL); // Initialize bsem to 1
pthread_t *tid= new pthread_t[nthreads];
SFE threadNumber;
threadNumber.turn = 0;
for(int i=0;i<nthreads;i++)
{
// initialize the thread number here (remember to follow the rules from the specifications of the assignment)
threadNumber.thread = i;
pthread_create(&tid[i], nullptr, thread_function, (void*)&threadNumber);
}
for(int i = 0; i < nthreads; i++)
{
pthread_join(tid[i], nullptr);
}
return 0;
}
I am expecting a simple way to solve my problem
Think about threadNum->thread. threadNum is an unique object, thus each thread gets an unspecified number threadNum->thread between 0 and nthreads. Only the last thread gets the correct number nthreads - 1 .
You should update
struct SFE{
int *turn;
int thread;
};
Allocate the array SFE threadNum[nthread] and pass &threadNum[i] to i-th thread.
int turn = 0;
for(int i=0;i<nthreads;i++)
{
// initialize the thread number here (remember to follow the rules from the specifications of the assignment)
threadNumber[i].turn = &turn;
threadNumber[i].thread = i;
pthread_create(&tid[i], nullptr, thread_function, &threadNumber[i]);
}
Your code is C-ism. This code w/o iostream and new is a clean C. If you use C++, all will be much easier.

Vector processing issues in multi threading

I'm implement about the data process in multi thread.
I want to process data in class DataProcess and merge the data in class DataStorage.
My problem is when the data is add to the vector sometimes occurs the exception error.
In my opinions, there have a different address class
Is it a problem to create a new data handling class and process each data?
Here is my code.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread>
#include <vector>
#include <mutex>
using namespace::std;
static std::mutex m;
class DataStorage
{
private :
std::vector<long long> vecData;
public:
DataStorage()
{
}
~DataStorage()
{
}
void SetDataVectorSize(int size)
{
vecData.clear();
vecData.resize(size);
}
void DataInsertLoop(void* Data, int start, int end)
{
m.lock();
std::vector<long long> const * _v1 = static_cast<std::vector<long long> const *>(Data);
long long num = 0;
for (int idx = start; idx < _v1->size(); ++idx)
{
vecData[idx] = _v1->at(idx);
}
m.unlock();
}
};
class DataProcess
{
private:
int m_index;
long long m_startIndex;
long long m_endIndex;
int m_coreNum;
long long num;
DataStorage* m_mainStorage;
std::vector<long long> m_vecData;
public :
DataProcess(int pindex, long long startindex, long long endindex)
: m_index(pindex), m_startIndex(startindex), m_endIndex(endindex),
m_coreNum(0),m_mainStorage(NULL), num(0)
{
m_vecData.clear();
}
~DataProcess()
{
}
void SetMainAdrr(DataStorage* const mainstorage)
{
m_mainStorage = mainstorage;
}
void SetCoreInCPU(int num)
{
m_coreNum = num;
}
void DataRun()
{
for (long long idx = m_startIndex; idx < m_endIndex; ++idx)
{
num += rand();
m_vecData.push_back(num); //<- exception error position
}
m_mainStorage->DataInsertLoop(&m_vecData, m_startIndex, m_endIndex);
}
};
int main()
{
//auto beginTime = std::chrono::high_resolution_clock::now();
clock_t beginTime, endTime;
DataStorage* main = new DataStorage();
beginTime = clock();
long long totalcount = 200000000;
long long halfdata = totalcount / 2;
std::thread t1,t2;
for (int t = 0; t < 2; ++t)
{
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
clsDP->SetCoreInCPU(2);
clsDP->SetMainAdrr(main);
if (t == 0)
{
t1 = std::thread([&]() {clsDP->DataRun(); });
}
else
{
t2 = std::thread([&]() {clsDP->DataRun(); });
}
}
t1.join(); t2.join();
endTime = clock();
double resultTime = (double)(endTime - beginTime);
std::cout << "Multi Thread " << resultTime / 1000 << " sec" << std::endl;
printf("--------------------\n");
int value = getchar();
}
Interestingly, if none of your threads accesses portions of vecData accessed by another thread, DataInsertLoop::DataInsertLoop should not need to be synchonized at all. That should make processsing much faster. That is, after all bugs are fixed... This also means, you should not need a mutex at all.
There are other issues with your code... The most easily spotted is a memory leak.
In main:
DataStorage* main = new DataStorage(); // you call new, but never call delete...
// that's a memory leak. Avoid caling
// new() directly.
//
// Also: 'main' is kind of a reserved
// name, don't use it except for the
// program entry point.
// How about this, instead ?
DataStorage dataSrc; // DataSrc has a very small footprint (a few pointers).
// ...
std::thread t1,t2; // why not use an array ?
// as in:
std::vector<std::tread> thrds;
// ...
// You forgot to set the size of your data set before starting, by calling:
dataSrc.SetDataVectorSize(200000000);
for (int t = 0; t < 2; ++t)
{
// ...
// Calling new again, and not delete... Use a smart pointer type
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
// Also, fix the start and en indices (NOTE: code below works for t < 2, but
// probably not for t < 3)
auto clsDP = std::make_unique<DataProcess>(t, t * halfdata, (t + 1) * halfdata);
// You need to keep a reference to these pointers
// Either by storing them in an array, or by passing them to
// the threads. As in, for example:
thrds.emplace_back([dp = std::move(clsDP)]() {clsDP->DataRun(); });
}
//...
std::for_each(thrds.begin(), thrds.end(), [](auto& t) { t.join(); });
//...
More...
You create a mutex on your very first line of executable code. That's good... somewhat...
static std::mutex m; // a one letter name is a terrible choice for a variable with
// file scope.
Apart form the name, it's not in the right scope... If you want to use a mutex to protect DataStorage::vecData, this mutex should be declared in the same scope as DataStorage::vecData.
One last thing. Have you considered using iterators (aka pointers) as arguments to DataProcess::DataProcess() ? This would simplify the code quite a bit, and it would very likely run faster.

Multiple Shared Work Pools With Boost::Fiber

I have been looking into boost::fibers as a method for dealing with some of my problems with data processing and IO. The shared_work scheduler in particular looks promising because it would let me spin up one data processing task for every data processing source and then let them distribute each other as needed across a few threads.
However this brings me to the source of my question: It looks like I can only have one shared_work 'pool' per process. What do I do if I want to have a set of 12 fibers in a processing data shared among 4 threads while, at the same time, a different set of 12 fibers are writing processed data to file shared among another 4 threads.
Something like:
#include<string>
#include<iostream>
#include<vector>
#include<mutex>
#include<thread>
#include<random>
#include<map>
#include<sstream>
#include<boost/bind.hpp>
#include<boost/fiber/all.hpp>
typedef boost::fibers::fiber FiberType;
typedef std::unique_lock<boost::fibers::mutex> LockType;
static const int fiberIterationCount = 5000;
static const int fiberCount = 12;
static const int threadCount = 4;
static const int distLowerLimit = 50;
static const int distUpperLimit = 500;
static boost::fibers::mutex firstMutex{};
static boost::fibers::mutex secondMutex{};
static boost::fibers::condition_variable firstCondition{};
static boost::fibers::condition_variable secondCondition{};
static boost::fibers::barrier synchronize{2*threadCount};
static int typeOneFibersFinished{0};
static int typeTwoFibersFinished{0};
static std::mt19937 typeOneGenerators[fiberCount];
static std::mt19937 typeTwoGenerators[fiberCount];
static std::mutex typeMapMutex;//lock for writing unnecessary for reads
static std::map<std::thread::id, std::string> threadTypeMap;
//simple function to give a heavy cpu load of variable duration
unsigned long long findPrimeNumber(int n)
{
int count=0;
unsigned long long a = 2;
while(count<n)
{
bool isPrime = true;
for(unsigned long long b = 2; (b * b) <= a; ++b)
{
if((a % b) == 0)
{
isPrime = false;
break;
}
}
if(isPrime)
{
count++;
}
++a;
}
return (a - 1);
}
void fiberTypeOne(int fiberNumber)
{
std::cout<<"Starting Type One Fiber #"<<fiberNumber;
std::uniform_int_distribution<int> dist(distLowerLimit, distUpperLimit);
for(int i=0; i<fiberIterationCount; ++i)
{
//generate a randomish load on this fiber so that it does not take a regular time slice
int tempPrime = dist(typeOneGenerators[fiberNumber]);
unsigned long long temp = findPrimeNumber(tempPrime);
std::cout << "T1 fiber #"<<fiberNumber<<" running on "<<threadTypeMap[std::this_thread::get_id()]
<<"\n Generated: "<<tempPrime<<", "<<temp;
boost::this_fiber::yield();
}
{
LockType lock(firstMutex);
++typeOneFibersFinished;
}
firstCondition.notify_all();
}
void threadTypeOne(int threadNumber)
{
//make a shared work scheduler that associates its fibers with "fiber pool 0"
boost::fibers::use_scheduling_algorithm< multi_pool_scheduler<0> >();
std::cout<<"Starting Type One Thread #"<<threadNumber<<" With Thread ID: "<<std::this_thread::get_id();
{
std::unique_lock<std::mutex> lock{typeMapMutex};
std::ostringstream gen;
gen<<"Thread Type 1 - Number: "<<threadNumber<<" with id: "<<std::this_thread::get_id();
threadTypeMap[std::this_thread::get_id()] = gen.str();
}
if(threadNumber == 0)
{ //if we are thread zero, create the fibers then join them to take ourselves off the "fiber list"
std::cout<<"Spawning Type One Fibers";
for(int fiberNumber=0; fiberNumber<fiberCount; ++fiberNumber)
{//create the fibers and instantly detach them
FiberType(boost::bind(&fiberTypeOne, fiberNumber)).detach();
}
}
synchronize.wait();
std::cout<<"T1 Thread preparing to wait";
//now let the fibers do their thing
LockType lock(firstMutex);
firstCondition.wait(lock, [](){return (typeOneFibersFinished == fiberCount);});
}
void fiberTypeTwo(int fiberNumber)
{
std::cout<<"Starting Type Two Fiber #"<<fiberNumber;
std::uniform_int_distribution<int> dist(distLowerLimit, distUpperLimit);
for(int i=0; i<fiberIterationCount; ++i)
{
//generate a randomish load on this fiber so that it does not take a regular time slice
int tempPrime = dist(typeTwoGenerators[fiberNumber]);
unsigned long long temp = findPrimeNumber(tempPrime);
std::cout << "T2 fiber #"<<fiberNumber<<" running on "<<threadTypeMap[std::this_thread::get_id()]
<<"\n Generated: "<<tempPrime<<", "<<temp;
boost::this_fiber::yield();
}
{
LockType lock(secondMutex);
++typeTwoFibersFinished;
}
secondCondition.notify_all();
}
void threadTypeTwo(int threadNumber)
{
//make a shared work scheduler that associates its fibers with "fiber pool 1"
boost::fibers::use_scheduling_algorithm< multi_pool_scheduler<1> >();
std::cout<<"Starting Type Two Thread #"<<threadNumber<<" With Thread ID: "<<std::this_thread::get_id();
{
std::unique_lock<std::mutex> lock{typeMapMutex};
std::ostringstream gen;
gen<<"Thread Type 2 - Number: "<<threadNumber<<" with id: "<<std::this_thread::get_id();
threadTypeMap[std::this_thread::get_id()] = gen.str();
}
if(threadNumber == 0)
{ //if we are thread zero, create the fibers then join them to take ourselves off the "fiber list"
std::cout<<"Spawning Type Two Fibers";
for(int fiberNumber=0; fiberNumber<fiberCount; ++fiberNumber)
{//create the fibers and instantly detach them
FiberType(boost::bind(&fiberTypeTwo, fiberNumber)).detach();
}
}
synchronize.wait();
std::cout<<"T2 Thread preparing to wait";
//now let the fibers do their thing
LockType lock(secondMutex);
secondCondition.wait(lock, [](){return (typeTwoFibersFinished == fiberCount);});
}
int main(int argc, char* argv[])
{
std::cout<<"Initializing Random Number Generators";
for(unsigned i=0; i<fiberCount; ++i)
{
typeOneGenerators->seed(i*500U - 1U);
typeTwoGenerators->seed(i*1500U - 1U);
}
std::cout<<"Commencing Main Thread Startup Startup";
std::vector<std::thread> typeOneThreads;
std::vector<std::thread> typeTwoThreads;
for(int i=0; i<threadCount; ++i)
{
typeOneThreads.emplace_back(std::thread(boost::bind(&threadTypeOne, i)));
typeTwoThreads.emplace_back(std::thread(boost::bind(&threadTypeTwo, i)));
}
//now let the threads do their thing and wait for them to finish with join
for(unsigned i=0; i<threadCount; ++i)
{
typeOneThreads[i].join();
}
for(unsigned i=0; i<threadCount; ++i)
{
typeTwoThreads[i].join();
}
std::cout<<"Shutting Down";
return 0;
}
Is this possible without writing your own fiber scheduler? If so, how?
I determined that I did require writing my own scheduler. However, the actual amount of work was minimal. The boost::fibers::shared_work scheduler manages the list of fibers that are shared between threads using a single static queue, guarded by a static mutex. There is another queue that governs the main fiber for each thread (since each thread has its own scheduler) but that is local to the class instance instead of shared between all instances of the class the way the static members are.
The solution then, to prevent the static queue and lock from being shared between separate sets of threads, is to put a, mostly useless, template parameter in front of the class. Then each thread passes a different parameter to this template. In this fashion, since you get a different object for every specialization of the template, you get different set of static variables for each instantiation with a different pool number.
Below is my implementation of this solution, (mostly a copy of boost::fiber::shared_work with a few variables and types more clearly named and the template parameter added).
#include <condition_variable>
#include <chrono>
#include <deque>
#include <mutex>
#include <boost/config.hpp>
#include <boost/fiber/algo/algorithm.hpp>
#include <boost/fiber/context.hpp>
#include <boost/fiber/detail/config.hpp>
#include <boost/fiber/scheduler.hpp>
#include <boost/assert.hpp>
#include "boost/fiber/type.hpp"
#ifdef BOOST_HAS_ABI_HEADERS
# include BOOST_ABI_PREFIX
#endif
#ifdef _MSC_VER
# pragma warning(push)
# pragma warning(disable:4251)
#endif
/*!
* #class SharedWorkPool
* #brief A scheduler for boost::fibers that operates in a manner similar to the
* shared work scheduler, except that it takes a template parameter determining
* which pool to draw fibers from. In this fashion, one group of threads can share
* a pool of fibers among themselves while another group of threads can work with
* a completely separate pool
* #tparam PoolNumber The index of the pool number for this thread
*/
template <int PoolNumber>
class SharedWorkPool : public boost::fibers::algo::algorithm
{
typedef std::deque<boost::fibers::context * > ReadyQueueType;
typedef boost::fibers::scheduler::ready_queue_type LocalQueueType;
typedef std::unique_lock<std::mutex> LockType;
public:
SharedWorkPool() = default;
~SharedWorkPool() override {}
SharedWorkPool( bool suspend) : suspendable{suspend}{}
SharedWorkPool( SharedWorkPool const&) = delete;
SharedWorkPool( SharedWorkPool &&) = delete;
SharedWorkPool& operator=(const SharedWorkPool&) = delete;
SharedWorkPool& operator=(SharedWorkPool&&) = delete;
void awakened(boost::fibers::context* ctx) noexcept override;
boost::fibers::context* pick_next() noexcept override;
bool has_ready_fibers() const noexcept override
{
LockType lock{readyQueueMutex};
return ((!readyQueue.empty()) || (!localQueue.empty()));
}
void suspend_until(const std::chrono::steady_clock::time_point& timePoint) noexcept override;
void notify() noexcept override;
private:
static ReadyQueueType readyQueue;
static std::mutex readyQueueMutex;
LocalQueueType localQueue{};
std::mutex instanceMutex{};
std::condition_variable suspendCondition{};
bool waitNotifyFlag{false};
bool suspendable{false};
};
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::awakened(boost::fibers::context* ctx) noexcept
{
if(ctx->is_context(boost::fibers::type::pinned_context))
{ // we have been passed the thread's main fiber, never put those in the shared queue
localQueue.push_back(*ctx);
}
else
{//worker fiber, enqueue on shared queue
ctx->detach();
LockType lock{readyQueueMutex};
readyQueue.push_back(ctx);
}
}
template <int PoolNumber>
boost::fibers::context* SharedWorkPool<PoolNumber>::pick_next() noexcept
{
boost::fibers::context * ctx = nullptr;
LockType lock{readyQueueMutex};
if(!readyQueue.empty())
{ //pop an item from the ready queue
ctx = readyQueue.front();
readyQueue.pop_front();
lock.unlock();
BOOST_ASSERT( ctx != nullptr);
boost::fibers::context::active()->attach( ctx); //attach context to current scheduler via the active fiber of this thread
}
else
{
lock.unlock();
if(!localQueue.empty())
{ //nothing in the ready queue, return main or dispatcher fiber
ctx = & localQueue.front();
localQueue.pop_front();
}
}
return ctx;
}
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::suspend_until(const std::chrono::steady_clock::time_point& timePoint) noexcept
{
if(suspendable)
{
if (std::chrono::steady_clock::time_point::max() == timePoint)
{
LockType lock{instanceMutex};
suspendCondition.wait(lock, [this](){return waitNotifyFlag;});
waitNotifyFlag = false;
}
else
{
LockType lock{instanceMutex};
suspendCondition.wait_until(lock, timePoint, [this](){return waitNotifyFlag;});
waitNotifyFlag = false;
}
}
}
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::notify() noexcept
{
if(suspendable)
{
LockType lock{instanceMutex};
waitNotifyFlag = true;
lock.unlock();
suspendCondition.notify_all();
}
}
template <int PoolNumber>
std::deque<boost::fibers::context*> SharedWorkPool<PoolNumber>::readyQueue{};
template <int PoolNumber>
std::mutex SharedWorkPool<PoolNumber>::readyQueueMutex{};
Note, I am not entirely sure what will happen if you try to use the same pool number from declarations in different compilation units. However, under normal circumstances, i.e. you have only written boost::fibers::use_scheduling_algorithm< Threads::Fibers::SharedWorkPool<WorkPoolNumber> >(); in a single location for each WorkPoolNumber, it works perfectly. Fibers assigned to a given set of threads always run within the same set of threads, never being run by a different set of threads.

Processing an array of objects with multithreading - invalid use of void expression error

I need to run some number of threads to process an array of objects.
So I've written this piece of code :
unsigned int object_counter = 0;
while(object_counter != (obj_max - left))
{
thread genThread[thread_num];//create thread objects
///launch threads
int thread_index = 0;
for (; thread_index<thread_num; thread_index++)
{
genThread[thread_index] = thread(object[object_counter].gen_maps());//launch a thread
object_counter++;
if(object_counter == (obj_max - left)
{
break;
}
}
///finish threads
for (; thread_index>0; thread_index--)
{
genThread[thread_index].join();
}
}
Basically, there is an array of objects (number of objects = obj_max - left).
Each object has a function (void type function) called gen_maps() that generates a terrain.
What I want to do is running all gen_maps() functions from all objects using multithreading.
A maximum number of threads is stored in thread_num variable.
But when I'm trying to compile this code I'm getting an error:
error: invalid use of void expression
genThread[thread_index] = thread(object[object_counter].gen_maps(), thread_index);//launch a thread
^
How can I fix this issue?
A more extendable way to manage an arbitrarily large number of jobs with a smaller number of threads is to use a thread pool.
Here's a naive implementation (for better efficiency there would be 2 condition variables to manage control and state reporting) which allows the initiator to add an arbitrary number of jobs or threads and wait for all jobs to be complete.
#include <thread>
#include <condition_variable>
#include <mutex>
#include <vector>
#include <functional>
#include <deque>
#include <cassert>
#include <ciso646>
#include <iostream>
struct work_pool
{
std::mutex control_mutex;
std::condition_variable control_cv;
std::deque<std::function<void()>> jobs;
bool terminating = false;
std::size_t running = 0;
std::vector<std::thread> threads;
work_pool(std::size_t n = std::thread::hardware_concurrency())
{
add_threads(n);
}
work_pool(const work_pool&) = delete;
work_pool& operator=(const work_pool&) = delete;
~work_pool()
{
wait();
shutdown();
}
void add_threads(std::size_t n)
{
while (n--)
{
threads.emplace_back([this]{
run_jobs();
});
}
}
void run_jobs()
{
while (1)
{
auto lock = std::unique_lock(control_mutex);
control_cv.wait(lock, [this] {
return terminating or not jobs.empty();
});
if (terminating) return;
++running;
auto job = std::move(jobs.front());
jobs.pop_front();
lock.unlock();
job();
lock.lock();
--running;
lock.unlock();
control_cv.notify_one();
}
}
void shutdown()
{
auto lock = std::unique_lock(control_mutex);
terminating = true;
lock.unlock();
control_cv.notify_all();
for (auto&& t : threads) {
if (t.joinable()) {
t.join();
}
}
threads.clear();
}
void wait()
{
auto lock = std::unique_lock(control_mutex);
control_cv.wait(lock, [this] {
return jobs.empty() and not running;
});
}
template<class F>
void add_work(F&& f)
{
auto lock = std::unique_lock(control_mutex);
assert(not terminating);
jobs.emplace_back(std::forward<F>(f));
lock.unlock();
control_cv.notify_all();
}
};
// dummy function for exposition
void generate_map() {}
int main()
{
work_pool pool;
for(int i = 0 ; i < 100000 ; ++i)
pool.add_work(generate_map);
pool.wait();
// maps are now all generated
std::cout << "done" << std::endl;
}
With object[object_counter].gen_maps() you call the function gen_maps and use the returned value as the thread function. Apparently gen_maps is declared to return void which leads to the error you get.
You need to pass a pointer to the function, and then pass the object it should be called on as an argument to the thread:
thread(&SomeClass::gen_maps, object[object_counter])

Extend the life of threads with synchronization (C++11)

I have a program with a function which takes a pointer as arg, and a main. The main is creating n threads, each of them running the function on different memory areas depending on the passed arg. Threads are then joined, the main performs some data mixing between the area and creates n new threads which do the the same operation as the old ones.
To improve the program I would like to keep the threads alive, removing the long time necessary to create them. Threads should sleep when the main is working and notified when they have to come up again. At the same way the main should wait when threads are working as it did with join.
I cannot end up with a strong implementation of this, always falling in a deadlock.
Simple baseline code, any hints about how to modify this would be much appreciated
#include <thread>
#include <climits>
...
void myfunc(void * p) {
do_something(p);
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
std::thread mythread[n_threads];
for (unsigned long int j=0; j < ULONG_MAX; j++) {
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i]);
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
mix_data(myp);
}
return 0;
}
Here is a possible approach using only classes from the C++11 Standard Library. Basically, each thread you create has an associated command queue (encapsulated in std::packaged_task<> objects) which it continuously check. If the queue is empty, the thread will just wait on a condition variable (std::condition_variable).
While data races are avoided through the use of std::mutex and std::unique_lock<> RAII wrappers, the main thread can wait for a particular job to be terminated by storing the std::future<> object associated to each submitted std::packaged_tast<> and call wait() on it.
Below is a simple program that follows this design. Comments should be sufficient to explain what it does:
#include <thread>
#include <iostream>
#include <sstream>
#include <future>
#include <queue>
#include <condition_variable>
#include <mutex>
// Convenience type definition
using job = std::packaged_task<void()>;
// Some data associated to each thread.
struct thread_data
{
int id; // Could use thread::id, but this is filled before the thread is started
std::thread t; // The thread object
std::queue<job> jobs; // The job queue
std::condition_variable cv; // The condition variable to wait for threads
std::mutex m; // Mutex used for avoiding data races
bool stop = false; // When set, this flag tells the thread that it should exit
};
// The thread function executed by each thread
void thread_func(thread_data* pData)
{
std::unique_lock<std::mutex> l(pData->m, std::defer_lock);
while (true)
{
l.lock();
// Wait until the queue won't be empty or stop is signaled
pData->cv.wait(l, [pData] () {
return (pData->stop || !pData->jobs.empty());
});
// Stop was signaled, let's exit the thread
if (pData->stop) { return; }
// Pop one task from the queue...
job j = std::move(pData->jobs.front());
pData->jobs.pop();
l.unlock();
// Execute the task!
j();
}
}
// Function that creates a simple task
job create_task(int id, int jobNumber)
{
job j([id, jobNumber] ()
{
std::stringstream s;
s << "Hello " << id << "." << jobNumber << std::endl;
std::cout << s.str();
});
return j;
}
int main()
{
const int numThreads = 4;
const int numJobsPerThread = 10;
std::vector<std::future<void>> futures;
// Create all the threads (will be waiting for jobs)
thread_data threads[numThreads];
int tdi = 0;
for (auto& td : threads)
{
td.id = tdi++;
td.t = std::thread(thread_func, &td);
}
//=================================================
// Start assigning jobs to each thread...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
//=================================================
// Here the main thread does something...
std::cin.get();
// ...done!
//=================================================
//=================================================
// Posts some new tasks...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
// Send stop signal to all threads and join them...
for (auto& td : threads)
{
std::unique_lock<std::mutex> l(td.m);
td.stop = true;
td.cv.notify_one();
}
// Join all the threads
for (auto& td : threads) { td.t.join(); }
}
The concept you want is the threadpool. This SO question deals with existing implementations.
The idea is to have a container for a number of thread instances. Each instance is associated with a function which polls a task queue, and when a task is available, pulls it and run it. Once the task is over (if it terminates, but that's another problem), the thread simply loop over to the task queue.
So you need a synchronized queue, a thread class which implements the loop on the queue, an interface for the task objects, and maybe a class to drive the whole thing (the pool class).
Alternatively, you could make a very specialized thread class for the task it has to perform (with only the memory area as a parameter for instance). This requires a notification mechanism for the threads to indicate that they are done with the current iteration.
The thread main function would be a loop on that specific task, and at the end of one iteration, the thread signals its end, and wait on condition variables to start the next loop. In essence, you would be inlining the task code within the thread, dropping the need of a queue altogether.
using namespace std;
// semaphore class based on C++11 features
class semaphore {
private:
mutex mMutex;
condition_variable v;
int mV;
public:
semaphore(int v): mV(v){}
void signal(int count=1){
unique_lock lock(mMutex);
mV+=count;
if (mV > 0) mCond.notify_all();
}
void wait(int count = 1){
unique_lock lock(mMutex);
mV-= count;
while (mV < 0)
mCond.wait(lock);
}
};
template <typename Task>
class TaskThread {
thread mThread;
Task *mTask;
semaphore *mSemStarting, *mSemFinished;
volatile bool mRunning;
public:
TaskThread(Task *task, semaphore *start, semaphore *finish):
mTask(task), mRunning(true),
mSemStart(start), mSemFinished(finish),
mThread(&TaskThread<Task>::psrun){}
~TaskThread(){ mThread.join(); }
void run(){
do {
(*mTask)();
mSemFinished->signal();
mSemStart->wait();
} while (mRunning);
}
void finish() { // end the thread after the current loop
mRunning = false;
}
private:
static void psrun(TaskThread<Task> *self){ self->run();}
};
classcMyTask {
public:
MyTask(){}
void operator()(){
// some code here
}
};
int main(){
MyTask task1;
MyTask task2;
semaphore start(2), finished(0);
TaskThread<MyTask> t1(&task1, &start, &finished);
TaskThread<MyTask> t2(&task2, &start, &finished);
for (int i = 0; i < 10; i++){
finished.wait(2);
start.signal(2);
}
t1.finish();
t2.finish();
}
The proposed (crude) implementation above relies on the Task type which must provide the operator() (ie. a functor like class). I said you could incorporate the task code directly in the thread function body earlier, but since I don't know it, I kept it as abstract as I could. There's one condition variable for the start of threads, and one for their end, both encapsulated in semaphore instances.
Seeing the other answer proposing the use of boost::barrier, I can only support this idea: make sure to replace my semaphore class with that class if possible, the reason being that it is better to rely on well tested and maintained external code rather than a self implemented solution for the same feature set.
All in all, both approaches are valid, but the former gives up a tiny bit of performance in favor of flexibility. If the task to be performed takes a sufficiently long time, the management and queue synchronization cost becomes negligible.
Update: code fixed and tested. Replaced a simple condition variable by a semaphore.
It can easily be achieved using a barrier (just a convenience wrapper over a conditional variable and a counter). It basically blocks until all N threads have reached the "barrier". It then "recycles" again. Boost provides an implementation.
void myfunc(void * p, boost::barrier& start_barrier, boost::barrier& end_barrier) {
while (!stop_condition) // You'll need to tell them to stop somehow
{
start_barrier.wait ();
do_something(p);
end_barrier.wait ();
}
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
boost::barrier start_barrier (n_threads + 1); // child threads + main thread
boost::barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i], start_barrier, end_barrier);
}
start_barrier.wait (); // first unblock the threads
for (unsigned long int j=0; j < ULONG_MAX; j++) {
end_barrier.wait (); // mix_data must not execute before the threads are done
mix_data(myp);
start_barrier.wait (); // threads must not start new iteration before mix_data is done
}
return 0;
}
The following is a simple compiling and working code performing some random stuffs. It implements aleguna's concept of barrier. The task length of each thread is different so it is really necessary to have a strong synchronization mechanism. I will try to do a pool on the same tasks and benchmark the result, and then maybe with futures as pointed out by Andy Prowl.
#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <complex>
#include <random>
const unsigned int n_threads=4; //varying this will not (almost) change the total amount of work
const unsigned int task_length=30000/n_threads;
const float task_length_variation=task_length/n_threads;
unsigned int rep=1000; //repetitions of tasks
class t_chronometer{
private:
std::chrono::steady_clock::time_point _t;
public:
t_chronometer(): _t(std::chrono::steady_clock::now()) {;}
void reset() {_t = std::chrono::steady_clock::now();}
double get_now() {return std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - _t).count();}
double get_now_ms() {return
std::chrono::duration_cast<std::chrono::duration<double,std::milli>>(std::chrono::steady_clock::now() - _t).count();}
};
class t_barrier {
private:
std::mutex m_mutex;
std::condition_variable m_cond;
unsigned int m_threshold;
unsigned int m_count;
unsigned int m_generation;
public:
t_barrier(unsigned int count):
m_threshold(count),
m_count(count),
m_generation(0) {
}
bool wait() {
std::unique_lock<std::mutex> lock(m_mutex);
unsigned int gen = m_generation;
if (--m_count == 0)
{
m_generation++;
m_count = m_threshold;
m_cond.notify_all();
return true;
}
while (gen == m_generation)
m_cond.wait(lock);
return false;
}
};
using namespace std;
void do_something(complex<double> * c, unsigned int max) {
complex<double> a(1.,0.);
complex<double> b(1.,0.);
for (unsigned int i = 0; i<max; i++) {
a *= polar(1.,2.*M_PI*i/max);
b *= polar(1.,4.*M_PI*i/max);
*(c)+=a+b;
}
}
bool done=false;
void task(complex<double> * c, unsigned int max, t_barrier* start_barrier, t_barrier* end_barrier) {
while (!done) {
start_barrier->wait ();
do_something(c,max);
end_barrier->wait ();
}
cout << "task finished" << endl;
}
int main() {
t_chronometer t;
std::default_random_engine gen;
std::normal_distribution<double> dis(.0,1000.0);
complex<double> cpx[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
cpx[i] = complex<double>(dis(gen), dis(gen));
}
t_barrier start_barrier (n_threads + 1); // child threads + main thread
t_barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
unsigned long int sum=0;
for (unsigned int i=0; i < n_threads; i++) {
unsigned int max = task_length + i * task_length_variation;
cout << i+1 << "th task length: " << max << endl;
mythread[i] = std::thread(task, &cpx[i], max, &start_barrier, &end_barrier);
sum+=max;
}
cout << "total task length " << sum << endl;
complex<double> c(0,0);
for (unsigned long int j=1; j < rep+1; j++) {
start_barrier.wait (); //give to the threads the missing call to start
if (j==rep) done=true;
end_barrier.wait (); //wait for the call from each tread
if (j%100==0) cout << "cycle: " << j << endl;
for (unsigned int i=0; i<n_threads; i++) {
c+=cpx[i];
}
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
cout << "result: " << c << " it took: " << t.get_now() << " s." << endl;
return 0;
}