I am studing boost library. I want to use fiber with blocking api so I try to use a thread pool to run blocking api. When the blocking api is finished, the fiber get the result. I have created a thread pool of 4 threads, but I found that only one thread is running. Why?
Here is my code.
#include <boost/asio.hpp>
#include <boost/fiber/all.hpp>
#include <boost/fiber/fiber.hpp>
#include <boost/fiber/future/future.hpp>
#include <boost/fiber/future/promise.hpp>
#include <boost/fiber/operations.hpp>
#include <chrono>
#include <ctime>
#include <iostream>
#include <string>
#include <thread>
#include <unistd.h>
using namespace std;
using namespace boost;
time_t block_api() {
std::this_thread::sleep_for(std::chrono::seconds(1));
return clock();
}
class AsyncWrapper {
public:
AsyncWrapper(asio::thread_pool &pool) : pool_(&pool) {}
~AsyncWrapper() {}
time_t async_api() {
fibers::promise<time_t> promise;
boost::fibers::future<time_t> future(promise.get_future());
auto func = [&]() mutable {
auto ret = block_api();
std::cout << "run on:"<<getpid()<<std::endl;
promise.set_value(ret);
};
asio::post(pool_->get_executor(), func);
return future.get();
}
private:
asio::thread_pool *pool_ = nullptr;
};
static asio::thread_pool g_pool(4);
int main(int argc, char **argv) {
if (argc != 2) {
return -1;
}
std::string arg = argv[1];
std::cout << "press enter to start" << std::endl;
getchar();
for (int32_t i = 0; i < stoi(arg); i++) {
fibers::fiber f([]() {
AsyncWrapper async_wrapper(g_pool);
auto ret = async_wrapper.async_api();
});
f.detach();
}
this_fiber::yield();
return 0;
}
When I use command to analysis my program ,I saw that
top - 17:32:03 up 5:29, 0 users, load average: 0.19, 0.17, 0.20
Threads: 5 total, 1 running, 4 sleeping, 0 stopped, 0 zombie
%Cpu(s): 8.3 us, 0.0 sy, 0.0 ni, 91.6 id, 0.0 wa, 0.0 hi, 0.1 si, 0.0 st
MiB Mem : 7931.7 total, 5037.4 free, 2405.4 used, 488.9 buff/cache
MiB Swap: 2048.0 total, 1259.0 free, 789.0 used. 5229.7 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
10338 wuhaotao 20 0 431880 15732 3424 R 99.7 0.2 0:03.63 with_blocking_a
10339 wuhaotao 20 0 431880 15732 3424 S 0.0 0.2 0:00.00 with_blocking_a
10340 wuhaotao 20 0 431880 15732 3424 S 0.0 0.2 0:00.00 with_blocking_a
10341 wuhaotao 20 0 431880 15732 3424 S 0.0 0.2 0:00.00 with_blocking_a
10342 wuhaotao 20 0 431880 15732 3424 S 0.0 0.2 0:00.00 with_blocking_a
Only the 10338 thread is running......
Related
I'm experimenting with the time-management of linux on a raspberry pi. For that I'm looking at clock_gettime (and clock_getres) for CLOCK_REALTIME.
I noticed that when I check clock_getres, it always says nanoseconds resolution (it returns 0 for tv_sec and 1 for tv_nsec) and also the values returned by clock_gettime point in that direction (e.g. I get values like 1642070078.415542996) but only if the system is fully booted with systemd etc running! When I put my test-program as a replacement for init (e.g. init=/test-clock in cmdline.txt), then suddenly getres still returns 1 for tv_nsec but all values returned by clock_gettime are with microsecond resolution!
test-program:
#include <stdint.h>
#include <stdio.h>
#include <time.h>
void test_clock(const clockid_t id, const char *const name)
{
struct timespec ts { 0, 0 };
uint64_t n = 0;
struct timespec start { 0, 0 }, end { 0, 0 };
clock_gettime(id, &start);
uint64_t start_ts = start.tv_sec * uint64_t(1000000000) + start.tv_nsec, end_ts = 0;
do {
clock_gettime(id, &end);
n++;
end_ts = end.tv_sec * uint64_t(1000000000) + end.tv_nsec;
}
while(end_ts - start_ts < 2500000000);
struct timespec now { 0, 0 };
int ns = 0;
for(int i=0; i<1000; i++) {
clock_gettime(id, &now);
if (now.tv_nsec % 1000)
ns++;
}
if (clock_getres(id, &ts))
fprintf(stderr, "clock_getres(%d) failed (%s)\n", id, name);
else
printf("%s:\t%ld.%09ld\t%f/s\t%d\t%ld.%09ld\n", name, ts.tv_sec, ts.tv_nsec, n / 2.5, ns, now.tv_sec, now.tv_nsec);
}
int main(int argc, char *argv[])
{
printf("clock\tresolution\tcalls/s\t# with ns\tnow\n");
test_clock(CLOCK_REALTIME, "CLOCK_REALTIME");
test_clock(CLOCK_TAI, "CLOCK_TAI");
test_clock(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
test_clock(CLOCK_MONOTONIC_RAW, "CLOCK_MONOTONIC_RAW");
return 0;
}
Compile with:
g++ -Ofast -ggdb3 -o test-clock test-clock.cpp -lrt
Expected output:
clock resolution calls/s # with ns now
CLOCK_REALTIME: 0.000000001 48881594.400000/s 1000 1642071062.213603835
CLOCK_TAI: 0.000000001 49500959.200000/s 1000 1642071101.713668922
CLOCK_MONOTONIC: 0.000000001 49248353.200000/s 1000 2402707.303582035
CLOCK_MONOTONIC_RAW: 0.000000001 47072281.600000/s 1000 2402705.604860726
What I see when starting test-clock as init replacement:
clock resolution calls/s # with ns now
CLOCK_REALTIME: 0.000000001 853001.200000/s 0 19.216404000
CLOCK_TAI: 0.000000001 736536.000000/s 0 21.718848000
CLOCK_MONOTONIC: 0.000000001 853367.200000/s 0 24.220166000
CLOCK_MONOTONIC_RAW: 0.000000001 855598.800000/s 0 26.721360000
The 4th column tells me that no readings were with nanosecond resolution.
So what I would like to know is: how can I configure the kernel/glibc/whatever so that it gives me nanosecond resolution at boot as well?
Any ideas?
I try to log with Boost library 1.61, rotating file once size exceeded 5Ko. But, every time program runs, it begins from the first log file, exceeding max size !
That is, it appends text to flog000.txt, then to flog001.txt.
But the desired behaviour was: append to the last file (flog002.txt) then continue with flog003.txt...
Here is my code:
#include <boost/log/common.hpp>
#include <boost/log/sinks/text_ostream_backend.hpp>
#include <boost/log/core.hpp>
#include <boost/log/utility/setup/file.hpp>
#include <boost/log/sources/logger.hpp>
#include <boost/log/utility/setup/common_attributes.hpp>
void init()
{
boost::log::add_file_log
(
boost::log::keywords::file_name = "flog%3N.txt",
boost::log::keywords::open_mode = std::ios_base::app,
boost::log::keywords::rotation_size = 5 * 1024,
boost::log::keywords::auto_flush = true
);
}
int main()
{
init();
boost::log::sources::logger lg;
boost::log::add_common_attributes();
for (int i(0); i < 1000; ++i)
{
BOOST_LOG(lg) << "Hello : " << i << std::endl;
}
return 0;
}
"There's no file collecting" - this is at least a part of the problem.
It does seem impossible to have scanning and append to an existing as well.
However, if you do scanning, you can at least prevent extra log messages from being wrongly appended to the older log files:
Live On Coliru
#include <boost/log/common.hpp>
#include <boost/log/core.hpp>
#include <boost/log/sinks/text_ostream_backend.hpp>
#include <boost/log/sources/logger.hpp>
#include <boost/log/utility/setup/common_attributes.hpp>
#include <boost/log/utility/setup/file.hpp>
void init() {
auto s = boost::log::add_file_log(
boost::log::keywords::file_name = "flog%3N.txt",
boost::log::keywords::open_mode = std::ios_base::app,
boost::log::keywords::rotation_size = 5 * 1024,
boost::log::keywords::auto_flush = true
);
auto be = s->locked_backend();
be->set_file_collector(boost::log::sinks::file::make_collector(
boost::log::keywords::target = "./", // log file & target have same dir
boost::log::keywords::max_size = 50 * 1024,
boost::log::keywords::min_free_space = 100000
));
be->scan_for_files(boost::log::sinks::file::scan_method::scan_matching, true);
}
int main() {
init();
boost::log::sources::logger lg;
boost::log::add_common_attributes();
for (int i(0); i < 1000; ++i) {
BOOST_LOG(lg) << "Hello : " << i << std::endl;
}
}
After one run:
-rw-rw-r-- 1 sehe sehe 5116 jun 6 00:27 flog000.txt
-rw-rw-r-- 1 sehe sehe 5109 jun 6 00:27 flog001.txt
-rw-rw-r-- 1 sehe sehe 2665 jun 6 00:27 flog002.txt
After two runs:
-rw-rw-r-- 1 sehe sehe 5116 jun 6 00:27 flog003.txt
-rw-rw-r-- 1 sehe sehe 5109 jun 6 00:27 flog004.txt
-rw-rw-r-- 1 sehe sehe 2665 jun 6 00:27 flog005.txt
After three runs:
-rw-rw-r-- 1 sehe sehe 5116 jun 6 00:28 flog006.txt
-rw-rw-r-- 1 sehe sehe 5109 jun 6 00:28 flog007.txt
-rw-rw-r-- 1 sehe sehe 2665 jun 6 00:28 flog008.txt
Wrote a sample C++ multithreaded program that runs with 10 threads, each thread set to high priority and affinity. Compiled and ran this code on dell machine that has 16 cores, running centos 7 (Linux kernel - 3.10.0-229), disabled hyperthreading. After I ran this code, in few seconds, our Linux machine becomes unresponsive, in the sense that, if I open up Eclipse editor, and save a file or save a file in vi editor those applications hang. Interestingly, once I stop this program / process, then all other applications resume from where they left off. Also I don't see this issue if I remove priority from these 10 threads.
Questions:
1) Out of 16 cores, 6 cores still left on the machine (above cpu usage shows, cpu executed 62.9% user space, and was idle for 37.1%. Interestingly 0% cpu usage in kernel space), so ideally kernel should have used those 6 cores to handle other application, what could be the reason that other application does not able to run? How to resolve this issue without introducing sleep / changing priority?
2) Like to know better approach other than introducing a sleep / waiting on a event (which introduce minimal latency due to kernel context switch) in the thread to achieve parallelism?
Ran top command (top -H):
%Cpu(s): 62.9 us, 0.0 sy, 0.0 ni, 37.1 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
1107 arun rt 0 96748 1112 932 R 99.9 0.0 0:25.78 PthreadTest
1115 arun rt 0 96748 1112 932 R 99.9 0.0 0:24.79 PthreadTest
1118 arun rt 0 96748 1112 932 R 99.9 0.0 0:22.79 PthreadTest
1120 arun rt 0 96748 1112 932 R 99.9 0.0 0:20.79 PthreadTest
1123 arun rt 0 96748 1112 932 R 99.9 0.0 0:18.79 PthreadTest
1117 arun rt 0 96748 1112 932 R 94.1 0.0 0:23.78 PthreadTest
1119 arun rt 0 96748 1112 932 R 94.1 0.0 0:21.78 PthreadTest
1122 arun rt 0 96748 1112 932 R 94.1 0.0 0:19.78 PthreadTest
1124 arun rt 0 96748 1112 932 R 94.1 0.0 0:17.78 PthreadTest
1125 arun rt 0 96748 1112 932 R 94.1 0.0 0:16.76 PthreadTest
Code below:
#include <unistd.h>
#include <iostream>
#include <cstdlib>
#include <pthread.h>
using namespace std;
#define NUM_THREADS 10
void *PrintHello(void *threadid)
{
long tid;
tid = (long)threadid;
cout << "Hello World! Thread ID, " << tid << endl;
while(true)
{
continue;
}
pthread_exit(NULL);
}
int main ()
{
pthread_t threads[NUM_THREADS];
pthread_attr_t threads_attr[NUM_THREADS];
struct sched_param params;
params.sched_priority = sched_get_priority_max(SCHED_FIFO);
int rc;
int i;
int cpu_num = 0;
for( i=0; i < NUM_THREADS; i++ ){
cpu_set_t cpu;
CPU_ZERO(&cpu);
CPU_SET(cpu_num, &cpu);
cout << "main() : creating thread, " << i << "cpu_num : "<<cpu_num<<endl;
pthread_attr_init(&threads_attr[i]);
pthread_attr_setscope(&threads_attr[i], PTHREAD_SCOPE_SYSTEM);
rc = pthread_create(&threads[i], NULL,
PrintHello, (void *)i);
if (rc){
cout << "Error:unable to create thread," << rc << endl;
exit(-1);
}
sleep(1);
int ret = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpu);
if(ret == 0 && CPU_ISSET(cpu_num, &cpu))
{
cout << "Thread " << i << " affinity set " <<endl;
}
ret = pthread_setschedparam(threads[i], SCHED_FIFO, ¶ms);
if(ret == 0)
{
cout << "Thread " << i << " priority set " <<endl;
}
cpu_num++;
}
// free attribute and wait for the other threads
void *status;
for( i=0; i < NUM_THREADS; i++ )
{
rc = pthread_join(threads[i], &status);
if (rc){
cout << "Error:unable to join," << rc << endl;
exit(-1);
}
cout << "Main: completed thread id :" << i ;
cout << " exiting with status :" << status << endl;
}
pthread_exit(NULL);
}
Compile:
g++ -std=c++14 -g -o PthreadTest busywait.cpp -lpthread
The effects of suddenly depriving the kernel of any use of a live core for an unlimited amount of time are unspecified and unknown. Anything attached to that core before exclusive ownership of it was taken, which could include threads that are waiting to be scheduled on it, is forever lost to the system.
DON'T DO THIS!
Tight loop with high priority thread is not a good idea...when the print hello is just looping without doing anything, Introduce switchtothread/yield after certain loops and sleep zero after a higher loop count. This will give other threads on the system oppurtinity to execute Eventually wait on a event handle when there is no work.
#kiran0x1B
I created a simple program to measure thread performance. I ripped out portions of a larger program in order to illustrate my point. Hopefully it's not too terrible to read.
Here is the program:
#include <sstream>
#include <thread>
#include <list>
#include <map>
#include <mutex>
#include <condition_variable>
#include <iostream>
#include <string.h>
std::mutex m_totalTranMutex;
int m_totalTrans = 0;
bool m_startThreads = false;
std::condition_variable m_allowThreadStart;
std::mutex m_threadStartMutex;
std::map<int,std::thread::native_handle_type> m_threadNativeHandles;
char *my_strdup(const char *str)
{
size_t len = strlen(str);
char *x = (char *)malloc(len+1);
if(x == nullptr)
return nullptr;
memcpy(x,str,len+1);
return x;
}
void DoWork()
{
char abc[50000];
char *s1, *s2;
std::strcpy(abc, "12345");
std::strcpy(abc+20000, "12345");
s1 = my_strdup(abc);
s2 = my_strdup(abc);
free(s1);
free(s2);
}
void WorkerThread(int threadID)
{
{
std::unique_lock<std::mutex> lk(m_threadStartMutex);
m_allowThreadStart.wait(lk, []{return m_startThreads;});
}
double transPerSec = 1 / 99999;
int transactionCounter = 0;
int64_t clockTicksUsed = 0;
std::thread::native_handle_type handle = m_threadNativeHandles[threadID];
std::chrono::high_resolution_clock::time_point current = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::time_point end = start + std::chrono::minutes(1);
int random_num_loops = 0;
double interarrivaltime = 0.0;
double timeHolderReal = 0.0;
while(current < end)
{
std::chrono::high_resolution_clock::time_point startWork = std::chrono::high_resolution_clock::now();
for(int loopIndex = 0; loopIndex < 100; ++loopIndex)
{
for(int alwaysOneHundred = 0; alwaysOneHundred < 100; ++alwaysOneHundred)
{
DoWork();
}
}
std::chrono::high_resolution_clock::time_point endWork = std::chrono::high_resolution_clock::now();
++transactionCounter;
clockTicksUsed += std::chrono::duration_cast<std::chrono::milliseconds>(endWork - startWork).count();
current = std::chrono::high_resolution_clock::now();
}
std::lock_guard<std::mutex> tranMutex(m_totalTranMutex);
std::cout << "Thread " << threadID << " finished with " << transactionCounter << " transaction." << std::endl;
m_totalTrans += transactionCounter;
}
int main(int argc, char *argv[])
{
std::stringstream ss;
int numthreads = atoi(argv[1]);
std::list<std::thread> threads;
int threadIds = 1;
for(int i = 0; i < numthreads; ++i)
{
threads.push_back(std::thread(&WorkerThread, threadIds));
m_threadNativeHandles.insert(std::make_pair(threadIds, threads.rbegin()->native_handle()));
++threadIds;
}
{
std::lock_guard<std::mutex> lk(m_threadStartMutex);
m_startThreads = true;
}
m_allowThreadStart.notify_all();
//Join until completion
for(std::thread &th : threads)
{
th.join();
}
ss << "TotalTran" << std::endl
<< m_totalTrans << std::endl;
std::cout << ss.str();
}
Application usage: app N
where app is the name of the application and N is the number of threads to produce. The program runs for 1 minute.
On windows, I build this program with Visual Studio 2012. I execute the program on a quad core I7 (4 cores, 2 threads per core).
I get the following:
simplethread 1
Thread 1 finished with 1667 transaction.
TotalTran
1667
simplethread 2
Thread 1 finished with 1037 transaction.
Thread 2 finished with 1030 transaction.
TotalTran
2067
simplethread 3
Thread 3 finished with 824 transaction.
Thread 2 finished with 830 transaction.
Thread 1 finished with 837 transaction.
TotalTran
2491
simplethread 4
Thread 3 finished with 688 transaction.
Thread 2 finished with 693 transaction.
Thread 1 finished with 704 transaction.
Thread 4 finished with 691 transaction.
TotalTran
2776
simplethread 8
Thread 2 finished with 334 transaction.
Thread 6 finished with 325 transaction.
Thread 7 finished with 346 transaction.
Thread 1 finished with 329 transaction.
Thread 8 finished with 329 transaction.
Thread 3 finished with 338 transaction.
Thread 5 finished with 331 transaction.
Thread 4 finished with 330 transaction.
TotalTran
2662
E:\Development\Projects\Applications\CPUBenchmark\Debug>simplethread 16
Thread 16 finished with 163 transaction.
Thread 15 finished with 169 transaction.
Thread 12 finished with 165 transaction.
Thread 9 finished with 170 transaction.
Thread 10 finished with 166 transaction.
Thread 4 finished with 164 transaction.
Thread 13 finished with 166 transaction.
Thread 8 finished with 165 transaction.
Thread 6 finished with 165 transaction.
Thread 5 finished with 168 transaction.
Thread 2 finished with 161 transaction.
Thread 1 finished with 159 transaction.
Thread 7 finished with 160 transaction.
Thread 11 finished with 161 transaction.
Thread 14 finished with 163 transaction.
Thread 3 finished with 161 transaction.
TotalTran
2626
These numbers look a little poor. I was expecting a lot closer to double going from one thread doing X work to 2 threads doing 2X work on this system. The threads did do about the same amount of work but not as much in one minutes time.
It get's even stranger when I move to solaris.
On Solaris 11, using GCC 4.8.0, I build this program as follows:
gcc -o simple simpleThreads.cpp -I. -std=c++11 -DSOLARIS=1 -lstdc++ -lm
when i run "./simple 1", i get
Thread 1 finished with 19686 transaction.
TotalTran
19686
for "./simple 2", I get:
Thread 1 finished with 5248 transaction.
Thread 2 finished with 2484 transaction.
TotalTran
7732
On Solaris, the 2 thread case is much slower. I can't figure out what I'm doing wrong. I'm new to c++11 constructs and threads. So it's a double whammy. gcc -v shows the thread model is posix. Any help would be appreciated.
I was learning how to use ptrace and I faced a strange problem:
I wrote a program:
#include <cstdio>
#include <sys/mman.h>
#include <string.h>
#include <errno.h>
int main()
{
long x=(long)mmap(0,-235,2,34,-1,0);
printf("Child: x=%ld (",x);
for(int i=31;i>=0;i--) printf((x&(1<<i))?"1":"0");
printf(")\n");
printf("Child errno: %s\n",strerror(errno));
return 0;
}
It simply makes an mmap syscall with wrong parameter. Then it prints return value (also in binary) and errno.
Here I have this program's output after executing it:
Child: x=-1 (11111111111111111111111111111111)
Child errno: Cannot allocate memory
And I run it with strace:
execve("./nic.e", ["./nic.e"], [/* 35 vars */]) = 0
uname({sys="Linux", node="dom", ...}) = 0
brk(0) = 0x9237000
brk(0x9237cd0) = 0x9237cd0
set_thread_area({entry_number:-1 -> 6, base_addr:0x9237830, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}) = 0
brk(0x9258cd0) = 0x9258cd0
brk(0x9259000) = 0x9259000
mmap2(NULL, 4294967061, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7798000
write(1, "Child: x=-1 (1111111111111111111"..., 47Child: x=-1 (11111111111111111111111111111111)
) = 47
write(1, "Child errno: Cannot allocate mem"..., 36Child errno: Cannot allocate memory
) = 36
exit_group(0) = ?
And strace tells that this wrong mmap returns -1 with error ENOMEM.
Till now everything is OK.
Here my code with ptrace (I cut everything not really needed):
#include <cstdio>
#include <cstdlib>
#include <unistd.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/reg.h>
#include <errno.h>
#include <string.h>
#include <fcntl.h>
int main(int argc,char**argv)
{
int pid=fork();
if(!pid)
{
ptrace(PTRACE_TRACEME,0,NULL,NULL);
execve("nic.e",NULL,NULL);
exit(1);
}
while(true)
{
int status;
waitpid(pid,&status,0);
if(WIFEXITED(status)) return 0;
int signal;
if(WIFSTOPPED(status))
{
signal=WSTOPSIG(status);
}
if(WIFSIGNALED(status)) return 0;
if(signal==SIGTRAP)
{
user_regs_struct regs;
ptrace(PTRACE_GETREGS,pid,NULL,®s);
if(regs.orig_eax==__NR_mmap2)
{
static bool mmap_back=false;
if(!mmap_back) mmap_back=true;
else
{
mmap_back=false;
long x=regs.eax;
printf("mmap return: %ld (",x);
for(int j=31;j>=0;j--) printf((x&(1<<j))?"1":"0");
printf(")\n");
}
}
}
ptrace(PTRACE_SYSCALL,pid,NULL,NULL);
}
return 0;
}
It should print same things that the child prints - retrun values of mmap2 syscalls.
But here's the output:
mmap return: -12 (11111111111111111111111111110100)
mmap return: -1216753664 (10110111011110011101000000000000)
Child: x=-1 (11111111111111111111111111111111)
Child errno: Cannot allocate memory
Why did mmap return -12? Am I capturing the return value incorrectly?
On 32-bit x86 linux %eax contains either the return value or the negated error value on error.
See e.g. 4.4 or 3.3 and 3.4.
A return value of -12 from the syscall means that the function failed and errno should be set to 12, which at least on my system matches ENOMEM.
It appears that strace is helpfully translating this for you. If you want your application to behave as strace, you should perform a test and translation similar to the one in 4.4:
if ((unsigned long)(x) >= (unsigned long)(-2048)) {
printf("syscall failed. errno = %ld\n", -(res));
}