For my test case, only send "00" message by bufferevent_write.
case 1: 20,000 tcp connections, and send "00" to each every 10s, it will cost 0.15s.
case 2: only 1 tcp connection, and send "00" 20,000 times every 10s, it will cost 0.015s.
Please give me some suggestions to improve bufferevent_write performance.
I just wanna as fast as possible, and wonder that, if bufferevent_write is async, why send 20k message to 1 tcp is much faster than send 1 msssage to 20k tcp.
CPU info:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) Platinum 8269CY CPU # 2.50GHz
Stepping: 7
CPU MHz: 2500.000
BogoMIPS: 5000.00
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32K
L1i cache: 32K
L2 cache: 1024K
L3 cache: 36608K
NUMA node0 CPU(s): 0-15
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl cpuid tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni
Memory info:
32G
the whole test case
#include <event2/buffer.h>
#include <event2/bufferevent.h>
#include <event2/event.h>
#include <event2/listener.h>
#include <event2/thread.h>
#include <netinet/tcp.h>
#include <atomic>
#include <cerrno>
#include <csignal>
#include <cstring>
#include <ctime>
#include <deque>
#include <functional>
#include <iostream>
#include <map>
#include <mutex>
#include <set>
#include <thread>
using namespace std::chrono_literals;
static event_base *kEventBase{nullptr};
static evconnlistener *kListener{nullptr};
static std::set<bufferevent *> kSessions{};
static std::mutex kSessionsMutex{};
static std::atomic_bool kRunning{false};
static void stop() {
kRunning = false;
if (kListener != nullptr) {
evconnlistener_disable(kListener);
std::cout << "normal listener stopped" << std::endl;
}
struct timeval local_timeval = {1, 0};
if (kEventBase != nullptr) { event_base_loopexit(kEventBase, &local_timeval); }
}
static void handler(int sig) {
std::cout << "get signal: " << sig << std::endl;
stop();
}
static void ReadCallback(bufferevent *event, void *) {
auto buffer = evbuffer_new();
evbuffer_add_buffer(buffer, bufferevent_get_input(event));
auto data_size = evbuffer_get_length(buffer);
char data[data_size + 1];
bzero(data, data_size + 1);
evbuffer_remove(buffer, data, data_size);
evbuffer_free(buffer);
std::cout << "get data: " << data << std::endl;
}
static void EventCallback(bufferevent *event, short events, void *) {
if (events & BEV_EVENT_EOF) {
std::cout << "socket EOF" << std::endl;
} else if (events & BEV_EVENT_ERROR) {
std::cout << "socket error: " << evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR());
} else if (events & BEV_EVENT_TIMEOUT) {
std::cout << "socket read/write timeout" << std::endl;
} else {
std::cout << "unhandled socket events: " << std::to_string(events) << std::endl;
}
{
std::lock_guard<std::mutex> local_lock_guard{kSessionsMutex};
kSessions.erase(event);
bufferevent_free(event);
}
}
static void listenerCallback(evconnlistener *, evutil_socket_t socket, sockaddr *, int, void *) {
bufferevent *event =
bufferevent_socket_new(kEventBase, socket, BEV_OPT_CLOSE_ON_FREE | BEV_OPT_THREADSAFE);
if (event == nullptr) {
std::cout << "create buffer event failed" << std::endl;
return;
}
int enable = 1;
setsockopt(socket, IPPROTO_TCP, TCP_NODELAY, (void *)&enable, sizeof(enable));
setsockopt(socket, IPPROTO_TCP, TCP_QUICKACK, (void *)&enable, sizeof(enable));
bufferevent_setcb(event, ReadCallback, nullptr, EventCallback, nullptr);
bufferevent_enable(event, EV_WRITE | EV_READ);
kSessions.emplace(event);
}
int main(int argc, const char **argv) {
signal(SIGTERM, handler);
signal(SIGINT, handler);
evthread_use_pthreads();
// init
kEventBase = event_base_new();
if (kEventBase == nullptr) {
std::cout << "cannot create event_base_miner_listener_" << std::endl;
return -1;
}
sockaddr_in local_sin{};
bzero(&local_sin, sizeof(local_sin));
local_sin.sin_family = AF_INET;
local_sin.sin_port = htons(1800u);
local_sin.sin_addr.s_addr = htonl(INADDR_ANY);
kListener = evconnlistener_new_bind(kEventBase,
listenerCallback,
nullptr,
LEV_OPT_REUSEABLE | LEV_OPT_CLOSE_ON_FREE,
-1,
reinterpret_cast<sockaddr *>(&local_sin),
static_cast<int>(sizeof(local_sin)));
if (kListener == nullptr) {
std::cout << "cannot create normal listener" << std::endl;
return -1;
}
kRunning = true;
std::thread thread_send_message([]() {
while (kRunning) {
{
// case 1: If send to 20,000 tcp connection, and send "00" for each, it will cost 0.15s.
std::lock_guard<std::mutex> local_lock_guard{kSessionsMutex};
std::clock_t clock_start = std::clock();
for (auto &it : kSessions) { bufferevent_write(it, "00", 2); }
std::cout << "send message to all done, client count: " << kSessions.size()
<< ", elapsed: " << std::clock() - clock_start << std::endl;
}
{
// case 2: If send to 1 tcp connection, and send "00" 20,000 times, it will cost 0.015s.
// std::lock_guard<std::mutex> local_lock_guard{kSessionsMutex};
// for (auto &it : kSessions) {
// std::clock_t clock_start = std::clock();
// for (int i = 0; i < 20000; ++i) { bufferevent_write(it, "00", 2); }
// std::cout << "send message 20k times done, elapsed: " << std::clock() - clock_start
// << std::endl;
// }
}
std::this_thread::sleep_for(10s);
}
});
event_base_dispatch(kEventBase);
if (thread_send_message.joinable()) { thread_send_message.join(); }
{
std::lock_guard<std::mutex> local_lock_guard{kSessionsMutex};
for (auto &it : kSessions) { bufferevent_free(it); }
kSessions.clear();
}
if (kListener != nullptr) {
evconnlistener_free(kListener);
kListener = nullptr;
}
if (kEventBase != nullptr) {
event_base_free(kEventBase);
kEventBase = nullptr;
}
}
the minimal reproducible example
// case 1: 20,000 tcp connections, and send "00" for each every 10s, it will cost 0.15s.
std::clock_t clock_start = std::clock();
for (auto &it : kSessions) { bufferevent_write(it, "00", 2); }
std::cout << "send message to all done, client count: " << kSessions.size()
<< ", elapsed: " << std::clock() - clock_start << std::endl;
// case 2: only 1 tcp connection, and send "00" 20,000 times every 10s, it will cost 0.015s.
for (auto &it : kSessions) {
std::clock_t clock_start = std::clock();
for (int i = 0; i < 20000; ++i) { bufferevent_write(it, "00", 2); }
std::cout << "send message 20k times done, elapsed: " << std::clock() - clock_start
<< std::endl;
}
strace of case 1:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
56.32 29.519892 9 3135415 408444 futex
20.53 10.762191 7 1490532 epoll_ctl
15.25 7.992391 11 715355 writev
3.98 2.086553 45360 46 nanosleep
1.86 0.973074 11 85273 1 epoll_wait
0.62 0.324022 8 39267 19266 accept4
0.58 0.305246 6 48721 read
0.55 0.286858 6 48762 write
0.30 0.154980 4 40004 setsockopt
0.01 0.006486 5 1216 mprotect
0.01 0.002952 21 143 madvise
0.00 0.001018 7 152 brk
0.00 0.000527 6 94 clock_gettime
0.00 0.000023 3 8 openat
0.00 0.000021 21 1 mremap
0.00 0.000010 0 22 mmap
0.00 0.000007 1 9 close
0.00 0.000000 0 8 fstat
0.00 0.000000 0 3 munmap
0.00 0.000000 0 4 rt_sigaction
0.00 0.000000 0 1 rt_sigprocmask
0.00 0.000000 0 1 ioctl
0.00 0.000000 0 1 readv
0.00 0.000000 0 8 8 access
0.00 0.000000 0 1 socket
0.00 0.000000 0 1 bind
0.00 0.000000 0 1 listen
0.00 0.000000 0 1 clone
0.00 0.000000 0 1 execve
0.00 0.000000 0 4 getuid
0.00 0.000000 0 4 getgid
0.00 0.000000 0 4 geteuid
0.00 0.000000 0 4 getegid
0.00 0.000000 0 1 arch_prctl
0.00 0.000000 0 1 set_tid_address
0.00 0.000000 0 2 set_robust_list
0.00 0.000000 0 1 eventfd2
0.00 0.000000 0 1 epoll_create1
0.00 0.000000 0 1 pipe2
0.00 0.000000 0 1 prlimit64
------ ----------- ----------- --------- --------- ----------------
100.00 52.416251 5605075 427719 total
strace of case 2:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
normal listener stopped
66.66 0.151105 7 22506 3469 futex
9.74 0.022084 6 3709 1 epoll_wait
9.54 0.021624 4 5105 epoll_ctl
9.47 0.021466 8 2550 writev
2.47 0.005598 4 1263 write
1.70 0.003857 3 1246 read
0.18 0.000409 18 23 nanosleep
0.09 0.000197 4 46 clock_gettime
0.03 0.000068 4 16 mprotect
0.02 0.000035 2 21 mmap
0.01 0.000024 8 3 munmap
0.01 0.000019 10 2 1 accept4
0.01 0.000018 5 4 setsockopt
0.01 0.000015 8 2 set_robust_list
0.01 0.000014 4 4 rt_sigaction
0.01 0.000014 4 4 geteuid
0.01 0.000013 3 4 getgid
0.01 0.000012 3 4 getuid
0.01 0.000012 3 4 getegid
0.00 0.000011 1 8 fstat
0.00 0.000010 10 1 socket
0.00 0.000008 8 1 clone
0.00 0.000007 2 3 brk
0.00 0.000007 7 1 pipe2
0.00 0.000006 1 7 openat
0.00 0.000006 6 1 epoll_create1
0.00 0.000005 1 8 8 access
0.00 0.000005 5 1 bind
0.00 0.000005 5 1 eventfd2
0.00 0.000005 5 1 prlimit64
0.00 0.000004 1 7 close
0.00 0.000004 4 1 listen
0.00 0.000003 3 1 rt_sigprocmask
0.00 0.000003 3 1 arch_prctl
0.00 0.000003 3 1 set_tid_address
0.00 0.000000 0 1 execve
------ ----------- ----------- --------- --------- ----------------
100.00 0.226676 36561 3479 total
How to improve libevent bufferevent_write performance
Read the documentation of libevent, study its source code, and consider other event loop libraries like libev, Qt, Wt, libonion, POCO etc....
Be aware of several points. I assume a modern Linux/x86-64 system.
you could profile your open source event loop library (e.g. by compiling it from source code with a recent GCC and using -pg -O2 flags, then strace(1) and/or gprof(1) and/or perf(1) and/or time(1) (and also top(1), ps(1), proc(5), netstat(8), ip(8), ifconfig(8), tcpdump(8), xosview to observe your entire Linux system). Of course read time(7) and epoll(7) and poll(2)
TCP/IP is introducing some overhead, IP routing adds more overhead, and the typical Ethernet packet has at least hundreds of bytes (and dozens of bytes of overhead). You certainly want to send(2) or recv(2) several hundred bytes at once. Sending short "00" messages (of about four bytes of useful payload) is inefficient. Ensure that your application send messages of hundreds of bytes at once. You might consider some JSONRPC approach (and of course design your protocol at a higher level with fewer but bigger messages triggering each more complex behavior) or some MPI one. A way to send fewer but higher-level messages is to embed some interpreter like Guile or Lua and send higher level script chunks or requests (like NeWS did in the past, and PostgreSQL or exim do today)
for short and small communications prefer running a few processes or threads on the same computer and use mq_overview(7), pipe(7), fifo(7), unix(7), avoiding Ethernet.
Most computers are in 2020 multi-core, and with care, you could use Pthreads or std::thread (with one thread running on each core, so at least 2 or 4 different threads on a laptop, or a hundred threads on a powerful Linux server). You'll need some synchronization code (e.g. std::mutex with std::lock_guard or Pẗhread mutexes....)
be aware of the C10K problem, and take inspiration from existing open source server programs or libraries such as lighttpd, Wt, FLTK, REDIS, Vmime, libcurl, libonion (and study their source code, and observe their runtime behavior with gdb(1) and/or strace(1) or ltrace(1))
the network might be the bottleneck (then you won't be able to improve your code to gain performance; you'll need some change in your software architecture). Read more about cloud computing, distributed computing, XDR, ASN.1, SOAP, REST, Web services, libssh, π-calculus
Notice that:
static void handler(int sig) {
std::cout << "get signal: " << sig << std::endl;
stop();
}
if used with signal(7) is against the rules of signal-safety(7) so you might use the pipe(7) to self-process trick as suggested by Qt or consider using the Linux specific signalfd(2) system call.
Read also Advanced Linux Programming then syscalls(2) and socket(7) and tcp(7).
so I have a simple recursive c++ program, very basic:
#include <iostream>
int fibonacciRec(int no) {
if (no == 0 || no == 1)
return no;
else
return fibonacciRec(no-1) + fibonacciRec(no-2);
}
int main(int argc, char** argv) {
int no = 42;
for (int i = 1; i <= no; i++) {
std::cout << fibonacciRec(i-1) << " ";
}
std::cout << std::endl;
return 0;
}
Now I want to run strace on this program, showing all the system calls. Basically I want to see a lot of mmaps etc. but as soon, as the first loop is called, strace -f stops following the system calls and only shows the last write call. Also strace -c gives unlikely numbers, since the program takes well more then 4-6 seconds to compute:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
60.47 0.000078 78 1 munmap
26.36 0.000034 11 3 brk
13.18 0.000017 3 6 fstat
0.00 0.000000 0 4 read
0.00 0.000000 0 1 write
0.00 0.000000 0 5 close
0.00 0.000000 0 14 mmap
0.00 0.000000 0 10 mprotect
0.00 0.000000 0 6 6 access
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 arch_prctl
0.00 0.000000 0 5 openat
------ ----------- ----------- --------- --------- ----------------
100.00 0.000129 57 6 total
There's no need for any mmaps or any other system calls when fibonacciRec is running.
The only memory that might be allocated is stack memory for the recursive calls, and there are several reasons why you those don't show up in the strace:
It's really not a lot of memory. Your maximum recursion depth is about 42, and you've only got 1 local variable, so the stack frames are small. The total stack allocated during the recursion is probably less than 1 page.
Even if it was a lot of memory, the stack allocation only grows, it never shrinks, so you'd see it grow to its maximum pretty quickly, then stay there for a long time. It wouldn't be a flood.
Stack allocation isn't done with a system call anyway. To ask the kernel for more stack, all you have to do is pretend you already have it. The kernel catches the page fault, notices that the faulting address is near your existing stack, and allocates more. It's so transparent that even strace can't see it.
Apart from calling itself and returning a value, fibonacciRec doesn't do anything but manipulate local variables. There are no system calls.
I am using a relatively simple code parallelize with OpenMP to familiarize myself with gprof.
My code mainly consists of gathering data from input files, perform some array manipulations and write the new data to different output files. I placed some calls to the intrinsic subroutine CPU_TIME to see if gprof was being accurate:
PROGRAM main
USE global_variables
USE fileio, ONLY: read_old_restart, write_new_restart, output_slice, write_solution
USE change_vars
IMPLICIT NONE
REAL(dp) :: t0, t1
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL allocate_data
CALL CPU_TIME(t1)
PRINT*, "Allocate data =", t1 - t0
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL build_grid
CALL CPU_TIME(t1)
PRINT*, "Build grid =", t1 - t0
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL read_old_restart
CALL CPU_TIME(t1)
PRINT*, "Read restart =", t1 - t0
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL regroup_all
CALL CPU_TIME(t1)
PRINT*, "Regroup all =", t1 - t0
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL redistribute_all
CALL CPU_TIME(t1)
PRINT*, "Redistribute =", t1 - t0
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CALL CPU_TIME(t0)
CALL write_new_restart
CALL CPU_TIME(t1)
PRINT*, "Write restart =", t1 - t0
END PROGRAM main
Here is the output:
Allocate data = 1.000000000000000E-003
Build grid = 0.000000000000000E+000
Read restart = 10.7963590000000
Regroup all = 6.65998700000000
Redistribute = 14.3518180000000
Write restart = 53.5218640000000
Therefore, the write_new_restart subroutine is the most time consuming and takes about 62% of the total run time. However according to grof, the subroutine redistribute_vars, which is called multiple times by redistribute_all is the most time consuming with 70% of the total time. Here is the output from gprof:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
74.40 8.95 8.95 61 0.15 0.15 change_vars_mp_redistribute_vars_
19.12 11.25 2.30 60 0.04 0.04 change_vars_mp_regroup_vars_
6.23 12.00 0.75 63 0.01 0.01 change_vars_mp_fill_last_blocks_
0.08 12.01 0.01 1 0.01 2.31 change_vars_mp_regroup_all_
0.08 12.02 0.01 __intel_ssse3_rep_memcpy
0.08 12.03 0.01 for_open
0.00 12.03 0.00 1 0.00 12.01 MAIN__
0.00 12.03 0.00 1 0.00 0.00 change_vars_mp_build_grid_
0.00 12.03 0.00 1 0.00 9.70 change_vars_mp_redistribute_all_
0.00 12.03 0.00 1 0.00 0.00 fileio_mp_read_old_restart_
0.00 12.03 0.00 1 0.00 0.00 fileio_mp_write_new_restart_
0.00 12.03 0.00 1 0.00 0.00 global_variables_mp_allocate_data_
index % time self children called name
0.00 12.01 1/1 main [2]
[1] 99.8 0.00 12.01 1 MAIN__ [1]
0.00 9.70 1/1 change_vars_mp_redistribute_all_ [3]
0.01 2.30 1/1 change_vars_mp_regroup_all_ [5]
0.00 0.00 1/1 global_variables_mp_allocate_data_ [13]
0.00 0.00 1/1 change_vars_mp_build_grid_ [10]
0.00 0.00 1/1 fileio_mp_read_old_restart_ [11]
0.00 0.00 1/1 fileio_mp_write_new_restart_ [12]
-----------------------------------------------
<spontaneous>
[2] 99.8 0.00 12.01 main [2]
0.00 12.01 1/1 MAIN__ [1]
-----------------------------------------------
0.00 9.70 1/1 MAIN__ [1]
[3] 80.6 0.00 9.70 1 change_vars_mp_redistribute_all_ [3]
8.95 0.00 61/61 change_vars_mp_redistribute_vars_ [4]
0.75 0.00 63/63 change_vars_mp_fill_last_blocks_ [7]
-----------------------------------------------
8.95 0.00 61/61 change_vars_mp_redistribute_all_ [3]
[4] 74.4 8.95 0.00 61 change_vars_mp_redistribute_vars_ [4]
-----------------------------------------------
0.01 2.30 1/1 MAIN__ [1]
[5] 19.2 0.01 2.30 1 change_vars_mp_regroup_all_ [5]
2.30 0.00 60/60 change_vars_mp_regroup_vars_ [6]
-----------------------------------------------
2.30 0.00 60/60 change_vars_mp_regroup_all_ [5]
[6] 19.1 2.30 0.00 60 change_vars_mp_regroup_vars_ [6]
-----------------------------------------------
0.75 0.00 63/63 change_vars_mp_redistribute_all_ [3]
[7] 6.2 0.75 0.00 63 change_vars_mp_fill_last_blocks_ [7]
-----------------------------------------------
<spontaneous>
[8] 0.1 0.01 0.00 for_open [8]
-----------------------------------------------
<spontaneous>
[9] 0.1 0.01 0.00 __intel_ssse3_rep_memcpy [9]
-----------------------------------------------
0.00 0.00 1/1 MAIN__ [1]
[10] 0.0 0.00 0.00 1 change_vars_mp_build_grid_ [10]
-----------------------------------------------
0.00 0.00 1/1 MAIN__ [1]
[11] 0.0 0.00 0.00 1 fileio_mp_read_old_restart_ [11]
-----------------------------------------------
0.00 0.00 1/1 MAIN__ [1]
[12] 0.0 0.00 0.00 1 fileio_mp_write_new_restart_ [12]
-----------------------------------------------
0.00 0.00 1/1 MAIN__ [1]
[13] 0.0 0.00 0.00 1 global_variables_mp_allocate_data_ [13]
-----------------------------------------------
For your information, regroup_all calls regroup_vars multiple times and redistribute_all calls redistribute_vars and fill_last_blocks multiple times.
I am compiling my code with ifort with the -openmp -O2 -pg options.
QUESTION:
Why is gprof not seeing the time my file i/o subroutines take? (read_old_restart, write_new_restart)
gprof specifically does not include I/O time. It only tries to measure CPU time.
That's because it only does two things: 1) sample the program counter on a 1/100 second clock, and the program counter is meaningless during I/O, and 2) count the number of times any function B is called by any function A.
From the call-counts, it tries to guess how much of each function's CPU time can be attributed to each caller.
That's it's whole advance over pre-existing profilers.
When you use gprof, you should understand what it does and what it doesn't do.
I have a program which opens a large number of files. I am timing the execution of a C++ loop which literally just opens and closes the files using both a C++ timer and strace. Strangely the system time and the time logged by C++ (which agree with each other) are orders of magnitude larger than the time the time strace claims was spent in system calls. How can this be? I have put the source and output below.
This all came about because I found that my application was spending an unreasonable amount of time just to open files. To help me pin down the problem I wrote the following test code (for reference the file "files.csv" is just a list with one filepath per line):
#include <stdio.h>
#include...
using namespace std;
int main(){
timespec start, end;
ifstream fin("files.csv");
string line;
vector<string> files;
while(fin >> line){
files.push_back(line);
}
fin.close();
clock_gettime(CLOCK_MONOTONIC, &start);
for(int i=0; i<500; i++){
size_t filedesc = open(files[i].c_str(), O_RDONLY);
if(filedesc < 0) printf("error in open");
if(close(filedesc)<0) printf("error in close");
}
clock_gettime(CLOCK_MONOTONIC, &end);
printf(" %fs elapsed\n", (end.tv_sec-start.tv_sec) + ((float)(end.tv_nsec - start.tv_nsec))/1000000000);
return 0;
}
And here is what I get when I run it:
-bash$ time strace -ttT -c ./open_stuff
5.162448s elapsed <------ Output from C++ code
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
99.72 0.043820 86 508 open <------output from strace
0.15 0.000064 0 508 close
0.14 0.000061 0 705 read
0.00 0.000000 0 1 write
0.00 0.000000 0 8 fstat
0.00 0.000000 0 25 mmap
0.00 0.000000 0 12 mprotect
0.00 0.000000 0 3 munmap
0.00 0.000000 0 52 brk
0.00 0.000000 0 2 rt_sigaction
0.00 0.000000 0 1 rt_sigprocmask
0.00 0.000000 0 1 1 access
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 getrlimit
0.00 0.000000 0 1 arch_prctl
0.00 0.000000 0 3 1 futex
0.00 0.000000 0 1 set_tid_address
0.00 0.000000 0 1 set_robust_list
------ ----------- ----------- --------- --------- ----------------
100.00 0.043945 1834 2 total
real 0m5.821s <-------output from time
user 0m0.031s
sys 0m0.084s
In theory the reported "elapsed" time from C++ should be the execution time of the the calls to open(2) plus the minimal overhead of executing a for loop 500 times. And yet the sum of the total time in open(2) and close(1) calls from strace is 99% shorter . I cannot figure out what is going on.
PS The difference between the C elapsed time and system time is due to the fact that files.csv actually contains tens of thousands of paths, which all get loaded.
Comparing elapsed time with execution time is like comparing apples with orange juice. (One of them is missing the pulp :) ) To open a file, the system has to find and read the appropriate directory entry... and if the paths are deep, it might need to rrad a number of directory entries. If the entries are not cached, they will need to be read from disk, which will involve a disk seek. While the disk heads are moving, and while the sector is spinning around to where the disk heads are, the wall clock keeps ticking, but the CPU can be doing other stuff (if there is work to do.) So that counts as elapsed time -- the inexorable clock ticks on -- but not execution time.
I was learning to use gprof and then i got weird results for this code:
int one(int a, int b)
{
int i, r = 0;
for (i = 0; i < 1000; i++)
{
r += b / (a + 1);
}
return r;
}
int two(int a, int b)
{
int i, r = 0;
for (i = 0; i < 1000; i++)
{
r += b / (a + 1);
}
return r;
}
int main()
{
for (int i = 1; i < 50000; i++)
{
one(i, i * 2);
two(i, i * 2);
}
return 0;
}
and this is the profiler output
% cumulative self self total
time seconds seconds calls us/call us/call name
50.67 1.14 1.14 49999 22.80 22.80 two(int, int)
49.33 2.25 1.11 49999 22.20 22.20 one(int, int)
If i call one then two the result is the inverse, two takes more time than one
both are the same functions, but the first calls always take less time then the second
Why is that?
Note: The assembly code is exactly the same and code is being compiled with no optimizations
I'd guess it is some fluke in run-time optimisation - one uses a register and the other doesn't or something minor like that.
The system clock probably runs to a precision of 100nsec. The average call time 30nsec or 25nsec is less than one clock tick. A rounding error of 5% of a clock tick is pretty small. Both times are near enough zero.
My guess: it is an artifact of the way mcount data gets interpreted. The granularity for mcount (monitor.h) is on the order of a 32 bit longword - 4 bytes on my system. So you would not expect this: I get different reports from prof vs gprof on the EXACT same mon.out file.
solaris 9 -
prof
%Time Seconds Cumsecs #Calls msec/call Name
46.4 2.35 2.3559999998 0.0000 .div
34.8 1.76 4.11120000025 0.0000 _mcount
10.1 0.51 4.62 1 510. main
5.3 0.27 4.8929999999 0.0000 one
3.4 0.17 5.0629999999 0.0000 two
0.0 0.00 5.06 1 0. _fpsetsticky
0.0 0.00 5.06 1 0. _exithandle
0.0 0.00 5.06 1 0. _profil
0.0 0.00 5.06 20 0.0 _private_exit, _exit
0.0 0.00 5.06 1 0. exit
0.0 0.00 5.06 4 0. atexit
gprof
% cumulative self self total
time seconds seconds calls ms/call ms/call name
71.4 0.90 0.90 1 900.00 900.00 key_2_text <cycle 3> [2]
5.6 0.97 0.07 106889 0.00 0.00 _findbuf [9]
4.8 1.03 0.06 209587 0.00 0.00 _findiop [11]
4.0 1.08 0.05 __do_global_dtors_aux [12]
2.4 1.11 0.03 mem_init [13]
1.6 1.13 0.02 102678 0.00 0.00 _doprnt [3]
1.6 1.15 0.02 one [14]
1.6 1.17 0.02 two [15]
0.8 1.18 0.01 414943 0.00 0.00 realloc <cycle 3> [16]
0.8 1.19 0.01 102680 0.00 0.00 _textdomain_u <cycle 3> [21]
0.8 1.20 0.01 102677 0.00 0.00 get_mem [17]
0.8 1.21 0.01 $1 [18]
0.8 1.22 0.01 $2 [19]
0.8 1.23 0.01 _alloc_profil_buf [22]
0.8 1.24 0.01 _mcount (675)
Is it always the first one called that is slightly slower? If that's the case, I would guess it is a CPU cache doing it's thing. or it could be lazy paging by the operating system.
BTW: what optimization flags are compiling with?