Related
My server crashes when I gracefully close a client that is connected to it, while the client is receiving a large chunk of data. I am thinking of a possible lifetime bug as with the most bugs in boost ASIO, however I was not able to point out my mistake myself.
Each client establishes 2 connection with the server, one of them is for syncing, the other connection is long-lived one to receive continuous updates. In the "syncing phase" client receives large data to sync with the server state ("state" is basically DB data in JSON format). After syncing, sync connection is closed. Client receives updates to the DB as it happens (these are of course very small data compared to "syncing data") via the other connection.
These are the relevant files:
connection.h
#pragma once
#include <array>
#include <memory>
#include <string>
#include <boost/asio.hpp>
class ConnectionManager;
/// Represents a single connection from a client.
class Connection : public std::enable_shared_from_this<Connection>
{
public:
Connection(const Connection&) = delete;
Connection& operator=(const Connection&) = delete;
/// Construct a connection with the given socket.
explicit Connection(boost::asio::ip::tcp::socket socket, ConnectionManager& manager);
/// Start the first asynchronous operation for the connection.
void start();
/// Stop all asynchronous operations associated with the connection.
void stop();
/// Perform an asynchronous write operation.
void do_write(const std::string& buffer);
int getNativeHandle();
~Connection();
private:
/// Perform an asynchronous read operation.
void do_read();
/// Socket for the connection.
boost::asio::ip::tcp::socket socket_;
/// The manager for this connection.
ConnectionManager& connection_manager_;
/// Buffer for incoming data.
std::array<char, 8192> buffer_;
std::string outgoing_buffer_;
};
typedef std::shared_ptr<Connection> connection_ptr;
connection.cpp
#include "connection.h"
#include <utility>
#include <vector>
#include <iostream>
#include <thread>
#include "connection_manager.h"
Connection::Connection(boost::asio::ip::tcp::socket socket, ConnectionManager& manager)
: socket_(std::move(socket))
, connection_manager_(manager)
{
}
void Connection::start()
{
do_read();
}
void Connection::stop()
{
socket_.close();
}
Connection::~Connection()
{
}
void Connection::do_read()
{
auto self(shared_from_this());
socket_.async_read_some(boost::asio::buffer(buffer_), [this, self](boost::system::error_code ec, std::size_t bytes_transferred) {
if (!ec) {
std::string buff_str = std::string(buffer_.data(), bytes_transferred);
const auto& tokenized_buffer = split(buff_str, ' ');
if(!tokenized_buffer.empty() && tokenized_buffer[0] == "sync") {
/// "syncing connection" sends a specific text
/// hence I can separate between sycing and long-lived connections here and act accordingly.
const auto& exec_json_strs = getExecutionJsons();
const auto& order_json_strs = getOrdersAsJsons();
const auto& position_json_strs = getPositionsAsJsons();
const auto& all_json_strs = exec_json_strs + order_json_strs + position_json_strs + createSyncDoneJson();
/// this is potentially a very large data.
do_write(all_json_strs);
}
do_read();
} else {
connection_manager_.stop(shared_from_this());
}
});
}
void Connection::do_write(const std::string& write_buffer)
{
outgoing_buffer_ = write_buffer;
auto self(shared_from_this());
boost::asio::async_write(socket_, boost::asio::buffer(outgoing_buffer_, outgoing_buffer_.size()), [this, self](boost::system::error_code ec, std::size_t transfer_size) {
if (!ec) {
/// everything is fine.
} else {
/// what to do here?
/// server crashes once I get error code 32 (EPIPE) here.
}
});
}
connection_manager.h
#pragma once
#include <set>
#include "connection.h"
/// Manages open connections so that they may be cleanly stopped when the server
/// needs to shut down.
class ConnectionManager
{
public:
ConnectionManager(const ConnectionManager&) = delete;
ConnectionManager& operator=(const ConnectionManager&) = delete;
/// Construct a connection manager.
ConnectionManager();
/// Add the specified connection to the manager and start it.
void start(connection_ptr c);
/// Stop the specified connection.
void stop(connection_ptr c);
/// Stop all connections.
void stop_all();
void sendAllConnections(const std::string& buffer);
private:
/// The managed connections.
std::set<connection_ptr> connections_;
};
connection_manager.cpp
#include "connection_manager.h"
ConnectionManager::ConnectionManager()
{
}
void ConnectionManager::start(connection_ptr c)
{
connections_.insert(c);
c->start();
}
void ConnectionManager::stop(connection_ptr c)
{
connections_.erase(c);
c->stop();
}
void ConnectionManager::stop_all()
{
for (auto c: connections_)
c->stop();
connections_.clear();
}
/// this function is used to keep clients up to date with the changes, not used during syncing phase.
void ConnectionManager::sendAllConnections(const std::string& buffer)
{
for (auto c: connections_)
c->do_write(buffer);
}
server.h
#pragma once
#include <boost/asio.hpp>
#include <string>
#include "connection.h"
#include "connection_manager.h"
class Server
{
public:
Server(const Server&) = delete;
Server& operator=(const Server&) = delete;
/// Construct the server to listen on the specified TCP address and port, and
/// serve up files from the given directory.
explicit Server(const std::string& address, const std::string& port);
/// Run the server's io_service loop.
void run();
void deliver(const std::string& buffer);
private:
/// Perform an asynchronous accept operation.
void do_accept();
/// Wait for a request to stop the server.
void do_await_stop();
/// The io_service used to perform asynchronous operations.
boost::asio::io_service io_service_;
/// The signal_set is used to register for process termination notifications.
boost::asio::signal_set signals_;
/// Acceptor used to listen for incoming connections.
boost::asio::ip::tcp::acceptor acceptor_;
/// The connection manager which owns all live connections.
ConnectionManager connection_manager_;
/// The *NEXT* socket to be accepted.
boost::asio::ip::tcp::socket socket_;
};
server.cpp
#include "server.h"
#include <signal.h>
#include <utility>
Server::Server(const std::string& address, const std::string& port)
: io_service_()
, signals_(io_service_)
, acceptor_(io_service_)
, connection_manager_()
, socket_(io_service_)
{
// Register to handle the signals that indicate when the server should exit.
// It is safe to register for the same signal multiple times in a program,
// provided all registration for the specified signal is made through Asio.
signals_.add(SIGINT);
signals_.add(SIGTERM);
#if defined(SIGQUIT)
signals_.add(SIGQUIT);
#endif // defined(SIGQUIT)
do_await_stop();
// Open the acceptor with the option to reuse the address (i.e. SO_REUSEADDR).
boost::asio::ip::tcp::resolver resolver(io_service_);
boost::asio::ip::tcp::endpoint endpoint = *resolver.resolve({address, port});
acceptor_.open(endpoint.protocol());
acceptor_.set_option(boost::asio::ip::tcp::acceptor::reuse_address(true));
acceptor_.bind(endpoint);
acceptor_.listen();
do_accept();
}
void Server::run()
{
// The io_service::run() call will block until all asynchronous operations
// have finished. While the server is running, there is always at least one
// asynchronous operation outstanding: the asynchronous accept call waiting
// for new incoming connections.
io_service_.run();
}
void Server::do_accept()
{
acceptor_.async_accept(socket_,
[this](boost::system::error_code ec)
{
// Check whether the server was stopped by a signal before this
// completion handler had a chance to run.
if (!acceptor_.is_open())
{
return;
}
if (!ec)
{
connection_manager_.start(std::make_shared<Connection>(
std::move(socket_), connection_manager_));
}
do_accept();
});
}
void Server::do_await_stop()
{
signals_.async_wait(
[this](boost::system::error_code /*ec*/, int /*signo*/)
{
// The server is stopped by cancelling all outstanding asynchronous
// operations. Once all operations have finished the io_service::run()
// call will exit.
acceptor_.close();
connection_manager_.stop_all();
});
}
/// this function is used to keep clients up to date with the changes, not used during syncing phase.
void Server::deliver(const std::string& buffer)
{
connection_manager_.sendAllConnections(buffer);
}
So, I am repeating my question: My server crashes when I gracefully close a client that is connected to it, while the client is receiving a large chunk of data and I do not know why.
Edit: Crash happens in async_write function, as soon as I receive EPIPE error. The application is multithreaded. There are 4 threads that call Server::deliver with their own data as it is produced. deliver() is used for keeping clients up to date, it has nothing to do with the initial syncing: syncing is done with persistent data fetched from db.
I had a single io_service, so I thought that I would not need strands. io_service::run is called on main thread, so the main thread is blocking.
Reviewing, adding some missing code bits:
namespace /*missing code stubs*/ {
auto split(std::string_view input, char delim) {
std::vector<std::string_view> result;
boost::algorithm::split(result, input,
boost::algorithm::is_from_range(delim, delim));
return result;
}
std::string getExecutionJsons() { return ""; }
std::string getOrdersAsJsons() { return ""; }
std::string getPositionsAsJsons() { return ""; }
std::string createSyncDoneJson() { return ""; }
}
Now the things I notice are:
you have a single io_service, so a single thread. Okay, so no strands should be required unless you have threads in your other code (main, e.g.?).
A particular reason to suspect that threads are at play is that nobody could possibly call Server::deliver because run() is blocking. This means that whenever you call deliver() now it causes a data race, which leads to Undefined Behaviour
The casual comment
/// this function is used to keep clients up to date with the changes,
/// not used during syncing phase.
does not do much to remove this concern. The code needs to defend against misuse. Comments do not get executed. Make it better:
void Server::deliver(const std::string& buffer) {
post(io_context_,
[this, buffer] { connection_manager_.broadcast(std::move(buffer)); });
}
you do not check that previous writes are completed before accepting a "new" one. This means that calling Connection::do_write results in Undefined Behaviour for two reasons:
modifying outgoing_buffer_ during an ongoing async operation that uses that buffer is UB
having two overlapped async_write on the same IO object is UB (see docs
The typical way to fix that is to have a queue of outgoing messages instead.
using async_read_some is rarely what you want, especially since the reads don't accumulate into a dynamic buffer. This means that if your packets get separated at unexpected boundaries, you may not detect commands at all, or incorrectly.
Instead consider asio::async_read_until with a dynamic buffer (e.g.
read directly into std::string so you don't have to copy the buffer into a string
read into streambuf so you can use std::istream(&sbuf_) to parse instead of tokenizing
Concatenating all_json_strs which clearly have to be owning text containers is wasteful. Instead, use a const-buffer-sequence to combine them all without copying.
Better yet, consider a streaming approach to JSON serialization so not all the JSON needs to be serialized in memory at any given time.
Don't declare empty destructors (~Connection). They're pessimizations
Likewise for empty constructors (ConnectionManager). If you must, consider
ConnectionManager::ConnectionManager() = default;
The getNativeHandle gives me more questions about other code that may interfere. E.g. it may indicate other libraries doing operations, which again can lead to overlapped reads/writes, or it could be a sign of more code living on threads (as Server::run() is by definition blocking)
Connection manager should probably hold weak_ptr, so Connections could eventually terminate. Now, the last reference is by defintion held in the connection manager, meaning nothing ever gets destructed when the peer disconnects or the session fails for some other reason.
This is not idiomatic:
// Check whether the server was stopped by a signal before this
// completion handler had a chance to run.
if (!acceptor_.is_open()) {
return;
}
If you closed the acceptor, the completion handler is called with error::operation_aborted anyways. Simply handle that, e.g. in the final version I'll post later:
// separate strand for each connection - just in case you ever add threads
acceptor_.async_accept(
make_strand(io_context_), [this](error_code ec, tcp::socket sock) {
if (!ec) {
connection_manager_.register_and_start(
std::make_shared<Connection>(std::move(sock),
connection_manager_));
do_accept();
}
});
I notice this comment:
// The server is stopped by cancelling all outstanding asynchronous
// operations. Once all operations have finished the io_service::run()
// call will exit.
In fact you never cancel() any operation on any IO object in your code. Again, comments aren't executed. It's better to indeed do as you say, and let the destructors close the resources. This prevents spurious errors when objects are used-after-close, and also prevents very annoying race conditions when e.g. you closed the handle, some other thread re-opened a new stream on the same filedescriptor and you had given out the handle to a third party (using getNativeHandle)... you see where this leads?
Reproducing The Problem?
Having reviewed this way, I tried to repro the issue, so I created fake data:
std::string getExecutionJsons() { return std::string(1024, 'E'); }
std::string getOrdersAsJsons() { return std::string(13312, 'O'); }
std::string getPositionsAsJsons() { return std::string(8192, 'P'); }
std::string createSyncDoneJson() { return std::string(24576, 'C'); }
With some minor tweaks to the Connection class:
std::string buff_str =
std::string(buffer_.data(), bytes_transferred);
const auto& tokenized_buffer = split(buff_str, ' ');
if (!tokenized_buffer.empty() &&
tokenized_buffer[0] == "sync") {
std::cerr << "sync detected on " << socket_.remote_endpoint() << std::endl;
/// "syncing connection" sends a specific text
/// hence I can separate between sycing and long-lived
/// connections here and act accordingly.
const auto& exec_json_strs = getExecutionJsons();
const auto& order_json_strs = getOrdersAsJsons();
const auto& position_json_strs = getPositionsAsJsons();
const auto& all_json_strs = exec_json_strs +
order_json_strs + position_json_strs +
createSyncDoneJson();
std::cerr << "All json length: " << all_json_strs.length() << std::endl;
/// this is potentially a very large data.
do_write(all_json_strs); // already on strand!
}
We get the server outputting
sync detected on 127.0.0.1:43012
All json length: 47104
sync detected on 127.0.0.1:43044
All json length: 47104
And clients faked with netcat:
$ netcat localhost 8989 <<< 'sync me' > expected
^C
$ wc -c expected
47104 expected
Good. Now let's cause premature disconnect:
netcat localhost 8989 -w0 <<< 'sync me' > truncated
$ wc -c truncated
0 truncated
So, it does lead to early close, but server still says
sync detected on 127.0.0.1:44176
All json length: 47104
Let's instrument do_write as well:
async_write( //
socket_, boost::asio::buffer(outgoing_buffer_, outgoing_buffer_.size()),
[/*this,*/ self](error_code ec, size_t transfer_size) {
std::cerr << "do_write completion: " << transfer_size << " bytes ("
<< ec.message() << ")" << std::endl;
if (!ec) {
/// everything is fine.
} else {
/// what to do here?
// FIXME: probably cancel the read loop so the connection
// closes?
}
});
Now we see:
sync detected on 127.0.0.1:44494
All json length: 47104
do_write completion: 47104 bytes (Success)
sync detected on 127.0.0.1:44512
All json length: 47104
do_write completion: 32768 bytes (Operation canceled)
For one disconnected and one "okay" connection.
No sign of crashes/undefined behaviour. Let's check with -fsanitize=address,undefined: clean record, even adding a heartbeat:
int main() {
Server s("127.0.0.1", "8989");
std::thread yolo([&s] {
using namespace std::literals;
int i = 1;
do {
std::this_thread::sleep_for(5s);
} while (s.deliver("HEARTBEAT DEMO " + std::to_string(i++)));
});
s.run();
yolo.join();
}
Conclusion
The only problem highlighted above that weren't addressed were:
additional threading issues not shown (perhaps via getNativeHandle)
the fact that you can have overlapping writes in the Connection do_write. Fixing that:
void Connection::write(std::string msg) { // public, might not be on the strand
post(socket_.get_executor(),
[self = shared_from_this(), msg = std::move(msg)]() mutable {
self->do_write(std::move(msg));
});
}
void Connection::do_write(std::string msg) { // assumed on the strand
outgoing_.push_back(std::move(msg));
if (outgoing_.size() == 1)
do_write_loop();
}
void Connection::do_write_loop() {
if (outgoing_.size() == 0)
return;
auto self(shared_from_this());
async_write( //
socket_, boost::asio::buffer(outgoing_.front()),
[this, self](error_code ec, size_t transfer_size) {
std::cerr << "write completion: " << transfer_size << " bytes ("
<< ec.message() << ")" << std::endl;
if (!ec) {
outgoing_.pop_front();
do_write_loop();
} else {
socket_.cancel();
// This would ideally be enough to free the connection, but
// since `ConnectionManager` doesn't use `weak_ptr` you need to
// force the issue using kind of an "umbillical cord reflux":
connection_manager_.stop(self);
}
});
}
As you can see I also split write/do_write to prevent off-strand invocation. Same with stop.
Full Listing
A full listing with all the remarks/fixes from above:
File connection.h
#pragma once
#include <boost/asio.hpp>
#include <array>
#include <deque>
#include <memory>
#include <string>
using boost::asio::ip::tcp;
class ConnectionManager;
/// Represents a single connection from a client.
class Connection : public std::enable_shared_from_this<Connection> {
public:
Connection(const Connection&) = delete;
Connection& operator=(const Connection&) = delete;
/// Construct a connection with the given socket.
explicit Connection(tcp::socket socket, ConnectionManager& manager);
void start();
void stop();
void write(std::string msg);
private:
void do_stop();
void do_write(std::string msg);
void do_write_loop();
/// Perform an asynchronous read operation.
void do_read();
/// Socket for the connection.
tcp::socket socket_;
/// The manager for this connection.
ConnectionManager& connection_manager_;
/// Buffer for incoming data.
std::array<char, 8192> buffer_;
std::deque<std::string> outgoing_;
};
using connection_ptr = std::shared_ptr<Connection>;
File connection_manager.h
#pragma once
#include <list>
#include "connection.h"
/// Manages open connections so that they may be cleanly stopped when the server
/// needs to shut down.
class ConnectionManager {
public:
ConnectionManager(const ConnectionManager&) = delete;
ConnectionManager& operator=(const ConnectionManager&) = delete;
ConnectionManager() = default; // could be split across h/cpp if you wanted
void register_and_start(connection_ptr c);
void stop(connection_ptr c);
void stop_all();
void broadcast(const std::string& buffer);
// purge defunct connections, returns remaining active connections
size_t garbage_collect();
private:
using handle = std::weak_ptr<connection_ptr::element_type>;
std::list<handle> connections_;
};
File server.h
#pragma once
#include <boost/asio.hpp>
#include <string>
#include "connection.h"
#include "connection_manager.h"
class Server {
public:
Server(const Server&) = delete;
Server& operator=(const Server&) = delete;
/// Construct the server to listen on the specified TCP address and port,
/// and serve up files from the given directory.
explicit Server(const std::string& address, const std::string& port);
/// Run the server's io_service loop.
void run();
bool deliver(const std::string& buffer);
private:
void do_accept();
void do_await_signal();
boost::asio::io_context io_context_;
boost::asio::any_io_executor strand_{io_context_.get_executor()};
boost::asio::signal_set signals_{strand_};
tcp::acceptor acceptor_{strand_};
ConnectionManager connection_manager_;
};
File connection.cpp
#include "connection.h"
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <thread>
#include <utility>
#include <vector>
#include "connection_manager.h"
using boost::system::error_code;
Connection::Connection(tcp::socket socket, ConnectionManager& manager)
: socket_(std::move(socket))
, connection_manager_(manager) {}
void Connection::start() { // always assumed on the strand (since connection
// just constructed)
do_read();
}
void Connection::stop() { // public, might not be on the strand
post(socket_.get_executor(),
[self = shared_from_this()]() mutable {
self->do_stop();
});
}
void Connection::do_stop() { // assumed on the strand
socket_.cancel(); // trust shared pointer to destruct
}
namespace /*missing code stubs*/ {
auto split(std::string_view input, char delim) {
std::vector<std::string_view> result;
boost::algorithm::split(result, input,
boost::algorithm::is_from_range(delim, delim));
return result;
}
std::string getExecutionJsons() { return std::string(1024, 'E'); }
std::string getOrdersAsJsons() { return std::string(13312, 'O'); }
std::string getPositionsAsJsons() { return std::string(8192, 'P'); }
std::string createSyncDoneJson() { return std::string(24576, 'C'); }
} // namespace
void Connection::do_read() {
auto self(shared_from_this());
socket_.async_read_some(
boost::asio::buffer(buffer_),
[this, self](error_code ec, size_t bytes_transferred) {
if (!ec) {
std::string buff_str =
std::string(buffer_.data(), bytes_transferred);
const auto& tokenized_buffer = split(buff_str, ' ');
if (!tokenized_buffer.empty() &&
tokenized_buffer[0] == "sync") {
std::cerr << "sync detected on " << socket_.remote_endpoint() << std::endl;
/// "syncing connection" sends a specific text
/// hence I can separate between sycing and long-lived
/// connections here and act accordingly.
const auto& exec_json_strs = getExecutionJsons();
const auto& order_json_strs = getOrdersAsJsons();
const auto& position_json_strs = getPositionsAsJsons();
const auto& all_json_strs = exec_json_strs +
order_json_strs + position_json_strs +
createSyncDoneJson();
std::cerr << "All json length: " << all_json_strs.length() << std::endl;
/// this is potentially a very large data.
do_write(all_json_strs); // already on strand!
}
do_read();
} else {
std::cerr << "do_read terminating: " << ec.message() << std::endl;
connection_manager_.stop(shared_from_this());
}
});
}
void Connection::write(std::string msg) { // public, might not be on the strand
post(socket_.get_executor(),
[self = shared_from_this(), msg = std::move(msg)]() mutable {
self->do_write(std::move(msg));
});
}
void Connection::do_write(std::string msg) { // assumed on the strand
outgoing_.push_back(std::move(msg));
if (outgoing_.size() == 1)
do_write_loop();
}
void Connection::do_write_loop() {
if (outgoing_.size() == 0)
return;
auto self(shared_from_this());
async_write( //
socket_, boost::asio::buffer(outgoing_.front()),
[this, self](error_code ec, size_t transfer_size) {
std::cerr << "write completion: " << transfer_size << " bytes ("
<< ec.message() << ")" << std::endl;
if (!ec) {
outgoing_.pop_front();
do_write_loop();
} else {
socket_.cancel();
// This would ideally be enough to free the connection, but
// since `ConnectionManager` doesn't use `weak_ptr` you need to
// force the issue using kind of an "umbellical cord reflux":
connection_manager_.stop(self);
}
});
}
File connection_manager.cpp
#include "connection_manager.h"
void ConnectionManager::register_and_start(connection_ptr c) {
connections_.emplace_back(c);
c->start();
}
void ConnectionManager::stop(connection_ptr c) {
c->stop();
}
void ConnectionManager::stop_all() {
for (auto h : connections_)
if (auto c = h.lock())
c->stop();
}
/// this function is used to keep clients up to date with the changes, not used
/// during syncing phase.
void ConnectionManager::broadcast(const std::string& buffer) {
for (auto h : connections_)
if (auto c = h.lock())
c->write(buffer);
}
size_t ConnectionManager::garbage_collect() {
connections_.remove_if(std::mem_fn(&handle::expired));
return connections_.size();
}
File server.cpp
#include "server.h"
#include <signal.h>
#include <utility>
using boost::system::error_code;
Server::Server(const std::string& address, const std::string& port)
: io_context_(1) // THREAD HINT: single threaded
, connection_manager_()
{
// Register to handle the signals that indicate when the server should exit.
// It is safe to register for the same signal multiple times in a program,
// provided all registration for the specified signal is made through Asio.
signals_.add(SIGINT);
signals_.add(SIGTERM);
#if defined(SIGQUIT)
signals_.add(SIGQUIT);
#endif // defined(SIGQUIT)
do_await_signal();
// Open the acceptor with the option to reuse the address (i.e. SO_REUSEADDR).
tcp::resolver resolver(io_context_);
tcp::endpoint endpoint = *resolver.resolve({address, port});
acceptor_.open(endpoint.protocol());
acceptor_.set_option(tcp::acceptor::reuse_address(true));
acceptor_.bind(endpoint);
acceptor_.listen();
do_accept();
}
void Server::run() {
// The io_service::run() call will block until all asynchronous operations
// have finished. While the server is running, there is always at least one
// asynchronous operation outstanding: the asynchronous accept call waiting
// for new incoming connections.
io_context_.run();
}
void Server::do_accept() {
// separate strand for each connection - just in case you ever add threads
acceptor_.async_accept(
make_strand(io_context_), [this](error_code ec, tcp::socket sock) {
if (!ec) {
connection_manager_.register_and_start(
std::make_shared<Connection>(std::move(sock),
connection_manager_));
do_accept();
}
});
}
void Server::do_await_signal() {
signals_.async_wait([this](error_code /*ec*/, int /*signo*/) {
// handler on the strand_ because of the executor on signals_
// The server is stopped by cancelling all outstanding asynchronous
// operations. Once all operations have finished the io_service::run()
// call will exit.
acceptor_.cancel();
connection_manager_.stop_all();
});
}
bool Server::deliver(const std::string& buffer) {
if (io_context_.stopped()) {
return false;
}
post(io_context_,
[this, buffer] { connection_manager_.broadcast(std::move(buffer)); });
return true;
}
File test.cpp
#include "server.h"
int main() {
Server s("127.0.0.1", "8989");
std::thread yolo([&s] {
using namespace std::literals;
int i = 1;
do {
std::this_thread::sleep_for(5s);
} while (s.deliver("HEARTBEAT DEMO " + std::to_string(i++)));
});
s.run();
yolo.join();
}
According to the documentation:
"The program must ensure that the stream performs no other write operations (such as async_write, the stream's async_write_some function, or any other composed operations that perform writes) until this operation completes."
Does this mean, I cannot call boost::asio::async_write a second time until the handler for the first is called? How does one achieve this and still be asynchronous?
If I have a method Send:
//--------------------------------------------------------------------
void Connection::Send(const std::vector<char> & data)
{
auto callback = boost::bind(&Connection::OnSend, this, boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred);
boost::asio::async_write(m_socket, boost::asio::buffer(data), callback);
}
Do I have to change it to something like:
//--------------------------------------------------------------------
void Connection::Send(const std::vector<char> & data)
{
// Issue a send
std::lock_guard<std::mutex> lock(m_numPostedSocketIOMutex);
++m_numPostedSocketIO;
m_numPostedSocketIOConditionVariable.wait(lock, [this]() {return m_numPostedSocketIO == 0; });
auto callback = boost::bind(&Connection::OnSend, this, boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred);
boost::asio::async_write(m_socket, boost::asio::buffer(data), callback);
}
and if so, then aren't I blocking after the first call again?
The async in async_write() refers to the fact that the function returns immediately while the writing happens in background. There should still be only one outstanding write at any given time.
You need to use a buffer if you have an asynchronous producer to set aside the new chunk of data until the currently active write completes, then issue a new async_write in the completion handler.
That is, Connection::Send must only call async_write once to kick off the process, in subsequent calls it should instead buffer its data, which will be picked up in the completion handler of the currently executing async_write.
For performance reasons you want to avoid copying the data into the buffer, and instead append the new chunk to a list of buffers and use the scatter-gather overload of async_write that accepts a ConstBufferSequence. It is also possible to use one large streambuf as a buffer and append directly into it.
Of course the buffer needs to be synchronized unless both Connection::Send and the io_service run in the same thread. An empty buffer can be reused as an indication that no async_write is in progress.
Here's some code to illustrate what I mean:
struct Connection
{
void Connection::Send(std::vector<char>&& data)
{
std::lock_guard<std::mutex> lock(buffer_mtx);
buffers[active_buffer ^ 1].push_back(std::move(data)); // move input data to the inactive buffer
doWrite();
}
private:
void Connection::doWrite()
{
if (buffer_seq.empty()) { // empty buffer sequence == no writing in progress
active_buffer ^= 1; // switch buffers
for (const auto& data : buffers[active_buffer]) {
buffer_seq.push_back(boost::asio::buffer(data));
}
boost::asio::async_write(m_socket, buffer_seq, [this] (const boost::system::error_code& ec, size_t bytes_transferred) {
std::lock_guard<std::mutex> lock(buffer_mtx);
buffers[active_buffer].clear();
buffer_seq.clear();
if (!ec) {
if (!buffers[active_buffer ^ 1].empty()) { // have more work
doWrite();
}
}
});
}
}
std::mutex buffer_mtx;
std::vector<std::vector<char>> buffers[2]; // a double buffer
std::vector<boost::asio::const_buffer> buffer_seq;
int active_buffer = 0;
. . .
};
The complete working source can be found in this answer.
Yes you need to wait for completion handler before calling async_write again. Are you sure you'll be blocked? Of course it depends on how fast you generate your data, but even if yes there's no way to send it faster than your network can handle it. If it's really an issue consider sending bigger chunks.
Here is a complete, compilable, and tested, example, that I researched and got to work through trial and error after reading the answer and subsequent edits from RustyX.
Connection.h
#pragma once
#include <boost/asio.hpp>
#include <atomic>
#include <condition_variable>
#include <memory>
#include <mutex>
//--------------------------------------------------------------------
class ConnectionManager;
//--------------------------------------------------------------------
class Connection : public std::enable_shared_from_this<Connection>
{
public:
typedef std::shared_ptr<Connection> SharedPtr;
// Ensure all instances are created as shared_ptr in order to fulfill requirements for shared_from_this
static Connection::SharedPtr Create(ConnectionManager * connectionManager, boost::asio::ip::tcp::socket & socket);
//
static std::string ErrorCodeToString(const boost::system::error_code & errorCode);
Connection(const Connection &) = delete;
Connection(Connection &&) = delete;
Connection & operator = (const Connection &) = delete;
Connection & operator = (Connection &&) = delete;
~Connection();
// We have to defer the start until we are fully constructed because we share_from_this()
void Start();
void Stop();
void Send(const std::vector<char> & data);
private:
static size_t m_nextClientId;
size_t m_clientId;
ConnectionManager * m_owner;
boost::asio::ip::tcp::socket m_socket;
std::atomic<bool> m_stopped;
boost::asio::streambuf m_receiveBuffer;
mutable std::mutex m_sendMutex;
std::vector<char> m_sendBuffers[2]; // Double buffer
int m_activeSendBufferIndex;
bool m_sending;
std::vector<char> m_allReadData; // Strictly for test purposes
Connection(ConnectionManager * connectionManager, boost::asio::ip::tcp::socket socket);
void DoReceive();
void DoSend();
};
//--------------------------------------------------------------------
Connection.cpp
#include "Connection.h"
#include "ConnectionManager.h"
#include <boost/bind.hpp>
#include <algorithm>
#include <cstdlib>
//--------------------------------------------------------------------
size_t Connection::m_nextClientId(0);
//--------------------------------------------------------------------
Connection::SharedPtr Connection::Create(ConnectionManager * connectionManager, boost::asio::ip::tcp::socket & socket)
{
return Connection::SharedPtr(new Connection(connectionManager, std::move(socket)));
}
//--------------------------------------------------------------------------------------------------
std::string Connection::ErrorCodeToString(const boost::system::error_code & errorCode)
{
std::ostringstream debugMsg;
debugMsg << " Error Category: " << errorCode.category().name() << ". "
<< " Error Message: " << errorCode.message() << ". ";
// IMPORTANT - These comparisons only work if you dynamically link boost libraries
// Because boost chose to implement boost::system::error_category::operator == by comparing addresses
// The addresses are different in one library and the other when statically linking.
//
// We use make_error_code macro to make the correct category as well as error code value.
// Error code value is not unique and can be duplicated in more than one category.
if (errorCode == boost::asio::error::make_error_code(boost::asio::error::connection_refused))
{
debugMsg << " (Connection Refused)";
}
else if (errorCode == boost::asio::error::make_error_code(boost::asio::error::eof))
{
debugMsg << " (Remote host has disconnected)";
}
else
{
debugMsg << " (boost::system::error_code has not been mapped to a meaningful message)";
}
return debugMsg.str();
}
//--------------------------------------------------------------------
Connection::Connection(ConnectionManager * connectionManager, boost::asio::ip::tcp::socket socket)
:
m_clientId (m_nextClientId++)
, m_owner (connectionManager)
, m_socket (std::move(socket))
, m_stopped (false)
, m_receiveBuffer ()
, m_sendMutex ()
, m_sendBuffers ()
, m_activeSendBufferIndex (0)
, m_sending (false)
, m_allReadData ()
{
printf("Client connection with id %zd has been created.", m_clientId);
}
//--------------------------------------------------------------------
Connection::~Connection()
{
// Boost uses RAII, so we don't have anything to do. Let thier destructors take care of business
printf("Client connection with id %zd has been destroyed.", m_clientId);
}
//--------------------------------------------------------------------
void Connection::Start()
{
DoReceive();
}
//--------------------------------------------------------------------
void Connection::Stop()
{
// The entire connection class is only kept alive, because it is a shared pointer and always has a ref count
// as a consequence of the outstanding async receive call that gets posted every time we receive.
// Once we stop posting another receive in the receive handler and once our owner release any references to
// us, we will get destroyed.
m_stopped = true;
m_owner->OnConnectionClosed(shared_from_this());
}
//--------------------------------------------------------------------
void Connection::Send(const std::vector<char> & data)
{
std::lock_guard<std::mutex> lock(m_sendMutex);
// Append to the inactive buffer
std::vector<char> & inactiveBuffer = m_sendBuffers[m_activeSendBufferIndex ^ 1];
inactiveBuffer.insert(inactiveBuffer.end(), data.begin(), data.end());
//
DoSend();
}
//--------------------------------------------------------------------
void Connection::DoSend()
{
// Check if there is an async send in progress
// An empty active buffer indicates there is no outstanding send
if (m_sendBuffers[m_activeSendBufferIndex].empty())
{
m_activeSendBufferIndex ^= 1;
std::vector<char> & activeBuffer = m_sendBuffers[m_activeSendBufferIndex];
auto self(shared_from_this());
boost::asio::async_write(m_socket, boost::asio::buffer(activeBuffer),
[self](const boost::system::error_code & errorCode, size_t bytesTransferred)
{
std::lock_guard<std::mutex> lock(self->m_sendMutex);
self->m_sendBuffers[self->m_activeSendBufferIndex].clear();
if (errorCode)
{
printf("An error occured while attemping to send data to client id %zd. %s", self->m_clientId, ErrorCodeToString(errorCode).c_str());
// An error occurred
// We do not stop or close on sends, but instead let the receive error out and then close
return;
}
// Check if there is more to send that has been queued up on the inactive buffer,
// while we were sending what was on the active buffer
if (!self->m_sendBuffers[self->m_activeSendBufferIndex ^ 1].empty())
{
self->DoSend();
}
});
}
}
//--------------------------------------------------------------------
void Connection::DoReceive()
{
auto self(shared_from_this());
boost::asio::async_read_until(m_socket, m_receiveBuffer, '#',
[self](const boost::system::error_code & errorCode, size_t bytesRead)
{
if (errorCode)
{
// Check if the other side hung up
if (errorCode == boost::asio::error::make_error_code(boost::asio::error::eof))
{
// This is not really an error. The client is free to hang up whenever they like
printf("Client %zd has disconnected.", self->m_clientId);
}
else
{
printf("An error occured while attemping to receive data from client id %zd. Error Code: %s", self->m_clientId, ErrorCodeToString(errorCode).c_str());
}
// Notify our masters that we are ready to be destroyed
self->m_owner->OnConnectionClosed(self);
// An error occured
return;
}
// Grab the read data
std::istream stream(&self->m_receiveBuffer);
std::string data;
std::getline(stream, data, '#');
data += "#";
printf("Received data from client %zd: %s", self->m_clientId, data.c_str());
// Issue the next receive
if (!self->m_stopped)
{
self->DoReceive();
}
});
}
//--------------------------------------------------------------------
ConnectionManager.h
#pragma once
#include "Connection.h"
// Boost Includes
#include <boost/asio.hpp>
// Standard Includes
#include <thread>
#include <vector>
//--------------------------------------------------------------------
class ConnectionManager
{
public:
ConnectionManager(unsigned port, size_t numThreads);
ConnectionManager(const ConnectionManager &) = delete;
ConnectionManager(ConnectionManager &&) = delete;
ConnectionManager & operator = (const ConnectionManager &) = delete;
ConnectionManager & operator = (ConnectionManager &&) = delete;
~ConnectionManager();
void Start();
void Stop();
void OnConnectionClosed(Connection::SharedPtr connection);
protected:
boost::asio::io_service m_io_service;
boost::asio::ip::tcp::acceptor m_acceptor;
boost::asio::ip::tcp::socket m_listenSocket;
std::vector<std::thread> m_threads;
mutable std::mutex m_connectionsMutex;
std::vector<Connection::SharedPtr> m_connections;
boost::asio::deadline_timer m_timer;
void IoServiceThreadProc();
void DoAccept();
void DoTimer();
};
//--------------------------------------------------------------------
ConnectionManager.cpp
#include "ConnectionManager.h"
#include <boost/bind.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <system_error>
#include <cstdio>
//------------------------------------------------------------------------------
ConnectionManager::ConnectionManager(unsigned port, size_t numThreads)
:
m_io_service ()
, m_acceptor (m_io_service, boost::asio::ip::tcp::endpoint(boost::asio::ip::tcp::v4(), port))
, m_listenSocket(m_io_service)
, m_threads (numThreads)
, m_timer (m_io_service)
{
}
//------------------------------------------------------------------------------
ConnectionManager::~ConnectionManager()
{
Stop();
}
//------------------------------------------------------------------------------
void ConnectionManager::Start()
{
if (m_io_service.stopped())
{
m_io_service.reset();
}
DoAccept();
for (auto & thread : m_threads)
{
if (!thread.joinable())
{
thread.swap(std::thread(&ConnectionManager::IoServiceThreadProc, this));
}
}
DoTimer();
}
//------------------------------------------------------------------------------
void ConnectionManager::Stop()
{
{
std::lock_guard<std::mutex> lock(m_connectionsMutex);
m_connections.clear();
}
// TODO - Will the stopping of the io_service be enough to kill all the connections and ultimately have them get destroyed?
// Because remember they have outstanding ref count to thier shared_ptr in the async handlers
m_io_service.stop();
for (auto & thread : m_threads)
{
if (thread.joinable())
{
thread.join();
}
}
}
//------------------------------------------------------------------------------
void ConnectionManager::IoServiceThreadProc()
{
try
{
// Log that we are starting the io_service thread
{
printf("io_service socket thread starting.");
}
// Run the asynchronous callbacks from the socket on this thread
// Until the io_service is stopped from another thread
m_io_service.run();
}
catch (std::system_error & e)
{
printf("System error caught in io_service socket thread. Error Code: %d", e.code().value());
}
catch (std::exception & e)
{
printf("Standard exception caught in io_service socket thread. Exception: %s", e.what());
}
catch (...)
{
printf("Unhandled exception caught in io_service socket thread.");
}
{
printf("io_service socket thread exiting.");
}
}
//------------------------------------------------------------------------------
void ConnectionManager::DoAccept()
{
m_acceptor.async_accept(m_listenSocket,
[this](const boost::system::error_code errorCode)
{
if (errorCode)
{
printf("An error occured while attemping to accept connections. Error Code: %s", Connection::ErrorCodeToString(errorCode).c_str());
return;
}
// Create the connection from the connected socket
std::lock_guard<std::mutex> lock(m_connectionsMutex);
Connection::SharedPtr connection = Connection::Create(this, m_listenSocket);
m_connections.push_back(connection);
connection->Start();
DoAccept();
});
}
//------------------------------------------------------------------------------
void ConnectionManager::OnConnectionClosed(Connection::SharedPtr connection)
{
std::lock_guard<std::mutex> lock(m_connectionsMutex);
auto itConnection = std::find(m_connections.begin(), m_connections.end(), connection);
if (itConnection != m_connections.end())
{
m_connections.erase(itConnection);
}
}
//------------------------------------------------------------------------------
void ConnectionManager::DoTimer()
{
if (!m_io_service.stopped())
{
// Send messages every second
m_timer.expires_from_now(boost::posix_time::seconds(30));
m_timer.async_wait(
[this](const boost::system::error_code & errorCode)
{
std::lock_guard<std::mutex> lock(m_connectionsMutex);
for (auto connection : m_connections)
{
connection->Send(std::vector<char>{'b', 'e', 'e', 'p', '#'});
}
DoTimer();
});
}
}
main.cpp
#include "ConnectionManager.h"
#include <cstring>
#include <iostream>
#include <string>
int main()
{
// Start up the server
ConnectionManager connectionManager(5000, 2);
connectionManager.Start();
// Pretend we are doing other things or just waiting for shutdown
std::this_thread::sleep_for(std::chrono::minutes(5));
// Stop the server
connectionManager.Stop();
return 0;
}
Could we use 2 strands for this question by posting write(...) as an asynchronous operation to strand1 and handler(...) to strand2?
Your advices on the code would be highly appreciated.
boost::asio::strand<boost::asio::io_context::executor_type> strand1, strand2;
std::vector<char> empty_vector(0);
void Connection::Send(const std::vector<char> & data)
{
boost::asio::post(boost::asio::bind_executor(strand1, std::bind(&Connection::write, this, true, data)));
}
void Connection::write(bool has_data, const std::vector<char> & data)
{
// Append to the inactive buffer
std::vector<char> & inactiveBuffer = m_sendBuffers[m_activeSendBufferIndex ^ 1];
if (has_data)
{
inactiveBuffer.insert(inactiveBuffer.end(), data.begin(), data.end());
}
//
if (!inactiveBuffer.empty() && m_sendBuffers[m_activeSendBufferIndex].empty())
{
m_activeSendBufferIndex ^= 1;
std::vector<char> & activeBuffer = m_sendBuffers[m_activeSendBufferIndex];
boost::asio::async_write(m_socket, boost::asio::buffer(activeBuffer), boost::asio::bind_executor(strand2, std::bind(&Connection::handler, this, boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred)));
}
}
void Connection::handler(const boost::system::error_code & errorCode, size_t bytesTransferred)
{
self->m_sendBuffers[self->m_activeSendBufferIndex].clear();
if (errorCode)
{
printf("An error occured while attemping to send data to client id %zd. %s", self->m_clientId, ErrorCodeToString(errorCode).c_str());
// An error occurred
// We do not stop or close on sends, but instead let the receive error out and then close
return;
}
boost::asio::post(boost::asio::bind_executor(strand1, std::bind(&Connection::write, this, false, empty_vector)));
}
}
I need to write a dynamic library which should export three functions:
bool init_sender(const char* ip_addr, int port);
void cleanup_sender();
void send_command(const char* cmd, int len);
init_sender should connect to server synchronously and return true / false according to whether it was success or not.
cleanup_sender should wait for all commands to be completed and then returns.
send_command should send the specified command to the server asynchronously and return as fast as possible.
So I wrote the following code:
boost::asio::io_service g_io_service;
std::unique_ptr<boost::asio::io_service::work> g_work;
boost::asio::ip::tcp::socket g_sock(g_io_service);
boost::thread g_io_service_th;
void io_service_processor()
{
g_io_service.run();
}
bool __stdcall init_sender(const char* ip_addr, int port)
{
try
{
g_work = std::make_unique<boost::asio::io_service::work>(g_io_service);
boost::asio::ip::tcp::resolver resolver(g_io_service);
boost::asio::connect(g_sock, resolver.resolve({ ip_addr, std::to_string(port) }));
g_io_service_th = boost::thread(io_service_processor);
return true;
}
catch (const std::exception& ex)
{
return false;
}
}
void __stdcall cleanup_sender()
{
g_work.reset();
if (g_io_service_th.joinable())
{
g_io_service_th.join();
}
}
void async_write_cb(
const boost::system::error_code& error,
std::size_t bytes_transferred)
{
// TODO: implement
}
void __stdcall send_command(const char* cmd, int len)
{
boost::asio::async_write(g_sock, boost::asio::buffer(cmd, len), async_write_cb);
}
As far as I knew from boost asio documentation, all my command posted by async_write function call will be executed from one single thread (the one that contains run function call -- g_io_service_th in my case). Am I right? If so, it doesn't seem to be fully asynchronous to me. What could I do to change this behavior and send several commands at the same time from several threads? Should I create boost::thread_group like this
for (int i = 0; i < pool_size; ++i)
{
_thread_group.create_thread(boost::bind(&boost::asio::io_service::run, &_io_service));
}
or is there any other way?
You're asking a bit question and there's a lot to learn. Probably the most important thing to understand is how to use a work object.
edit: reference to async_write restriction:
http://www.boost.org/doc/libs/1_59_0/doc/html/boost_asio/reference/async_write/overload1.html
quoting from the documentation:
This operation is implemented in terms of zero or more calls to the stream's async_write_some function, and is known as a composed operation. The program must ensure that the stream performs no other write operations (such as async_write, the stream's async_write_some function, or any other composed operations that perform writes) until this operation completes.
Your asio thread code should look something like this:
#include <iostream>
#include <vector>
#include <boost/asio.hpp>
#include <thread>
struct service_loop
{
using io_service = boost::asio::io_service;
io_service& get_io_service() {
return _io_service;
}
service_loop(size_t threads = 1)
: _strand(_io_service)
, _work(_io_service)
, _socket(_io_service)
{
for(size_t i = 0 ; i < threads ; ++i)
add_thread();
}
~service_loop() {
stop();
}
// adding buffered sequential writes...
void write(const char* data, size_t length)
{
_strand.dispatch([this, v = std::vector<char>(data, data + length)] {
_write_buffer.insert(std::end(_write_buffer), v.begin(), v.end());
check_write();
});
}
private:
std::vector<char> _write_buffer;
bool _writing;
void check_write()
{
if (!_writing and !_write_buffer.empty()) {
auto pv = std::make_shared<std::vector<char>>(std::move(_write_buffer));
_writing = true;
_write_buffer.clear();
boost::asio::async_write(_socket,
boost::asio::buffer(*pv),
[this, pv] (const boost::system::error_code& ec, size_t written) {
_strand.dispatch(std::bind(&service_loop::handle_write,
this,
ec,
written));
});
}
}
void handle_write(const boost::system::error_code& ec, size_t written)
{
_writing = false;
if (ec) {
// handle error somehow
}
else {
check_write();
}
}
private:
io_service _io_service;
io_service::strand _strand;
io_service::work _work;
std::vector<std::thread> _threads;
boost::asio::ip::tcp::socket _socket;
void add_thread()
{
_threads.emplace_back(std::bind(&service_loop::run_thread, this));
}
void stop()
{
_io_service.stop();
for(auto& t : _threads) {
if(t.joinable()) t.join();
}
}
void run_thread()
{
while(!_io_service.stopped())
{
try {
_io_service.run();
}
catch(const std::exception& e) {
// report exceptions here
}
}
}
};
using namespace std;
auto main() -> int
{
service_loop sl;
sl.write("hello", 5);
sl.write(" world", 6);
std::this_thread::sleep_for(std::chrono::seconds(10));
return 0;
}
I am implementing a small distributed system that consists N machines. Each of them receives some data from some remote server and then propagates the data to other n-1 fellow machines. I am using the Boost Asio async_read and async_write to implement this. I set up a test cluster of N=30 machines. When I tried smaller datesets (receiving 75KB to 750KB per machine), the program always worked. But when I moved on to just a slightly larger dataset (7.5MB), I observed strange behavior: at the beginning, reads and writes happened as expected, but after a while, some machines hanged while others finished, the number of machines that hanged varied with each run. I tried to print out some messages in each handler and found that for those machines that hanged, async_read basically could not successfully read after a while, therefore nothing could proceed afterwards. I checked the remote servers, and they all finished writing. I have tried out using strand to control the order of execution of async reads and writes, and I also tried using different io_services for read and write. None of them solved the problem. I am pretty desperate. Can anyone help me?
Here is the code for the class that does the read and propagation:
const int TRANS_TUPLE_SIZE=15;
const int TRANS_BUFFER_SIZE=5120/TRANS_TUPLE_SIZE*TRANS_TUPLE_SIZE;
class Asio_Trans_Broadcaster
{
private:
char buffer[TRANS_BUFFER_SIZE];
int node_id;
int mpi_size;
int mpi_rank;
boost::asio::ip::tcp::socket* dbsocket;
boost::asio::ip::tcp::socket** sender_sockets;
int n_send;
boost::mutex mutex;
bool done;
public:
Asio_Trans_Broadcaster(boost::asio::ip::tcp::socket* dbskt, boost::asio::ip::tcp::socket** senderskts,
int msize, int mrank, int id)
{
dbsocket=dbskt;
count=0;
node_id=id;
mpi_size=mpi_rank=-1;
sender_sockets=senderskts;
mpi_size=msize;
mpi_rank=mrank;
n_send=-1;
done=false;
}
static std::size_t completion_condition(const boost::system::error_code& error, std::size_t bytes_transferred)
{
int remain=bytes_transferred%TRANS_TUPLE_SIZE;
if(remain==0 && bytes_transferred>0)
return 0;
else
return TRANS_BUFFER_SIZE-bytes_transferred;
}
void write_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
int n=-1;
mutex.lock();
n_send--;
n=n_send;
mutex.unlock();
fprintf(stdout, "~~~~~~ #%d, write_handler: %d bytes, copies_to_send: %d\n",
node_id, bytes_transferred, n);
if(n==0 && !done)
boost::asio::async_read(*dbsocket,
boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
void broadcast_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
fprintf(stdout, "#%d, broadcast_handler: %d bytes, mpi_size:%d, mpi_rank: %d\n", node_id, bytes_transferred, mpi_size, mpi_rank);
if (!ec)
{
int pos=0;
while(pos<bytes_transferred && pos<TRANS_BUFFER_SIZE)
{
int id=-1;
memcpy(&id, &buffer[pos], 4);
if(id<0)
{
done=true;
fprintf(stdout, "#%d, broadcast_handler: done!\n", mpi_rank);
break;
}
pos+=TRANS_TUPLE_SIZE;
}
mutex.lock();
n_send=mpi_size-1;
mutex.unlock();
for(int i=0; i<mpi_size; i++)
if(i!=mpi_rank)
{
boost::asio::async_write(*sender_sockets[i], boost::asio::buffer(buffer, bytes_transferred),
boost::bind(&Asio_Trans_Broadcaster::write_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
}
else
{
cerr<<mpi_rank<<" error: "<<ec.message()<<endl;
delete this;
}
}
void broadcast()
{
boost::asio::async_read(*dbsocket,
boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
};
Here is the main code running on each machine:
int N=30;
boost::asio::io_service* sender_io_service=new boost::asio::io_service();
boost::asio::io_service::work* p_work=new boost::asio::io_service::work(*sender_io_service);
boost::thread_group send_thread_pool;
for(int i=0; i<NUM_THREADS; i++)
{
send_thread_pool.create_thread( boost::bind( & boost::asio::io_service::run, sender_io_service ) );
}
boost::asio::io_service* receiver_io_service=new boost::asio::io_service();
shared_ptr<boost::asio::io_service::work> p_work2(new boost::asio::io_service::work(*receiver_io_service));
boost::thread_group thread_pool2;
thread_pool2.create_thread( boost::bind( & boost::asio::io_service::run, receiver_io_service) );
boost::asio::ip::tcp::socket* receiver_socket;
//establish nonblocking connection with remote server
AsioConnectToRemote(5000, 1, receiver_io_service, receiver_socket, true);
boost::asio::ip::tcp::socket* send_sockets[N];
//establish blocking connection with other machines
hadoopNodes = SetupAsioConnectionsWIthOthers(sender_io_service, send_sockets, hostFileName, mpi_rank, mpi_size, 3000, false);
Asio_Trans_Broadcaster* db_receiver=new Asio_Trans_Broadcaster(receiver_socket, send_sockets,
mpi_size, mpi_rank, mpi_rank);
db_receiver->broadcast();
p_work2.reset();
thread_pool2.join_all();
delete p_work;
send_thread_pool.join_all();
I don't know what your code is trying to achieve. There are too many missing bits.
Of course, if the task is to asynchronously send/receive traffic on network sockets, Asio is just the thing for that. It's hard to see what's special about your code.
I'd suggest to clean up the more obvious problems:
there's (almost) no error handling (check your error_code-s!)
unless you're on a funny platform, your format strings should use %lu for size_t
why do you mess around with raw arrays, with possibly bad sizes, when you can just have a vector?
never assume the size of objects if you can use sizeof:
memcpy(&id, &trans_buffer[pos], sizeof(id));
come to think of it, it looks like the indexing of buffer is unsafe anyways:
while(pos < bytes_transferred && pos < TRANS_BUFFER_SIZE)
{
int id = -1;
memcpy(&id, &buffer[pos], sizeof(id));
If e.g. pos == TRANS_BUFFER_SIZE-1 here the memcpy invokes Undefined Behavour...
why is there so much new going on? You're inviting a hairy class of bugs into your code. As if memory management wasn't the achilles heel of lowlevel coding. Use values, or shared pointers. Never delete this. Ever[1]
why is there so much repeated code? Why is one thread pool named after sender and the other thread_pool2? Which contains 1 thread. Eh? Why do you have one work item as a raw pointer, the other as a shared_ptr?
You could just just:
struct service_wrap {
service_wrap(int threads) {
while(threads--)
pool.create_thread(boost::bind(&boost::asio::io_service::run, boost::ref(io_service)));
}
~service_wrap() {
io_service.post(boost::bind(&service_wrap::stop, this));
pool.join_all();
}
private: // mind the initialization order!
boost::asio::io_service io_service;
boost::optional<boost::asio::io_service::work> work;
boost::thread_group pool;
void stop() {
work = boost::none;
}
};
So you can simply write:
service_wrap senders(NUM_THREADS);
service_wrap receivers(1);
Wow. Did you see that? No more chance of error. If you fix one pool, you fix the other automatically. No more delete the first, .reset() the second work item. In short: no more messy code, and less complexity.
Use exception safe locking guards:
int local_n_send = -1; // not clear naming
{
boost::lock_guard<boost::mutex> lk(mutex);
n_send--;
local_n_send = n_send;
}
the body of broadcast is completely repeated in write_handler(). Why not just call it:
if(local_n_send == 0 && !done)
broadcast();
I think there's still a race condition - not a data race on the access to n_send itself, but the decision to re-broadcast might be wrong if n_send reaches zero after the the lock is released. Now, since broadcast() does only an async operation, you can just do it under the lock and get rid of the race condition:
void write_handler(const error_code &ec, size_t bytes_transferred) {
boost::lock_guard<boost::mutex> lk(mutex);
if(!(done || --n_send))
broadcast();
}
Woop woop. That's three lines of code now. Less code is less bugs.
My guess would be that if you diligently scrub the code like this, you will inevitably find your clues. Think of it like you would look for a lost wedding-ring: you wouldn't leave a mess lying around. Instead, you'd go from room to room and tidy it all up. Throw everything "out" first if need be.
Iff you can make this thing self-contained /and/ reproducible, I'll even debug it further for you!
Cheers
Here's a starting point that I made while looking at the code: Compiling on Coliru
#include <boost/asio.hpp>
#include <boost/thread.hpp>
#include <boost/array.hpp>
#include <boost/make_shared.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <iostream>
const/*expr*/ int TRANS_TUPLE_SIZE = 15;
const/*expr*/ int TRANS_BUFFER_SIZE = 5120 / TRANS_TUPLE_SIZE * TRANS_TUPLE_SIZE;
namespace AsioTrans
{
using boost::system::error_code;
using namespace boost::asio;
typedef ip::tcp::socket socket_t;
typedef boost::ptr_vector<socket_t> socket_list;
class Broadcaster
{
private:
boost::array<char, TRANS_BUFFER_SIZE> trans_buffer;
int node_id;
int mpi_rank;
socket_t& dbsocket;
socket_list& sender_sockets;
int n_send;
boost::mutex mutex;
bool done;
public:
Broadcaster(
socket_t& dbskt,
socket_list& senderskts,
int mrank,
int id) :
node_id(id),
mpi_rank(mrank),
dbsocket(dbskt),
sender_sockets(senderskts),
n_send(-1),
done(false)
{
// count=0;
}
static size_t completion_condition(const error_code& error, size_t bytes_transferred)
{
// TODO FIXME handler error_code here
int remain = bytes_transferred % TRANS_TUPLE_SIZE;
if(bytes_transferred && !remain)
{
return 0;
}
else
{
return TRANS_BUFFER_SIZE - bytes_transferred;
}
}
void write_handler(const error_code &ec, size_t bytes_transferred)
{
// TODO handle errors
// TODO check bytes_transferred
boost::lock_guard<boost::mutex> lk(mutex);
if(!(done || --n_send))
broadcast();
}
void broadcast_handler(const error_code &ec, size_t bytes_transferred)
{
fprintf(stdout, "#%d, broadcast_handler: %lu bytes, mpi_size:%lu, mpi_rank: %d\n", node_id, bytes_transferred, sender_sockets.size(), mpi_rank);
if(!ec)
{
for(size_t pos = 0; (pos < bytes_transferred && pos < TRANS_BUFFER_SIZE); pos += TRANS_TUPLE_SIZE)
{
int id = -1;
memcpy(&id, &trans_buffer[pos], sizeof(id));
if(id < 0)
{
done = true;
fprintf(stdout, "#%d, broadcast_handler: done!\n", mpi_rank);
break;
}
}
{
boost::lock_guard<boost::mutex> lk(mutex);
n_send = sender_sockets.size() - 1;
}
for(int i = 0; size_t(i) < sender_sockets.size(); i++)
{
if(i != mpi_rank)
{
async_write(
sender_sockets[i],
buffer(trans_buffer, bytes_transferred),
boost::bind(&Broadcaster::write_handler, this, placeholders::error, placeholders::bytes_transferred));
}
}
}
else
{
std::cerr << mpi_rank << " error: " << ec.message() << std::endl;
delete this;
}
}
void broadcast()
{
async_read(
dbsocket,
buffer(trans_buffer),
Broadcaster::completion_condition,
boost::bind(&Broadcaster::broadcast_handler, this,
placeholders::error,
placeholders::bytes_transferred));
}
};
struct service_wrap {
service_wrap(int threads) {
while(threads--)
_pool.create_thread(boost::bind(&io_service::run, boost::ref(_service)));
}
~service_wrap() {
_service.post(boost::bind(&service_wrap::stop, this));
_pool.join_all();
}
io_service& service() { return _service; }
private: // mind the initialization order!
io_service _service;
boost::optional<io_service::work> _work;
boost::thread_group _pool;
void stop() {
_work = boost::none;
}
};
extern void AsioConnectToRemote(int, int, io_service&, socket_t&, bool);
extern void SetupAsioConnectionsWIthOthers(io_service&, socket_list&, std::string, int, bool);
}
int main()
{
using namespace AsioTrans;
// there's no use in increasing #threads unless there are blocking operations
service_wrap senders(boost::thread::hardware_concurrency());
service_wrap receivers(1);
socket_t receiver_socket(receivers.service());
AsioConnectToRemote(5000, 1, receivers.service(), receiver_socket, true);
socket_list send_sockets(30);
/*hadoopNodes =*/ SetupAsioConnectionsWIthOthers(senders.service(), send_sockets, "hostFileName", 3000, false);
int mpi_rank = send_sockets.size();
AsioTrans::Broadcaster db_receiver(receiver_socket, send_sockets, mpi_rank, mpi_rank);
db_receiver.broadcast();
}
[1] No exceptions. Except when there's an exception to the no-exceptions rule. Exception-ception.
I'm novice in boost::asio and have the first own troubles.
I create a simple Host resolver (see full code below).
Problem 1.
In case of lost Internet connection, my host resolver stops resolving after the first enter into deadline_timer.
My assumption, that "localhost" must be resolved at any time. But "localhost" are not resolved after timeout during resolving google.us (for example, we unplugged Ethernet jack).
The same behaviour in case of resolving unexisted TLD (for example, google.usd instead google.us).
Problem 2.
In case of lost Internet connection, destructor io_service runs very long (usually, 5 seconds).
What's wrong?
I use VS2012, boost 1.54
File hostresolver.h
pragma once
#include <set>
#include <boost/system/error_code.hpp>
#include <boost/asio.hpp>
#include <boost/asio/ip/basic_resolver.hpp>
#include <boost/asio/ip/basic_resolver_iterator.hpp>
typedef std::set<unsigned long> hostresolver_result_container;
class hostresolver
{
public:
hostresolver(boost::asio::io_service* io_service);
~hostresolver(void);
boost::asio::io_service* ios_ptr;
boost::asio::ip::tcp::resolver resolver_;
boost::asio::deadline_timer timer_;
volatile bool is_completed;
bool is_timeout;
std::string hostname;
hostresolver_result_container result;
void on_timeout(const boost::system::error_code &err);
void start_resolve(const char* hostname, int timeout_seconds);
void finish_resolve(const boost::system::error_code& err, boost::asio::ip::tcp::resolver::iterator endpoint_iterator);
private:
void stop();
};
File hostresolver.cpp
#include "stdafx.h"
#include "hostresolver.h"
#include <boost/bind.hpp>
hostresolver::hostresolver(boost::asio::io_service* io_service) :
resolver_(*io_service), timer_(*io_service), is_completed(false), is_timeout(false)
{
ios_ptr = io_service;
}
hostresolver::~hostresolver(void)
{
}
void hostresolver::start_resolve(const char* hostname, int timeout_second)
{
this->hostname.assign(hostname);
timer_.expires_from_now(boost::posix_time::seconds(timeout_second));
timer_.async_wait(boost::bind(&hostresolver::on_timeout, this, _1));
boost::asio::ip::tcp::resolver::query query(hostname, "http");
resolver_.async_resolve(query,
boost::bind(&hostresolver::finish_resolve, this,
boost::asio::placeholders::error,
boost::asio::placeholders::iterator));
do
{
ios_ptr->run_one();
}
while (!is_completed);
}
void hostresolver::stop()
{
resolver_.cancel();
timer_.cancel();
is_completed = true;
}
void hostresolver::on_timeout(const boost::system::error_code &err)
{
if ((!err) && (err != boost::asio::error::operation_aborted))
{
is_timeout = true;
stop();
}
}
void hostresolver::finish_resolve(const boost::system::error_code& err, boost::asio::ip::tcp::resolver::iterator endpoint_iterator)
{
if (!err)
{
while (endpoint_iterator != boost::asio::ip::tcp::resolver::iterator())
{
boost::asio::ip::tcp::endpoint endpoint = *endpoint_iterator;
if (endpoint.address().is_v4())
{
result.insert(endpoint.address().to_v4().to_ulong());
}
endpoint_iterator++;
}
}
stop();
}
File main.cpp
#include "stdafx.h"
#include "hostresolver.h"
int _tmain(int argc, _TCHAR* argv[])
{
boost::asio::io_service ios;
for (int i = 0; i < 2; i++)
{
std::cout << "iteration: " << i << std::endl;
{
hostresolver hres(&ios);
hres.start_resolve("localhost", 1);
if (hres.result.size() == 0)
std::cout << "failed" << std::endl;
}
{
hostresolver hres(&ios);
hres.start_resolve("google.usd", 1);
}
}
return 0;
}
After returning from run_once, the io_service would most likely enteres the "stopped" state. Thus, you should call ios_ptr->reset() prior to calling run_once() again. Quoting from run_once reference:
Return Value: The number of handlers that were executed. A zero return
value implies that the io_service object is stopped (the stopped()
function returns true). Subsequent calls to run(), run_one(), poll()
or poll_one() will return immediately unless there is a prior call to
reset().