Is there a way to read from a streambuf without removing the bytes?
I'm reading a 'message size' field from the buffer to check if the whole message was received.
If not, I'm posting another async read to get it, but the handler then has no way to know how long the message was supposed to be - because the size field was removed.
Any help appreciated!
E.g.
boost::asio::streambuf _buffer;
void onReceive(const boost::system::error_code& e, std::size_t bytesTransferred)
{
if(e) return;
if(_buffer.size() > 0)
{
// Partial message was previously received, but I don't know how long.
}
else
{
_buffer.commit(bytesTransferred);
/* Read the size (and remove it from the stream) */
unsigned short size = 0;
std::istream in(&_buffer);
in.read((char*)&size, sizeof(unsigned short);
/* Got the whole message? */
if(_buffer.size() > size)
{
/* Yes. */
}
else
{
/* No - read the rest. */
boost::asio::async_read(/*...*/);
}
}
}
You can use read_async to initiate a read using the size of the message header and then adjust it in a 'completion condition' callback, like so:
typedef boost::system::error_code error_code;
template <typename Stream, typename Message>
void MessageReader<Stream, Message>::startRead()
{
readBuffer = allocateMsg();
async_read(stream,
boost::asio::buffer(readBuffer.get(), sizeof(*readBuffer)),
boost::bind(&MessageReader<Stream, Message>::bytesToRead, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred),
boost::bind(&MessageReader<Stream, Message>::readDone, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
template <typename Stream, typename Message>
size_t MessageReader<Stream, Message>::bytesToRead(const error_code& error,
size_t bytes_read)
{
size_t result;
if (error)
result = 0; // error - stop reading
else if (bytes_read < sizeof(CmnMessageHeader))
result = sizeof(CmnMessageHeader) - bytes_read; // read rest of header
else if (readBuffer->header.byteCount > sizeof(*readBuffer))
result = 0; // bad byte count
else
result = readBuffer->header.byteCount - bytes_read; // read message body
return result;
}
template <typename Stream, typename Message>
void MessageReader<Stream, Message>::readDone(const error_code& error,
size_t bytes_read)
{
if (error)
{
if (error.value() == boost::system::errc::no_such_file_or_directory)
{
notifyStop();
}
else if (error.value() != boost::system::errc::operation_canceled)
{
notifyStop();
}
// else the operation was cancelled, thus no stop notification is needed and
// we can merely return
}
else if (bytes_read != readBuffer->header.byteCount)
{
LOG4CXX_ERROR(logger, "Message byte count mismatch");
notifyStop();
}
else
{
handleMsg(readBuffer);
startRead();
}
}
EDIT: Added typedef for error_code.
I did this yesterday. So i thought i'd offer my solution...
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <boost/asio.hpp>
#include <boost/asio/streambuf.hpp>
void ReadFromStreambuf()
{
boost::asio::streambuf mybuffer;
// write some data to the buffer
std::ostream o2buffer (&mybuffer);
o2buffer << "hello stackoverflow";
// get buffer size
size_t nBufferSize = boost::asio::buffer_size(mybuffer.data());
// get const buffer
std::stringstream ssOut;
boost::asio::streambuf::const_buffers_type constBuffer = mybuffer.data();
// copy const buffer to stringstream, then output
std::copy(
boost::asio::buffers_begin(constBuffer),
boost::asio::buffers_begin(constBuffer) + nBufferSize,
std::ostream_iterator<char>(ssOut)
);
std::cout << ssOut.str() << "\n";
}
int main(int argc, char const *argv[])
{
ReadFromStreambuf();
return 0;
}
There are two approaches you can adopt:
Issue a single read to read the number of bytes for the size (say 4), the issue a read for the required size.
Use the read some call, and buffer the bytes in your code, say in a vector and analyse that way.
I would go for option 2, it does mean copying the buffer, however I would hazard that it is cheaper than multiple read some calls.
Related
I'm working on a RS485 communication class and I'm trying to make a function that reads until a certain char is on the line, but with a time out. The problem is that my system timer immediately returns, doesn't matter which time out I enter. I tried changing the timer to be a member variable of the class, so it doesn't go out of scope, but that wasn't the problem. I tried different implementations of timers (deadline_timer mostly) but that didn't help. If I remove the timer from the code, then the read succeeds, but when I add it, even if I give it a timeout of 10 seconds (which should be waay more than enough), it will respond with an immediate timeout.
I tried making a simple version of the class here, but I guess that the options mostly depend on the type of machine you're talking to:
class RS485CommunicationLayer final {
public:
RS485CommunicationLayer(
const std::string& path,
/* options */
): io(), port(io), timer(port.get_io_service()) {
open(/* options */);
};
std::size_t write(const char* const buffer, const size_t size) {
/*impl*/
}
// THIS FUNCTION --v
void readUntil(std::vector<char>& buffer, char delim,std::chrono::microseconds timeout) {
boost::optional<boost::system::error_code> timer_result;
boost::optional<boost::system::error_code> read_result;
port.get_io_service().reset();
timer.expires_from_now(timeout);
boost::asio::async_read_until(port, asio::dynamic_buffer(buffer), delim, [&read_result] (const boost::system::error_code& error, size_t) { read_result.reset(error); });
timer.async_wait([&timer_result] (const boost::system::error_code& error) { timer_result.reset(error); });
while (port.get_io_service().run_one())
{
if (read_result)
timer.cancel();
else if (timer_result) {
port.cancel();
}
}
if (read_result)
throw boost::system::system_error(*read_result);
};
private:
asio::io_context io;
asio::serial_port port;
boost::asio::system_timer timer;
void open(/*args*/) {
port.open(path);
/*set options*/
}
};
Edit:
I also tried the following implementation after finding out that run_for() exists. But then the buffer stays empty weirdly enough.
void RS485CommunicationLayer::readUntil(std::vector<char>& buffer, char delim, std::chrono::microseconds timeout) {
boost::optional<boost::system::error_code> read_result;
boost::asio::async_read_until(port, asio::dynamic_buffer(buffer), delim, [&read_result] (const boost::system::error_code& error, size_t) { read_result.reset(error); });
port.get_io_service().run_for(timeout);
if (read_result)
throw boost::system::system_error(*read_result);
}
First off, get_io_service() indicates a Very Old(TM) boost version. Also, it just returns io.
Secondly, why so complicated? I don't even really have the energy to see whether there is a subtle problem with the run_one() loop (it looks fine at a glance).
I'd simplify:
size_t readUntil(std::vector<char>& buffer, char delim,
std::chrono::microseconds timeout) {
error_code read_result;
size_t msglen = 0;
io.reset();
asio::system_timer timer(io, timeout);
asio::async_read_until(port, asio::dynamic_buffer(buffer), delim,
[&](error_code ec, size_t n) {
timer.cancel();
read_result = ec;
msglen = n;
});
timer.async_wait([&](error_code ec) { if (!ec) port.cancel(); });
io.run();
if (read_result)
boost::throw_with_location(boost::system::system_error(read_result),
read_result.location());
return msglen;
}
You can just cancel the complementary IO object from the respective completion handlers.
The timer is per-op and local to the readUntil, so it doesn't have to be a member.
Let's also throw in the write side, which is all of:
size_t write(char const* const data, const size_t size) {
return asio::write(port, asio::buffer(data, size));
}
And I can demo it working:
Live On Coliru
#include <boost/asio.hpp>
#include <iomanip>
#include <iostream>
namespace asio = boost::asio;
using boost::system::error_code;
using namespace std::chrono_literals;
class RS485CommunicationLayer final {
public:
RS485CommunicationLayer(std::string const& path) : io(), port(io) { open(path); };
size_t write(char const* const data, const size_t size) {
return asio::write(port, asio::buffer(data, size));
}
size_t readUntil(std::vector<char>& buffer, char delim,
std::chrono::microseconds timeout) {
error_code read_result;
size_t msglen = 0;
io.reset();
asio::system_timer timer(io, timeout);
asio::async_read_until(port, asio::dynamic_buffer(buffer), delim,
[&](error_code ec, size_t n) {
timer.cancel();
read_result = ec;
msglen = n;
});
timer.async_wait([&](error_code ec) { if (!ec) port.cancel(); });
io.run();
if (read_result)
boost::throw_with_location(boost::system::system_error(read_result),
read_result.location());
return msglen;
}
private:
asio::io_context io;
asio::serial_port port;
void open(std::string path) {
port.open(path);
/*set options*/
}
void close();
};
int main(int argc, char** argv) {
RS485CommunicationLayer comm(argc > 1 ? argv[1] : "");
comm.write("Hello world\n", 12);
for (std::vector<char> response_buffer;
auto len = comm.readUntil(response_buffer, '\n', 100ms);) //
{
std::cout << "Received " << response_buffer.size() << " bytes, next "
<< quoted(std::string_view(response_buffer.data(), len - 1))
<< std::endl;
// consume
response_buffer.erase(begin(response_buffer), begin(response_buffer) + len);
}
}
Demo locally with a socat PTS tunnel:
socat -d -d pty,raw,echo=0 pty,raw,echo=0
And throwing dictionaries at the other end:
while true; do cat /etc/dictionaries-common/words ; done | pv > /dev/pts/10
I have been working on a http server, when I am trying to read the body part of the request It just gets stuck in there, here are snippets of my code
This part reads until headers and calls the parser
void Connection::start_operation() {
parser_ = new Parser(shared_from_this());
if (sock_.is_open()) {
//! TODO limit header size
asio::async_read_until(
sock_, request_stream_, "\r\n\r\n",
[me = shared_from_this()](system::error_code ec, const size_t bytes) {
if (!ec && bytes) {
std::unique_ptr<char> req(new char[bytes]);
me->input_stream.read(req.get(), bytes);
me->parser_->parser_init(std::string(req.get()));
} else {
error::print(ec);
return;
}
});
}
}
The parser makes the call to read_body on parsing Content-Length Field
template <typename Callback>
void Connection::internal_read_body(std::string &req, size_t bytes, Callback call) {
std::unique_ptr<char> buff_space(new char[BUFFER_SIZE]);
if (sock_.is_open()) {
std::cout << "Here" << std::endl;
sock_.async_read_some(
boost::asio::buffer(buff_space.get(), BUFFER_SIZE),
[me_ = shared_from_this(), bytes, &req, buff_space = std::move(buff_space), call](
boost::system::error_code ec, const size_t &bytes_read) mutable {
std::cout << "$$$ " << buff_space.get() << std::endl;
req.append(buff_space.get(), bytes_read);
bytes -= bytes_read;
if (bytes > 0)
me_->internal_read_body(req, bytes, call);
else
call();
});
}
}
void Connection::read_left_over(std::string &req, size_t &bytes) {
std::string extra_data;
input_stream >> extra_data;
bytes -= size_t(extra_data.size());
req.append(std::move(extra_data));
}
template <typename Callback>
void read_body(std::string &req, size_t bytes, Callback call) {
// reserve so much space in the string
req.reserve(bytes);
// reading from remaining data in the streambuf
read_left_over(req, bytes);
ioc_.dispatch(
[this, &req, bytes, call] {
internal_read_body(req, bytes, call);
});
}
The code is stuck in the async_read_some after printing "Here", I'm not sure what I am doing wrong here, or how to debug through this issue. Any help or hint on the cause of error would be appreciated. Thanks!
I've got a problem where boost::asio::async_read fails on the second call, in a strange way:
std::atomic<error_code> ec(asio::error::would_block);
size_t len = 0;
// 1st call
asio::async_read(socket,
asio::buffer(buffer+X),
asio::transfer_exactly(512-X),
[&] (error_code const& err, size_t bytesTransferred)
{
len = bytesTransferred;
ec.store(err, std::memory_order_release);
}
);
/////// ... wait for read to complete ...
// 2nd call
asio::async_read(socket,
asio::buffer(buffer),
asio::transfer_exactly(512),
[&] (error_code const& err, size_t bytesTransferred)
{
len = bytesTransferred;
ec.store(err, std::memory_order_release);
}
);
The constant X is because I have some data already that I got in another way,
so the first read is smaller. Say X=364, then bytesTransferred will be 148 the first time around. My problem however is that the second read again returns 148 bytes, even though that read was for 512 bytes exactly.
I'm baffled. The second call doesn't have an error condition (I checked err). bytesTransferred is an argument passed by aync_read to me, and it's 148 bytes twice. The first time, it matches the asio::transfer_exactly(148) higher up on the stack .The second time the callstack clearly has a asio::transfer_exactly(512). What is going on here?
It's particular to that second call, though, The third call again reads 512 bytes, but also gets 512 bytes.
[MCVE]
#include <iostream>
#include <atomic>
#include <boost/asio/buffer.hpp>
#include <boost/asio/ip/tcp.hpp>
#include <boost/asio/write.hpp>
#include <boost/asio/read_until.hpp>
#include <boost/asio/read.hpp>
// Minimal example, code that works has error checking removed. Class members turned itno globals etc.
namespace {
boost::asio::io_service io_service;
boost::asio::ip::tcp::resolver resolver(io_service);
boost::asio::ip::tcp::socket sock(io_service);
std::vector<char> data(512);
boost::asio::mutable_buffers_1 buffer(&data[0], data.size());
unsigned read_counter = 1;
std::atomic<unsigned> read_timeout;
}
boost::system::error_code openSocket(const std::string &server,
const std::string &port)
{
boost::system::error_code error = boost::asio::error::host_not_found;
using boost::asio::ip::tcp;
tcp::resolver::query query(server, port);
tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
tcp::resolver::iterator end;
while (error && endpoint_iterator != end)
{
sock.close();
sock.connect(*endpoint_iterator++, error);
}
if (error)
{
std::cerr << "No route\n";
sock.close(); // Would be wrong to leave it open.
}
return error;
}
int read(size_t bytesNeeded)
{
size_t buffer_len = boost::asio::buffer_size(buffer);
size_t byteShift = buffer_len - bytesNeeded; // Read into back of buffer.
const int timeoutSeconds = 10;
boost::asio::deadline_timer deadline(io_service);
deadline.expires_from_now(boost::posix_time::seconds(timeoutSeconds)); // This will reset any outstanding timer
read_counter += 2; // If we'd use +1, after 4 billion cycles it would reset to 0
read_timeout.store(0, std::memory_order_release); // 0 = no timeout.
unsigned read_counter_copy = read_counter; // Can't capture global.
deadline.async_wait([read_counter_copy](boost::system::error_code const&) {
// read_timeout is very intentionally captured by value - timeout events are numbered
read_timeout.store(read_counter_copy, std::memory_order_release); }
);
// Start reading "asynchronously", wait for completion or timeout:
std::atomic<boost::system::error_code> ec(boost::asio::error::would_block);
size_t len = 0;
boost::asio::async_read(sock, boost::asio::buffer(buffer + byteShift), boost::asio::transfer_exactly(bytesNeeded),
[&, bytesNeeded](boost::system::error_code const& err, size_t bytesTransferred)
{
if (bytesTransferred != bytesNeeded) {
std::cout << bytesTransferred << " , " << err.message() << std::endl;
}
len = bytesTransferred;
ec.store(err, std::memory_order_release);
}
);
do {
io_service.run_one();
} while (read_timeout.load(std::memory_order_acquire) != read_counter && // Continue if the **last** read didn't time out
(ec.load(std::memory_order_acquire) == boost::asio::error::would_block) && // ec.store() not called,
!io_service.stopped()); // and program still running.
deadline.cancel(); // This will set read_timeout, if it wasn't set yet. But we ignore it from now on.
if (ec.load(std::memory_order_acquire))
{
std::cerr << "oops\n"; // Real error handling omitted.
throw std::runtime_error("");
}
else if (read_timeout == read_counter)
{
std::cerr << "timeout\n";
}
else if (len != bytesNeeded)
{
// This is the real problem.
std::cerr << "Asked " << bytesNeeded << " got " << len;
}
return (int)len;
}
int main(int argc, char* argv[])
{
do try {
::openSocket("192.168.6.30", "80");
read(148); // Assume that data[] already has 364 bytes on the first call.
for (;;)
{
read(512); // Full buffers on every subsequent call.
// Do something with data[] here.
}
}
catch (std::runtime_error) { } while (true);
}
The do try catch while is necessary because the error only happens after I unplug the other side. After the second call to read(148), the next read(512)` fails.
[update]
It's not just transfer_exactly. With transfer_at_least(512) I also get the same problem, one superfluous 148 byte read. (The two should behave the same, as reading at least 512 bytes into a buffer that's only 512 bytes cannot read more or less bytes)
"Solved" it for now by ignoring the incorrect read operation. I was lucky in that I could deal with an unknown amount of missing data, and resync with the stream later on. But it looks like I will have to drop Boost::Asio in the future when I can no longer tolerate missed data.
I'd like to use the method "read_some()" of boost::asio::ip::tcp::socket to fill a buffer represented as a char*.
Here is my method implementation so far:
template<class charType>
int receive_some(charType* buffer, int size)
{
int total_received = 0;
boost::array<charType, size> buf;
while (1)
{
int received = 0;
boost::system::error_code error;
received = _socket.read_some(boost::asio::buffer(buf), error);
if (error == boost::asio::error::eof)
{
break;
}
std::cout.write(buf.data(), received);
total_received += received;
}
return total_received;
}
My problem is I don't see how to convert my charType* buffer into boost::array buf. It seems expensive to iterate over the elements of my boost::array at the end of the process just to fill-in the buffer object...
Any idea ?
template<class charType>
int receive_some(charType* buffer, int size)
{
int total_received = 0;
while (1)
{
int received = 0;
boost::system::error_code error;
received = _socket.read_some(boost::asio::buffer(buffer, size), error);
if (error == boost::asio::error::eof)
{
break;
}
std::cout.write(buffer, received);
total_received += received;
}
return total_received;
}
The boost::asio::buffer function has a lot of overloads to allow to create an asio buffer from diffrent types of sources.
It's worth noting that size has to be the number of bytes to read into buffer and not the number of charType.
Bonus tip: As comments pointed out, that template is suspicious, the best you could do with it is directly write into wide strings but that might better be somewhere else than in a read_some function (actually it might even be better nowhere), in a network function you deal with bytes not characters so you'd better take a simple char* or even void* as a type for the buffer parameter.
I am implementing a small distributed system that consists N machines. Each of them receives some data from some remote server and then propagates the data to other n-1 fellow machines. I am using the Boost Asio async_read and async_write to implement this. I set up a test cluster of N=30 machines. When I tried smaller datesets (receiving 75KB to 750KB per machine), the program always worked. But when I moved on to just a slightly larger dataset (7.5MB), I observed strange behavior: at the beginning, reads and writes happened as expected, but after a while, some machines hanged while others finished, the number of machines that hanged varied with each run. I tried to print out some messages in each handler and found that for those machines that hanged, async_read basically could not successfully read after a while, therefore nothing could proceed afterwards. I checked the remote servers, and they all finished writing. I have tried out using strand to control the order of execution of async reads and writes, and I also tried using different io_services for read and write. None of them solved the problem. I am pretty desperate. Can anyone help me?
Here is the code for the class that does the read and propagation:
const int TRANS_TUPLE_SIZE=15;
const int TRANS_BUFFER_SIZE=5120/TRANS_TUPLE_SIZE*TRANS_TUPLE_SIZE;
class Asio_Trans_Broadcaster
{
private:
char buffer[TRANS_BUFFER_SIZE];
int node_id;
int mpi_size;
int mpi_rank;
boost::asio::ip::tcp::socket* dbsocket;
boost::asio::ip::tcp::socket** sender_sockets;
int n_send;
boost::mutex mutex;
bool done;
public:
Asio_Trans_Broadcaster(boost::asio::ip::tcp::socket* dbskt, boost::asio::ip::tcp::socket** senderskts,
int msize, int mrank, int id)
{
dbsocket=dbskt;
count=0;
node_id=id;
mpi_size=mpi_rank=-1;
sender_sockets=senderskts;
mpi_size=msize;
mpi_rank=mrank;
n_send=-1;
done=false;
}
static std::size_t completion_condition(const boost::system::error_code& error, std::size_t bytes_transferred)
{
int remain=bytes_transferred%TRANS_TUPLE_SIZE;
if(remain==0 && bytes_transferred>0)
return 0;
else
return TRANS_BUFFER_SIZE-bytes_transferred;
}
void write_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
int n=-1;
mutex.lock();
n_send--;
n=n_send;
mutex.unlock();
fprintf(stdout, "~~~~~~ #%d, write_handler: %d bytes, copies_to_send: %d\n",
node_id, bytes_transferred, n);
if(n==0 && !done)
boost::asio::async_read(*dbsocket,
boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
void broadcast_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
fprintf(stdout, "#%d, broadcast_handler: %d bytes, mpi_size:%d, mpi_rank: %d\n", node_id, bytes_transferred, mpi_size, mpi_rank);
if (!ec)
{
int pos=0;
while(pos<bytes_transferred && pos<TRANS_BUFFER_SIZE)
{
int id=-1;
memcpy(&id, &buffer[pos], 4);
if(id<0)
{
done=true;
fprintf(stdout, "#%d, broadcast_handler: done!\n", mpi_rank);
break;
}
pos+=TRANS_TUPLE_SIZE;
}
mutex.lock();
n_send=mpi_size-1;
mutex.unlock();
for(int i=0; i<mpi_size; i++)
if(i!=mpi_rank)
{
boost::asio::async_write(*sender_sockets[i], boost::asio::buffer(buffer, bytes_transferred),
boost::bind(&Asio_Trans_Broadcaster::write_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
}
else
{
cerr<<mpi_rank<<" error: "<<ec.message()<<endl;
delete this;
}
}
void broadcast()
{
boost::asio::async_read(*dbsocket,
boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
boost::asio::placeholders::error,
boost::asio::placeholders::bytes_transferred));
}
};
Here is the main code running on each machine:
int N=30;
boost::asio::io_service* sender_io_service=new boost::asio::io_service();
boost::asio::io_service::work* p_work=new boost::asio::io_service::work(*sender_io_service);
boost::thread_group send_thread_pool;
for(int i=0; i<NUM_THREADS; i++)
{
send_thread_pool.create_thread( boost::bind( & boost::asio::io_service::run, sender_io_service ) );
}
boost::asio::io_service* receiver_io_service=new boost::asio::io_service();
shared_ptr<boost::asio::io_service::work> p_work2(new boost::asio::io_service::work(*receiver_io_service));
boost::thread_group thread_pool2;
thread_pool2.create_thread( boost::bind( & boost::asio::io_service::run, receiver_io_service) );
boost::asio::ip::tcp::socket* receiver_socket;
//establish nonblocking connection with remote server
AsioConnectToRemote(5000, 1, receiver_io_service, receiver_socket, true);
boost::asio::ip::tcp::socket* send_sockets[N];
//establish blocking connection with other machines
hadoopNodes = SetupAsioConnectionsWIthOthers(sender_io_service, send_sockets, hostFileName, mpi_rank, mpi_size, 3000, false);
Asio_Trans_Broadcaster* db_receiver=new Asio_Trans_Broadcaster(receiver_socket, send_sockets,
mpi_size, mpi_rank, mpi_rank);
db_receiver->broadcast();
p_work2.reset();
thread_pool2.join_all();
delete p_work;
send_thread_pool.join_all();
I don't know what your code is trying to achieve. There are too many missing bits.
Of course, if the task is to asynchronously send/receive traffic on network sockets, Asio is just the thing for that. It's hard to see what's special about your code.
I'd suggest to clean up the more obvious problems:
there's (almost) no error handling (check your error_code-s!)
unless you're on a funny platform, your format strings should use %lu for size_t
why do you mess around with raw arrays, with possibly bad sizes, when you can just have a vector?
never assume the size of objects if you can use sizeof:
memcpy(&id, &trans_buffer[pos], sizeof(id));
come to think of it, it looks like the indexing of buffer is unsafe anyways:
while(pos < bytes_transferred && pos < TRANS_BUFFER_SIZE)
{
int id = -1;
memcpy(&id, &buffer[pos], sizeof(id));
If e.g. pos == TRANS_BUFFER_SIZE-1 here the memcpy invokes Undefined Behavour...
why is there so much new going on? You're inviting a hairy class of bugs into your code. As if memory management wasn't the achilles heel of lowlevel coding. Use values, or shared pointers. Never delete this. Ever[1]
why is there so much repeated code? Why is one thread pool named after sender and the other thread_pool2? Which contains 1 thread. Eh? Why do you have one work item as a raw pointer, the other as a shared_ptr?
You could just just:
struct service_wrap {
service_wrap(int threads) {
while(threads--)
pool.create_thread(boost::bind(&boost::asio::io_service::run, boost::ref(io_service)));
}
~service_wrap() {
io_service.post(boost::bind(&service_wrap::stop, this));
pool.join_all();
}
private: // mind the initialization order!
boost::asio::io_service io_service;
boost::optional<boost::asio::io_service::work> work;
boost::thread_group pool;
void stop() {
work = boost::none;
}
};
So you can simply write:
service_wrap senders(NUM_THREADS);
service_wrap receivers(1);
Wow. Did you see that? No more chance of error. If you fix one pool, you fix the other automatically. No more delete the first, .reset() the second work item. In short: no more messy code, and less complexity.
Use exception safe locking guards:
int local_n_send = -1; // not clear naming
{
boost::lock_guard<boost::mutex> lk(mutex);
n_send--;
local_n_send = n_send;
}
the body of broadcast is completely repeated in write_handler(). Why not just call it:
if(local_n_send == 0 && !done)
broadcast();
I think there's still a race condition - not a data race on the access to n_send itself, but the decision to re-broadcast might be wrong if n_send reaches zero after the the lock is released. Now, since broadcast() does only an async operation, you can just do it under the lock and get rid of the race condition:
void write_handler(const error_code &ec, size_t bytes_transferred) {
boost::lock_guard<boost::mutex> lk(mutex);
if(!(done || --n_send))
broadcast();
}
Woop woop. That's three lines of code now. Less code is less bugs.
My guess would be that if you diligently scrub the code like this, you will inevitably find your clues. Think of it like you would look for a lost wedding-ring: you wouldn't leave a mess lying around. Instead, you'd go from room to room and tidy it all up. Throw everything "out" first if need be.
Iff you can make this thing self-contained /and/ reproducible, I'll even debug it further for you!
Cheers
Here's a starting point that I made while looking at the code: Compiling on Coliru
#include <boost/asio.hpp>
#include <boost/thread.hpp>
#include <boost/array.hpp>
#include <boost/make_shared.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <iostream>
const/*expr*/ int TRANS_TUPLE_SIZE = 15;
const/*expr*/ int TRANS_BUFFER_SIZE = 5120 / TRANS_TUPLE_SIZE * TRANS_TUPLE_SIZE;
namespace AsioTrans
{
using boost::system::error_code;
using namespace boost::asio;
typedef ip::tcp::socket socket_t;
typedef boost::ptr_vector<socket_t> socket_list;
class Broadcaster
{
private:
boost::array<char, TRANS_BUFFER_SIZE> trans_buffer;
int node_id;
int mpi_rank;
socket_t& dbsocket;
socket_list& sender_sockets;
int n_send;
boost::mutex mutex;
bool done;
public:
Broadcaster(
socket_t& dbskt,
socket_list& senderskts,
int mrank,
int id) :
node_id(id),
mpi_rank(mrank),
dbsocket(dbskt),
sender_sockets(senderskts),
n_send(-1),
done(false)
{
// count=0;
}
static size_t completion_condition(const error_code& error, size_t bytes_transferred)
{
// TODO FIXME handler error_code here
int remain = bytes_transferred % TRANS_TUPLE_SIZE;
if(bytes_transferred && !remain)
{
return 0;
}
else
{
return TRANS_BUFFER_SIZE - bytes_transferred;
}
}
void write_handler(const error_code &ec, size_t bytes_transferred)
{
// TODO handle errors
// TODO check bytes_transferred
boost::lock_guard<boost::mutex> lk(mutex);
if(!(done || --n_send))
broadcast();
}
void broadcast_handler(const error_code &ec, size_t bytes_transferred)
{
fprintf(stdout, "#%d, broadcast_handler: %lu bytes, mpi_size:%lu, mpi_rank: %d\n", node_id, bytes_transferred, sender_sockets.size(), mpi_rank);
if(!ec)
{
for(size_t pos = 0; (pos < bytes_transferred && pos < TRANS_BUFFER_SIZE); pos += TRANS_TUPLE_SIZE)
{
int id = -1;
memcpy(&id, &trans_buffer[pos], sizeof(id));
if(id < 0)
{
done = true;
fprintf(stdout, "#%d, broadcast_handler: done!\n", mpi_rank);
break;
}
}
{
boost::lock_guard<boost::mutex> lk(mutex);
n_send = sender_sockets.size() - 1;
}
for(int i = 0; size_t(i) < sender_sockets.size(); i++)
{
if(i != mpi_rank)
{
async_write(
sender_sockets[i],
buffer(trans_buffer, bytes_transferred),
boost::bind(&Broadcaster::write_handler, this, placeholders::error, placeholders::bytes_transferred));
}
}
}
else
{
std::cerr << mpi_rank << " error: " << ec.message() << std::endl;
delete this;
}
}
void broadcast()
{
async_read(
dbsocket,
buffer(trans_buffer),
Broadcaster::completion_condition,
boost::bind(&Broadcaster::broadcast_handler, this,
placeholders::error,
placeholders::bytes_transferred));
}
};
struct service_wrap {
service_wrap(int threads) {
while(threads--)
_pool.create_thread(boost::bind(&io_service::run, boost::ref(_service)));
}
~service_wrap() {
_service.post(boost::bind(&service_wrap::stop, this));
_pool.join_all();
}
io_service& service() { return _service; }
private: // mind the initialization order!
io_service _service;
boost::optional<io_service::work> _work;
boost::thread_group _pool;
void stop() {
_work = boost::none;
}
};
extern void AsioConnectToRemote(int, int, io_service&, socket_t&, bool);
extern void SetupAsioConnectionsWIthOthers(io_service&, socket_list&, std::string, int, bool);
}
int main()
{
using namespace AsioTrans;
// there's no use in increasing #threads unless there are blocking operations
service_wrap senders(boost::thread::hardware_concurrency());
service_wrap receivers(1);
socket_t receiver_socket(receivers.service());
AsioConnectToRemote(5000, 1, receivers.service(), receiver_socket, true);
socket_list send_sockets(30);
/*hadoopNodes =*/ SetupAsioConnectionsWIthOthers(senders.service(), send_sockets, "hostFileName", 3000, false);
int mpi_rank = send_sockets.size();
AsioTrans::Broadcaster db_receiver(receiver_socket, send_sockets, mpi_rank, mpi_rank);
db_receiver.broadcast();
}
[1] No exceptions. Except when there's an exception to the no-exceptions rule. Exception-ception.