Streaming file from aws s3 through boost socket - c++

I'm trying to connect via boost ssl socket to aws s3.
It works but when I read, I had several problem.
Corrupted files on the same file but not others.
File not corrupted (md5filter got all the data) but the data sent to the buffer are not good. Meaning there is a problem between the different layers of read somewhere but can't figure out where.
Sometimes the program get stuck in the S3_client::read function and loop thousands of times in the do-while loop calling read. But it never reaches md5filter read.
It get stuck between filterStream.read() and md5filter.read() which is not called. I don't know if it gzip or filterStream. But it only happens if there is no call to the lower layers of read for a while.
Can you help spot the problem in my code ?
#ifndef BTLOOP_AWSCLIENT_H
#define BTLOOP_AWSCLIENT_H
#include "boost/iostreams/filter/gzip.hpp"
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/categories.hpp>
#include <boost/iostreams/stream.hpp>
#include <string>
#include <set>
#include <map>
#include <openssl/md5.h>
#include <sstream>
#include <fstream>
#include <iostream>
#include <boost/asio.hpp>
#include <boost/asio/ssl.hpp>
#include "Logger.h"
namespace io = boost::iostreams;
namespace asio = boost::asio;
namespace ssl = boost::asio::ssl;
typedef ssl::stream<asio::ip::tcp::socket> ssl_socket;
namespace S3Reader
{
class MD5Filter
{
public:
typedef char char_type;
struct category :
io::multichar_input_filter_tag{};
MD5Filter( std::streamsize n );
~MD5Filter();
template<typename Source>
std::streamsize read( Source& src, char* s, std::streamsize n );
void setBigFileMode() { _bigFileMode = true; }
std::string close();
void setFileName( std::string fileName ) { _fileName = fileName; };
inline std::streamsize writtenBytes() {std::streamsize res = _writtenBytes; _writtenBytes= 0; return res;};
inline bool eof(){return _eof;};
private:
void computeMd5( char* buffer, size_t size, bool force = false );
private:
bool _bigFileMode;
int _blockCount;
std::vector<unsigned char> _bufferMD5;
MD5_CTX _mdContext;
unsigned char _hashMd5[MD5_DIGEST_LENGTH];
std::string _fileName;
std::streamsize _writtenBytes;
int _totalSize;
bool _eof;
};
class Ssl_wrapper : public io::device<io::bidirectional>
{
public:
Ssl_wrapper( ssl_socket* sock, std::streamsize n ) :
_sock( sock ),_totalSize(0) { };
std::streamsize read( char_type* s, std::streamsize n )
{
boost::system::error_code ec;
size_t rval = _sock->read_some( asio::buffer( s, n ), ec );
_totalSize +=rval;
LOG_AUDIT( " wrapperR: " << rval << " " << _totalSize << " "<<ec.message());
if ( !ec )
{
return rval;
}
else if ( ec == asio::error::eof )
return -1;
else
throw boost::system::system_error( ec, "Wrapper read_some" );
}
std::streamsize write( const char* s, std::streamsize n )
{
boost::system::error_code ec;
size_t rval = _sock->write_some( asio::buffer( s, n ), ec );
if ( !ec )
{
return rval;
}
else if ( ec == asio::error::eof )
return -1;
else
throw boost::system::system_error( ec, " Wrapper read_some" );
}
private:
ssl_socket* _sock;
int _totalSize;
};
class S3_client
{
public:
S3_client( const std::string& key_id, const std::string& key_secret, const std::string& bucket );
virtual ~S3_client();
bool open( const std::string& fileName );
int read( char* buffer, size_t size );
int readLine( char* buffer, size_t size );
void close();
bool eof() { return _filterStream.eof(); }
std::string authorize( const std::string request );
bool connectSocket( std::string url, std::string port, std::string auth );
private :
std::string _key_id;
std::string _key_secret;
std::string _bucket;
std::string _fileName;
io::gzip_decompressor _gzip;
MD5Filter _md5Filter;
boost::posix_time::seconds _timeout;
ssl_socket* _sock;
Ssl_wrapper* _wrapper;
io::stream<Ssl_wrapper>* _sockstream;
std::map<std::string, std::string> _headerMap;
io::filtering_istream _filterStream;
int _totalSize;
};
}
#endif //BTLOOP_AWSCLIENT_H
S3Client.cpp
#include "S3_client.h"
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/counter.hpp>
#include <boost/exception/diagnostic_information.hpp>
#include <system/ArmError.h>
namespace io = boost::iostreams;
namespace asio = boost::asio;
namespace ssl = boost::asio::ssl;
namespace S3Reader
{
static const size_t s3_block_size = 8 * 1024 * 1024;
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64( unsigned char c )
{
return (isalnum( c ) || (c == '+') || (c == '/'));
}
std::string url_encode( const std::string& value )
{
std::ostringstream escaped;
escaped.fill( '0' );
escaped << std::hex;
for ( auto i = value.begin(), n = value.end(); i != n; ++i )
{
auto c = *i;
if ( isalnum( c ) || c == '-' || c == '_' || c == '.' || c == '~' )
{
escaped << c;
continue;
}
escaped << std::uppercase;
escaped << '%' << std::setw( 2 ) << int((unsigned char) c );
escaped << std::nouppercase;
}
return escaped.str();
}
std::string base64_encode( unsigned char const* bytes_to_encode, unsigned int in_len )
{
std::string ret;
int i = 0;
int j = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
while ( in_len-- )
{
char_array_3[i++] = *(bytes_to_encode++);
if ( i == 3 )
{
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for ( i = 0; (i < 4); i++ )
ret += base64_chars[char_array_4[i]];
i = 0;
}
}
if ( i )
{
for ( j = i; j < 3; j++ )
char_array_3[j] = '\0';
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for ( j = 0; (j < i + 1); j++ )
ret += base64_chars[char_array_4[j]];
while ((i++ < 3))
ret += '=';
}
return ret;
}
std::string to_hex( const uint8_t* buffer, size_t buffer_size )
{
std::stringstream sst;
for ( uint i = 0; i < buffer_size; i++ )
{
sst << std::setw( 2 ) << std::setfill( '0' ) << std::hex << int( buffer[i] );
}
return sst.str();
}
std::string getDateForHeader( bool amzFormat )
{
time_t lt;
time( &lt );
struct tm* tmTmp;
tmTmp = gmtime( &lt );
char buf[50];
if ( amzFormat )
{
strftime( buf, 50, "Date: %a, %d %b %Y %X +0000", tmTmp );
return std::string( buf );
}
else
{
tmTmp->tm_hour++;
//strftime( buf, 50, "%a, %d %b %Y %X +0000", tmTmp );
std::stringstream ss;
ss << mktime( tmTmp );
return ss.str();
}
}
MD5Filter::MD5Filter( std::streamsize n ) :
_bigFileMode( false ), _blockCount( 0 ), _writtenBytes(0), _totalSize( 0 )
{
MD5_Init( &_mdContext );
memset( _hashMd5, 0, MD5_DIGEST_LENGTH );
}
MD5Filter::~MD5Filter()
{
close();
}
template<typename Source>
std::streamsize MD5Filter::read( Source& src, char* s, std::streamsize n )
{
int result =0;
try
{
if ((result = io::read( src, s, n )) == -1 )
{
_eof=true;
LOG_AUDIT( _fileName << " md5R: " << result << " " << _totalSize );
return -1;
}
}
catch ( boost::exception& ex)
{
LOG_ERROR( _fileName <<" "<< boost::diagnostic_information(ex)<< " " << result );
}
computeMd5( s, (size_t) result );
_totalSize += result;
_writtenBytes = result;
LOG_AUDIT( _fileName << " md5R: " << result << " " << _totalSize );
return result;
}
void MD5Filter::computeMd5( char* buffer, size_t size, bool force )
{
size_t realSize = s3_block_size;
uint8_t blockMd5[MD5_DIGEST_LENGTH];
if ( !_bigFileMode )
{
MD5_Update( &_mdContext, buffer, size );
return;
}
if ( size > 0 )
{
_bufferMD5.insert( _bufferMD5.end(), &buffer[0], &buffer[size] );
}
if ((_bufferMD5.size() < s3_block_size) && !force )
return;
if ( force )
realSize = _bufferMD5.size();
MD5( &_bufferMD5[0], realSize, blockMd5 );
MD5_Update( &_mdContext, blockMd5, MD5_DIGEST_LENGTH );
_blockCount++;
if ( _bufferMD5.size() == s3_block_size )
{
_bufferMD5.clear();
return;
}
if ( force )
return;
memcpy( &_bufferMD5[0], &_bufferMD5[s3_block_size], _bufferMD5.size() - s3_block_size );
_bufferMD5.erase( _bufferMD5.begin() + s3_block_size, _bufferMD5.end());
}
std::string MD5Filter::close()
{
std::string mdOutput;
computeMd5( NULL, 0, true );
MD5_Final( _hashMd5, &_mdContext );
mdOutput = to_hex( _hashMd5, MD5_DIGEST_LENGTH );
if ( _bigFileMode )
{
mdOutput += "-" + boost::lexical_cast<std::string>( _blockCount );
}
return mdOutput;
}
std::string S3_client::authorize( const std::string request )
{
unsigned char* digest;
digest = HMAC( EVP_sha1(), _key_secret.c_str(), (int) _key_secret.size(), (unsigned char*) request.c_str(), (int) request.size(), NULL, NULL );
std::string signature( url_encode( base64_encode( digest, 20 )));
return "?AWSAccessKeyId=" + _key_id + "&Expires=" + getDateForHeader( false ) + "&Signature=" + signature;
}
S3_client::S3_client( const std::string& key_id, const std::string& key_secret, const std::string& bucket ) :
_key_id( key_id ), _key_secret( key_secret ), _bucket( bucket ), _gzip( io::gzip::default_window_bits, 1024 * 1024 )
, _md5Filter( s3_block_size ), _timeout( boost::posix_time::seconds( 1 )), _totalSize( 0 ) { }
S3_client::~S3_client()
{
close();
}
bool S3_client::connectSocket( std::string url, std::string port, std::string auth )
{
std::string amzDate = getDateForHeader( true );
std::string host = "url";
boost::asio::io_service io_service;
boost::asio::ip::tcp::resolver resolver( io_service );
boost::asio::ip::tcp::resolver::query query( url, "https" );
auto endpoint = resolver.resolve( query );
// Context with default path
ssl::context ctx( ssl::context::sslv23 );
ctx.set_default_verify_paths();
_sock = new ssl_socket( io_service, ctx );
boost::asio::socket_base::keep_alive option( true );
_wrapper = new Ssl_wrapper( _sock, s3_block_size );
_sockstream = new io::stream<Ssl_wrapper>( boost::ref( *_wrapper ));
asio::connect( _sock->lowest_layer(), endpoint );
_sock->set_verify_mode( ssl::verify_peer );
_sock->set_verify_callback( ssl::rfc2818_verification( url ));
_sock->handshake( ssl_socket::client );
_sock->lowest_layer().set_option( option );
std::stringstream ss;
ss << "GET " << _fileName << auth << " HTTP/1.1\r\n" << "Host: " << host << "\r\nAccept: */*\r\n\r\n";
_sockstream->write( ss.str().c_str(), ss.str().size());
_sockstream->flush();
std::string http_version;
int status_code = 0;
(*_sockstream) >> http_version;
(*_sockstream) >> status_code;
if ( !_sockstream || http_version.substr( 0, 5 ) != "HTTP/" )
{
std::cout << "Invalid response: " << http_version << " " << status_code << std::endl;
return false;
}
if ( status_code != 200 )
{
std::cout << "Response returned with status code " << http_version << " " << status_code << std::endl;
return false;
}
return true;
}
bool S3_client::open( const std::string& fileName )
{
std::string port = "443";
std::string url = "bucket";
std::stringstream authRequest;
std::string date = getDateForHeader( false );
_fileName = fileName;
authRequest << "GET\n\n\n" << date << "\n/" << _bucket << "" << fileName;
std::string auth = authorize( authRequest.str());
if ( !connectSocket( url, port, auth ))
THROW( "Failed to open socket" );
std::string header;
while ( std::getline( *_sockstream, header ) && header != "\r" )
{
std::vector<std::string> vectLine;
boost::split( vectLine, header, boost::is_any_of( ":" ));
if ( vectLine.size() < 2 )
continue;
boost::erase_all( vectLine[1], "\"" );
boost::erase_all( vectLine[1], "\r" );
boost::erase_all( vectLine[1], " " );
_headerMap[vectLine[0]] = vectLine[1];
}
if ( _headerMap.find( "Content-Length" ) == _headerMap.end())
return false;
if ( _headerMap.find( "Content-Type" ) == _headerMap.end())
return false;
if ((uint) std::atoi( _headerMap.at( "Content-Length" ).c_str()) > s3_block_size )
_md5Filter.setBigFileMode();
_md5Filter.setFileName( _fileName );
if ( _headerMap["Content-Type"] == "binary/octet-stream" )
_filterStream.push( _gzip, s3_block_size );
_filterStream.push( boost::ref( _md5Filter ), s3_block_size );
_filterStream.push( boost::ref( *_sockstream ), s3_block_size );
return true;
}
void S3_client::close()
{
std::string localMD5 = _md5Filter.close();
std::string headerMD5 = _headerMap["ETag"];
if ( localMD5 != headerMD5 )
THROW ( "Corrupted file " << _fileName << " " << localMD5 << " " << headerMD5 << "." );
else
LOG_AUDIT( "Close S3: " << _fileName << " " << localMD5 << " " << headerMD5 << "." );
}
int S3_client::readLine( char* buffer, size_t size )
{
_filterStream.getline( buffer, size );
return _filterStream.gcount();
}
int S3_client::read( char* buffer, size_t size )
{
std::streamsize sizeRead = 0;
do
{
_filterStream.read( buffer, size );
sizeRead = _md5Filter.writtenBytes();
_totalSize += sizeRead;
LOG_AUDIT( _fileName << " s3R: " << sizeRead << " " << _totalSize );
}
while( sizeRead ==0 && !_md5Filter.eof() && !_sockstream->eof() && _filterStream.good() && _sock->next_layer().is_open());
return sizeRead;
}
}
int main( int argc, char** argv )
{
S3Reader::S3_client client( key_id, key_secret, s3_bucket );
client.open("MyFile");
while (client.read(buffer, bufferSize) >0 ) {}
}

Related

Boost Asio, console pinger throws exception on socket.send (...)

I'm trying to figure out how ICMP and Boost Asio work. There was a problem sending the packet to the endpoint. The entered url is translated into ip and a socket connection is made. The problem is that when a packet is sent via socket.send (...), an exception is thrown.
Exception: send: Bad address
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <memory>
#include <tuple>
//BOOST
#include <boost/asio.hpp>
#include <boost/program_options.hpp>
#include <boost/log/trivial.hpp>
//CONSTANTS
#define BUFFER_SIZE_64KB 65536
#define TTL_DEFAULT 64
#define ICMP_HDR_SIZE 8
#define LINUX_PAYLOAD_SIZE 56
#define TIME_BYTE_SIZE 4
#define FILL_BYTE 0X8
template <typename T, typename flag_type = int>
using flagged = std::tuple<flag_type, T>;
using namespace boost::asio;
typedef boost::system::error_code error_code;
typedef unsigned char byte;
enum ICMP : uint8_t {
ECHO_REPLY = 0,
UNREACH = 3,
TIME_EXCEEDED = 11,
ECHO_REQUEST = 8
};
enum class IPtype {IPV4, IPV6, BOTH};
struct icmp_header_t {
uint8_t type;
uint8_t code;
uint16_t checksum;
uint16_t id;
uint16_t seq_num;
};
struct ip_header_t {
uint8_t ver_ihl;
uint8_t tos;
uint16_t total_length;
uint16_t id;
uint16_t flags_fo;
uint8_t ttl;
uint8_t protocol;
uint16_t checksum;
uint32_t src_addr;
uint32_t dst_addr;
};
ip_header_t ip_load(std::istream& stream, bool ntoh ) {
ip_header_t header;
stream.read((char*)&header.ver_ihl, sizeof(header.ver_ihl));
stream.read((char*)&header.tos, sizeof(header.tos));
stream.read((char*)&header.total_length, sizeof(header.total_length));
stream.read((char*)&header.id, sizeof(header.id));
stream.read((char*)&header.flags_fo, sizeof(header.flags_fo));
stream.read((char*)&header.ttl, sizeof(header.ttl));
stream.read((char*)&header.protocol, sizeof(header.protocol));
stream.read((char*)&header.checksum, sizeof(header.checksum));
stream.read((char*)&header.src_addr, sizeof(header.src_addr));
stream.read((char*)&header.dst_addr, sizeof(header.dst_addr));
if (ntoh) {
header.total_length = ntohs(header.total_length);
header.id = ntohs(header.id);
header.flags_fo = ntohs(header.flags_fo);
header.checksum = ntohs(header.checksum);
header.src_addr = ntohl(header.src_addr);
header.dst_addr = ntohl(header.dst_addr);
}
return header;
}
icmp_header_t icmp_load(std::istream& stream) {
icmp_header_t header;
stream.read((char*)&header.type, sizeof(header.type));
stream.read((char*)&header.code, sizeof(header.code));
stream.read((char*)&header.checksum, sizeof(header.checksum));
stream.read((char*)&header.id, sizeof(header.id));
stream.read((char*)&header.seq_num, sizeof(header.seq_num));
return header;
}
flagged<ip::icmp::endpoint> sync_icmp_solver(io_service& ios, std::string host,
IPtype type = IPtype::BOTH) noexcept {
ip::icmp::resolver::query query(host, "");
ip::icmp::resolver resl(ios);
ip::icmp::endpoint ep;
error_code ec;
auto it = resl.resolve(query, ec);
if (ec != boost::system::errc::errc_t::success) {
std::cerr << "Error message = " << ec.message() << std::endl;
return std::make_tuple(ec.value(), ep);
}
ip::icmp::resolver::iterator it_end;
//Finds first available ip.
while (it != it_end) {
ip::icmp::endpoint ep = (it++)->endpoint();
auto addr = ep.address();
switch(type) {
case IPtype::IPV4:
if (addr.is_v4()) return std::make_tuple(0, ep);
break;
case IPtype::IPV6:
if(addr.is_v6()) return std::make_tuple(0, ep);
break;
case IPtype::BOTH:
return std::make_tuple(0, ep);
break;
}
}
return std::make_tuple(-1, ep);
}
unsigned short checksum(void *b, int len) {
unsigned short* buf = reinterpret_cast<unsigned short*>(b);
unsigned int sum = 0;
unsigned short result;
for (sum = 0; len > 1; len -= 2 ) {
sum += *buf++;
}
if (len == 1) sum += *(byte*) buf;
sum = (sum >> 16) + (sum & 0xFFFF);
sum += (sum >> 16);
result = ~sum;
return result;
}
unsigned short get_identifier() {
#if defined(BOOST_WINDOWS)
return static_cast<unsigned short>(::GetCurrentProcessId());
#else
return static_cast<unsigned short>(::getpid());
#endif
}
struct PingInfo {
unsigned short seq_num = 0;
size_t time_out;
size_t reply_time = 1;
size_t payload_size = LINUX_PAYLOAD_SIZE;
size_t packets_rec = 0;
size_t packets_trs = 0;
size_t reps = 0;
};
class PingConnection {
private:
ip::icmp::socket sock;
io_service* ios_ptr;
PingInfo* pi_ptr;
ip::icmp::endpoint dst;
boost::posix_time::ptime timestamp;
streambuf input_buf;
deadline_timer deadtime;
//TODO: Check for memleaks.
void write_icmp_req(std::ostream& os) {
byte* pckt = new byte[ICMP_HDR_SIZE + pi_ptr->payload_size];
unsigned short pid = get_identifier();
pckt[0] = 0x8;
pckt[1] = 0x0;
pckt[2] = 0x0;
pckt[3] = 0x0;
pckt[4] = (byte)((pid & 0xF0) >> 4);
pckt[5] = (byte)(pid & 0x0F);
for (size_t i = ICMP_HDR_SIZE; i < ICMP_HDR_SIZE + pi_ptr->payload_size; i++) {
pckt[i] = FILL_BYTE;
}
pckt[6] = (byte)((pi_ptr->seq_num & 0xF0) >> 4);
pckt[7] = (byte)((pi_ptr->seq_num)++ & 0x0F);
unsigned short cs = checksum(pckt, ICMP_HDR_SIZE);
pckt[2] = (byte)((cs & 0xF0) >> 4);
pckt[3] = (byte)(cs & 0x0F);
os << pckt;
delete [] pckt;
}
void pckt_send() {
streambuf buf;
std::ostream os(&buf);
write_icmp_req(os);
timestamp = boost::posix_time::microsec_clock::universal_time();
std::cout << "begin" << std::endl;
sock.send(buf.data());
std::cout << "sock.send(buf.data())" << std::endl;
deadtime.expires_at(timestamp + boost::posix_time::seconds(pi_ptr->time_out));
deadtime.async_wait(std::bind(&PingConnection::req_timeout_callback, this));
}
void req_timeout_callback() {
if (pi_ptr->reps == 0) {
std::cout << "Time Out:echo req" << std::endl;
}
deadtime.expires_at(timestamp + boost::posix_time::seconds(pi_ptr->reply_time));
deadtime.async_wait(std::bind(&PingConnection::pckt_send, this));
}
void pckt_recv() {
std::cout << "pckt_recv" << std::endl;
input_buf.consume(input_buf.size());
sock.async_receive(input_buf.prepare(BUFFER_SIZE_64KB),
std::bind(&PingConnection::recv_timeout_callback, this, std::placeholders::_2));
}
void recv_timeout_callback(size_t sz) {
std::cout << "recv_timeout_callback" << std::endl;
input_buf.commit(sz);
std::istream is(&input_buf);
ip_header_t iph = ip_load(is, false);
icmp_header_t icmph = icmp_load(is);
if (is &&
icmph.type == ECHO_REQUEST &&
icmph.id == get_identifier() &&
icmph.seq_num == pi_ptr->seq_num) {
// If this is the first reply, interrupt the five second timeout.
if (pi_ptr->reps++ == 0) deadtime.cancel();
boost::posix_time::ptime now = boost::posix_time::microsec_clock::universal_time();
std::cout << sz - iph.total_length
<< " bytes from " << iph.src_addr
<< ": icmp_seq=" << icmph.seq_num
<< ", ttl=" << iph.ttl
<< ", time=" << (now - timestamp).total_milliseconds() << " ms"
<< std::endl;
}
pckt_recv();
}
public:
PingConnection(io_service& ios, PingInfo& pi_add) : deadtime(ios), sock(ios) {
pi_ptr = &pi_add;
ios_ptr = &ios;
}
void ping(std::string host) {
int err_flag;
error_code error;
std::tie(err_flag, dst) = sync_icmp_solver(*ios_ptr, host);
if (err_flag) return;
std::cout << dst << std::endl;
sock.connect(dst, error);
if(error) {
return;
}
std::cout << "sock.connect(dst)" << error.message() <<std::endl;
pckt_send();
pckt_recv();
}
};
int main(int argc, char** argv) {
try
{
if (argc < 2) {
std::cerr << "Usage: ping [args]* destination\n";
return -1;
}
io_service ios;
PingInfo pi;
pi.time_out = 56;
PingConnection ping(ios, pi);
ping.ping(argv[1]);
ios.run();
} catch(std::exception& e) {
std::cerr << "Exception: " << e.what() << std::endl;
}
}
socket.send() is called in pckt_send()
For development I use WSL2 and Ubuntu image.

How to remove the � UFT8 character from a char* string? [duplicate]

On Python, there is this option errors='ignore' for the open Python function:
open( '/filepath.txt', 'r', encoding='UTF-8', errors='ignore' )
With this, reading a file with invalid UTF8 characters will replace them with nothing, i.e., they are ignored. For example, a file with the characthers Føö»BÃ¥r is going to be read as FøöBår.
If a line as Føö»BÃ¥r is read with getline() from stdio.h, it will be read as Føö�Bår:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
while( true )
{
if( getline( &readline, &linebuffersize, cfilestream ) != -1 ) {
std::cerr << "readline=" readline << std::endl;
}
else {
break;
}
}
How can I make stdio.h getline() read it as FøöBår instead of Føö�Bår, i..e, ignoring invalid UTF8 characters?
One overwhelming solution I can think of it do iterate throughout all characters on each line read and build a new readline without any of these characters. For example:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
int index;
int charsread;
int invalidcharsoffset;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
if( readline[index] != '�' ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
Related questions:
Fixing invalid UTF8 characters
Replacing non UTF8 characters
python replace unicode characters
Python unicode: how to replace character that cannot be decoded using utf8 with whitespace?
You are confusing what you see with what is really going on. The getline function does not do any replacement of characters. [Note 1]
You are seeing a replacement character (U+FFFD) because your console outputs that character when it is asked to render an invalid UTF-8 code. Most consoles will do that if they are in UTF-8 mode; that is, the current locale is UTF-8.
Also, saying that a file contains the "characters Føö»BÃ¥r" is at best imprecise. A file does not really contain characters. It contains byte sequences which may be interpreted as characters -- for example, by a console or other user presentation software which renders them into glyphs -- according to some encoding. Different encodings produce different results; in this particular case, you have a file which was created by software using the Windows-1252 encoding (or, roughly equivalently, ISO 8859-15), and you are rendering it on a console using UTF-8.
What that means is that the data read by getline contains an invalid UTF-8 sequence, but it (probably) does not contain the replacement character code. Based on the character string you present, it contains the hex character \xbb, which is a guillemot (») in Windows code page 1252.
Finding all the invalid UTF-8 sequences in a string read by getline (or any other C library function which reads files) requires scanning the string, but not for a particular code sequence. Rather, you need to decode UTF-8 sequences one at a time, looking for the ones which are not valid. That's not a simple task, but the mbtowc function can help (if you have enabled a UTF-8 locale). As you'll see in the linked manpage, mbtowc returns the number of bytes contained in a valid "multibyte sequence" (which is UTF-8 in a UTF-8 locale), or -1 to indicate an invalid or incomplete sequence. In the scan, you should pass through the bytes in a valid sequence, or remove/ignore the single byte starting an invalid sequence, and then continue the scan until you reach the end of the string.
Here's some lightly-tested example code (in C):
#include <stdlib.h>
#include <string.h>
/* Removes in place any invalid UTF-8 sequences from at most 'len' characters of the
* string pointed to by 's'. (If a NUL byte is encountered, conversion stops.)
* If the length of the converted string is less than 'len', a NUL byte is
* inserted.
* Returns the length of the possibly modified string (with a maximum of 'len'),
* not including the NUL terminator (if any).
* Requires that a UTF-8 locale be active; since there is no way to test for
* this condition, no attempt is made to do so. If the current locale is not UTF-8,
* behaviour is undefined.
*/
size_t remove_bad_utf8(char* s, size_t len) {
char* in = s;
/* Skip over the initial correct sequence. Avoid relying on mbtowc returning
* zero if n is 0, since Posix is not clear whether mbtowc returns 0 or -1.
*/
int seqlen;
while (len && (seqlen = mbtowc(NULL, in, len)) > 0) { len -= seqlen; in += seqlen; }
char* out = in;
if (len && seqlen < 0) {
++in;
--len;
/* If we find an invalid sequence, we need to start shifting correct sequences. */
for (; len; in += seqlen, len -= seqlen) {
seqlen = mbtowc(NULL, in, len);
if (seqlen > 0) {
/* Shift the valid sequence (if one was found) */
memmove(out, in, seqlen);
out += seqlen;
}
else if (seqlen < 0) seqlen = 1;
else /* (seqlen == 0) */ break;
}
*out++ = 0;
}
return out - s;
}
Notes
Aside from the possible line-end transformation of the underlying I/O library, which will replace CR-LF with a single \n on systems like Windows where the two character CR-LF sequence is used as a line-end indication.
As #rici well explains in his answer, there can be several invalid UTF-8 sequences in a byte sequence.
Possibly iconv(3) could be worth a look, e.g. see https://linux.die.net/man/3/iconv_open.
When the string "//IGNORE" is appended to tocode, characters that cannot be represented in the target character set will be silently discarded.
Example
This byte sequence, if interpreted as UTF-8, contains some invalid UTF-8:
"some invalid\xFE\xFE\xFF\xFF stuff"
If you display this you would see something like
some invalid���� stuff
When this string passes through the remove_invalid_utf8 function in the following C program, the invalid UTF-8 bytes are removed using the iconv function mentioned above.
So the result is then:
some invalid stuff
C Program
#include <stdio.h>
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
char *remove_invalid_utf8(char *utf8, size_t len) {
size_t inbytes_len = len;
char *inbuf = utf8;
size_t outbytes_len = len;
char *result = calloc(outbytes_len + 1, sizeof(char));
char *outbuf = result;
iconv_t cd = iconv_open("UTF-8//IGNORE", "UTF-8");
if(cd == (iconv_t)-1) {
perror("iconv_open");
}
if(iconv(cd, &inbuf, &inbytes_len, &outbuf, &outbytes_len)) {
perror("iconv");
}
iconv_close(cd);
return result;
}
int main() {
char *utf8 = "some invalid\xFE\xFE\xFF\xFF stuff";
char *converted = remove_invalid_utf8(utf8, strlen(utf8));
printf("converted: %s to %s\n", utf8, converted);
free(converted);
return 0;
}
I also managed to fix it by trailing/cutting down all Non-ASCII characters.
This one takes about 2.6 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
int index;
int charsread;
int invalidcharsoffset;
unsigned int fixedchar;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "index " << std::setw(3) << index
// << " readline " << std::setw(10) << fixedchar
// << " -> '" << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
fixedreadline[index-invalidcharsoffset] = '\0';
// std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Alternative and slower version using memcpy
Using menmove does not improve much speed, so you could either one.
This one takes about 3.1 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
char* finalresult;
int index;
int lastcopy;
int charsread;
int charstocopy;
int invalidcharsoffset;
bool hasignoredbytes;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
hasignoredbytes = false;
source = readline;
destination = fixedreadline;
lastcopy = 0;
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "fixedchar " << std::setw(10)
// << fixedchar << " -> '"
// << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
if( hasignoredbytes ) {
charstocopy = index - lastcopy - invalidcharsoffset;
memcpy( destination, source, charstocopy );
source += index - lastcopy;
lastcopy = index;
destination += charstocopy;
invalidcharsoffset = 0;
hasignoredbytes = false;
}
}
else {
++invalidcharsoffset;
hasignoredbytes = true;
}
}
if( destination != fixedreadline ) {
charstocopy = charsread - static_cast<int>( source - readline )
- invalidcharsoffset;
memcpy( destination, source, charstocopy );
destination += charstocopy - 1;
if( *destination == '\n' ) {
*destination = '\0';
}
else {
*++destination = '\0';
}
finalresult = fixedreadline;
}
else {
finalresult = readline;
}
// std::cerr << "finalresult=" << finalresult << std::endl;
}
else {
break;
}
}
std::cerr << "finalresult=" << finalresult << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Optimized solution using iconv
This takes about 4.6 seconds to parse 319MB of text.
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
// Compile it with:
// g++ -o main test.cpp -O3 -liconv
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
int charsread;
size_t inchars;
size_t outchars;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
iconv_t conversiondescriptor = iconv_open("UTF-8//IGNORE", "UTF-8");
if( conversiondescriptor == (iconv_t)-1 ) {
perror( "iconv_open conversiondescriptor" );
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
source = readline;
inchars = charsread;
destination = fixedreadline;
outchars = charsread;
if( iconv( conversiondescriptor, &source, &inchars, &destination, &outchars ) )
{
perror( "iconv" );
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
free( readline );
free( fixedreadline );
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
if( iconv_close( conversiondescriptor ) ) {
perror( "iconv_close conversiondescriptor" );
}
return 0;
}
Slowest solution ever using mbtowc
This takes about 24.2 seconds to parse 319MB of text.
If you comment out the line fixedchar = mbtowc(NULL, source, charsread); and uncomment the line charsread -= fixedchar; (breaking the invalid characters removal) this will take 1.9 seconds instead of 24.2 seconds (also compiled with -O3 optimization level).
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; )
{
// fixedchar = 1;
fixedchar = mbtowc(NULL, source, charsread);
charsread -= fixedchar;
// std::ostringstream contents;
// for( int index = 0; index < fixedchar; ++index )
// contents << source[index];
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '"
// << contents.str().c_str() << "'" << std::endl;
if( fixedchar > 0 ) {
memmove( destination, source, fixedchar );
source += fixedchar;
destination += fixedchar;
}
else if( fixedchar < 0 ) {
source += 1;
// std::cerr << "errno=" << strerror( errno ) << std::endl;
}
else {
break;
}
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Fastest version from all my others above using memmove
You cannot use memcpy here because the memory regions overlap!
This takes about 2.4 seconds to parse 319MB.
If you comment out the lines *destination = *source and memmove( destination, source, 1 ) (breaking the invalid characters removal) the performance still almost the same as when memmove is being called. Here in, calling memmove( destination, source, 1 ) is a little slower than directly doing *destination = *source;
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Bonus
You can also use Python C Extensions (API).
It takes about 2.3 seconds to parse 319MB without converting them to cached version UTF-8 char*
And takes about 3.2 seconds to parse 319MB converting them to UTF-8 char*.
And also takes about 3.2 seconds to parse 319MB converting them to cached ASCII char*.
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iostream>
typedef struct
{
PyObject_HEAD
}
PyFastFile;
static PyModuleDef fastfilepackagemodule =
{
// https://docs.python.org/3/c-api/module.html#c.PyModuleDef
PyModuleDef_HEAD_INIT,
"fastfilepackage", /* name of module */
"Example module that wrapped a C++ object", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or
-1 if the module keeps state in global variables. */
NULL, /* PyMethodDef* m_methods */
NULL, /* inquiry m_reload */
NULL, /* traverseproc m_traverse */
NULL, /* inquiry m_clear */
NULL, /* freefunc m_free */
};
// initialize PyFastFile Object
static int PyFastFile_init(PyFastFile* self, PyObject* args, PyObject* kwargs) {
char* filepath;
if( !PyArg_ParseTuple( args, "s", &filepath ) ) {
return -1;
}
int linecount = 0;
PyObject* iomodule;
PyObject* openfile;
PyObject* fileiterator;
iomodule = PyImport_ImportModule( "builtins" );
if( iomodule == NULL ) {
std::cerr << "ERROR: FastFile failed to import the io module '"
"(and open the file " << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openfunction = PyObject_GetAttrString( iomodule, "open" );
if( openfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module open "
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
openfile = PyObject_CallFunction(
openfunction, "ssiss", filepath, "r", -1, "ASCII", "ignore" );
if( openfile == NULL ) {
std::cerr << "ERROR: FastFile failed to open the file'"
<< filepath << "'!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* iterfunction = PyObject_GetAttrString( openfile, "__iter__" );
Py_DECREF( openfunction );
if( iterfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator"
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openiteratorobject = PyObject_CallObject( iterfunction, NULL );
Py_DECREF( iterfunction );
if( openiteratorobject == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator object"
<< " (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
fileiterator = PyObject_GetAttrString( openfile, "__next__" );
Py_DECREF( openiteratorobject );
if( fileiterator == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator "
<< "object (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* readline;
while( ( readline = PyObject_CallObject( fileiterator, NULL ) ) != NULL ) {
linecount += 1;
PyUnicode_AsUTF8( readline );
Py_DECREF( readline );
// std::cerr << "linecount " << linecount << " readline '" << readline
// << "' '" << PyUnicode_AsUTF8( readline ) << "'" << std::endl;
}
std::cerr << "linecount " << linecount << std::endl;
// PyErr_PrintEx(100);
PyErr_Clear();
PyObject* closefunction = PyObject_GetAttrString( openfile, "close" );
if( closefunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the close file function for '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* closefileresult = PyObject_CallObject( closefunction, NULL );
Py_DECREF( closefunction );
if( closefileresult == NULL ) {
std::cerr << "ERROR: FastFile failed close open file '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
Py_DECREF( closefileresult );
Py_XDECREF( iomodule );
Py_XDECREF( openfile );
Py_XDECREF( fileiterator );
return 0;
}
// destruct the object
static void PyFastFile_dealloc(PyFastFile* self) {
Py_TYPE(self)->tp_free( (PyObject*) self );
}
static PyTypeObject PyFastFileType =
{
PyVarObject_HEAD_INIT( NULL, 0 )
"fastfilepackage.FastFile" /* tp_name */
};
// create the module
PyMODINIT_FUNC PyInit_fastfilepackage(void)
{
PyObject* thismodule;
// https://docs.python.org/3/c-api/typeobj.html
PyFastFileType.tp_new = PyType_GenericNew;
PyFastFileType.tp_basicsize = sizeof(PyFastFile);
PyFastFileType.tp_dealloc = (destructor) PyFastFile_dealloc;
PyFastFileType.tp_flags = Py_TPFLAGS_DEFAULT;
PyFastFileType.tp_doc = "FastFile objects";
PyFastFileType.tp_init = (initproc) PyFastFile_init;
if( PyType_Ready( &PyFastFileType) < 0 ) {
return NULL;
}
thismodule = PyModule_Create(&fastfilepackagemodule);
if( thismodule == NULL ) {
return NULL;
}
// Add FastFile class to thismodule allowing the use to create objects
Py_INCREF( &PyFastFileType );
PyModule_AddObject( thismodule, "FastFile", (PyObject*) &PyFastFileType );
return thismodule;
}
To built it, create the file source/fastfilewrappar.cpp with the contents of the above file and the setup.py with the following contents:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup, Extension
myextension = Extension(
language = "c++",
extra_link_args = ["-std=c++11"],
extra_compile_args = ["-std=c++11"],
name = 'fastfilepackage',
sources = [
'source/fastfilewrapper.cpp'
],
include_dirs = [ 'source' ],
)
setup(
name = 'fastfilepackage',
ext_modules= [ myextension ],
)
To run example, use following Python script:
import time
import datetime
import fastfilepackage
testfile = './test.txt'
timenow = time.time()
iterable = fastfilepackage.FastFile( testfile )
fastfile_time = time.time() - timenow
timedifference = datetime.timedelta( seconds=fastfile_time )
print( 'FastFile timedifference', timedifference, flush=True )
Example:
user#user-pc$ /usr/bin/pip3.6 install .
Processing /fastfilepackage
Building wheels for collected packages: fastfilepackage
Building wheel for fastfilepackage (setup.py) ... done
Stored in directory: /pip-ephem-wheel-cache-j313cpzc/wheels/e5/5f/bc/52c820
Successfully built fastfilepackage
Installing collected packages: fastfilepackage
Found existing installation: fastfilepackage 0.0.0
Uninstalling fastfilepackage-0.0.0:
Successfully uninstalled fastfilepackage-0.0.0
Successfully installed fastfilepackage-0.0.0
user#user-pc$ /usr/bin/python3.6 fastfileperformance.py
linecount 820800
FastFile timedifference 0:00:03.204614
Using std::getline
This takes about 4.7 seconds to parse 319MB.
If you remove the UTF-8 removal algorithm borrowed from the fastest benchmark using stdlib.h getline(), it takes 1.7 seconds to run.
#include <stdlib.h>
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
int main(int argc, char const *argv[])
{
unsigned int fixedchar;
int linecount = -1;
char* source;
char* lineend;
char* destination;
if( ( source = setlocale( LC_ALL, "en_US.ascii" ) ) == NULL ) {
perror( "setlocale" );
return -1;
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
std::ifstream fileifstream{ "./test.txt" };
if( fileifstream.fail() ) {
std::cerr << "ERROR: FastFile failed to open the file!" << std::endl;
return -1;
}
size_t linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
while( true )
{
if( !fileifstream.eof() )
{
linecount += 1;
fileifstream.getline( readline, linebuffersize );
lineend = readline + fileifstream.gcount();
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "linecount='" << linecount << "'" << std::endl;
if( fileifstream.is_open() ) {
fileifstream.close();
}
free( readline );
return 0;
}
Resume
2.6 seconds trimming UTF-8 using two buffers with indexing
3.1 seconds trimming UTF-8 using two buffers with memcpy
4.6 seconds removing invalid UTF-8 with iconv
24.2 seconds removing invalid UTF-8 with mbtowc
2.4 seconds trimming UTF-8 using one buffer with pointer direct assigning
Bonus
2.3 seconds removing invalid UTF-8 without converting them to a cached UTF-8 char*
3.2 seconds removing invalid UTF-8 converting them to a cached UTF-8 char*
3.2 seconds trimming UTF-8 and caching as ASCII char*
4.7 seconds trimming UTF-8 with std::getline() using one buffer with pointer direct assigning
The used file ./text.txt had 820.800 lines where each line was equal to:
id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char\r\n
And all versions where compiled with
g++ (GCC) 7.4.0
iconv (GNU libiconv 1.14)
g++ -o main test.cpp -O3 -liconv && time ./main

Segmentation Fault C++, Linked List

I am going through and comparing a bunch of DNA sequences to find if it is a subset of another. I remove those that are subsets of another.
I'm using a linked list and I keep getting a segmentation fault somewhere around the output of the data back to the output file.
I'd also greatly appreciate feedback on overall code structure. I know its rather messy so I figured someone could point out some things that should be improved on.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
using namespace std;
/*
* Step 1. Load all sequences and their metadata into structures.
*
* Step 2. Start n^2 operation to compare sequences.
*
* Step 3. Output file back to a different fasta file.
*/
typedef struct sequence_structure sequence_structure;
struct sequence_structure
{
char *sequence;
char *id;
char *header;
sequence_structure *next_sequence_structure;
sequence_structure *previous_sequence_structure;
int length;
};
int main(int argc, char *argv[])
{
FILE *input_file;
ofstream output_file;
/* this is the TAIL of the linked list. This is a reversed linked list. */
sequence_structure *sequences;
int first_sequence = 0;
char *line = (char*) malloc( sizeof( char ) * 1024 );
if( argc != 3 )
{
printf("This program requires a input file and output file as its argument!\n");
return 0;
}
else
{
/* let's read the input file. */
input_file = fopen( argv[1], "r" );
}
while( !feof(input_file) )
{
string string_line;
fgets( line, 2048, input_file );
string_line = line;
if( string_line.length() <= 2 )
break;
if( string_line.at( 0 ) == '>' )
{
sequence_structure *new_sequence = (sequence_structure *) malloc( sizeof( sequence_structure ) );
new_sequence->id = (char *) malloc( sizeof( char ) * ( 14 + 1 ) );
string_line.copy( new_sequence->id, 14, 1 );
(new_sequence->id)[14] = '\0';
stringstream ss ( string_line.substr( 23, 4 ) );
ss >> new_sequence->length;
new_sequence->header = (char *) malloc( sizeof(char) * ( string_line.length() + 1 ) );
string_line.copy( new_sequence->header, string_line.length(), 0 );
(new_sequence->header)[string_line.length()] = '\0';
fgets( line, 2048, input_file );
string_line = line;
new_sequence->sequence = (char *) malloc( sizeof(char) * ( string_line.length() + 1 ) );
string_line.copy( new_sequence->sequence, string_line.length(), 0 );
(new_sequence->sequence)[string_line.length()] = '\0';
if( first_sequence == 0 )
{
sequences = new_sequence;
sequences->previous_sequence_structure = NULL;
first_sequence = 1;
}
else
{
sequences->next_sequence_structure = new_sequence;
new_sequence->previous_sequence_structure = sequences;
sequences = new_sequence;
}
}
else
{
cout << "Error: input file reading error." << endl;
}
}
fclose( input_file );
free( line );
sequence_structure *outer_sequence_node = sequences;
while( outer_sequence_node != NULL )
{
sequence_structure *inner_sequence_node = sequences;
string outer_sequence ( outer_sequence_node->sequence );
while( inner_sequence_node != NULL )
{
string inner_sequence ( inner_sequence_node->sequence );
if( outer_sequence_node->length > inner_sequence_node->length )
{
if( outer_sequence.find( inner_sequence ) != std::string::npos )
{
cout << "Deleting the sequence with id: " << inner_sequence_node->id << endl;
cout << inner_sequence_node->sequence << endl;
cout << "Found within the sequence with id: " << outer_sequence_node->id << endl;
cout << outer_sequence_node->sequence << endl;
sequence_structure *previous_sequence = inner_sequence_node->previous_sequence_structure;
sequence_structure *next_sequence = inner_sequence_node->next_sequence_structure;
free( inner_sequence_node->id );
free( inner_sequence_node->sequence );
free( inner_sequence_node->header );
if( next_sequence != NULL )
next_sequence->previous_sequence_structure = previous_sequence;
if( previous_sequence != NULL )
{
inner_sequence_node = previous_sequence;
free( previous_sequence->next_sequence_structure );
previous_sequence->next_sequence_structure = next_sequence;
}
}
}
inner_sequence_node = inner_sequence_node->previous_sequence_structure;
}
outer_sequence_node = outer_sequence_node->previous_sequence_structure;
}
output_file.open( argv[2], ios::out );
while( sequences->previous_sequence_structure != NULL )
{
sequences = sequences->previous_sequence_structure;
}
sequence_structure *current_sequence = sequences;
while( current_sequence->next_sequence_structure != NULL )
{
output_file << current_sequence->header;
output_file << current_sequence->sequence;
current_sequence = current_sequence->next_sequence_structure;
}
output_file << current_sequence->header;
output_file << current_sequence->sequence;
output_file.close();
while( sequences != NULL )
{
cout << "Freeing sequence with this id: " << sequences->id << endl;
free( sequences->id );
free( sequences->header );
free( sequences->sequence );
if( sequences->next_sequence_structure != NULL )
{
sequences = sequences->next_sequence_structure;
free( sequences->previous_sequence_structure );
}
else
{
sequences = NULL;
}
}
return 0;
}

C++ Logger Runtime Threshold

namespace Log {
#include <ctime>
#include <string>
#include <boost\scoped_ptr.hpp>
#ifndef LOG_PREPEND_TIMESTAMP_DEFAULT
#define LOG_PREPEND_TIMESTAMP_DEFAULT false
#endif
enum LogLevel_t { logFatal = 0, logError = 1, logWarning = 2, logVerbose = 3, logDebug = 4 };
std::string LogLevelToString( const LogLevel_t level ) {
static const char* const buffer[] = { "Fatal", "Error", "Warning", "Verbose", "Debug" };
return buffer[level];
}
class Log: public boost::noncopyable {
protected:
boost::scoped_ptr< std::ostringstream > Output_String;
bool m_Prepending_Timestamp;
size_t m_LinesOutputted;
public:
Log():
Output_String( new std::ostringstream ),
m_Prepending_Timestamp( LOG_PREPEND_TIMESTAMP_DEFAULT ),
m_LinesOutputted( 0 ) {}
void UsingTimestamp( bool Prepending_Timestamp = true ) {
m_Prepending_Timestamp = Prepending_Timestamp;
}
std::ostringstream& Get( const LogLevel_t level ) {
// Write line number
++m_LinesOutputted;
*Output_String << m_LinesOutputted << " | ";
if( m_Prepending_Timestamp == true ) {
//prepare a timestamp
time_t now = time( NULL );
std::string formatted_time( asctime( localtime( &now ) ) ); /* &now can be replaced with the above call when r-value references work (maybe) saving a stack var */
formatted_time.erase(( formatted_time.length( ) - 1 ), 1 ); /* Removes the \n(newline) that asctime adds for better formatting */
// Write timestamp to stream
*Output_String << formatted_time << " || ";
}
// Write Logging level(severity) to stream
*Output_String << LogLevelToString( level ) << " || ";
return *Output_String;
}
void Flush() {
*Output_String << std::endl;
fprintf( stdout, "%s", Output_String->str( ).c_str( ) );
fflush( stdout );
Output_String.reset( new std::ostringstream ); /* streams have lots of internal state clean streams are good */
}
~Log() {
*Output_String << std::endl;
fprintf( stdout, "%s", Output_String->str( ).c_str( ) );
fflush( stdout );
}
};
How would I implement a threshold without a macro?
If I wrap the body of get with a check I still have to return the stringstream.
I could change the return type to a pointer instead of a reference then return NULL but then every logger statement would have to have a null check for the returned stringstream

C++ - Detours WinSock Hooking

What I am trying to do is use the Detours library to hook into an applications WinSock2 send() and recv() functions (a packet logger).
While it does work for the send() function, it does not, however, work for the recv() function.
Here is my relevant code:
#include <cstdio>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <string>
#include <windows.h>
#include <detours.h>
#pragma comment( lib, "Ws2_32.lib" )
#pragma comment( lib, "detours.lib" )
#pragma comment( lib, "detoured.lib" )
#pragma comment( lib, "Mswsock.lib" )
std::ofstream Logger;
std::string NowToString() {
time_t rawtime;
tm *timeinfo = new tm();
char buffer[32];
time( &rawtime );
localtime_s( timeinfo, &rawtime );
strftime( buffer, 32, "%m/%d/%Y %I:%M:%S %p", timeinfo );
delete timeinfo;
return std::string( buffer );
}
std::string TimeToString() {
time_t rawtime;
tm *timeinfo = new tm();
char buffer[32];
time( &rawtime );
localtime_s( timeinfo, &rawtime );
strftime( buffer, 32, "%I:%M:%S %p", timeinfo );
delete timeinfo;
return std::string( buffer );
}
void LogPacket( const char *buf, int len ) {
Logger << " 0 1 2 3 4 5 6 7 8 9 A B C D E F\n";
Logger << " -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --\n";
Logger << "0000 ";
for ( int i = 0; i < len; ++i ) {
if ( i != 0 && i % 16 == 0 ) {
Logger << " ";
int line = ( i / 16 ) - 1;
for ( int j = 0; j < 16; ++j ) {
char c = buf[line * 16 + j];
if ( c >= 32 && c <= 126 ) {
Logger << c;
} else {
Logger << '.';
}
}
Logger << "\n" << std::hex << std::setw( 4 ) << std::setfill( '0' ) << i << std::dec << std::setw( 0 ) << " ";
} else if ( i % 16 == 8 ) {
Logger << ' ';
}
Logger << std::hex << std::setw( 2 ) << std::setfill( '0' ) << ( int( buf[i] ) & 0xFF ) << ' ';
Logger << std::dec << std::setw( 0 );
if ( i == len - 1 ) {
int remaining = 16 - ( len % 16 );
int fill = ( remaining * 3 ) + 2;
if ( remaining >= 8 ) {
++fill;
}
for ( int j = 0; j < fill; ++j ) {
Logger << ' ';
}
int line = ( i - ( ( len % 16 ) - 1 ) ) / 16 ;
for ( int k = 0; k < ( len % 16 ); ++k ) {
char c = buf[line * 16 + k];
if ( c >= 32 && c <= 126 ) {
Logger << c;
} else {
Logger << '.';
}
}
}
}
Logger << "\n\n";
}
int ( WINAPI *Real_Send )( SOCKET s, const char *buf, int len, int flags ) = send;
int ( WINAPI *Real_Recv )( SOCKET s, char *buf, int len, int flags ) = recv;
int ( WINAPI *Real_RecvFrom )( SOCKET s, char *buf, int len, int flags, sockaddr *from, int *fromlen ) = recvfrom;
int ( WINAPI *Real_WSARecvEx )( SOCKET s, char *buf, int len, int *flags ) = WSARecvEx;
int WINAPI Mine_Send( SOCKET s, const char* buf, int len, int flags );
int WINAPI Mine_Recv( SOCKET s, char *buf, int len, int flags );
int WINAPI Mine_RecvFrom( SOCKET s, char *buf, int len, int flags, sockaddr *from, int *fromlen );
int WINAPI Mine_WSARecvEx( SOCKET s, char *buf, int len, int *flags );
int WINAPI Mine_Send( SOCKET s, const char *buf, int len, int flags ) {
Logger << TimeToString() << ": Client -> Server (Length: " << len << " bytes)\n\n";
LogPacket( buf, len );
Logger << std::endl;
return Real_Send( s, buf, len, flags );
}
int WINAPI Mine_Recv( SOCKET s, char *buf, int len, int flags ) {
Logger << TimeToString() << ": Server -> Client (Length: " << len << " bytes)\n\n";
LogPacket( buf, len );
Logger << std::endl;
return Real_Recv( s, buf, len, flags );
}
int WINAPI Mine_RecvFrom( SOCKET s, char *buf, int len, int flags, sockaddr *from, int *fromlen ) {
Logger << TimeToString() << ": Server -> Client (Length: " << len << " bytes)*\n\n";
LogPacket( buf, len );
Logger << std::endl;
return Real_RecvFrom( s, buf, len, flags, from, fromlen );
}
int WINAPI Mine_WSARecvEx( SOCKET s, char *buf, int len, int *flags ) {
Logger << TimeToString() << ": Server -> Client (Length: " << len << " bytes)**\n\n";
LogPacket( buf, len );
Logger << std::endl;
return Real_WSARecvEx( s, buf, len, flags );
}
BOOL WINAPI DllMain( HINSTANCE, DWORD dwReason, LPVOID ) {
switch ( dwReason ) {
case DLL_PROCESS_ATTACH:
Logger.open( "C:\\Packets.txt", std::ios::out | std::ios::app | std::ios::ate );
if ( Logger.tellp() > 0 ) {
Logger << "\n\n\n";
}
Logger << "##\n## Logging Started (" << NowToString() << ")\n##\n\n\n";
DetourTransactionBegin();
DetourUpdateThread( GetCurrentThread() );
DetourAttach( &(PVOID &)Real_Send, Mine_Send );
DetourAttach( &(PVOID &)Real_Recv, Mine_Recv );
DetourAttach( &(PVOID &)Real_RecvFrom, Mine_RecvFrom );
DetourAttach( &(PVOID &)Real_WSARecvEx, Mine_WSARecvEx );
DetourTransactionCommit();
break;
case DLL_PROCESS_DETACH:
Logger << "##\n## Logging Stopped (" << NowToString() << ")\n##";
Logger.close();
DetourTransactionBegin();
DetourUpdateThread( GetCurrentThread() );
DetourDetach( &(PVOID &)Real_Send, Mine_Send );
DetourDetach( &(PVOID &)Real_Recv, Mine_Recv );
DetourDetach( &(PVOID &)Real_RecvFrom, Mine_RecvFrom );
DetourDetach( &(PVOID &)Real_WSARecvEx, Mine_WSARecvEx );
DetourTransactionCommit();
break;
}
return TRUE;
}
Any ideas?
EDIT: So I've hooked recvfrom(), and WSARecvEx() as well, and it still doesn't log the outgoing packets! I've updated my code with my exact code.
To use hooking effectively you need to make sure you really hook all of the relevant endpoints or atleast a common denominator that is ensured to be called eventually from all of them.
In the case of recv() I think its possible that the application actually calls WSARecv() instead.
You can use depends.exe to find out exactly what functions does the application/library import from Ws2_32.dll to know which ones you need to hook.
Well, a few months later I figured it out: I was hooking WinSock 2 functions when I should have been hooking WinSock 1.1's send()/recv()!