Efficient parsing of mmap file - c++

Following is the code for creating a memory map file using boost.
boost::iostreams::mapped_file_source file;
boost::iostreams::mapped_file_params param;
param.path = "\\..\\points.pts"; //! Filepath
file.open(param, fileSize);
if(file.is_open())
{
//! Access the buffer and populate the ren point buffer
const char* pData = file.data();
char* pData1 = const_cast<char*>(pData); //! this gives me all the data from Mmap file
std::vector<RenPoint> readPoints;
ParseData( pData1, readPoints);
}
The implementation of ParseData is as follows
void ParseData ( char* pbuffer , std::vector<RenPoint>>& readPoints)
{
if(!pbuffer)
throw std::logic_error("no Data in memory mapped file");
stringstream strBuffer;
strBuffer << pbuffer;
//! Get the max number of points in the pts file
std::string strMaxPts;
std::getline(strBuffer,strMaxPts,'\n');
auto nSize = strMaxPts.size();
unsigned nMaxNumPts = GetValue<unsigned>(strMaxPts);
readPoints.clear();
//! Offset buffer
pbuffer += nSize;
strBuffer << pbuffer;
std::string cur_line;
while(std::getline(strBuffer, cur_line,'\n'))
{
//! How do I read the data from mmap file directly and populate my renpoint structure
int yy = 0;
}
//! Working but very slow
/*while (std::getline(strBuffer,strMaxPts,'\n'))
{
std::vector<string> fragments;
istringstream iss(strMaxPts);
copy(istream_iterator<string>(iss),
istream_iterator<string>(),
back_inserter<vector<string>>(fragments));
//! Logic to populate the structure after getting data back from fragments
readPoints.push_back(pt);
}*/
}
I have say a minimum of 1 million points in my data structure and I want to optimize my parsing. Any ideas ?

read in header information to get the number of points
reserve space in a std::vector for N*num_points (N=3 assuming only X,Y,Z, 6 with normals, 9 with normals and rgb)
load the remainder of the file into a string
boost::spirit::qi::phrase_parse into the vector.
//code here can parse a file with 40M points (> 1GB) in about 14s on my 2 year old macbook:
#include <boost/spirit/include/qi.hpp>
#include <fstream>
#include <vector>
template <typename Iter>
bool parse_into_vec(Iter p_it, Iter p_end, std::vector<float>& vf) {
using boost::spirit::qi::phrase_parse;
using boost::spirit::qi::float_;
using boost::spirit::qi::ascii::space;
bool ret = phrase_parse(p_it, p_end, *float_, space, vf);
return p_it != p_end ? false : ret;
}
int main(int argc, char **args) {
if(argc < 2) {
std::cerr << "need a file" << std::endl;
return -1;
}
std::ifstream in(args[1]);
size_t numPoints;
in >> numPoints;
std::istreambuf_iterator<char> eos;
std::istreambuf_iterator<char> it(in);
std::string strver(it, eos);
std::vector<float> vf;
vf.reserve(3 * numPoints);
if(!parse_into_vec(strver.begin(), strver.end(), vf)) {
std::cerr << "failed during parsing" << std::endl;
return -1;
}
return 0;
}

AFAICT, you're currently copying the entire contents of the file into strBuffer.
What I think you want to do is use boost::iostreams::stream with your mapped_file_source instead.
Here's an untested example, based on the linked documentation:
// Create the stream
boost::iostreams::stream<boost::iostreams::mapped_file_source> str("some/path/file");
// Alternately, you can create the mapped_file_source separately and tell the stream to open it (using a copy of your mapped_file_source)
boost::iostreams::stream<boost::iostreams::mapped_file_source> str2;
str2.open(file);
// Now you can use std::getline as you normally would.
std::getline(str, strMaxPts);
As an aside, I'll note that by default mapped_file_source maps the entire file, so there's no need to pass the size explicitly.

You can go with something like this (just a fast concept, you'll need to add some additional error checking etc.):
#include "boost/iostreams/stream.hpp"
#include "boost/iostreams/device/mapped_file.hpp"
#include "boost/filesystem.hpp"
#include "boost/lexical_cast.hpp"
double parse_double(const std::string & str)
{
double value = 0;
bool decimal = false;
double divisor = 1.0;
for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
{
switch (*it)
{
case '.':
case ',':
decimal = true;
break;
default:
{
const int x = *it - '0';
value = value * 10 + x;
if (decimal)
divisor *= 10;
}
break;
}
}
return value / divisor;
}
void process_value(const bool initialized, const std::string & str, std::vector< double > & values)
{
if (!initialized)
{
// convert the value count and prepare the output vector
const size_t count = boost::lexical_cast< size_t >(str);
values.reserve(count);
}
else
{
// convert the value
//const double value = 0; // ~ 0:20 min
const double value = parse_double(str); // ~ 0:35 min
//const double value = atof(str.c_str()); // ~ 1:20 min
//const double value = boost::lexical_cast< double >(str); // ~ 8:00 min ?!?!?
values.push_back(value);
}
}
bool load_file(const std::string & name, std::vector< double > & values)
{
const int granularity = boost::iostreams::mapped_file_source::alignment();
const boost::uintmax_t chunk_size = ( (256 /* MB */ << 20 ) / granularity ) * granularity;
boost::iostreams::mapped_file_params in_params(name);
in_params.offset = 0;
boost::uintmax_t left = boost::filesystem::file_size(name);
std::string value;
bool whitespace = true;
bool initialized = false;
while (left > 0)
{
in_params.length = static_cast< size_t >(std::min(chunk_size, left));
boost::iostreams::mapped_file_source in(in_params);
if (!in.is_open())
return false;
const boost::iostreams::mapped_file_source::size_type size = in.size();
const char * data = in.data();
for (boost::iostreams::mapped_file_source::size_type i = 0; i < size; ++i, ++data)
{
const char c = *data;
if (strchr(" \t\n\r", c))
{
// c is whitespace
if (!whitespace)
{
whitespace = true;
// finished previous value
process_value(initialized, value, values);
initialized = true;
// start a new value
value.clear();
}
}
else
{
// c is not whitespace
whitespace = false;
// append the char to the value
value += c;
}
}
if (size < chunk_size)
break;
in_params.offset += chunk_size;
left -= chunk_size;
}
if (!whitespace)
{
// convert the last value
process_value(initialized, value, values);
}
return true;
}
Note that your main problem will be the conversion from string to float, which is very slow (insanely slow in the case of boost::lexical_cast). With my custom special parse_double func it is faster, however it only allows a special format (e.g. you'll need to add sign detection if negative values are allowed etc. - or you can just go with atof if all possible formats are needed).
If you'll want to parse the file faster, you'll probably need to go for multithreading - for example one thread only parsing the string values and other one or more threads converting the loaded string values to floats. In that case you probably won't even need the memory mapped file, as the regular buffered file read might suffice (the file will be read only once anyway).

A few quick comments on your code:
1) you're not reserving space for your vector so it's doing expansion every time you add a value. You have read the number of points from the file so call reserve(N) after the clear().
2) you're forcing a map of the entire file in one hit which will work on 64 bits but is probably slow AND is forcing another allocation of the same amount of memory with strBuffer << pbuffer;
http://www.boost.org/doc/libs/1_53_0/doc/html/interprocess/sharedmemorybetweenprocesses.html#interprocess.sharedmemorybetweenprocesses.mapped_file.mapped_file_mapping_regions shows how to getRegion
Use a loop through getRegion to load an estimated chunk of data containing many lines. You are going to have to handle partial buffers - each getRegion will likely end with part of a line you need to preserve and join to the next partial buffer starting the next region.

Related

C++, Qt - splitting a QByteArray as fast as possible

I'm trying to split a massive QByteArray which contains UTF-8 encoded plain text(using whitespace as delimiter) with the best performance possible. I found that I can achieve much better results if I convert the array to QString first. I tried using the QString.split function using a regexp, but the performance was horrendous. This code turned out to be way faster:
QMutex mutex;
QSet<QString> split(QByteArray body)
{
QSet<QString> slova;
QString s_body = QTextCodec::codecForMib(106)->toUnicode(body);
QString current;
for(int i = 0; i< body.size(); i++){
if(s_body[i] == '\r' || s_body[i] == '\n' || s_body[i] == '\t' || s_body[i] == ' '){
mutex.lock();
slova.insert(current);
mutex.unlock();
current.clear();
current.reserve(40);
} else {
current.push_back(s_body[i]);
}
}
return slova;
}
"Slova" is a QSet<QString> currently, but I could use a std::set or any other format. This code is supposed to find how many unique words there are in the array, with the best performance possible.
Unfortunately, this code runs far from fast enough. I'm looking to squeeze the absolute maximum out of this.
Using callgrind, I found that the most gluttonous internal functions were:
QString::reallocData (18% absolute cost)
QString::append (10% absolute cost)
QString::operator= (8 % absolute cost)
QTextCodec::toUnicode (8% absolute cost)
Obviously, this has to do with memory allocation stemming from the push_back function. What is the most optimal way to solve this? Doesn't necessarily have to be a Qt solution - pure C or C++ are also acceptable.
Minimise the amount of copying you need to do. Keep the input buffer in UTF-8, and don't store std::string or QString in your set; instead, create a small class to reference the existing UTF-8 data:
#include <QString>
class stringref {
const char *start;
size_t length;
public:
stringref(const char *start, const char *end);
operator QString() const;
bool operator<(const stringref& other) const;
};
This can encapsulate a substring of the UTF-8 input. You'll need to ensure that it doesn't outlive the input string; you could do this by clever use of std::shared_ptr, but if the code is reasonably self-contained, then it should be tractable enough to reason about the lifetime.
We can construct it from a pair of pointers into our UTF-8 data, and convert it to QString when we want to actually use it:
stringref::stringref(const char *start, const char *end)
: start(start), length(end-start)
{}
stringref::operator QString() const
{
return QString::fromUtf8(start, length);
}
You need to define operator< so you can use it in a std::set.
#include <cstring>
bool stringref::operator<(const stringref& other) const
{
return length == other.length
? std::strncmp(start, other.start, length) < 0
: length < other.length;
}
Note that we sort by length before dereferencing pointers, to reduce cache impact.
Now we can write the split method:
#include <set>
#include <QByteArray>
std::set<stringref> split(const QByteArray& a)
{
std::set<stringref> words;
// start and end
const auto s = a.data(), e = s + a.length();
// current word
auto w = s;
for (auto p = s; p <= e; ++p) {
switch (*p) {
default: break;
case ' ': case '\r': case '\n': case '\t': case '\0':
if (w != p)
words.insert({w, p});
w = p+1;
}
}
return words;
}
The algorithm is pretty much yours, with the addition of the w!=p test so that runs of whitespace don't get counted.
Let's test it, and time the important bit:
#include <QDebug>
#include <chrono>
int main()
{
QByteArray body{"foo bar baz\n foo again\nbar again "};
// make it a million times longer
for (int i = 0; i < 20; ++i)
body.append(body);
using namespace std::chrono;
const auto start = high_resolution_clock::now();
auto words = split(body);
const auto end = high_resolution_clock::now();
qDebug() << "Split"
<< body.length()
<< "bytes in"
<< duration_cast<duration<double>>(end - start).count()
<< "seconds";
for (auto&& word: words)
qDebug() << word;
}
I get:
Split 35651584 bytes in 1.99142 seconds
"bar"
"baz"
"foo"
"again"
Compiling with -O3 reduced that time to 0.6188 seconds, so don't forget to beg the compiler for help!
If that's still not fast enough, it's probably time to start to look at parallelising the task. You'll want to split the string into roughly equal lengths, but advance to the next whitespace so that no work straddles two threads worth of work. Each thread should create its own set of results, and the reduction step is then to merge the result sets. I won't provide a full solution for this, as that's another question in its own right.
Your largest cost, as suspected, is in push_back causing frequent reallocations as you append one character at a time. Why not search ahead, then append all of the data at once using QString::mid():
slova.insert(s_body.mid(beginPos, i - beginPos - 1));
Where beginPos holds the index of the start of the current substring. Instead of appending each character to current before it is inserted into slova, the copy happens all at once. After copying a substring, search ahead for the next valid (not a separator) character and set beginPos equal to that index.
In (rough) code:
QString s_body = ...
//beginPos tells us the index of the current substring we are working
//with. -1 means the previous character was a separator
int beginPos = -1;
for (...) {
//basically your if statement provided in the question as a function
if (isSeparator(s_body[i])) {
//ignore double white spaces, etc.
if (beginPos != -1) {
mutex.lock();
slova.insert(s_body.mid(beginPos, i - beginPos - 1));
mutex.unlock();
}
} else if (beginPos == -1)
//if beginPos is not valid and we are not on a separator, we
//are at the start of a new substring.
beginPos = i;
}
This approach will drastically reduce your overhead in heap allocations and eliminate QString::push_back() calls.
One final note: QByteArray also provides a mid() function. You can skip the conversion to QString entirely and work directly with the byte array.
The first thing I'd do if I were you is modify your code so it isn't locking and unlocking a QMutex for ever word it inserts into the QSet -- that's pure overhead. Either lock the QMutex only once, at the beginning of the loop, and unlock it again after the loop terminates; or better yet, insert into a QSet that isn't accessible from any other thread, so that you don't need to lock any QMutexes at all.
With that out of the way, the second thing to do is eliminate as many heap allocations as possible. Ideally you'd execute the entire parse without ever allocating or freeing any dynamic memory at all; my implementation below does that (well, almost -- the unordered_set might do some internal allocations, but it probably won't). On my computer (a 2.7GHz Mac Mini) I measure a processing speed of around 11 million words per second, using the Gutenberg ASCII text of Moby Dick as my test input.
Note that due to the backward-compatible encoding that UTF-8 uses, this program will work equally well with either UTF-8 or ASCII input.
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <unordered_set>
// Loads in a text file from disk into an in-memory array
// Expected contents of the file are ASCII or UTF8 (doesn't matter which).
// Note that this function appends a space to the end of the returned array
// That way the parsing function doesn't have to include a special case
// since it is guaranteed that every word in the array ends with whitespace
static char * LoadFile(const char * fileName, unsigned long * retArraySizeBytes)
{
char * ret = NULL;
*retArraySizeBytes = 0;
FILE * fpIn = fopen(fileName, "r");
if (fpIn)
{
if (fseek(fpIn, 0L, SEEK_END) == 0)
{
const unsigned long fileSizeBytes = ftell(fpIn);
const unsigned long arraySizeBytes = *retArraySizeBytes = fileSizeBytes+1; // +1 because I'm going to append a space to the end
rewind(fpIn);
ret = new char[arraySizeBytes];
if (fread(ret, 1, fileSizeBytes, fpIn) == fileSizeBytes)
{
ret[fileSizeBytes] = ' '; // appending a space allows me to simplify the parsing step
}
else
{
perror("fread");
delete [] ret;
ret = NULL;
}
}
else perror("fseek");
fclose(fpIn);
}
return ret;
}
// Gotta provide our own equality-testing function otherwise unordered_set will just compare pointer values
struct CharPointersEqualityFunction : public std::binary_function<char *, char *,bool>
{
bool operator() (char * s1, char * s2) const {return strcmp(s1, s2) == 0;}
};
// Gotta provide our own hashing function otherwise unordered_set will just hash the pointer values
struct CharPointerHashFunction
{
int operator() (char * str) const
{
// djb2 by Dan Bernstein -- fast enough and simple enough
unsigned long hash = 5381;
int c; while((c = *str++) != 0) hash = ((hash << 5) + hash) + c;
return (int) hash;
}
};
typedef std::unordered_set<char *, CharPointerHashFunction, CharPointersEqualityFunction > CharPointerUnorderedSet;
int main(int argc, char ** argv)
{
if (argc < 2)
{
printf("Usage: ./split_words filename\n");
return 10;
}
unsigned long arraySizeBytes;
char * buf = LoadFile(argv[1], &arraySizeBytes);
if (buf == NULL)
{
printf("Unable to load input file [%s]\n", argv[1]);
return 10;
}
CharPointerUnorderedSet set;
set.reserve(100000); // trying to size (set) big enough that no reallocations will be necessary during the parse
struct timeval startTime;
gettimeofday(&startTime, NULL);
// The actual parsing of the text is done here
int wordCount = 0;
char * wordStart = buf;
char * wordEnd = buf;
char * bufEnd = &buf[arraySizeBytes];
while(wordEnd < bufEnd)
{
if (isspace(*wordEnd))
{
if (wordEnd > wordStart)
{
*wordEnd = '\0';
set.insert(wordStart);
wordCount++;
}
wordStart = wordEnd+1;
}
wordEnd++;
}
struct timeval endTime;
gettimeofday(&endTime, NULL);
unsigned long long startTimeMicros = (((unsigned long long)startTime.tv_sec)*1000000) + startTime.tv_usec;
unsigned long long endTimeMicros = (((unsigned long long) endTime.tv_sec)*1000000) + endTime.tv_usec;
double secondsElapsed = ((double)(endTimeMicros-startTimeMicros))/1000000.0;
printf("Parsed %i words (%zu unique words) in %f seconds, aka %.0f words/second\n", wordCount, set.size(), secondsElapsed, wordCount/secondsElapsed);
//for (const auto& elem: set) printf("word=[%s]\n", elem);
delete [] buf;
return 0;
}

C++ efficient parse

I am programming some automated test equipment (ATE) and I'm trying to extract the following values out of an example response from the ATE:
DCRE? 1,
DCRE P, 10.3, (pin1)
DCRE F, 200.1, (pin2)
DCRE P, 20.4, (pin3)
From each line, I only care about the pin and the measured result value. So for the case above, I want to store the following pieces of information in a map<std::string, double> results;
results["pin1"] = 50.3;
results["pin2"] = 30.8;
results["pin3"] = 70.3;
I made the following code to parse the response:
void parseResultData(map<Pin*, double> &pinnametoresult, string &datatoparse) {
char *p = strtok((char*) datatoparse.c_str(), " \n");
string lastread;
string current;
while (p) {
current = p;
if(current.find('(') != string::npos) {
string substring = lastread.substr(1);
const char* last = substring.c_str();
double value = strtod(last, NULL);
unsigned short number = atoi(current.substr(4, current.size()-2).c_str());
pinnametoresult[&pinlookupmap[number]] = value;
}
lastread = p;
p = strtok(NULL, " \n");
}
}
It works, but it's not very efficient. Is there a way to make the function more efficient for this specific case? I don't care about the DCRE or P/F value on each line. I thought about using Boost regex library, but not sure if that would be more efficient.
In order to make this a bit more efficient, try to avoid copying. In particular, calls to substring, assignments etc can cause havoc on the performance. If you look at your code, you will see that the content of datatoparse are repeatedly assigned to lastread and current, each time with one line less at the beginning. So, on average you copy half of the original string times the number of lines, making just that part an O(n^2) algorithm. This isn't relevant if you have three or four line (not even on 100 lines!) but if you have a few more, performance degrades rapidly.
Try this approach instead:
string::size_type p0 = 0;
string::size_type p1 = input.find('\n', p0);
while (p1 != string::npos) {
// extract the line
string line = input.substr(p0, p1 - p0);
// move to the next line
p0 = p1 + 1;
p1 = input.find('\n', p0);
}
Notes:
Note that the algorithm still copies all input once, but each line only once, making it O(n).
Since you have a copy of the line, you can insert '\0' as artificial separator in order to give a substring to e.g. atoi() or strtod().
I'm not 100% sure of the order of parameters for string::find() and too lazy to look it up, but the idea is to start searching at a certain position. Look at the various overloads of find-like functions.
When handling a line, search the indices of the parts you need and then extract and parse them.
If you have line fragments (i.e. a partial line without a newline) at the end, you will have to modify the loop slightly. Create tests!
This is what I did:
#include <cstdlib>
#include <string>
#include <vector>
#include <unordered_map>
#include <sstream>
#include <iostream>
using namespace std;
struct Pin {
string something;
Pin() {}
};
vector<Pin*> pins = { new Pin(), new Pin(), new Pin() };
typedef unordered_map<Pin*, double> CONT_T;
inline bool OfInterest(const string& line) {
return line.find("(") != string::npos;
}
void parseResultData(CONT_T& pinnametoresult, const string& datatoparse)
{
istringstream is(datatoparse);
string line;
while (getline(is, line)) {
if (OfInterest(line)) {
double d = 0.0;
unsigned int pinid;
size_t firstComma = line.find(",")+2; // skip space
size_t secondComma = line.find(",", firstComma);
istringstream is2(line.substr(firstComma, secondComma-firstComma));
is2 >> d;
size_t paren = line.find("(")+4; // skip pin
istringstream is3(line.substr(paren, (line.length()-paren)-1));
is3 >> pinid;
--pinid;
Pin* pin = pins[pinid];
pinnametoresult[pin] = d;
}
}
}
/*
*
*/
int main(int argc, char** argv) {
string datatoparse = "DCRE? 1, \n"
"DCRE P, 10.3, (pin1)\n"
"DCRE F, 200.1, (pin2)\n"
"DCRE P, 20.4, (pin3)\n";
CONT_T results;
parseResultData(results, datatoparse);
return 0;
}
Here's my final result. Does not involve any copying, but it will destroy the string.
void parseResultData3(map<std::string, double> &pinnametoresult, std::string &datatoparse) {
char* str = (char*) datatoparse.c_str();
int length = datatoparse.size();
double lastdouble = 0.0;
char* startmarker = NULL; //beginning of next pin to parse
for(int pos = 0; pos < length; pos++, str++) {
if(str[0] == '(') {
startmarker = str + 1;
//get previous value
bool triggered = false;
for(char* lookback = str - 1; ; lookback--) {
if(!triggered && (isdigit(lookback[0]) || lookback[0] == '.')) {
triggered = true;
*(lookback + 1) = '\0';
}
else if(triggered && (!isdigit(lookback[0]) && lookback[0] != '.')) {
lastdouble = strtod(lookback, NULL);
break;
}
}
}
else if(startmarker != NULL) {
if(str[0] == ')') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = NULL;
}
if(str[0] == ',') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = str + 1;
}
}
}
}

How to update struct item in binary files

I have a binary file that i write some struct items to it. Now I want to find and update specific item from file items.
Note that my struct has a vector and its size is not constant.
my struct:
struct mapItem
{
string term;
vector<int> pl;
};
codes that write struct items to file
if (it==hashTable.end())//didn't find
{
vector <int> posting;
posting.push_back(position);
hashTable.insert ( pair<string,vector <int> >(md,posting ) );
mapItem* mi = new mapItem();
mi->term = md;
mi->pl = posting;
outfile.write((char*)mi, sizeof(mi));
}
else//finded
{
}
In else block I want to find and update item with its term(term is unique).
Now I have changed my code like this to serialize my vector.
if (it==hashTable.end())//didn't find
{
vector <int> posting;
posting.push_back(position);
hashTable.insert ( pair<string,vector <int> >(md,posting ) );
mapItem* mi = new mapItem();
mi->term = md;
mi->pl = posting;
if(!outfile.is_open())
outfile.open("sample.dat", ios::binary | ios::app);
size_t size = mi->term.size() + 1;
outfile.write((char*)&size, sizeof(size) );
outfile.write((char*)mi->term.c_str(), size);
size = (int)mi->pl.size() * sizeof(int);
outfile.write((char*)&size, sizeof(size) );
outfile.write((char*)&mi->pl[0], size );
outfile.close();
}
else//finded
{
(it->second).push_back(position);
mapItem* mi = new mapItem();
size_t size;
if(!infile.is_open())
{
infile.open("sample.dat", ios::binary | ios::in);
}
do{
infile.read((char*)&size, sizeof(size) ); // string size
mi->term.resize(size - 1); // make string the right size
infile.read((char*)mi->term.c_str(), size); // may need const_cast
infile.read((char*)&size, sizeof(size) ); // vector size
mi->pl.resize(size / sizeof(int));
infile.read((char*)&mi->pl[0], size );
}while(mi->term != md);
infile.close();
}
Well, my main question still remains: how can I update the data that I found?
Is there a better way to find them?
I evaluated the following solutions:
update in a new file, rename it to the old one in the end
update in the same file with a stream with two file positions, read & write, but I didn't rapidly find support for such a thing
update in the same file with two streams, read & write, but the risk of underlying overwrite is too big for me (even if protected outside against overlaps)
So I choose the first one, the most straightforward anyway.
#include <string>
#include <vector>
#include <fstream>
#include <cstdio>
#include <assert.h>
I added the following function to your struct:
size_t SizeWrittenToFile() const
{
return 2*sizeof(size_t)+term.length()+pl.size()*sizeof(int);
}
The read & write functions are basically same as your, except I choose not to write to string:c_str() pointer (although this ugly solution should work on every known compiles).
bool ReadNext(std::istream& in, mapItem& item)
{
size_t size;
in.read(reinterpret_cast<char*>(&size), sizeof(size_t));
if (!in)
return false;
std::istreambuf_iterator<char> itIn(in);
std::string& out = item.term;
out.reserve(size);
out.clear(); // this is necessary if the string is not empty
for (std::insert_iterator<std::string> itOut(out, out.begin());
in && (out.length() < size); itIn++, itOut++)
*itOut = *itIn;
assert(in);
if (!in)
return false;
in.read(reinterpret_cast<char*>(&size), sizeof(size_t));
if (!in)
return false;
std::vector<int>& out2 = item.pl;
out2.resize(size); // unfortunately reserve doesn't work here
in.read(reinterpret_cast<char*>(&out2[0]), size * sizeof(int));
assert(in);
return true;
}
// a "header" should be added to mark complete data (to write "atomically")
bool WriteNext(std::ostream& out, const mapItem& item)
{
size_t size = item.term.length();
out.write(reinterpret_cast<const char*>(&size), sizeof(size_t));
if (!out)
return false;
out.write(item.term.c_str(), size);
if (!out)
return false;
size = item.pl.size();
out.write(reinterpret_cast<const char*>(&size), sizeof(size_t));
if (!out)
return false;
out.write(reinterpret_cast<const char*>(&item.pl[0]), size * sizeof(int));
if (!out)
return false;
return true;
}
The update functions look like this:
bool UpdateItem(std::ifstream& in, std::ofstream& out, const mapItem& item)
{
mapItem it;
bool result;
for (result = ReadNext(in, it); result && (it.term != item.term);
result = ReadNext(in, it))
if (!WriteNext(out, it))
return false;
if (!result)
return false;
// write the new item content
assert(it.term == item.term);
if (!WriteNext(out, item))
return false;
for (result = ReadNext(in, it); result; result = ReadNext(in, it))
if (!WriteNext(out, it))
return false;
// failure or just the end of the file?
return in.eof();
}
bool UpdateItem(const char* filename, const mapItem& item)
{
std::ifstream in(filename);
assert(in);
std::string filename2(filename);
filename2 += ".tmp";
std::ofstream out(filename2.c_str());
assert(out);
bool result = UpdateItem(in, out, item);
// close them before delete
in.close();
out.close();
int err = 0;
if (result)
{
err = remove(filename);
assert(!err && "remov_140");
result = !err;
}
if (!result)
{
err = remove(filename2.c_str());
assert(!err && "remov_147");
}
else
{
err = rename(filename2.c_str(), filename);
assert(!err && "renam_151");
result = !err;
}
return result;
}
Questions ?
This:
outfile.write((char*)mi, sizeof(mi));
Does not make sense. You're writing the bits of a vector's implementation directly to disk. Some of those bits are extremely likely to be pointers. Pointers written to a file on disk are not useful, because they point to an address space belonging to the process which wrote the file, but won't work in another process reading the same file.
You need to "serialize" your data to the file, e.g. in a for loop writing each element.
You can serialize the struct to a file this way:
write length of string (4 bytes)
write string itself.
write length of vector (in bytes is easier to parse later).
write vector data. &vec[0] is the address of the first element. you can write all elements in ones shot since this buffer is contiguous.
Write:
size_t size = mi->term.size() + 1;
outfile.write((char*)&size, sizeof(size) );
outfile.write((char*)mi->term.c_str(), size);
size = (int)mi->pl.size() * sizeof(int);
outfile.write((char*)&size, sizeof(size) );
outfile.write((char*)&mi->pl[0], size );
Read:
infile.read((char*)&size, sizeof(size) ); // string size
mi->term.resize(size - 1); // make string the right size
infile.read((char*)mi->term.c_str(), size); // may need const_cast
infile.read((char*)&size, sizeof(size) ); // vector size
mi->pl.resize(size / sizeof(int));
infile.read((char*)&mi->pl[0], size );

String manipulation , at a complete loss

I am trying to grab sub-strings out of a larger string and I have got it to work in a small program but when I try to run it into the real program it just goes wrong. I am building off someone else s function and got it to work for my purpose, but cannot get it to work in the main program I need it in. I will limit the program down to where I think error is occurring.
Problem: I pass in same value into function findStats(std::string sString) but get different results.
Case I:
stats = findStats("^9dff9d[Attribute 0% Active Defense 0]\r^f2f3f2Mana: 1411 ^00ff00(+1975)\r^f2f3f2^9dff9d[Attribute 0% Active Mana 0]\r^f2f3f2^ffc000Fortify Level: 12/12\r^f2f3f2^006effIdentified Attribute: + 6% Crit Damage\rIdentified Attribute: + 6 Accuracy\r^f2f3f2^006eff^O053Sacrifice Elapse(6/8)\r^00ff00 ^O041Desollar's Shadow\rÌÌÌÌÌÌÌÌL«");
The above case will output correctly and stores \r offsets correctly.
Case II:
stats = findStats((std::string)((char*)&buffer));
Case II is the case I need to work and has the same value as above Case I at start of function findStats but offsets for \r Are not stored for w.e reason when sString has same value at start of function.
//Function that finds positioning of \r
void calc_z (std::string &s, std::vector<int> & z)
{
int len = s.size();
z.resize (len);
int l = 0, r = 0;
for (int i=1; i<len; ++i)
if (z[i-l]+i <= r)
z[i] = z[i-l];
else
{
l = i;
if (i > r) r = i;
for (z[i] = r-i; r<len; ++r, ++z[i])
if (s[r] != s[z[i]])
break;
--r;
}
}
std::vector<std::string> findStats(std::string sString){
//sString is exactly the same in value for both cases of stats at this point
int offSet = 0;
int sOffsets[100] = {};
std::vector<std::string> t1;
std::string main_string = sString;
std::string substring = "\r";
std::string working_string = substring + main_string;
std::vector<int> z;
calc_z(working_string, z);
for(int i = substring.size(); i < working_string.size(); ++i){
if(z[i] >=substring.size()){
sOffsets[offSet] = i;
offSet++;
}
}
.... code ....problem occurs right above offsets are not stored for \r
}
void main()
{
std::vector<std::string> stats;
std::string buffer[10];
...code...
...code to find string and store in buffer...
stats = findStats((std::string)((char*)&buffer));
//stats = findStats("^9dff9d[Attribute 0% Active Defense 0]\r^f2f3f2Mana: 1411 ^00ff00(+1975)\r^f2f3f2^9dff9d[Attribute 0% Active Mana 0]\r^f2f3f2^ffc000Fortify Level: 12/12\r^f2f3f2^006effIdentified Attribute: + 6% Crit Damage\rIdentified Attribute: + 6 Accuracy\r^f2f3f2^006eff^O053Sacrifice Elapse(6/8)\r^00ff00 ^O041Desollar's Shadow\rÌÌÌÌÌÌÌÌL«");
for( std::vector<std::string>::const_iterator i = stats.begin(); i != stats.end(); ++i)std::cout << *i << ' ' << std::endl;
std::cin.get();
}
This statement: (std::string)((char*)&buffer) does not do what you think it does.
std::vector is not a simple array.
If you take address of std::vector, that won't be the address of the first element within std::vector.
YOu can't just cast const char* or char* into std::string. You can, however, construct new std::string using provided const char* or char * c-style string. const char *str = asdf; std::string s = std::string(str);.
So, to summarize:
If you want to pass several strings at once in std::vector, pass the buffer by const reference
typedef std::vector<std::string> StringVector;
void test(const StringVector& v){
for (StringVector::const_iterator i = v.begin(); i != v.end(); i++)
std::cout << *i << std::endl;
}
...
StringVector strings;
test(strings);
If you want to WRITE something into std::vector, pass it by reference:
typedef std::vector<std::string> StringVector;
void test(const StringVector& out){
out.push_back("test");
}
...
StringVector strings;
test(strings);
If you want to pass a single string from, vector, just pass the element itself (by reference, const reference, or by value, depending on what you want to do with it), without casts.
typedef std::vector<std::string> StringVector;
void test(const std::string& s){
std::cout << s << std::endl;
}
...
StringVector strings;
strings.push_back("test");
test(strings[0]);
--edit--
IN addition to that:
std::vector<std::string> findStats(std::string sString){
//sString is exactly the same in value for both cases of stats at this point
int offSet = 0;
int sOffsets[100] = {};//<<here's a problem
Using array with fixed size in this case is a bad idea. Your array is small, and it WILL overflow on any string larger than 100 bytes, breaking/crashing your program. You can simply store results on std::vectro<std::string>, make vector of structs, or use std::map, depending on your goals.

Converting from char string to an array of uint8_t?

I'm reading a string from a file so it's in the form of a char array. I need to tokenize the string and save each char array token as a uint8_t hex value in an array.
char* starting = "001122AABBCC";
// ...
uint8_t[] ending = {0x00,0x11,0x22,0xAA,0xBB,0xCC}
How can I convert from starting to ending? Thanks.
Here is a complete working program. It is based on Rob I's solution, but fixes several problems has been tested to work.
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
const char* starting = "001122AABBCC";
int main()
{
std::string starting_str = starting;
std::vector<unsigned char> ending;
ending.reserve( starting_str.size());
for (int i = 0 ; i < starting_str.length() ; i+=2) {
std::string pair = starting_str.substr( i, 2 );
ending.push_back(::strtol( pair.c_str(), 0, 16 ));
}
for(int i=0; i<ending.size(); ++i) {
printf("0x%X\n", ending[i]);
}
}
strtoul will convert text in any base you choose into bytes. You have to do a little work to chop the input string into individual digits, or you can convert 32 or 64bits at a time.
ps uint8_t[] ending = {0x00,0x11,0x22,0xAA,0xBB,0xCC}
Doesn't mean anything, you aren't storing the data in a uint8 as 'hex', you are storing bytes, it's upto how you (or your debugger) interpretes the binary data
With C++11, you may use std::stoi for that :
std::vector<uint8_t> convert(const std::string& s)
{
if (s.size() % 2 != 0) {
throw std::runtime_error("Bad size argument");
}
std::vector<uint8_t> res;
res.reserve(s.size() / 2);
for (std::size_t i = 0, size = s.size(); i != size; i += 2) {
std::size_t pos = 0;
res.push_back(std::stoi(s.substr(i, 2), &pos, 16));
if (pos != 2) {
throw std::runtime_error("bad character in argument");
}
}
return res;
}
Live example.
I think any canonical answer (w.r.t. the bounty notes) would involve some distinct phases in the solution:
Error checking for valid input
Length check and
Data content check
Element conversion
Output creation
Given the usefulness of such conversions, the solution should probably include some flexibility w.r.t. the types being used and the locale required.
From the outset, given the date of the request for a "more canonical answer" (circa August 2014) liberal use of C++11 will be applied.
An annotated version of the code, with types corresponding to the OP:
std::vector<std::uint8_t> convert(std::string const& src)
{
// error check on the length
if ((src.length() % 2) != 0) {
throw std::invalid_argument("conversion error: input is not even length");
}
auto ishex = [] (decltype(*src.begin()) c) {
return std::isxdigit(c, std::locale()); };
// error check on the data contents
if (!std::all_of(std::begin(src), std::end(src), ishex)) {
throw std::invalid_argument("conversion error: input values are not not all xdigits");
}
// allocate the result, initialised to 0 and size it to the correct length
std::vector<std::uint8_t> result(src.length() / 2, 0);
// run the actual conversion
auto str = src.begin(); // track the location in the string
std::for_each(result.begin(), result.end(), [&str](decltype(*result.begin())& element) {
element = static_cast<std::uint8_t>(std::stoul(std::string(str, str + 2), nullptr, 16));
std::advance(str, 2); // next two elements
});
return result;
}
The template version of the code adds flexibility;
template <typename Int /*= std::uint8_t*/,
typename Char = char,
typename Traits = std::char_traits<Char>,
typename Allocate = std::allocator<Char>,
typename Locale = std::locale>
std::vector<Int> basic_convert(std::basic_string<Char, Traits, Allocate> const& src, Locale locale = Locale())
{
using string_type = std::basic_string<Char, Traits, Allocate>;
auto ishex = [&locale] (decltype(*src.begin()) c) {
return std::isxdigit(c, locale); };
if ((src.length() % 2) != 0) {
throw std::invalid_argument("conversion error: input is not even length");
}
if (!std::all_of(std::begin(src), std::end(src), ishex)) {
throw std::invalid_argument("conversion error: input values are not not all xdigits");
}
std::vector<Int> result(src.length() / 2, 0);
auto str = std::begin(src);
std::for_each(std::begin(result), std::end(result), [&str](decltype(*std::begin(result))& element) {
element = static_cast<Int>(std::stoul(string_type(str, str + 2), nullptr, 16));
std::advance(str, 2);
});
return result;
}
The convert() function can then be based on the basic_convert() as follows:
std::vector<std::uint8_t> convert(std::string const& src)
{
return basic_convert<std::uint8_t>(src, std::locale());
}
Live sample.
uint8_t is typically no more than a typedef of an unsigned char. If you're reading characters from a file, you should be able to read them into an unsigned char array just as easily as a signed char array, and an unsigned char array is a uint8_t array.
I'd try something like this:
std::string starting_str = starting;
uint8_t[] ending = new uint8_t[starting_str.length()/2];
for (int i = 0 ; i < starting_str.length() ; i+=2) {
std::string pair = starting_str.substr( i, i+2 );
ending[i/2] = ::strtol( pair.c_str(), 0, 16 );
}
Didn't test it but it looks good to me...
You may add your own conversion from set of char { '0','1',...'E','F' } to uint8_t:
uint8_t ctoa(char c)
{
if( c >= '0' && c <= '9' ) return c - '0';
else if( c >= 'a' && c <= 'f' ) return 0xA + c - 'a';
else if( c >= 'A' && c <= 'F' ) return 0xA + c - 'A';
else return 0;
}
Then it will be easy to convert a string in to array:
uint32_t endingSize = strlen(starting)/2;
uint8_t* ending = new uint8_t[endingSize];
for( uint32_t i=0; i<endingSize; i++ )
{
ending[i] = ( ctoa( starting[i*2] ) << 4 ) + ctoa( starting[i*2+1] );
}
This simple solution should work for your problem
char* starting = "001122AABBCC";
uint8_t ending[12];
// This algo will work for any size of starting
// However, you have to make sure that the ending have enough space.
int i=0;
while (i<strlen(starting))
{
// convert the character to string
char str[2] = "\0";
str[0] = starting[i];
// convert string to int base 16
ending[i]= (uint8_t)atoi(str,16);
i++;
}
uint8_t* ending = static_cast<uint8_t*>(starting);