I have a CSV file formatted as below:
1,50,a,46,50,b
2, 20,s,56,30,f
3,35,b,5,67,s
...
How can I turn that to a 2D array so that I could do some calculations?
void Core::parseCSV(){
std::ifstream data("test.csv");
std::string line;
while(std::getline(data,line))
{
std::stringstream lineStream(line);
std::string cell;
while(std::getline(lineStream,cell,','))
{
//not sure how to create the 2d array here
}
}
};
Try this,
void Core::parseCSV()
{
std::ifstream data("test.csv");
std::string line;
std::vector<std::vector<std::string> > parsedCsv;
while(std::getline(data,line))
{
std::stringstream lineStream(line);
std::string cell;
std::vector<std::string> parsedRow;
while(std::getline(lineStream,cell,','))
{
parsedRow.push_back(cell);
}
parsedCsv.push_back(parsedRow);
}
};
Following code is working. hope it can help you. It has a CSV file istream_iterator-like class. It is a template so that it can read strings, ints, doubles, etc. Its constructors accept a char delimiter, so that it may be used for more than strictly comma-delimited files. It also has a specialization for strings so that white space characters could be retained.
#include <iostream>
#include <sstream>
#include <fstream>
#include <iterator>
using namespace std;
template <class T>
class csv_istream_iterator: public iterator<input_iterator_tag, T>
{
istream * _input;
char _delim;
string _value;
public:
csv_istream_iterator( char delim = ',' ): _input( 0 ), _delim( delim ) {}
csv_istream_iterator( istream & in, char delim = ',' ): _input( &in ), _delim( delim ) { ++*this; }
const T operator *() const {
istringstream ss( _value );
T value;
ss >> value;
return value;
}
istream & operator ++() {
if( !( getline( *_input, _value, _delim ) ) )
{
_input = 0;
}
return *_input;
}
bool operator !=( const csv_istream_iterator & rhs ) const {
return _input != rhs._input;
}
};
template <>
const string csv_istream_iterator<string>::operator *() const {
return _value;
}
int main( int argc, char * args[] )
{
{ // test for integers
ifstream fin( "data.csv" );
if( fin )
{
copy( csv_istream_iterator<int>( fin ),
csv_istream_iterator<int>(),
ostream_iterator<int>( cout, " " ) );
fin.close();
}
}
cout << endl << "----" << endl;
{ // test for strings
ifstream fin( "data.csv" );
if( fin )
{
copy( csv_istream_iterator<string>( fin ),
csv_istream_iterator<string>(),
ostream_iterator<string>( cout, "|" ) );
fin.close();
}
}
return 0;
}
I would go for something like this (untested, incomplete) and eventually refine operator >>, if you have strings instead of chars, or floats instead of ints.
struct data_t
{
int a ;
int b ;
char c ;
int d ;
int e ;
char f ;
} ;
std::istream &operator>>(std::istream &ist, data_t &data)
{
char comma ;
ist >> data.a >> comma
>> data.b >> comma
>> data.c >> comma
>> data.d >> comma
>> data.e >> comma
>> data.f
;
return ist ;
}
void Core::parseCSV(){
std::ifstream data("test.csv");
std::string line;
std::vector<data_t> datavect ;
while(std::getline(data,line))
{
data_t data ;
std::stringstream lineStream(line);
lineStream >> data ;
datavect.push_back(data) ;
}
};
Related
Let's say I have the following string that I want to tokenize as per the delimiter '>':
std::string veg = "orange>kiwi>apple>potato";
I want every item in the string to be placed in a structure that has the following format:
struct pack_item
{
std::string it1;
std::string it2;
std::string it3;
std::string it4;
};
I know how to do it this way:
pack_item pitem;
std::stringstream veg_ss(veg);
std::string veg_item;
std::getline(veg_ss, veg_item, '>')
pitem.it1 = veg_item;
std::getline(veg_ss, veg_item, '>')
pitem.it2 = veg_item;
std::getline(veg_ss, veg_item, '>')
pitem.it3 = veg_item;
std::getline(veg_ss, veg_item, '>')
pitem.it4 = veg_item;
Is there a better and one-liner kind of way to do it?
Something like this:
#include <string>
#include <vector>
#include <sstream>
#include <iostream>
std::string veg = "orange>kiwi>apple>potato";
typedef std::vector<std::string> it_vec;
int main(int argc, char* argv[]) {
it_vec vec;
std::stringstream veg_ss(veg);
std::string veg_item;
while (std::getline(veg_ss, veg_item, '>')) {
vec.push_back(veg_item);
}
for (const std::string& vec_item : vec) {
std::cout << vec_item << std::endl;
}
}
You don't need an intermediate variable.
pack_item pitem;
std::stringstream veg_ss(veg);
std::getline(veg_ss, pitem.it1, '>');
std::getline(veg_ss, pitem.it2, '>');
std::getline(veg_ss, pitem.it3, '>');
std::getline(veg_ss, pitem.it4, '>');
You might want to make that a function, e.g. operator >> (with a similar operator <<)
std::istream& operator >>(std::istream& is, pack_item & pitem) {
std::getline(is, pitem.it1, '>');
std::getline(is, pitem.it2, '>');
std::getline(is, pitem.it3, '>');
std::getline(is, pitem.it4, '>');
return is;
}
std::ostream& operator <<(std::ostream& os, pack_item & pitem) {
return os << pitem.it1 << '>'
<< pitem.it2 << '>'
<< pitem.it3 << '>'
<< pitem.it4 << '>';
}
int main() {
std::stringstream veg_ss("orange>kiwi>apple>potato>");
pack_item pitem;
veg_ss >> pitem;
}
Is there a better and one-liner kind of way to do it?
You can make a type who's >> reads in a string up to a delimiter, and read all four elements in one statement. Is that really "better"?
template <bool is_const>
struct delimited_string;
template<>
struct delimited_string<true> {
const std::string & string;
char delim;
};
template<>
struct delimited_string<false> {
std::string & string;
char delim;
};
delimited_string(const std::string &, char) -> delimited_string<true>;
delimited_string(std::string &, char) -> delimited_string<false>;
std::istream& operator >>(std::istream& is, delimited_string<false> s) {
return std::getline(is, s.string, s.delim);
}
template <bool is_const>
std::ostream& operator <<(std::ostream& os, delimited_string<is_const> s) {
return os << s.string << s.delim;
}
std::istream& operator >>(std::istream& is, pack_item & pitem) {
return is >> delimited_string { pitem.it1, '>' }
>> delimited_string { pitem.it2, '>' }
>> delimited_string { pitem.it3, '>' }
>> delimited_string { pitem.it4, '>' };
}
std::ostream& operator <<(std::ostream& os, const pack_item & pitem) {
return os << delimited_string { pitem.it1, '>' }
<< delimited_string { pitem.it2, '>' }
<< delimited_string { pitem.it3, '>' }
<< delimited_string { pitem.it4, '>' };
}
As suggested in the comments, you could use a for loop as such:
pack_item a;
std::array<std::reference_wrapper<std::string>, 4> arr{a.it1, a.it2, a.it3, a.it4};
constexpr std::string_view veg = "orange>kiwi>apple>potato";
std::istringstream ss(veg.data());
std::string str;
for(std::size_t idx = 0; std::getline(ss, str, '>'); ++idx){
arr[idx].get() = std::move(str);
}
If you meant "one-liner" in its true sense, then you could be nasty and use:
std::getline(std::getline(std::getline(std::getline(ss, a.it1, '>'), a.it2, '>'), a.it3, '>'), a.it4, '>');
Indeed:
#include <iostream>
#include <sstream>
#include <string>
struct pack_item
{
std::string it1;
std::string it2;
std::string it3;
std::string it4;
};
pack_item pack( const std::string & s )
{
pack_item p;
getline(getline(getline(getline(std::istringstream(s), p.it1,'>'), p.it2,'>'), p.it3,'>'), p.it4);
return p;
}
int main()
{
auto pitem = pack( "orange>kiwi>apple>potato" );
std::cout << pitem.it4 << "<" << pitem.it3 << "<" << pitem.it2 << "<" << pitem.it1 << "\n";
}
BTW, there is nothing wrong with multiple lines of code. The quest for the one-liner is often a distraction to doing things the Right Way™.
What I would do is to create a constructor with std::string_view as argument (the second, which is predefined, would be the separator), and use the find function.
The reason of using std::string_view is posted here: How exactly is std::string_view faster than const std::string&?
struct pack_item
{
std::string it1;
std::string it2;
std::string it3;
std::string it4;
pack_item():it1(){}
pack_item(std::string_view in, char sep = '>'){
auto ptr = in.begin();
auto l_ptr = ptr;
ptr = std::find(ptr, in.end(), sep);
it1 = std::string(l_ptr, ptr++);
l_ptr = ptr;
ptr = std::find(ptr, in.end(), sep);
it2 = std::string(l_ptr, ptr++);
l_ptr = ptr;
ptr = std::find(ptr, in.end(), sep);
it3 = std::string(l_ptr, ptr++);
l_ptr = ptr;
ptr = std::find(ptr, in.end(), sep);
it4 = std::string(l_ptr, ptr++);
}
};
You can see here that this can be easily converted into a loop if you want and stop it by checking:
if(ptr == in.end()) break;
I am writing a program which is suppose to be able to open a file, read each line in and separate a LAST NAME, First name: 3 test scores. the format of the file being read is Mark Titan: 80 80 85. I am suppose to output TITAN, Mark: 80 80 85. We are using strings, and so far using my teachers code i have gotten the file to separate completely. it would show the test scores in order from 1-100(100 comes first because it starts with 1 but i can fix that after) and then alphabetically the names. I need help creating a substr of the line, creating a string of just the full name and splitting that into first and last and then sort them correctly. I have been messing with .find but im not sure how to split this vector into smaller vectors. please help and thank you.
#include <iostream>
#include <fstream>
#include <vector>
#include <sstream>
using namespace std;
void openFile(ifstream &in);
void processFile(ifstream &in, vector<string> &list);
void display(const vector<string> &list);
void sort(vector<string> &list);
int main(int argc, char *argv[])
{
ifstream in;
string line;
vector<string> words;
openFile(in);
processFile(in, words);
display(words);
return 0;
}
void openFile(ifstream &in)
{
string fileName;
bool again;
do
{
again = false;
cout<<"What is the name of the file to you wish to sort? (.txt will be added if no extension is included): ";
cin>>fileName;
if(fileName.find('.') >= fileName.size() )
fileName += ".txt";
in.open(fileName.c_str());
if(in.fail())
{
in.close();
in.clear();
again = true;
cout<<"That file does not exist! Please re-enter"<<endl;
}
}while(again);
}
void processFile(ifstream &in, vector<string> &list)
{
string line, word;
int s1,s2,s3;
stringstream ss;
while(getline(in, line, ':'))
{
ss<<line.substr(line.find(' ') + 1);
while(ss>>word)
list.push_back(word);
ss.clear();
}
sort(list);
}
void sort(vector<string> &list)
{
for(unsigned int i = 0; i < list.size(); ++i)
for(unsigned int j = 0; j < list.size(); ++j)
if(list[i] < list[j])
{
string temp = list[i];
list[i] = list[j];
list[j] = temp;
}
}
void display(const vector<string> &list)
{
cout<<"The file read had "<<list.size()<<" words in it"
<<endl<<"In sorted order, they are:"<<endl;
for(unsigned int i = 0; i < list.size();++i)
cout<<list[i]<<endl;}
You can Tokenize the line twice. First, split on the colon, then split the first token on the space. Also, your teacher will throw empty and malformed lines at you. For instance missing colon or missing name (or missing first/last name). Handle those errors by counting the tokens returned when you split a string or sub string.
void Tokenize(const std::string& theSourceString, std::vector<std::string>& theTokens, const std::string& theDelimiter)
{
// If no delimiter is passed, tokenize on all white space.
if (theDelimiter.empty())
{
std::string aBuffer; // Have a buffer string
std::stringstream ss(theSourceString); // Insert the string into a stream
while (ss >> aBuffer)
{
theTokens.push_back(aBuffer);
}
return; //?
}
// Skip delimiters at beginning.
std::string::size_type aLastPosition = theSourceString.find_first_not_of(theDelimiter, 0);
// Find first "non-delimiter".
std::string::size_type aPosition = theSourceString.find_first_of(theDelimiter, aLastPosition);
while (aPosition != std::string::npos || aLastPosition != std::string::npos)
{
// Found a token, add it to the vector.
theTokens.push_back(theSourceString.substr(aLastPosition, aPosition - aLastPosition));
// Skip delimiters. Note the "not_of"
aLastPosition = theSourceString.find_first_not_of(theDelimiter, aPosition);
// Find next "non-delimiter"
aPosition = theSourceString.find_first_of(theDelimiter, aLastPosition);
}
}
Example use:
std::vector<std::string> tokens;
Tokenize("{ 1, 2, 3, 4, 5 }", tokens, "{}, " );
I have this Utility class that has a bunch of methods for string manipulation. I will show the class function for splitting strings with a delimiter. This class has private constructor so you can not create an instance of this class. All the methods are static methods.
Utility.h
#ifndef UTILITY_H
#define UTILITY_h
// Library Includes Here: vector, string etc.
class Utility {
public:
static std::vector<std::string> splitString( const std::string& strStringToSplit,
const std::string& strDelimiter,
const bool keepEmpty = true );
private:
Utility();
};
Utility.cpp
std::vector<std::string> Utility::splitString( const std::string& strStringToSplit,
const std::string& strDelimiter,
const bool keepEmpty ) {
std::vector<std::string> vResult;
if ( strDelimiter.empty() ) {
vResult.push_back( strStringToSplit );
return vResult;
}
std::string::const_iterator itSubStrStart = strStringToSplit.begin(), itSubStrEnd;
while ( true ) {
itSubStrEnd = search( itSubStrStart, strStringToSplit.end(), strDelimiter.begin(), strDelimiter.end() );
std::string strTemp( itSubStrStart, itSubStrEnd );
if ( keepEmpty || !strTemp.empty() ) {
vResult.push_back( strTemp );
}
if ( itSubStrEnd == strStringToSplit.end() ) {
break;
}
itSubStrStart = itSubStrEnd + strDelimiter.size();
}
return vResult;
}
Main.cpp -- Usage
#include <string>
#include <vector>
#include "Utility.h"
int main() {
std::string myString( "Hello World How Are You Today" );
std::vector<std::string> vStrings = Utility::splitString( myString, " " );
// Check Vector Of Strings
for ( unsigned n = 0; n < vStrings.size(); ++n ) {
std::cout << vStrings[n] << " ";
}
std::cout << std::endl;
// The Delimiter is also not restricted to just a single character
std::string myString2( "Hello, World, How, Are, You, Today" );
// Clear Out Vector
vStrings.clear();
vStrings = Utility::splitString( myString2, ", " ); // Delimiter = Comma & Space
// Test Vector Again
for ( unsigned n = 0; n < vStrings.size(); ++n ) {
std::cout << vStrings[n] << " ";
}
std::cout << std::endl;
return 0;
}
I have a file, at the end of each line there is possibly a newline:
111\n
100\n
101
In C++ you can load the lines of a file into an array of byte strings like this:
auto lines_from( istream& is )
-> vector<string>
{
string line;
vector<string> result;
while( getline( is, line ) )
{
result.push_back( line );
}
return result;
}
auto main() -> int
{
vector<string> const lines = lines_from( cin );
// Use it.
}
Here string is std::string from the <string> header, getline is std::getline from the same header, and vector is std::vector from the <vector> header. I chose to use a descriptive name for the function, lines_from. However, it's commonly named readall.
Where you absolutely need a char**, presumably with an assumption of some given buffer size for each string, then you can use a vector of pointers, pointing to buffers that e.g. are managed by a class like this:
class C_strings
{
private:
vector<string> buffers_;
vector<char*> pointers_;
int bufsize_;
C_strings( C_strings const& ) = delete;
auto operator=( C_strings const& ) -> C_strings& = delete;
public:
auto pointer() -> char** { return pointers_.data(); }
auto bufsize() const -> int { return bufsize_; }
C_strings( vector<string> const& strings, int const bufsize )
: buffers_( strings )
, bufsize_( bufsize )
{
pointers_.reserve( buffers_.size() + 1 );
for( string& s : buffers_ )
{
s.reserve( bufsize );
if( s.empty() or s.back() != '\0' ) { s += '\0'; }
pointers_.push_back( &s[0] );
}
pointers_.push_back( nullptr );
}
C_strings( C_strings&& other )
: buffers_( move( other.buffers_ ) )
, pointers_( move( other.pointers_ ) )
{}
};
Then let's say you want to call a double-star function like this:
void doublestarfunc( char** const lines )
{
using std::cout;
for( char** pps = lines; *pps != nullptr; ++pps )
{
if( strlen( *pps ) < 40 ) { strcat( *pps, " < Oh la la!" ); }
cout << *pps << '\n';
}
cout << '\n';
}
It can be done very simply:
using namespace std; // cin, cout
int const columns = 80;
int const cstring_bufsize = columns + 1;
auto c_strings = C_strings( lines_from( cin ), cstring_bufsize );
doublestarfunc( c_strings.pointer() );
But is it a good idea? No, except when you have to relate to an existing C style API. For C++ code, better restructure it to use C++ std::string throughout.
The array is set up like so:
string * str = new string[11];
Where the content of the string looks like:
str[0]=AAAAAAAA,BBBBBBBB,CCCCCCCC,DDDDDDDD,EEEE,FFFFFFFF,GGGGGGGG,HHHH,IIII,JJJJ,KKKK
str[1]=AAAAAAAA,BBBBBBBB,CCCCCCCC,DDDDDDDD,EEEE,FFFFFFFF,GGGGGGGG,HHHH,IIII,JJJJ,KKKK
str[2]=AAAAAAAA,BBBBBBBB,CCCCCCCC,DDDDDDDD,EEEE,FFFFFFFF,GGGGGGGG,HHHH,IIII,JJJJ,KKKK
...
str[12]=AAAAAAAA,BBBBBBBB,CCCCCCCC,DDDDDDDD,EEEE,FFFFFFFF,GGGGGGGG,HHHH,IIII,JJJJ,KKKK
Another array looks like:
string * type = new string[11];
Where the content is:
type[0]="1";
type[1]="1";
type[2]="1";
type[3]="1";
type[4]="2";
type[5]="1";
type[6]="1";
type[7]="2";
type[8]="2";
type[9]="2";
type[10]="2";
These types correspond to each value in the string, so, for the first string:
1=float , 2=integer
AAAAAAAA would be 1; or an float
BBBBBBBB would be 1; or an float
CCCCCCCC would be 1; or an float
DDDDDDDD would be 1; or an float
EEEE would be 2; or a integer
FFFFFFFF would be 1; or an float
GGGGGGGG would be 1; or an float
HHHH would be 2; or a integer
IIII would be 2; or a integer
JJJJ would be 2; or a integer
KKKK would be 2; or a integer
In addition the single type array works for all strings in the str array.
Now for my question:
How do i use the above information to extract each individual values from the string and convert it to an integer or a float based on the value in the type array.
BE AWARE:
Boost is not available to me
The conversion functions look like: (The other is formatted similarly except for an integer)
unsigned int BinaryParser::hexToFloat(std::string hexInput)
{
std::stringstream ss (hexInput);
unsigned int floatOutput;
ss >> hex >> floatOutput;
return reinterpret_cast<float&>(floatOutput);
}
OK, first part: extract the comma-separated strings. One way would be:
std::vector<std::string> split( std::string s ){
std::vector<std::string> vec;
int pos = 0;
while( std::string::npos != (pos = s.find( ',', pos ) ) ){
vec.push_back( s.substr( 0, pos ) );
s = s.substr( pos + 1 );
}
vec.push_back( s );
return vec;
}
Depends on the input string being "well-behaved".
This converts an int from hex digits:
int convInt( std::string hexInput ){
std::istringstream iss (hexInput);
uint16_t intOutput;
iss >> std::hex >> intOutput;
return intOutput;
}
Float cannot be read using std::hex, so we assume the HHHHHHHH is a float's bytes interpreted as an int32_t.
float convFloat( std::string & hexInput ){
std::istringstream iss (hexInput);
uint32_t intOutput;
iss >> std::hex >> intOutput;
return reinterpret_cast<float&>(intOutput);
}
For storing the results we can use:
enum TypeTag { eInt, eFloat };
class IntOrFloat {
public:
IntOrFloat( int i ) : typeTag(eInt),integer(i),floating(0) { }
IntOrFloat( float f ) : typeTag(eFloat),integer(0),floating(f) { }
virtual ~IntOrFloat(){}
int getInt() const { return integer; }
float getFloat() const { return floating; }
TypeTag getTypeTag() const { return typeTag; }
private:
TypeTag typeTag;
int integer;
float floating;
};
std::ostream& operator<< (std::ostream& os, const IntOrFloat& iof){
switch( iof.getTypeTag() ){
case eInt:
os << iof.getInt();
break;
case eFloat:
os << iof.getFloat();
break;
}
return os;
}
To convert one comma-separated string according to the type vector:
std::vector<IntOrFloat> convert( const std::vector<std::string> t, const std::string s ){
std::vector<IntOrFloat> results;
std::vector<std::string> hexes = split( s );
for( int i = 0; i < hexes.size(); i++ ){
if( t[i] == "1" ){
results.push_back( IntOrFloat( convFloat( hexes[i] ) ) );
} else {
results.push_back( IntOrFloat( convInt( hexes[i] ) ) );
}
}
return results;
}
That's it, then. - I've been using vector instead of the arrays. You can easily convert, e.g.
std::vector<std::string> fromArray( std::string strs[], int n ){
std::vector<std::string> strings;
for( int i = 0; i < n; i++ ) strings.push_back( std::string( strs[i] ) );
return strings;
}
#define fromArray(a) fromArray( a, (sizeof(a)/sizeof(a[0])) )
And here is my test program:
#define LENGTH(a) (sizeof(a)/sizeof(a[0]))
int main(){
std::string t[] = {"2","1","1","2"};
std::string s[] = {
"8000,4048f5c3,bf000000,FFFF",
"0001,42f6e979,c44271ba,7FFF",
"1234,00000000,447a0000,5678"
};
std::vector<std::string> types = fromArray( t );
std::vector<std::string> strings = fromArray( s );
for( std::vector<std::string>::iterator it = strings.begin() ; it != strings.end(); ++it ){
std::vector<IntOrFloat> results = convert( types, *it );
std::cout << "converting string " << *it << ", " << results.size() << " values:" << std::endl;
for( std::vector<IntOrFloat>::iterator iof = results.begin() ; iof != results.end(); ++iof ){
std::cout << " " << *iof << std::endl;
}
}
}
I'm working on a C/C++ app (in Visual Studio 2010) where I need to tokenize a comma delimited string and I would like this to be as fast as possible. Currently I'm using strtok_s. I ran some tests of strtok_s versus sscanf and it seemed like strtok_s was faster (unless I wrote a terrible implementation :) ) but I was wondering if anyone could suggest a faster alternative.
For sheer runtime speed, boost.spirit.qi is an excellent candidate.
The best you can do is to make sure you go through the string only once, and build the output on the fly. Start pulling off chars in to a temp buffer, and when you encounter the delimiter save the temp buffer to the output collection, clear the temp buffer, rinse and repeat.
Here's a basic implementation that does this.
template<class C=char>
struct basic_token
{
typedef std::basic_string<C> token_string;
typedef unsigned long size_type;
token_string token_, delim_;
basic_token(const token_string& token, const token_string& delim = token_string());
};
template<class C>
basic_token<C>::basic_token(const token_string& token, const token_string& delim)
: token_(token),
delim_(delim)
{
}
typedef basic_token<char> token;
template<class Char, class Iter> void tokenize(const std::basic_string<Char>& line, const Char* delims, Iter itx)
{
typedef basic_token<Char> Token;
typedef std::basic_string<Char> TString;
for( TString::size_type tok_begin = 0, tok_end = line.find_first_of(delims, tok_begin);
tok_begin != TString::npos; tok_end = line.find_first_of(delims, tok_begin) )
{
if( tok_end == TString::npos )
{
(*itx++) = Token(TString(&line[tok_begin]));
tok_begin = tok_end;
}
else
{
(*itx++) = Token(TString(&line[tok_begin], &line[tok_end]), TString(1, line[tok_end]));
tok_begin = tok_end + 1;
}
}
}
template<class Char, class Iter> void tokenize(const Char* line, const Char* delim, Iter itx)
{
tokenize(std::basic_string<Char>(line), delim, itx);
}
template<class Stream, class Token> Stream& operator<<(Stream& os, const Token& tok)
{
os << tok.token_ << "\t[" << tok.delim_ << "]";
return os;
}
...which you would use like this:
string raw = "35=BW|49=TEST|1346=REQ22|1355=2|1182=88500|1183=88505|10=087^";
vector<stoken> tokens;
tokenize(raw, "|", back_inserter(tokens));
copy(tokens.begin(), tokens.end(), ostream_iterator<stoken>(cout, "\n"));
Output is:
35=BW [|]
49=TEST [|]
1346=REQ22 [|]
1355=2 [|]
1182=88500 [|]
1183=88505 [|]
10=087^ []
I would remind you that there is a risk with strtok and its ilk
that you can get back a different number of tokens than you might want.
one|two|three would yield 3 tokens
while
one|||three would yield 2.
The test of mmhmm haven't make use of spirit correctly, his grammars are flaw.
#include <cstdio>
#include <cstring>
#include <iostream>
#include <string>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/io.hpp>
#include <boost/spirit/include/qi.hpp>
/****************************strtok_r************************/
typedef struct sTokenDataC {
char *time;
char *symb;
float bid;
float ask;
int bidSize;
int askSize;
} tokenDataC;
tokenDataC parseTick( char *line, char *parseBuffer )
{
tokenDataC tokenDataOut;
tokenDataOut.time = strtok_r( line,",", &parseBuffer );
tokenDataOut.symb = strtok_r( nullptr,",", &parseBuffer );
tokenDataOut.bid = atof(strtok_r( nullptr,",", &parseBuffer ));
tokenDataOut.ask = atof(strtok_r( nullptr , ",", &parseBuffer ));
tokenDataOut.bidSize = atoi(strtok_r( nullptr,",", &parseBuffer ));
tokenDataOut.askSize = atoi(strtok_r( nullptr, ",", &parseBuffer ));
return tokenDataOut;
}
void test_strcpy_s(int iteration)
{
char *testStringC = new char[64];
char *lineBuffer = new char[64];
printf("test_strcpy_s....\n");
strcpy(testStringC,"09:30:00,TEST,13.24,15.32,10,14");
{
timeEstimate<> es;
tokenDataC tokenData2;
for(int i = 0; i < iteration; i++)
{
strcpy(lineBuffer, testStringC);//this is more realistic since this has to happen because I want to preserve the line
tokenData2 = parseTick(lineBuffer, testStringC);
//std::cout<<*tokenData2.time<<", "<<*tokenData2.symb<<",";
//std::cout<<tokenData2.bid<<", "<<tokenData2.ask<<", "<<tokenData2.bidSize<<", "<<tokenData2.askSize<<std::endl;
}
}
delete[] lineBuffer;
delete[] testStringC;
}
/****************************strtok_r************************/
/****************************spirit::qi*********************/
namespace qi = boost::spirit::qi;
struct tokenDataCPP
{
std::string time;
std::string symb;
float bid;
float ask;
int bidSize;
int askSize;
void clearTimeSymb(){
time.clear();
symb.clear();
}
};
BOOST_FUSION_ADAPT_STRUCT(
tokenDataCPP,
(std::string, time)
(std::string, symb)
(float, bid)
(float, ask)
(int, bidSize)
(int, askSize)
)
void test_spirit_qi(int iteration)
{
std::string const strs("09:30:00,TEST,13.24,15.32,10,14");
tokenDataCPP data;
auto myString = *~qi::char_(",");
auto parser = myString >> "," >> myString >> "," >> qi::float_ >> "," >> qi::float_ >> "," >> qi::int_ >> "," >> qi::int_;
{
std::cout<<("test_spirit_qi....\n");
timeEstimate<> es;
for(int i = 0; i < iteration; ++i){
qi::parse(std::begin(strs), std::end(strs), parser, data);
//std::cout<<data.time<<", "<<data.symb<<", ";
//std::cout<<data.bid<<", "<<data.ask<<", "<<data.bidSize<<", "<<data.askSize<<std::endl;
data.clearTimeSymb();
}
}
}
/****************************spirit::qi*********************/
int main()
{
int const ITERATIONS = 500 * 10000;
test_strcpy_s(ITERATIONS);
test_spirit_qi(ITERATIONS);
}
Since clang++ don't have strtok_s, I use strtok_r to replace it
Iterate 500 * 10k, the times are
test_strcpy_s : 1.40951
test_spirit_qi : 1.34277
Their times are almost the same, not much different.
compiler, clang++ 3.2, -O2
codes of timeEstime
This should be quite fast, no temp buffers, it allocates empty tokes too.
template <class char_t, class char_traits_t,
class char_allocator_t, class string_allocator_t>
inline void _tokenize(
const std::basic_string<char_t, char_traits_t, char_allocator_t>& _Str,
const char_t& _Tok,
std::vector<std::basic_string<char_t, char_traits_t, char_allocator_t>,
string_allocator_t>& _Tokens,
const size_t& _HintSz=10)
{
_Tokens.reserve(_HintSz);
const char_t* _Beg(&_Str[0]), *_End(&_Str[_Str.size()]);
for (const char_t* _Ptr=_Beg; _Ptr<_End; ++_Ptr)
{
if (*_Ptr == _Tok)
{
_Tokens.push_back(
std::basic_string<char_t, char_traits_t,
char_allocator_t>(_Beg, _Ptr));
_Beg = 1+_Ptr;
}
}
_Tokens.push_back(
std::basic_string<char_t, char_traits_t,
char_allocator_t>(_Beg, _End));
}
After testing and timing each suggested candidate, the result is that strtok is clearly the fastest. Although I'm not surprised my love of testing dictated it was worth exploring other options. [Note: Code was thrown together edits welcome :) ]
Given:
typedef struct sTokenDataC {
char *time;
char *symb;
float bid;
float ask;
int bidSize;
int askSize;
} tokenDataC;
tokenDataC parseTick( char *line, char *parseBuffer )
{
tokenDataC tokenDataOut;
tokenDataOut.time = strtok_s( line,",", &parseBuffer );
tokenDataOut.symb = strtok_s( null,",", &parseBuffer );
tokenDataOut.bid = atof(strtok_s( null,",", &parseBuffer ));
tokenDataOut.ask = atof(strtok_s( null , ",", &parseBuffer ));
tokenDataOut.bidSize = atoi(strtok_s( null,",", &parseBuffer ));
tokenDataOut.askSize = atoi(strtok_s( null, ",", &parseBuffer ));
return tokenDataOut;
}
char *testStringC = new char[64];
strcpy(testStringC,"09:30:00,TEST,13.24,15.32,10,14");
int _tmain(int argc, _TCHAR* argv[])
{
char *lineBuffer = new char[64];
printf("Testing method2....\n");
for(int i = 0; i < ITERATIONS; i++)
{
strcpy(lineBuffer,testStringC);//this is more realistic since this has to happen because I want to preserve the line
tokenData2 = parseTick(lineBuffer,parseBuffer);
}
}
vs calling John Diblings impl via:
struct sTokenDataCPP
{
std::basic_string<char> time;
std::basic_string<char> symb;
float bid;
float ask;
int bidSize;
int askSize;
};
std::vector<myToken> tokens1;
tokenDataCPP tokenData;
printf("Testing method1....\n");
for(int i = 0; i < ITERATIONS; i++)
{
tokens1.clear();
tokenize(raw, ",", std::back_inserter(tokens1));
tokenData.time.assign(tokens1.at(0).token_);
tokenData.symb.assign(tokens1.at(1).token_);
tokenData.ask = atof(tokens1.at(2).token_.c_str());
tokenData.bid = atof(tokens1.at(3).token_.c_str());
tokenData.askSize = atoi(tokens1.at(4).token_.c_str());
tokenData.bidSize = atoi(tokens1.at(5).token_.c_str());
}
vs a simple boost.spirit.qi implementation defining the grammer as follows:
template <typename Iterator>
struct tick_parser : grammar<Iterator, tokenDataCPP(), boost::spirit::ascii::space_type>
{
tick_parser() : tick_parser::base_type(start)
{
my_string %= lexeme[+(boost::spirit::ascii::char_ ) ];
start %=
my_string >> ','
>> my_string >> ','
>> float_ >> ','
>> float_ >> ','
>> int_ >> ','
>> int_
;
}
rule<Iterator, std::string(), boost::spirit::ascii::space_type> my_string;
rule<Iterator, sTokenDataCPP(), boost::spirit::ascii::space_type> start;
};
with ITERATIONS set to 500k:
strtok version: 2s
john's version: 115s
boost: 172s
I can post the full code is people want this, I just didn't want to take up a huge amt of space