Efficient Parsing FIX Message c++ - c++

I need to parse a file which contains the financial FIX protocol. A sample is below:
1128=99=24535=X49=CME75=2017040934=82452=2017040920070508394791460=201704092007050800000005799=10000000268=2279=0269=B48=900655=ESM783=23271=1473460731=100000005796=17263279=0269=C48=900655=ESM783=24271=2861528731=100000005796=1726310=219
My application will load many files each with many millions of rows of historical data so performance needs to be considered.
I have reviewed similar questions online around FIX parsing, as well as explored the QuickFix library (specifically using FIX::Message(string) to crack the message) but i aim to have a throughput better than what i was able to achieve using quickfix.
I wrote up a mock for the most common of the message types (Market Data Incremental Refresh) to see the kinds of speed i was achieving, and am most unimpressed with the result of ~60,000 messages / second including the file parsing of a 3m line file.
This is my first c++ application so i'm expecting there to be many flaws in my approach and any advice on how to improve its performance would be greatly appreciated.
Currently the flow is file->string->MDIncrementalRefresh. An MDIncrementalRefresh has two optional repeating groups which i'm using a vector to store as they are of unknown size from message to message.
I'm guessing the fact that i'm reconstructing MDIncrementalRefresh upon every update is causing unnecessary overhead compared to if i were to re-use the object by updating the contents of the previous MDIncrementalRefresh?
Thanks in Advance
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
using namespace std;
std::vector<std::string> string_split(std::string s, const char delimiter)
{
size_t start=0;
size_t end=s.find_first_of(delimiter);
std::vector<std::string> output;
while (end <= std::string::npos)
{
output.emplace_back(s.substr(start, end-start));
if (end == std::string::npos)
break;
start=end+1;
end = s.find_first_of(delimiter, start);
}
return output;
}
const char FIX_FIELD_DELIMITER = '\x01';
const char FIX_KEY_DELIMITER = '=';
const int STR_TO_CHAR = 0;
const int KEY = 0;
const int VALUE = 1;
const string Field_TransactTime = "60";
const string Field_MatchEventIndicator = "5799";
const string Field_NoMDEntries = "268";
const string Field_MDUpdateAction = "279";
const string Field_MDEntryType = "269";
const string Field_SecurityID = "48";
const string Field_RptSeq = "83";
const string Field_MDEntryPx = "270";
const string Field_MDEntrySize = "271";
const string Field_NumberOfOrders = "346";
const string Field_MDPriceLevel = "1023";
const string Field_OpenCloseSettlFlag = "286";
const string Field_AggressorSide = "5797";
const string Field_TradingReferenceDate = "5796";
const string Field_HighLimitPrice = "1149";
const string Field_LowLimitPrice = "1148";
const string Field_MaxPriceVariation = "1143";
const string Field_ApplID = "1180";
const string Field_NoOrderIDEntries = "37705";
const string Field_OrderID = "37";
const string Field_LastQty = "32";
const string Field_SettlPriceType= "731";
class OrderIdEntry {
public:
string OrderID;
int LastQty;
};
struct MDEntry {
public:
// necessary for defaults?
char MDUpdateAction;
char MDEntryType;
int SecurityID;
int RptSeq;
double MDEntryPx;
int MDEntrySize;
int NumberOfOrders = 0;
int MDPriceLevel = 0;
int OpenCloseSettlFlag = 0;
string SettlPriceType = "";
int AggressorSide = 0;
string TradingReferenceDate = "";
double HighLimitPrice = 0.0;
double LowLimitPrice = 0.0;
double MaxPriceVariation = 0.0;
int ApplID = 0;
};
class MDIncrementalRefresh {
public:
string TransactTime;
string MatchEventIndicator;
int NoMDEntries;
int NoOrderIDEntries = 0;
vector<MDEntry> MDEntries;
vector<OrderIdEntry> OrderIdEntries;
MDIncrementalRefresh(const string& message)
{
MDEntry* currentMDEntry = nullptr;
OrderIdEntry* currentOrderIDEntry = nullptr;
for (auto fields : string_split(message, FIX_FIELD_DELIMITER))
{
vector<string> kv = string_split(fields, FIX_KEY_DELIMITER);
// Header :: MDIncrementalRefresh
if (kv[KEY] == Field_TransactTime) this->TransactTime = kv[VALUE];
else if (kv[KEY] == Field_MatchEventIndicator) this->MatchEventIndicator = kv[VALUE];
else if (kv[KEY] == Field_NoMDEntries) this->NoMDEntries = stoi(kv[VALUE]);
else if (kv[KEY] == Field_NoOrderIDEntries) this->NoOrderIDEntries = stoi(kv[VALUE]);
// Repeating Group :: MDEntry
else if (kv[KEY] == Field_MDUpdateAction)
{
MDEntries.push_back(MDEntry());
currentMDEntry = &MDEntries.back(); // use pointer for fast lookup on subsequent repeating group fields
currentMDEntry->MDUpdateAction = kv[VALUE][STR_TO_CHAR];
}
else if (kv[KEY] == Field_MDEntryType) currentMDEntry->MDEntryType = kv[VALUE][STR_TO_CHAR];
else if (kv[KEY] == Field_SecurityID) currentMDEntry->SecurityID = stoi(kv[VALUE]);
else if (kv[KEY] == Field_RptSeq) currentMDEntry->RptSeq = stoi(kv[VALUE]);
else if (kv[KEY] == Field_MDEntryPx) currentMDEntry->MDEntryPx = stod(kv[VALUE]);
else if (kv[KEY] == Field_MDEntrySize) currentMDEntry->MDEntrySize = stoi(kv[VALUE]);
else if (kv[KEY] == Field_NumberOfOrders) currentMDEntry->NumberOfOrders = stoi(kv[VALUE]);
else if (kv[KEY] == Field_MDPriceLevel) currentMDEntry->MDPriceLevel = stoi(kv[VALUE]);
else if (kv[KEY] == Field_OpenCloseSettlFlag) currentMDEntry->OpenCloseSettlFlag = stoi(kv[VALUE]);
else if (kv[KEY] == Field_SettlPriceType) currentMDEntry->SettlPriceType= kv[VALUE];
else if (kv[KEY] == Field_AggressorSide) currentMDEntry->AggressorSide = stoi(kv[VALUE]);
else if (kv[KEY] == Field_TradingReferenceDate) currentMDEntry->TradingReferenceDate = kv[VALUE];
else if (kv[KEY] == Field_HighLimitPrice) currentMDEntry->HighLimitPrice = stod(kv[VALUE]);
else if (kv[KEY] == Field_LowLimitPrice) currentMDEntry->LowLimitPrice = stod(kv[VALUE]);
else if (kv[KEY] == Field_MaxPriceVariation) currentMDEntry->MaxPriceVariation = stod(kv[VALUE]);
else if (kv[KEY] == Field_ApplID) currentMDEntry->ApplID = stoi(kv[VALUE]);
// Repeating Group :: OrderIDEntry
else if (kv[KEY] == Field_OrderID) {
OrderIdEntries.push_back(OrderIdEntry());
currentOrderIDEntry = &OrderIdEntries.back();
currentOrderIDEntry->OrderID = kv[VALUE];
}
else if (kv[KEY] == Field_LastQty) currentOrderIDEntry->LastQty = stol(kv[VALUE]);
}
}
};
int main() {
//std::string filename = "test/sample";
std::string line;
std::ifstream file (filename);
int count = 0;
if (file.is_open())
{
while ( std::getline( file, line ) )
{
MDIncrementalRefresh md(line);
if (md.TransactTime != "") {
count++;
}
}
file.close();
}
cout << count << endl;
return 0;
}

For those who are interested, majority of the time being spent processing the code above was in the split_string function. The large number of calls to split_string resulted in many (expensive) allocations being done on the heap.
An alternative implementation split_string_optim re-uses a pre-allocated vector. This prevents unnecessary heap allocation/expansion upon every split_string function call. The below sample running 1.5m iterations suggests a 3.4x speed improvement. By utilising vector.clear() which itself does not free allocated memory back to the heap, it ensures subsequent split_string calls to split_string_optim where the resulting vector size <= previous have no additional allocations.
#include <string>
#include <vector>
void string_split_optim(std::vector<std::string>& output, const std::string &s, const char delimiter)
{
output.clear();
size_t start = 0;
size_t end = s.find_first_of(delimiter);
while (end <= std::string::npos)
{
output.emplace_back(s.substr(start, end - start));
if (end == std::string::npos)
break;
start = end + 1;
end = s.find_first_of(delimiter, start);
}
}
int main()
{
const int NUM_RUNS = 1500000;
const std::string s = "1128=9\u00019=174\u000135=X\u000149=CME\u000175=20170403\u000134=1061\u000152=20170402211926965794928\u000160=20170402211926965423233\u00015799=10000100\u0001268=1\u0001279=1\u0001269=1\u000148=9006\u000155=ESM7\u000183=118\u0001270=236025.0\u0001271=95\u0001346=6\u00011023=9\u000110=088\u0001";
std::vector<std::string> vec;
// standard
clock_t tStart = clock();
for (int i = 0; i < NUM_RUNS; ++i)
{
vec = string_split(s, '=');
}
printf("Time taken: %.2fs\n", (double) (clock() - tStart) / CLOCKS_PER_SEC);
// reused vector
tStart = clock();
for (int i = 0; i < NUM_RUNS; ++i)
{
string_split_optim(vec, s, '=');
vec.clear();
}
printf("Time taken: %.2fs\n", (double) (clock() - tStart) / CLOCKS_PER_SEC);
}
The result on my macbook was a 3.4x improvement.
Time taken: 6.60s
Time taken: 1.94s
Additionally, the MDIncrementalRefresh object was being repetitively constructed (on the stack, but it's vector members were also being expanded on the heap). In line with the above findings on split_string, i decided to re-use the temporary object and simply clear its previous state, resulting in another significant performance increase.

Related

How can I speed up parsing of large strings?

So I've made a program that reads in various config files. Some of these config files can be small, some can be semi-large (largest one is 3,844 KB).
The read in file is stored in a string (in the program below it's called sample).
I then have the program extract information from the string based on various formatting rules. This works well, the only issue is that when reading larger files it is very slow....
I was wondering if there was anything I could do to speed up the parsing or if there was an existing library that does what I need (extract string up until a delimiter & extract string string in between 2 delimiters on the same level). Any assistance would be great.
Here's my code & a sample of how it should work...
#include "stdafx.h"
#include <string>
#include <vector>
std::string ExtractStringUntilDelimiter(
std::string& original_string,
const std::string& delimiter,
const int delimiters_to_skip = 1)
{
std::string needle = "";
if (original_string.find(delimiter) != std::string::npos)
{
int total_found = 0;
auto occurance_index = static_cast<size_t>(-1);
while (total_found != delimiters_to_skip)
{
occurance_index = original_string.find(delimiter);
if (occurance_index != std::string::npos)
{
needle = original_string.substr(0, occurance_index);
total_found++;
}
else
{
break;
}
}
// Remove the found string from the original string...
original_string.erase(0, occurance_index + 1);
}
else
{
needle = original_string;
original_string.clear();
}
if (!needle.empty() && needle[0] == '\"')
{
needle = needle.substr(1);
}
if (!needle.empty() && needle[needle.length() - 1] == '\"')
{
needle.pop_back();
}
return needle;
}
void ExtractInitialDelimiter(
std::string& original_string,
const char delimiter)
{
// Remove extra new line characters
while (!original_string.empty() && original_string[0] == delimiter)
{
original_string.erase(0, 1);
}
}
void ExtractInitialAndFinalDelimiters(
std::string& original_string,
const char delimiter)
{
ExtractInitialDelimiter(original_string, delimiter);
while (!original_string.empty() && original_string[original_string.size() - 1] == delimiter)
{
original_string.erase(original_string.size() - 1, 1);
}
}
std::string ExtractStringBetweenDelimiters(
std::string& original_string,
const std::string& opening_delimiter,
const std::string& closing_delimiter)
{
const size_t first_delimiter = original_string.find(opening_delimiter);
if (first_delimiter != std::string::npos)
{
int total_open = 1;
const size_t opening_index = first_delimiter + opening_delimiter.size();
for (size_t i = opening_index; i < original_string.size(); i++)
{
// Check if we have room for opening_delimiter...
if (i + opening_delimiter.size() <= original_string.size())
{
for (size_t j = 0; j < opening_delimiter.size(); j++)
{
if (original_string[i + j] != opening_delimiter[j])
{
break;
}
else if (j == opening_delimiter.size() - 1)
{
total_open++;
}
}
}
// Check if we have room for closing_delimiter...
if (i + closing_delimiter.size() <= original_string.size())
{
for (size_t j = 0; j < closing_delimiter.size(); j++)
{
if (original_string[i + j] != closing_delimiter[j])
{
break;
}
else if (j == closing_delimiter.size() - 1)
{
total_open--;
}
}
}
if (total_open == 0)
{
// Extract result, and return it...
std::string needle = original_string.substr(opening_index, i - opening_index);
original_string.erase(first_delimiter, i + closing_delimiter.size());
// Remove new line symbols
ExtractInitialAndFinalDelimiters(needle, '\n');
ExtractInitialAndFinalDelimiters(original_string, '\n');
return needle;
}
}
}
return "";
}
int main()
{
std::string sample = "{\n"
"Line1\n"
"Line2\n"
"{\n"
"SubLine1\n"
"SubLine2\n"
"}\n"
"}";
std::string result = ExtractStringBetweenDelimiters(sample, "{", "}");
std::string LineOne = ExtractStringUntilDelimiter(result, "\n");
std::string LineTwo = ExtractStringUntilDelimiter(result, "\n");
std::string SerializedVector = ExtractStringBetweenDelimiters(result, "{", "}");
std::string SubLineOne = ExtractStringUntilDelimiter(SerializedVector, "\n");
std::string SubLineTwo = ExtractStringUntilDelimiter(SerializedVector, "\n");
// Just for testing...
printf("LineOne: %s\n", LineOne.c_str());
printf("LineTwo: %s\n", LineTwo.c_str());
printf("\tSubLineOne: %s\n", SubLineOne.c_str());
printf("\tSubLineTwo: %s\n", SubLineTwo.c_str());
system("pause");
}
Use string_view or a hand rolled one.
Don't modify the string loaded.
original_string.erase(0, occurance_index + 1);
is code smell and going to be expensive with a large original string.
If you are going to modify something, do it in one pass. Don't repeatedly delete from the front of it -- that is O(n^2). Instead, procceed along it and shove "finished" stuff into an output accumulator.
This will involve changing how your code works.
You're reading your data into a string. "Length of string" should not be a problem. So far, so good...
You're using "string.find().". That's not necessarily a bad choice.
You're using "string.erase()". That's probably the main source of your problem.
SUGGESTIONS:
Treat the original string as "read-only". Don't call erase(), don't modify it.
Personally, I'd consider reading your text into a C string (a text buffer), then parsing the text buffer, using strstr().
Here is a more efficient version of ExtractStringBetweenDelimiters. Note that this version does not mutate the original buffer. You would perform subsequent queries on the returned string.
std::string trim(std::string buffer, char what)
{
auto not_what = [&what](char ch)
{
return ch != what;
};
auto first = std::find_if(buffer.begin(), buffer.end(), not_what);
auto last = std::find_if(buffer.rbegin(), std::make_reverse_iterator(first), not_what).base();
return std::string(first, last);
}
std::string ExtractStringBetweenDelimiters(
std::string const& buffer,
const char opening_delimiter,
const char closing_delimiter)
{
std::string result;
auto first = std::find(buffer.begin(), buffer.end(), opening_delimiter);
if (first != buffer.end())
{
auto last = std::find(buffer.rbegin(), std::make_reverse_iterator(first),
closing_delimiter).base();
if(last > first)
{
result.assign(first + 1, last);
result = trim(std::move(result), '\n');
}
}
return result;
}
If you have access to string_view (c++17 for std::string_view or boost::string_view) you could return one of these from both functions for extra efficiency.
It's worth mentioning that this method of parsing a structured file is going to cause you problems down the line if any of the serialised strings contains a delimiter, such as a '{'.
In the end you'll want to write or use someone else's parser.
The boost::spirit library is a little complicated to learn, but creates very efficient parsers for this kind of thing.

Using a loop with std::strcmp to load lots of settings

In my game I keep track of unlocked levels with a vector std::vector<bool> lvlUnlocked_;.
The simple function to save the progress is this:
void save() {
std::stringstream ss;
std::string stringToSave = "";
std::ofstream ofile("./progress.txt");
if (ofile.good()) {
ofile.clear();
for (std::size_t i = 0; i < levelUnlocked_.size(); ++i) {
ss << "lvl" << i << "=" << (lvlUnlocked_.at(i) ? "1" : "0") << std::endl;
}
stringToSave = ss.str();
ofile << stringToSave;
ofile.close();
}
}
This works and is nice since I can just use a loop to dump the info.
Now to the part where I am stuck, the lower part of my load function (see comment in code below):
void load() {
std::ifstream ifile("./progress.txt");
if (ifile.good()) {
int begin;
int end;
std::string line;
std::string stringKey = "";
std::string stringValue = "";
unsigned int result;
while (std::getline(ifile, line)) {
stringKey = "";
stringValue = "";
for (unsigned int i = 0; i < line.length(); i++) {
if (line.at(i) == '=') {
begin = i + 1;
end = line.length();
break;
}
}
for (int i = 0; i < begin - 1; i++) {
stringKey += line.at(i);
}
for (int i = begin; i < end; i++) {
stringValue += line.at(i);
}
result = static_cast<unsigned int>(std::stoi(stringValue));
// usually I now compare the value and act accordingly, like so:
if (std::strcmp(stringKey.c_str(), "lvl0") == 0) {
lvlUnlocked_.at(0) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl1") == 0) {
lvlUnlocked_.at(1) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl2") == 0) {
lvlUnlocked_.at(2) = true;
}
// etc....
}
}
}
This works fine, but...
the problem is that I have 100+ levels and I want it to be dynamic based on the size of my lvlUnlocked_ vector instead of having to type it all like in the code above.
Is there a way to somehow make use of a loop like in my save function to check all levels?
If you parse your key to extract a suitable integer value, you can just index into the bit-vector with that:
while (std::getline(ifile, line)) {
const size_t eq = line.find('=');
if (eq == std::string::npos)
// no equals sign
continue;
auto stringKey = line.substr(0, eq);
auto stringValue = line.substr(eq+1);
if (stringKey.substr(0,3) != "lvl")
// doesn't begin with lvl
continue;
// strip off "lvl"
stringKey = stringKey.substr(3);
size_t end;
std::vector<bool>::size_type index = std::stoi(stringKey, &end);
if (end == 0 || end != stringKey.length())
// not a valid level number
continue;
if (index >= lvlUnlocked_.size())
// out of range
continue;
// Set it :-)
lvlUnlocked_[index] = stringValue=="1";
}
(I've also updated your parsing for "key=value" strings to more idiomatic C++.)

C++ efficient parse

I am programming some automated test equipment (ATE) and I'm trying to extract the following values out of an example response from the ATE:
DCRE? 1,
DCRE P, 10.3, (pin1)
DCRE F, 200.1, (pin2)
DCRE P, 20.4, (pin3)
From each line, I only care about the pin and the measured result value. So for the case above, I want to store the following pieces of information in a map<std::string, double> results;
results["pin1"] = 50.3;
results["pin2"] = 30.8;
results["pin3"] = 70.3;
I made the following code to parse the response:
void parseResultData(map<Pin*, double> &pinnametoresult, string &datatoparse) {
char *p = strtok((char*) datatoparse.c_str(), " \n");
string lastread;
string current;
while (p) {
current = p;
if(current.find('(') != string::npos) {
string substring = lastread.substr(1);
const char* last = substring.c_str();
double value = strtod(last, NULL);
unsigned short number = atoi(current.substr(4, current.size()-2).c_str());
pinnametoresult[&pinlookupmap[number]] = value;
}
lastread = p;
p = strtok(NULL, " \n");
}
}
It works, but it's not very efficient. Is there a way to make the function more efficient for this specific case? I don't care about the DCRE or P/F value on each line. I thought about using Boost regex library, but not sure if that would be more efficient.
In order to make this a bit more efficient, try to avoid copying. In particular, calls to substring, assignments etc can cause havoc on the performance. If you look at your code, you will see that the content of datatoparse are repeatedly assigned to lastread and current, each time with one line less at the beginning. So, on average you copy half of the original string times the number of lines, making just that part an O(n^2) algorithm. This isn't relevant if you have three or four line (not even on 100 lines!) but if you have a few more, performance degrades rapidly.
Try this approach instead:
string::size_type p0 = 0;
string::size_type p1 = input.find('\n', p0);
while (p1 != string::npos) {
// extract the line
string line = input.substr(p0, p1 - p0);
// move to the next line
p0 = p1 + 1;
p1 = input.find('\n', p0);
}
Notes:
Note that the algorithm still copies all input once, but each line only once, making it O(n).
Since you have a copy of the line, you can insert '\0' as artificial separator in order to give a substring to e.g. atoi() or strtod().
I'm not 100% sure of the order of parameters for string::find() and too lazy to look it up, but the idea is to start searching at a certain position. Look at the various overloads of find-like functions.
When handling a line, search the indices of the parts you need and then extract and parse them.
If you have line fragments (i.e. a partial line without a newline) at the end, you will have to modify the loop slightly. Create tests!
This is what I did:
#include <cstdlib>
#include <string>
#include <vector>
#include <unordered_map>
#include <sstream>
#include <iostream>
using namespace std;
struct Pin {
string something;
Pin() {}
};
vector<Pin*> pins = { new Pin(), new Pin(), new Pin() };
typedef unordered_map<Pin*, double> CONT_T;
inline bool OfInterest(const string& line) {
return line.find("(") != string::npos;
}
void parseResultData(CONT_T& pinnametoresult, const string& datatoparse)
{
istringstream is(datatoparse);
string line;
while (getline(is, line)) {
if (OfInterest(line)) {
double d = 0.0;
unsigned int pinid;
size_t firstComma = line.find(",")+2; // skip space
size_t secondComma = line.find(",", firstComma);
istringstream is2(line.substr(firstComma, secondComma-firstComma));
is2 >> d;
size_t paren = line.find("(")+4; // skip pin
istringstream is3(line.substr(paren, (line.length()-paren)-1));
is3 >> pinid;
--pinid;
Pin* pin = pins[pinid];
pinnametoresult[pin] = d;
}
}
}
/*
*
*/
int main(int argc, char** argv) {
string datatoparse = "DCRE? 1, \n"
"DCRE P, 10.3, (pin1)\n"
"DCRE F, 200.1, (pin2)\n"
"DCRE P, 20.4, (pin3)\n";
CONT_T results;
parseResultData(results, datatoparse);
return 0;
}
Here's my final result. Does not involve any copying, but it will destroy the string.
void parseResultData3(map<std::string, double> &pinnametoresult, std::string &datatoparse) {
char* str = (char*) datatoparse.c_str();
int length = datatoparse.size();
double lastdouble = 0.0;
char* startmarker = NULL; //beginning of next pin to parse
for(int pos = 0; pos < length; pos++, str++) {
if(str[0] == '(') {
startmarker = str + 1;
//get previous value
bool triggered = false;
for(char* lookback = str - 1; ; lookback--) {
if(!triggered && (isdigit(lookback[0]) || lookback[0] == '.')) {
triggered = true;
*(lookback + 1) = '\0';
}
else if(triggered && (!isdigit(lookback[0]) && lookback[0] != '.')) {
lastdouble = strtod(lookback, NULL);
break;
}
}
}
else if(startmarker != NULL) {
if(str[0] == ')') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = NULL;
}
if(str[0] == ',') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = str + 1;
}
}
}
}

Compare two strings containing float values

I am given two strings which contain a floating point number. I need to compare them. Can I directly compare the strings using std::string::compare and will this always give correct results? My current approach is to convert the string to float using std::stof, however I would prefer to avoid C++11 library functions.
simply comparing strings won't help you in cases like
a = "0.43"
b = "0.4300"
if you need to compare first parse them into float and then compare them
std::string s1 = "0.6"
std::wstring s2 = "0.7"
float d1 = std::stof(s1);
float d2 = std::stof(s2);
and then compare them
here is a full program
#include <iostream> // std::cout
#include <string> // std::string, std::stof
int main ()
{
std::string s1 = "0.6"
std::wstring s2 = "0.7"
float d1 = std::stof(s1);
float d2 = std::stof(s2);
if(d1 == d2)
std::cout << "Equals!";
else
std::cout << "Not Equals!";
return 0;
}
click here for more reading on stof
What about writing some ugly codes? It may not be good practice but ...
int compare (const string &str1, const string &str2) {
string *s1 = &str1, *s2 = &str2;
int isReverse = 1;
int len1, len2;
if (str1.length() > str2.length()) {
s1 = &str2;
s2 = &str1;
isReverse = -1;
}
len1 = s1->length();
len2 = s2->length();
if (!len1) {
if (!len2))
return 0;
else if ((*s2)[0] != '-')
return 1*isReverse;
return -1*isReverse;
}
int i = 0;
while(i < len1) {
if ((*s1)[i] > (*s2)[i])
return 1*isReverse;
else if ((*s1)[i] < (*s2)[i])
return -1*isReverse;
i++;
}
while (i < len2) {
if ((*s2)[i] != '0')
return -1*isReverse;
i++;
}
return 0;
}

Complex algorithm to extract numbers/number range from a string

I am working on a algorithm where I am trying the following output:
Given values/Inputs:
char *Var = "1-5,10,12,15-16,25-35,67,69,99-105";
int size = 29;
Here "1-5" depicts a range value, i.e. it will be understood as "1,2,3,4,5" while the values with just "," are individual values.
I was writing an algorithm where end output should be such that it will give complete range of output as:
int list[]=1,2,3,4,5,10,12,15,16,25,26,27,28,29,30,31,32,33,34,35,67,69,99,100,101,102,103,104,105;
If anyone is familiar with this issue then the help would be really appreciated.
Thanks in advance!
My initial code approach was as:
if(NULL != strchr((char *)grp_range, '-'))
{
int_u8 delims[] = "-";
result = (int_u8 *)strtok((char *)grp_range, (char *)delims);
if(NULL != result)
{
start_index = strtol((char*)result, (char **)&end_ptr, 10);
result = (int_u8 *)strtok(NULL, (char *)delims);
}
while(NULL != result)
{
end_index = strtol((char*)result, (char**)&end_ptr, 10);
result = (int_u8 *)strtok(NULL, (char *)delims);
}
while(start_index <= end_index)
{
grp_list[i++] = start_index;
start_index++;
}
}
else if(NULL != strchr((char *)grp_range, ','))
{
int_u8 delims[] = ",";
result = (unison_u8 *)strtok((char *)grp_range, (char *)delims);
while(result != NULL)
{
grp_list[i++] = strtol((char*)result, (char**)&end_ptr, 10);
result = (int_u8 *)strtok(NULL, (char *)delims);
}
}
But it only works if I have either "0-5" or "0,10,15". I am looking forward to make it more versatile.
Here is a C++ solution for you to study.
#include <vector>
#include <string>
#include <sstream>
#include <iostream>
using namespace std;
int ConvertString2Int(const string& str)
{
stringstream ss(str);
int x;
if (! (ss >> x))
{
cerr << "Error converting " << str << " to integer" << endl;
abort();
}
return x;
}
vector<string> SplitStringToArray(const string& str, char splitter)
{
vector<string> tokens;
stringstream ss(str);
string temp;
while (getline(ss, temp, splitter)) // split into new "lines" based on character
{
tokens.push_back(temp);
}
return tokens;
}
vector<int> ParseData(const string& data)
{
vector<string> tokens = SplitStringToArray(data, ',');
vector<int> result;
for (vector<string>::const_iterator it = tokens.begin(), end_it = tokens.end(); it != end_it; ++it)
{
const string& token = *it;
vector<string> range = SplitStringToArray(token, '-');
if (range.size() == 1)
{
result.push_back(ConvertString2Int(range[0]));
}
else if (range.size() == 2)
{
int start = ConvertString2Int(range[0]);
int stop = ConvertString2Int(range[1]);
for (int i = start; i <= stop; i++)
{
result.push_back(i);
}
}
else
{
cerr << "Error parsing token " << token << endl;
abort();
}
}
return result;
}
int main()
{
vector<int> result = ParseData("1-5,10,12,15-16,25-35,67,69,99-105");
for (vector<int>::const_iterator it = result.begin(), end_it = result.end(); it != end_it; ++it)
{
cout << *it << " ";
}
cout << endl;
}
Live example
http://ideone.com/2W99Tt
This is my boost approach :
This won't give you array of ints, instead a vector of ints
Algorithm used: (nothing new)
Split string using ,
Split the individual string using -
Make a range low and high
Push it into vector with help of this range
Code:-
#include<iostream>
#include<vector>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
int main(){
std::string line("1-5,10,12,15-16,25-35,67,69,99-105");
std::vector<std::string> strs,r;
std::vector<int> v;
int low,high,i;
boost::split(strs,line,boost::is_any_of(","));
for (auto it:strs)
{
boost::split(r,it,boost::is_any_of("-"));
auto x = r.begin();
low = high =boost::lexical_cast<int>(r[0]);
x++;
if(x!=r.end())
high = boost::lexical_cast<int>(r[1]);
for(i=low;i<=high;++i)
v.push_back(i);
}
for(auto x:v)
std::cout<<x<<" ";
return 0;
}
You're issue seems to be misunderstanding how strtok works. Have a look at this.
#include <string.h>
#include <stdio.h>
int main()
{
int i, j;
char delims[] = " ,";
char str[] = "1-5,6,7";
char *tok;
char tmp[256];
int rstart, rend;
tok = strtok(str, delims);
while(tok != NULL) {
for(i = 0; i < strlen(tok); ++i) {
//// range
if(i != 0 && tok[i] == '-') {
strncpy(tmp, tok, i);
rstart = atoi(tmp);
strcpy(tmp, tok + i + 1);
rend = atoi(tmp);
for(j = rstart; j <= rend; ++j)
printf("%d\n", j);
i = strlen(tok) + 1;
}
else if(strchr(tok, '-') == NULL)
printf("%s\n", tok);
}
tok = strtok(NULL, delims);
}
return 0;
}
Don't search. Just go through the text one character at a time. As long as you're seeing digits, accumulate them into a value. If the digits are followed by a - then you're looking at a range, and need to parse the next set of digits to get the upper bound of the range and put all the values into your list. If the value is not followed by a - then you've got a single value; put it into your list.
Stop and think about it: what you actually have is a comma
separated list of ranges, where a range can be either a single
number, or a pair of numbers separated by a '-'. So you
probably want to loop over the ranges, using recursive descent
for the parsing. (This sort of thing is best handled by an
istream, so that's what I'll use.)
std::vector<int> results;
std::istringstream parser( std::string( var ) );
processRange( results, parser );
while ( isSeparator( parser, ',' ) ) {
processRange( results, parser );
}
with:
bool
isSeparator( std::istream& source, char separ )
{
char next;
source >> next;
if ( source && next != separ ) {
source.putback( next );
}
return source && next == separ;
}
and
void
processRange( std::vector<int>& results, std::istream& source )
{
int first = 0;
source >> first;
int last = first;
if ( isSeparator( source, '-' ) ) {
source >> last;
}
if ( last < first ) {
source.setstate( std::ios_base::failbit );
}
if ( source ) {
while ( first != last ) {
results.push_back( first );
++ first;
}
results.push_back( first );
}
}
The isSeparator function will, in fact, probably be useful in
other projects in the future, and should be kept in your
toolbox.
First divide whole string into numbers and ranges (using strtok() with "," delimiter), save strings in array, then, search through array looking for "-", if it present than use sscanf() with "%d-%d" format, else use sscanf with single "%d" format.
Function usage is easily googling.
One approach:
You need a parser that identifies 3 kinds of tokens: ',', '-', and numbers. That raises the level of abstraction so that you are operating at a level above characters.
Then you can parse your token stream to create a list of ranges and constants.
Then you can parse that list to convert the ranges into constants.
Some code that does part of the job:
#include <stdio.h>
// Prints a comma after the last digit. You will need to fix that up.
void print(int a, int b) {
for (int i = a; i <= b; ++i) {
printf("%d, ", i);
}
}
int main() {
enum { DASH, COMMA, NUMBER };
struct token {
int type;
int value;
};
// Sample input stream. Notice the sentinel comma at the end.
// 1-5,10,
struct token tokStream[] = {
{ NUMBER, 1 },
{ DASH, 0 },
{ NUMBER, 5 },
{ COMMA, 0 },
{ NUMBER, 10 },
{ COMMA, 0 } };
// This parser assumes well formed input. You have to add all the error
// checking yourself.
size_t i = 0;
while (i < sizeof(tokStream)/sizeof(struct token)) {
if (tokStream[i+1].type == COMMA) {
print(tokStream[i].value, tokStream[i].value);
i += 2; // skip to next number
}
else { // DASH
print(tokStream[i].value, tokStream[i+2].value);
i += 4; // skip to next number
}
}
return 0;
}