Related
I have to process badly mismanaged text with creative indentation. I want to remove the empty (or whitespace) lines at the beginning and end of my text without touching anything else; meaning that if the first or last actual lines respectively begin or end with whitespace, these will stay.
For example, this:
<lines, empty or with whitespaces ...>
<text, maybe preceded by whitespace>
<lines with or without text...>
<text, maybe followed by whitespace>
<lines, empty or with whitespaces ...>
turns to
<text, maybe preceded by whitespace>
<lines with or without text...>
<text, maybe followed by whitespace>
preserving the spaces at the beginning and the end of the actual text lines (the text might also be entirely whitespace)
A regex replacing (\A\s*(\r\n|\Z)|\r\n\s*\Z) by emptiness does exactly what I want, but regex is kind of overkill, and I fear it might cost me some time when processing texts with a lot of lines but not much to trim.
On the other hand, an explicit algorithm is easy to make (just read until a non-whitespace/the end while remembering the last line feed, then truncate, and do the same backwards) but it feels like I'm missing something obvious.
How can I do this?
As you can see from this discussion, trimming whitespace requires a lot of work in C++. This should definitely be included in the standard library.
Anyway, I've checked how to do it as simply as possible, but nothing comes near the compactness of RegEx. For speed, it's a different story.
In the following you can find three versions of a program which does the required task. With regex, with std functions and with just a couple of indexes. The last one can be also made faster because you can avoid copying altogether, but I left it for fair comparison:
#include <string>
#include <sstream>
#include <chrono>
#include <iostream>
#include <regex>
#include <exception>
struct perf {
std::chrono::steady_clock::time_point start_;
perf() : start_(std::chrono::steady_clock::now()) {}
double elapsed() const {
auto stop = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = stop - start_;
return elapsed_seconds.count();
}
};
std::string Generate(size_t line_len, size_t empty, size_t nonempty) {
std::string es(line_len, ' ');
es += '\n';
for (size_t i = 0; i < empty; ++i) {
es += es;
}
std::string nes(line_len - 1, ' ');
es += "a\n";
for (size_t i = 0; i < nonempty; ++i) {
nes += nes;
}
return es + nes + es;
}
int main()
{
std::string test;
//test = " \n\t\n \n \tTEST\n\tTEST\n\t\t\n TEST\t\n \t\n \n ";
std::cout << "Generating...";
std::cout.flush();
test = Generate(1000, 8, 10);
std::cout << " done." << std::endl;
std::cout << "Test 1...";
std::cout.flush();
perf p1;
std::string out1;
std::regex re(R"(^\s*\n|\n\s*$)");
try {
out1 = std::regex_replace(test, re, "");
}
catch (std::exception& e) {
std::cout << e.what() << std::endl;
}
std::cout << " done. Elapsed time: " << p1.elapsed() << "s" << std::endl;
std::cout << "Test 2...";
std::cout.flush();
perf p2;
std::stringstream is(test);
std::string line;
while (std::getline(is, line) && line.find_first_not_of(" \t\n\v\f\r") == std::string::npos);
std::string out2 = line;
size_t end = out2.size();
while (std::getline(is, line)) {
out2 += '\n';
out2 += line;
if (line.find_first_not_of(" \t\n\v\f\r") != std::string::npos) {
end = out2.size();
}
}
out2.resize(end);
std::cout << " done. Elapsed time: " << p2.elapsed() << "s" << std::endl;
if (out1 == out2) {
std::cout << "out1 == out2\n";
}
else {
std::cout << "out1 != out2\n";
}
std::cout << "Test 3...";
std::cout.flush();
perf p3;
static bool whitespace_table[] = {
1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
};
size_t sfl = 0; // Start of first line
for (size_t i = 0, end = test.size(); i < end; ++i) {
if (test[i] == '\n') {
sfl = i + 1;
}
else if (whitespace_table[(unsigned char)test[i]]) {
break;
}
}
size_t ell = test.size(); // End of last line
for (size_t i = test.size(); i-- > 0;) {
if (test[i] == '\n') {
ell = i;
}
else if (whitespace_table[(unsigned char)test[i]]) {
break;
}
}
std::string out3 = test.substr(sfl, ell - sfl);
std::cout << " done. Elapsed time: " << p3.elapsed() << "s" << std::endl;
if (out1 == out3) {
std::cout << "out1 == out3\n";
}
else {
std::cout << "out1 != out3\n";
}
return 0;
}
Running it on C++ Shell you get these timings:
Generating... done.
Test 1... done. Elapsed time: 4.2288s
Test 2... done. Elapsed time: 0.0077323s
out1 == out2
Test 3... done. Elapsed time: 0.000695783s
out1 == out3
If performance is important, it's better to really test it with the real files.
As a side note, this regex doesn't work on MSVC, because I couldn't find a way of avoiding ^ and $ to match the start and end of lines, that is disable the multiline mode of operation. If you run this, it throws an exception saying regex_error(error_complexity): The complexity of an attempted match against a regular expression exceeded a pre-set level.
I think I'll ask how to cope with this!
If whitespace in front of the first line or after the last non-whitespace-only line can be removed then this answer https://stackoverflow.com/a/217605/14258355 will suffice.
However, due to this constraint and if you do not want to use regex, I would propose to convert the string into lines and then build the string back up again from the first to the last non-whitespace-only line.
Here is a working example: https://godbolt.org/z/rozxj6saj
Convert the string to lines:
std::vector<std::string> StringToLines(const std::string &s) {
// Create vector with lines (not using input stream to keep line break
// characters)
std::vector<std::string> result;
std::string line;
for (auto c : s) {
line.push_back(c);
// Check for line break
if (c == '\n' || c == '\r') {
result.push_back(line);
line.clear();
}
}
// add last bit
result.push_back(line);
return result;
}
Build the string from the first to the last non-whitespace-only line:
bool IsNonWhiteSpaceString(const std::string &s) {
return s.end() != std::find_if(s.begin(), s.end(), [](unsigned char uc) {
return !std::isspace(uc);
});
}
std::string TrimVectorEmptyEndsIntoString(const std::vector<std::string> &v) {
std::string result;
// Find first non-whitespace line
auto it_begin = std::find_if(v.begin(), v.end(), [](const std::string &s) {
return IsNonWhiteSpaceString(s);
});
// Find last non-whitespace line
auto it_end = std::find_if(v.rbegin(), v.rend(), [](const std::string &s) {
return IsNonWhiteSpaceString(s);
});
// Build the string
for (auto it = it_begin; it != it_end.base(); std::advance(it, 1)) {
result.append(*it);
}
return result;
}
Usage example:
// Create a test string
std::string test_string(
" \n\t\n \n TEST\n\tTEST\n\t\tTEST\n TEST\t\n \t");
// Output result
std::cout << TrimVectorEmptyEndsIntoString(StringToLines(test_string));
Output showing whitespace:
Lets say I have three strings like that
"RES-003 :"
"RES 007 :"
" RES-015 :"
I want them to look like that when I show them
"RES-003:"
"RES 007:"
"RES-015:"
I was trying to solve that, however I can't get it right because when I try to clean up the spaces in between the colon and the numbers I delete the space in the second string "RES 007 :" so it changes it to "RES007:".
My trim function looks like that.
std::string& Reservation::trim(std::string& s) {
bool valid = true;
s.erase(0, s.find_first_not_of(' '));
s.erase(s.find_last_not_of(' ') + 1);
while (valid)
{
if (s.find(" ") != std::string::npos) {
s.erase(s.find(" "), 1);
valid = true;
if (s.find(" ") != std::string::npos) {
s.erase(s.find(" "), 1);
valid = true;
}
}
else
valid = false;
}
return s;
}
What can I do to improve it or I have to completely replace it?
Making the assumption, based on the examples you provided, that the format of your string is:
\s*[A-Z]{3}[\s\-][0-9]{3}\s*:\s*
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>
int main()
{
auto trimStr = [](const auto& str) -> std::string
{
auto first = std::find_if(str.begin(), str.end(),
[](unsigned char c){ return !std::isspace(c); });
return std::string(first, first+7) + ":";
};
std::vector<std::string> examples =
{
"RES-003 :",
"RES 007 :",
" RES-015 :"
};
for(const auto& elem : examples)
{
std::cout << trimStr(elem) << "\n";
}
}
Godbolt
What would be easiest method to split a string using c++11?
I've seen the method used by this post, but I feel that there ought to be a less verbose way of doing it using the new standard.
Edit: I would like to have a vector<string> as a result and be able to delimitate on a single character.
std::regex_token_iterator performs generic tokenization based on a regex. It may or may not be overkill for doing simple splitting on a single character, but it works and is not too verbose:
std::vector<std::string> split(const string& input, const string& regex) {
// passing -1 as the submatch index parameter performs splitting
std::regex re(regex);
std::sregex_token_iterator
first{input.begin(), input.end(), re, -1},
last;
return {first, last};
}
Here is a (maybe less verbose) way to split string (based on the post you mentioned).
#include <string>
#include <sstream>
#include <vector>
std::vector<std::string> split(const std::string &s, char delim) {
std::stringstream ss(s);
std::string item;
std::vector<std::string> elems;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
// elems.push_back(std::move(item)); // if C++11 (based on comment from #mchiasson)
}
return elems;
}
Here's an example of splitting a string and populating a vector with the extracted elements using boost.
#include <boost/algorithm/string.hpp>
std::string my_input("A,B,EE");
std::vector<std::string> results;
boost::algorithm::split(results, my_input, boost::is_any_of(","));
assert(results[0] == "A");
assert(results[1] == "B");
assert(results[2] == "EE");
Another regex solution inspired by other answers but hopefully shorter and easier to read:
std::string s{"String to split here, and here, and here,..."};
std::regex regex{R"([\s,]+)"}; // split on space and comma
std::sregex_token_iterator it{s.begin(), s.end(), regex, -1};
std::vector<std::string> words{it, {}};
I don't know if this is less verbose, but it might be easier to grok for those more seasoned in dynamic languages such as javascript. The only C++11 features it uses is auto and range-based for loop.
#include <string>
#include <cctype>
#include <iostream>
#include <vector>
using namespace std;
int main()
{
string s = "hello how are you won't you tell me your name";
vector<string> tokens;
string token;
for (const auto& c: s) {
if (!isspace(c))
token += c;
else {
if (token.length()) tokens.push_back(token);
token.clear();
}
}
if (token.length()) tokens.push_back(token);
return 0;
}
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
using namespace std;
vector<string> split(const string& str, int delimiter(int) = ::isspace){
vector<string> result;
auto e=str.end();
auto i=str.begin();
while(i!=e){
i=find_if_not(i,e, delimiter);
if(i==e) break;
auto j=find_if(i,e, delimiter);
result.push_back(string(i,j));
i=j;
}
return result;
}
int main(){
string line;
getline(cin,line);
vector<string> result = split(line);
for(auto s: result){
cout<<s<<endl;
}
}
My choice is boost::tokenizer but I didn't have any heavy tasks and test with huge data.
Example from boost doc with lambda modification:
#include <iostream>
#include <boost/tokenizer.hpp>
#include <string>
#include <vector>
int main()
{
using namespace std;
using namespace boost;
string s = "This is, a test";
vector<string> v;
tokenizer<> tok(s);
for_each (tok.begin(), tok.end(), [&v](const string & s) { v.push_back(s); } );
// result 4 items: 1)This 2)is 3)a 4)test
return 0;
}
This is my answer. Verbose, readable and efficient.
std::vector<std::string> tokenize(const std::string& s, char c) {
auto end = s.cend();
auto start = end;
std::vector<std::string> v;
for( auto it = s.cbegin(); it != end; ++it ) {
if( *it != c ) {
if( start == end )
start = it;
continue;
}
if( start != end ) {
v.emplace_back(start, it);
start = end;
}
}
if( start != end )
v.emplace_back(start, end);
return v;
}
#include <string>
#include <vector>
#include <sstream>
inline vector<string> split(const string& s) {
vector<string> result;
istringstream iss(s);
for (string w; iss >> w; )
result.push_back(w);
return result;
}
Here is a C++11 solution that uses only std::string::find(). The delimiter can be any number of characters long. Parsed tokens are output via an output iterator, which is typically a std::back_inserter in my code.
I have not tested this with UTF-8, but I expect it should work as long as the input and delimiter are both valid UTF-8 strings.
#include <string>
template<class Iter>
Iter splitStrings(const std::string &s, const std::string &delim, Iter out)
{
if (delim.empty()) {
*out++ = s;
return out;
}
size_t a = 0, b = s.find(delim);
for ( ; b != std::string::npos;
a = b + delim.length(), b = s.find(delim, a))
{
*out++ = std::move(s.substr(a, b - a));
}
*out++ = std::move(s.substr(a, s.length() - a));
return out;
}
Some test cases:
void test()
{
std::vector<std::string> out;
size_t counter;
std::cout << "Empty input:" << std::endl;
out.clear();
splitStrings("", ",", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, empty delimiter:" << std::endl;
out.clear();
splitStrings("Hello, world!", "", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", no delimiter in string:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxya", "xyz", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string:" << std::endl;
out.clear();
splitStrings("abxycdxy!!xydefxya", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string"
", input contains blank token:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxya", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string"
", nothing after last delimiter:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxy", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", only delimiter exists string:" << std::endl;
out.clear();
splitStrings("xy", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
}
Expected output:
Empty input:
0:
Non-empty input, empty delimiter:
0: Hello, world!
Non-empty input, non-empty delimiter, no delimiter in string:
0: abxycdxyxydefxya
Non-empty input, non-empty delimiter, delimiter exists string:
0: ab
1: cd
2: !!
3: def
4: a
Non-empty input, non-empty delimiter, delimiter exists string, input contains blank token:
0: ab
1: cd
2:
3: def
4: a
Non-empty input, non-empty delimiter, delimiter exists string, nothing after last delimiter:
0: ab
1: cd
2:
3: def
4:
Non-empty input, non-empty delimiter, only delimiter exists string:
0:
1:
One possible way of doing this is finding all occurrences of the split string and storing locations to a list. Then count input string characters and when you get to a position where there is a 'search hit' in the position list then you jump forward by 'length of the split string'. This approach takes a split string of any length. Here is my tested and working solution.
#include <iostream>
#include <string>
#include <list>
#include <vector>
using namespace std;
vector<string> Split(string input_string, string search_string)
{
list<int> search_hit_list;
vector<string> word_list;
size_t search_position, search_start = 0;
// Find start positions of every substring occurence and store positions to a hit list.
while ( (search_position = input_string.find(search_string, search_start) ) != string::npos) {
search_hit_list.push_back(search_position);
search_start = search_position + search_string.size();
}
// Iterate through hit list and reconstruct substring start and length positions
int character_counter = 0;
int start, length;
for (auto hit_position : search_hit_list) {
// Skip over substrings we are splitting with. This also skips over repeating substrings.
if (character_counter == hit_position) {
character_counter = character_counter + search_string.size();
continue;
}
start = character_counter;
character_counter = hit_position;
length = character_counter - start;
word_list.push_back(input_string.substr(start, length));
character_counter = character_counter + search_string.size();
}
// If the search string is not found in the input string, then return the whole input_string.
if (word_list.size() == 0) {
word_list.push_back(input_string);
return word_list;
}
// The last substring might be still be unprocessed, get it.
if (character_counter < input_string.size()) {
word_list.push_back(input_string.substr(character_counter, input_string.size() - character_counter));
}
return word_list;
}
int main() {
vector<string> word_list;
string search_string = " ";
// search_string = "the";
string text = "thetheThis is some text to test with the split-thethe function.";
word_list = Split(text, search_string);
for (auto item : word_list) {
cout << "'" << item << "'" << endl;
}
cout << endl;
}
What would be easiest method to split a string using c++11?
I've seen the method used by this post, but I feel that there ought to be a less verbose way of doing it using the new standard.
Edit: I would like to have a vector<string> as a result and be able to delimitate on a single character.
std::regex_token_iterator performs generic tokenization based on a regex. It may or may not be overkill for doing simple splitting on a single character, but it works and is not too verbose:
std::vector<std::string> split(const string& input, const string& regex) {
// passing -1 as the submatch index parameter performs splitting
std::regex re(regex);
std::sregex_token_iterator
first{input.begin(), input.end(), re, -1},
last;
return {first, last};
}
Here is a (maybe less verbose) way to split string (based on the post you mentioned).
#include <string>
#include <sstream>
#include <vector>
std::vector<std::string> split(const std::string &s, char delim) {
std::stringstream ss(s);
std::string item;
std::vector<std::string> elems;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
// elems.push_back(std::move(item)); // if C++11 (based on comment from #mchiasson)
}
return elems;
}
Here's an example of splitting a string and populating a vector with the extracted elements using boost.
#include <boost/algorithm/string.hpp>
std::string my_input("A,B,EE");
std::vector<std::string> results;
boost::algorithm::split(results, my_input, boost::is_any_of(","));
assert(results[0] == "A");
assert(results[1] == "B");
assert(results[2] == "EE");
Another regex solution inspired by other answers but hopefully shorter and easier to read:
std::string s{"String to split here, and here, and here,..."};
std::regex regex{R"([\s,]+)"}; // split on space and comma
std::sregex_token_iterator it{s.begin(), s.end(), regex, -1};
std::vector<std::string> words{it, {}};
I don't know if this is less verbose, but it might be easier to grok for those more seasoned in dynamic languages such as javascript. The only C++11 features it uses is auto and range-based for loop.
#include <string>
#include <cctype>
#include <iostream>
#include <vector>
using namespace std;
int main()
{
string s = "hello how are you won't you tell me your name";
vector<string> tokens;
string token;
for (const auto& c: s) {
if (!isspace(c))
token += c;
else {
if (token.length()) tokens.push_back(token);
token.clear();
}
}
if (token.length()) tokens.push_back(token);
return 0;
}
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
using namespace std;
vector<string> split(const string& str, int delimiter(int) = ::isspace){
vector<string> result;
auto e=str.end();
auto i=str.begin();
while(i!=e){
i=find_if_not(i,e, delimiter);
if(i==e) break;
auto j=find_if(i,e, delimiter);
result.push_back(string(i,j));
i=j;
}
return result;
}
int main(){
string line;
getline(cin,line);
vector<string> result = split(line);
for(auto s: result){
cout<<s<<endl;
}
}
My choice is boost::tokenizer but I didn't have any heavy tasks and test with huge data.
Example from boost doc with lambda modification:
#include <iostream>
#include <boost/tokenizer.hpp>
#include <string>
#include <vector>
int main()
{
using namespace std;
using namespace boost;
string s = "This is, a test";
vector<string> v;
tokenizer<> tok(s);
for_each (tok.begin(), tok.end(), [&v](const string & s) { v.push_back(s); } );
// result 4 items: 1)This 2)is 3)a 4)test
return 0;
}
This is my answer. Verbose, readable and efficient.
std::vector<std::string> tokenize(const std::string& s, char c) {
auto end = s.cend();
auto start = end;
std::vector<std::string> v;
for( auto it = s.cbegin(); it != end; ++it ) {
if( *it != c ) {
if( start == end )
start = it;
continue;
}
if( start != end ) {
v.emplace_back(start, it);
start = end;
}
}
if( start != end )
v.emplace_back(start, end);
return v;
}
#include <string>
#include <vector>
#include <sstream>
inline vector<string> split(const string& s) {
vector<string> result;
istringstream iss(s);
for (string w; iss >> w; )
result.push_back(w);
return result;
}
Here is a C++11 solution that uses only std::string::find(). The delimiter can be any number of characters long. Parsed tokens are output via an output iterator, which is typically a std::back_inserter in my code.
I have not tested this with UTF-8, but I expect it should work as long as the input and delimiter are both valid UTF-8 strings.
#include <string>
template<class Iter>
Iter splitStrings(const std::string &s, const std::string &delim, Iter out)
{
if (delim.empty()) {
*out++ = s;
return out;
}
size_t a = 0, b = s.find(delim);
for ( ; b != std::string::npos;
a = b + delim.length(), b = s.find(delim, a))
{
*out++ = std::move(s.substr(a, b - a));
}
*out++ = std::move(s.substr(a, s.length() - a));
return out;
}
Some test cases:
void test()
{
std::vector<std::string> out;
size_t counter;
std::cout << "Empty input:" << std::endl;
out.clear();
splitStrings("", ",", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, empty delimiter:" << std::endl;
out.clear();
splitStrings("Hello, world!", "", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", no delimiter in string:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxya", "xyz", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string:" << std::endl;
out.clear();
splitStrings("abxycdxy!!xydefxya", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string"
", input contains blank token:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxya", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", delimiter exists string"
", nothing after last delimiter:" << std::endl;
out.clear();
splitStrings("abxycdxyxydefxy", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
std::cout << "Non-empty input, non-empty delimiter"
", only delimiter exists string:" << std::endl;
out.clear();
splitStrings("xy", "xy", std::back_inserter(out));
counter = 0;
for (auto i = out.begin(); i != out.end(); ++i, ++counter) {
std::cout << counter << ": " << *i << std::endl;
}
}
Expected output:
Empty input:
0:
Non-empty input, empty delimiter:
0: Hello, world!
Non-empty input, non-empty delimiter, no delimiter in string:
0: abxycdxyxydefxya
Non-empty input, non-empty delimiter, delimiter exists string:
0: ab
1: cd
2: !!
3: def
4: a
Non-empty input, non-empty delimiter, delimiter exists string, input contains blank token:
0: ab
1: cd
2:
3: def
4: a
Non-empty input, non-empty delimiter, delimiter exists string, nothing after last delimiter:
0: ab
1: cd
2:
3: def
4:
Non-empty input, non-empty delimiter, only delimiter exists string:
0:
1:
One possible way of doing this is finding all occurrences of the split string and storing locations to a list. Then count input string characters and when you get to a position where there is a 'search hit' in the position list then you jump forward by 'length of the split string'. This approach takes a split string of any length. Here is my tested and working solution.
#include <iostream>
#include <string>
#include <list>
#include <vector>
using namespace std;
vector<string> Split(string input_string, string search_string)
{
list<int> search_hit_list;
vector<string> word_list;
size_t search_position, search_start = 0;
// Find start positions of every substring occurence and store positions to a hit list.
while ( (search_position = input_string.find(search_string, search_start) ) != string::npos) {
search_hit_list.push_back(search_position);
search_start = search_position + search_string.size();
}
// Iterate through hit list and reconstruct substring start and length positions
int character_counter = 0;
int start, length;
for (auto hit_position : search_hit_list) {
// Skip over substrings we are splitting with. This also skips over repeating substrings.
if (character_counter == hit_position) {
character_counter = character_counter + search_string.size();
continue;
}
start = character_counter;
character_counter = hit_position;
length = character_counter - start;
word_list.push_back(input_string.substr(start, length));
character_counter = character_counter + search_string.size();
}
// If the search string is not found in the input string, then return the whole input_string.
if (word_list.size() == 0) {
word_list.push_back(input_string);
return word_list;
}
// The last substring might be still be unprocessed, get it.
if (character_counter < input_string.size()) {
word_list.push_back(input_string.substr(character_counter, input_string.size() - character_counter));
}
return word_list;
}
int main() {
vector<string> word_list;
string search_string = " ";
// search_string = "the";
string text = "thetheThis is some text to test with the split-thethe function.";
word_list = Split(text, search_string);
for (auto item : word_list) {
cout << "'" << item << "'" << endl;
}
cout << endl;
}
I'm trying to replicate the same functionally as JavaScript's sub string to extract string from string by given the sub string from / to pos's . How should I continue this function after I gut the location from and to?
std::string& UT::extractid(std::string& url)
{
std::vector< std::string > tokens;
std::string id = "";
tokens = split(url,'v=');
video_id = tokens.at(1);
if(video_id.length()>2)
{
string::size_type ampersandPosition = video_id.find('&', 0 );
if( ampersandPosition != string::npos ) {
cout << "Found at " << loc << endl;
} else {
cout << "Didn't find " << endl;
}
}
return video_id;
}
Instead of
return video_id;
do this:
size_t start = video_id.find_first_of("/");
if (start == string::npos) start = 0;
assert(ampersandPosition > start);
return video_id.substr(start, ampersandPosition - start);