There is a very useful function in Python called strip(). Any similar ones in C++?
I use this:
#include <string>
#include <cctype>
std::string strip(const std::string &inpt)
{
auto start_it = inpt.begin();
auto end_it = inpt.rbegin();
while (std::isspace(*start_it))
++start_it;
while (std::isspace(*end_it))
++end_it;
return std::string(start_it, end_it.base());
}
There's nothing built-in; I used to use something like the following:
template <std::ctype_base::mask mask>
class IsNot
{
std::locale myLocale; // To ensure lifetime of facet...
std::ctype<char> const* myCType;
public:
IsNot( std::locale const& l = std::locale() )
: myLocale( l )
, myCType( &std::use_facet<std::ctype<char> >( l ) )
{
}
bool operator()( char ch ) const
{
return ! myCType->is( mask, ch );
}
};
typedef IsNot<std::ctype_base::space> IsNotSpace;
std::string
trim( std::string const& original )
{
std::string::const_iterator right = std::find_if( original.rbegin(), original.rend(), IsNotSpace() ).base();
std::string::const_iterator left = std::find_if(original.begin(), right, IsNotSpace() );
return std::string( left, right );
}
which works pretty well. (I now have a significantly more complex
version which handles UTF-8 correctly.)
void strip(std::string &str)
{
if (str.length() != 0)
{
auto w = std::string(" ") ;
auto n = std::string("\n") ;
auto r = std::string("\t") ;
auto t = std::string("\r") ;
auto v = std::string(1 ,str.front());
while((v == w) || (v==t) || (v==r) || (v==n))
{
str.erase(str.begin());
v = std::string(1 ,str.front());
}
v = std::string(1 , str.back());
while((v ==w) || (v==t) || (v==r) || (v==n))
{
str.erase(str.end() - 1 );
v = std::string(1 , str.back());
}
}
This is on top of the answer provided by Ferdi Kedef to make it safer.
void strip(std::string& str)
{
if (str.length() == 0) {
return;
}
auto start_it = str.begin();
auto end_it = str.rbegin();
while (std::isspace(*start_it)) {
++start_it;
if (start_it == str.end()) break;
}
while (std::isspace(*end_it)) {
++end_it;
if (end_it == str.rend()) break;
}
int start_pos = start_it - str.begin();
int end_pos = end_it.base() - str.begin();
str = start_pos <= end_pos ? std::string(start_it, end_it.base()) : "";
}
Related
Is there a way to split a string into small part and store into vector.
For example:
A string: str = "(a b c) d e f [[g h i]]". The output expected is:
(a b c)
d e f
[[g h i]]
Sample code:
vector<string> token;
string str = "(a b c)d e f[[g h i]]";
string bracketS = "()[]";
istringstream ss(str);
string section;
string tok;
while (getline(ss,section)) {
size_t start = 0;
size_t end = section.find_first_of(bracketS);
while (end != string::npos) {
tok = section.substr(start, end - start);
token.push_back(tok);
start = end + 1;
end = section.find_first_of(bracketS, start);
}
}
And output is without the brackets:
a b c
d e f
g h i
Tried to adjust my section.substr(start-1, end - start+2)
Then my output is:
(a b c)
) d e f [
[g h i]
Why the middle vector is wrong.
Also tried do strtok. But the output is the same as the first one.
Is there any other way to do it?
This is a possible solution with a stack for parsing and throwing a parsing_error if there are opening brackets missing closing brackets or the closing bracket mismatches the opening one.
#include <iostream>
#include <stack>
#include <string>
#include <vector>
const auto Brackets = { std::make_pair('(', ')'), std::make_pair('[', ']') };
const auto is_opening_bracket = [](const char c) {
return std::find_if(Brackets.begin(), Brackets.end(),
[c](const auto& p) { return p.first == c; } ) != Brackets.end();
};
const auto is_closing_bracket = [](const char c) {
return std::find_if(Brackets.begin(), Brackets.end(),
[c](const auto& p) { return p.second == c; } ) != Brackets.end();
};
const auto get_opening_bracket = [](const char c) {
const auto p = std::find_if(Brackets.begin(), Brackets.end(), [c](const auto& p) { return p.second == c; });
if (p == Brackets.end())
return '0';
return p->first;
};
struct parsing_error {};
int main() {
const std::string str = "(a b c)d e f[[g h i]]";
std::stack<char> brackets;
std::vector<std::string> tokens;
std::string token;
for (const auto c : str) {
if (is_opening_bracket(c)) {
if (!token.empty() && brackets.empty()) {
tokens.push_back(token);
token.clear();
}
brackets.push(c);
token += c;
} else if (is_closing_bracket(c)) {
if (brackets.top() != get_opening_bracket(c))
throw parsing_error();
brackets.pop();
token += c;
if (brackets.empty()) {
tokens.push_back(token);
token.clear();
}
} else {
token += c;
}
}
if (!brackets.empty())
throw parsing_error();
for (const auto& token : tokens)
std::cout << token << '\n';
return 0;
}
I'm attempting to tokenize a scripting language in C++ and am struggling currently with including further delimiters as tokens.
#ifndef TOKENIZER_H
#define TOKENIZER_H
#include <regex>
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <cctype>
using namespace std;
regex re("[\\s]+");
vector<string> deconstructDelimit(const string &input) {
string trimmed = input.substr(input.find_first_not_of(" \t\f\v\n\r"));
vector<string> decons;
sregex_token_iterator it(trimmed.begin(), trimmed.end(), re, -1);
sregex_token_iterator reg_end;
for (; it != reg_end; ++it) {
decons.push_back(it->str());
}
return decons;
}
vector<string> tokenize(const string &input) {
vector<string> whitespace;
string currToken;
for (auto it = input.begin(); it != input.end(); ++it) {
if (*it == '\'') {
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
currToken.clear();
}
whitespace.push_back("\'");
++it;
while (*it != '\'' && it != input.end()) {
currToken += *it;
++it;
}
if (currToken.length()) whitespace.push_back(currToken);
whitespace.push_back("\'");
currToken.clear();
} else if (*it == '\"') {
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
currToken.clear();
}
whitespace.push_back("\"");
++it;
while (*it != '\"' && it != input.end()) {
currToken += *it;
++it;
}
if (currToken.length()) whitespace.push_back(currToken);
whitespace.push_back("\"");
currToken.clear();
} else {
currToken += *it;
}
}
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
}
return whitespace;
}
#endif
So far, it is able to convert this code:
i = 1
while(i <= 10) {
print i + " " then i++
}
into these tokens:
i
=
1
while(i
<=
10)
{
print
i
+
"
"
then
i++
}
However, I want to then split this string vector of tokens by other delimiters, such as operators (++, =, <=, +, etc.), keywords (while, then, etc.), and other grammar like parentheses and brackets, preferably without using boost. What would be the best way for me to achieve this, given the string vector output of my current progress?
Edit:
For example, the result of further tokenization would be:
i
=
1
while(i -> while, (, i
<=
10) -> 10, )
{
print
i
+
"
"
then
i++ -> i, ++
}
Which, expanded, would be:
i
=
1
while
(
i
<=
10
)
{
print
i
+
"
"
then
i
++
}
I had the exact same problem as you when I tried to separate items of a math expression using a regex. I successfully found a well working way to do it :
std::vector<std::string> resplit(const std::string& s, std::string rg_str = "\\s+"){
std::cmatch cm;
std::string reg_str = std::string("(.*?)(")+rg_str+std::string(")");
std::string str = s+std::string(" ");
unsigned a = 0;
unsigned b = 1;
std::string subs = str.substr(a, b-a);
std::vector<std::string> elements;
while(b <= str.length()){
subs = str.substr(a, b-a);
if(std::regex_match(subs.c_str(), cm, std::regex(reg_str), std::regex_constants::match_default)){
for(unsigned i=1; i<cm.size(); i++){
std::string cmi(cm[i]);
// The following part can be adapted if you want to include whitespaces or empty strings
if(!std::regex_match(cmi.c_str(), std::regex("\\s*"))){
elements.push_back(std::string(cm[i]));
}
}
a = b;
b = b+1;
} else {
b++;
}
}
return elements;
}
When I use it on resplit("sin(x^2) + 1", "[^0-9a-zPI.]|[ \\(\\)]");, I get : ["sin", "(", "x", "^", "2", ")", "+", "1"].
Don't forget to change :
if(!std::regex_match(cmi.c_str(), std::regex("\\s*"))){
elements.push_back(std::string(cm[i]));
}
into :
if(!std::regex_match(cmi.c_str(), std::regex(""))){
elements.push_back(std::string(cm[i]));
}
if you want to include spaces (it will remove empty strings though, but this is preferable).
I hope it's useful to someone. Have a nice day.
I had the same problem and here is my complete solution which consists of few helper functions:
#include <regex>
#include <string>
#include <iostream>
#include <algorithm>
void ltrim(std::string& str) {
str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int character) {
return !std::isspace(character);
}));
}
void rtrim(std::string& str) {
str.erase(std::find_if(str.rbegin(), str.rend(), [](int character) {
return !std::isspace(character);
}).base(), str.end());
}
void trim(std::string& str) {
ltrim(str);
rtrim(str);
}
bool is_empty(std::string const& str) {
return str.empty() || str.find_first_not_of(' ') == std::string::npos;
}
std::vector<std::string> split(std::string const& str, std::string const& pattern) {
std::regex regex(pattern);
std::vector<std::string> result(
std::sregex_token_iterator(str.begin(), str.end(), regex, {-1, 0}),
std::sregex_token_iterator()
);
for (auto& token : result) {
trim(token);
}
result.erase(
std::remove_if(
result.begin(),
result.end(),
[](std::string const& str) { return is_empty(str); }
),
result.end()
);
return result;
}
int main() {
for (auto &s: split("sin(x^2) + 1", "[^0-9a-zPI.]|[ \\(\\)]")) {
std::cout << s << '\n';
}
return 0;
}
The key thing I used is std::sregex_token_iterator. As the last argument to its constructor I passed {-1, 0} where -1 represents the parts that are not matched and 0 represents the entire match.
The result of the above code snippet is:
sin
(
x
^
2
)
+
1
So basically I want to create a format function that accepts a string, and replaces words in that string with whatever the user wants to be replaced. At first I had some issues with non-deferencable iterators until I realized that when you change the the size of a string you can invalid any iterators. It doesn't throw anymore exceptions now now the output is the same as the input. Any advice???
string& formatFn(string& s, string& oldWord, string& newWord)
{
string word = "";
for (auto iter1 = s.begin(); iter1 != s.end(); ++iter1)
{
string tmpWord = "";
if (!isblank(*iter1)) // Testing for whitespace
{
tmpWord += *iter1;
if (tmpWord == oldWord)
{
string::iterator beg = iter1 - word.size();
string::iterator end = iter1;
auto sIter = s.erase(beg, end); // Get the position returned by erase
auto i = sIter - s.begin(); // Get an index
s = s.insert(s[i], newWord);
}
}
if (isblank(*iter1))
{
tmpWord.clear();
}
}
return s;
}
If you already use string why don`t use all methods?
for (auto it = text.find(o_text); it != string::npos; it = text.find(o_text)){
text.replace(it, o_text.size(), n_text);
}
string::iterator beg = iter1 - word.size();
I'm not sure what word actually does. You are trying to delete oldWord, right? Then it should be:
string::iterator beg = iter1 - oldWord.size();
EDIT : This is an improved version of your code:
string formatFn(const string& s, const string& oldWord, const string& newWord) {
string result = ""; // holds the string we want to return
string word = ""; // while iterating over 's', holds the current word
for (auto iter1 = s.begin(); iter1 != s.end(); ++iter1) {
if (!isblank(*iter1))
word += *iter1;
else { // if it is a whitespace, it must be the end of some word
// if 'word' is not same as 'oldword', just append it
// otherwise append 'newWord' instead
if (word == oldWord)
result += newWord;
else
result += word;
result += *iter1;
word.clear(); // reset 'word' to hold the next word in s
}
}
// the end of the string might not end with a whitespace, so the last word
// might be skipped if you don't make this test
if (word == oldWord)
result += newWord;
else
result += word;
return result;
}
You are over-complicating it:
std::string replace_all(std::string s, const std::string& sOld, const std::string& sNew)
{
std::size_t p = s.find(sOld);
while (p != std::string::npos)
{
s.replace(p, sOld.length(), sNew);
p = s.find(sOld, p + sNew.length());
}
return s;
}
If you are looking to replace whole words only (which your current attempt will not do):
#include <iostream>
#include <string>
bool test(const std::string& s, const std::string& sOld, std::size_t pos)
{
return (pos == 0 || !::isalpha(s[pos - 1])) && (!::isalpha(s[pos + sOld.length()]) || pos + sOld.length() >= s.length());
}
std::size_t find_word(const std::string& s, const std::string& sOld, std::size_t pos)
{
pos = s.find(sOld, pos);
while (pos != std::string::npos && (!test(s, sOld, pos) && pos < s.length()))
{
pos++;
pos = s.find(sOld, pos);
}
return pos;
}
std::string replace_all(std::string s, const std::string& sOld, const std::string& sNew)
{
std::size_t p = find_word(s, sOld, 0);
while (p != std::string::npos && p < s.length())
{
s.replace(p, sOld.length(), sNew);
p = find_word(s, sOld, p + sNew.length());
}
return s;
}
int main()
{
std::string sOrig = "eat Heat eat beat sweat cheat eat";
std::string sOld = "eat";
std::string sNew = "ate";
std::string sResult = replace_all(sOrig, sOld, sNew);
std::cout << "Result: " << sResult << std::endl;
// Output: "ate Heat ate beat sweat cheat ate"
return 0;
}
The below function is working as expected. But I think I can do it in an efficient way.
input = "Hello' Main's World";
Function Return Value "Hello'' Main''s World";
string ReplaceSingleQuote(string input)
{
int len = input.length();
int i = 0, j =0;
char str[255];
sprintf(str, input.c_str());
char strNew[255];
for (i = 0; i <= len; i++)
{
if (str[i] == '\'')
{
strNew[j] = '\'';
strNew[j+ 1] = '\'';
j = j + 2;
} else
{
strNew[j] = str[i];
j = j + 1 ;
}
}
return strNew;
}
boost::replace_all(str, "'", "''");
http://www.boost.org/doc/libs/1_49_0/doc/html/boost/algorithm/replace_all.html
James Kanze's answer is a fine one. Just for the heck of it, though, I'll offer one that's a little more C++11ish.
string DoubleQuotes(string value)
{
string retval;
for (auto ch : value)
{
if (ch == '\'')
{
retval.push_back('\'');
}
retval.push_back(ch);
}
return retval;
}
Maybe use a std::stringstream:
string ReplaceSingleQuote(string input)
{
stringstream s;
for (i = 0; i <= input.length(); i++)
{
s << input[i];
if ( input[i] == '\'' )
s << '\'';
}
return s.str();
}
A possibility (that will modify input) is to use std::string::replace() and std::string::find():
size_t pos = 0;
while (std::string::npos != (pos = input.find("'", pos)))
{
input.replace(pos, 1, "\'\'", 2);
pos += 2;
}
The obvious solution is:
std::string
replaceSingleQuote( std::string const& original )
{
std::string results;
for ( std::string::const_iterator current = original.begin();
current != original.end();
++ current ) {
if ( *current == '\'' ) {
results.push_back( '\'');
}
results.push_back( *current );
}
return results;
}
Small variations might improve performance:
std::string
replaceSingleQuote( std::string const& original )
{
std::string results(
original.size()
+ std::count( original.begin(), original.end(), '\''),
'\'' );
std::string::iterator dest = results.begin();
for ( std::string::const_iterator current = original.begin();
current != original.end();
++ current ) {
if ( *current == '\'' ) {
++ dest;
}
*dest = *current;
++ dest;
}
return results;
}
might be worth trying, for example. But only if you find the original
version to be a bottleneck in your code; there's no point in making
something more complicated than necessary.
How can you strip non-ASCII characters from a string?
I like to know how we can achieve this in c++
Maybe something like:
struct InvalidChar
{
bool operator()(char c) const {
return !isprint(static_cast<unsigned char>(c));
}
};
std::string s;
HypoteticalReadFileToString(&s);
s.erase(std::remove_if(s.begin(),s.end(),InvalidChar()), s.end());
Its nicer to define a reusable function for the erase-remove idiom
template <typename C, typename P>
void erase_remove_if(C& c, P predicate) {
c.erase(std::remove_if(c.begin(), c.end(), predicate), c.end());
}
...
erase_remove_if(s, InvalidChar());
void stringPurifier ( std::string& s )
{
for ( std::string::iterator it = s.begin(), itEnd = s.end(); it!=itEnd; ++it)
{
if ( static_cast<unsigned int>(*it) < 32 || static_cast<unsigned int>(*it) > 127 )
{
(*it) = ' ';
}
}
}
void stringPurifier ( std::string& dest, const std::string& source )
{
dest.reserve(source.size());
for ( std::string::const_iterator it = source.begin(), itEnd = source.end(); it!=itEnd; ++it)
{
if ( static_cast<unsigned int>(*it) < 32 || static_cast<unsigned int>(*it) > 127 )
{
dest.push_back(' ');
}
else
{
dest.push_back(*it);
}
}
}
Strip everything that is greater than 127, or see http://www.asciitable.com/ and create a more specific range
while (CrtChar)
{
if (*CrtChar<35 || *CrtChar>127)
*CrtChar = ' ';
}