How can you strip non-ASCII characters from a string?
I like to know how we can achieve this in c++
Maybe something like:
struct InvalidChar
{
bool operator()(char c) const {
return !isprint(static_cast<unsigned char>(c));
}
};
std::string s;
HypoteticalReadFileToString(&s);
s.erase(std::remove_if(s.begin(),s.end(),InvalidChar()), s.end());
Its nicer to define a reusable function for the erase-remove idiom
template <typename C, typename P>
void erase_remove_if(C& c, P predicate) {
c.erase(std::remove_if(c.begin(), c.end(), predicate), c.end());
}
...
erase_remove_if(s, InvalidChar());
void stringPurifier ( std::string& s )
{
for ( std::string::iterator it = s.begin(), itEnd = s.end(); it!=itEnd; ++it)
{
if ( static_cast<unsigned int>(*it) < 32 || static_cast<unsigned int>(*it) > 127 )
{
(*it) = ' ';
}
}
}
void stringPurifier ( std::string& dest, const std::string& source )
{
dest.reserve(source.size());
for ( std::string::const_iterator it = source.begin(), itEnd = source.end(); it!=itEnd; ++it)
{
if ( static_cast<unsigned int>(*it) < 32 || static_cast<unsigned int>(*it) > 127 )
{
dest.push_back(' ');
}
else
{
dest.push_back(*it);
}
}
}
Strip everything that is greater than 127, or see http://www.asciitable.com/ and create a more specific range
while (CrtChar)
{
if (*CrtChar<35 || *CrtChar>127)
*CrtChar = ' ';
}
Related
To remove front & back whitespaces, Qt offers QByteArray::trimmed(). But it makes a copy of the underlying string unnecessarily everytime.
Is there any generic & efficient way (preferably using templates) to trim whitespaces for QByteArray, QString, std:: string?
You can do similar things for each of them, but they don't use the same interface.
void trim(std::string & str)
{
auto first = std::find_if_not(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); });
auto last = std::find_if_not(str.rbegin(), str.rend(), [](unsigned char c){ return std::isspace(c); }).base();
if (first != str.begin())
{
std::move(first, last, str.begin());
}
str.erase(last, str.end());
}
void trim(QString & str)
{
auto first = std::find_if_not(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); });
str.remove(0, first - str.begin());
auto last = std::find_if_not(str.rbegin(), str.rend(), [](unsigned char c){ return std::isspace(c); }).base();
str.resize(last - str.begin());
}
One simplistic way:
template<typename T>
T& Trim (T& value)
{
auto pos = value.size();
while(pos != 0 and value[--pos] == ' ');
if(++pos < value.size())
value.erase(pos);
if(value.size() > 0)
{
pos = -1;
while(value[++pos] == ' ');
if(pos > 0)
value.erase(0, pos);
}
return value;
}
Above will support all the types which have following methods like std::string:
size()
operator[] const
erase(pos, n)
Qt just lacks the erase() (name is remove()) unfortunately. To support the Qt types QString and QByteArray, we can have following wrapper:
template<class String>
struct QtWrap
{
String& m_Value;
auto size () const { return m_Value.size(); }
auto erase (int pos) { return m_Value.truncate(pos); }
auto erase (int pos, int length) { return m_Value.remove(pos, length); }
auto& operator[] (const int pos) const { return m_Value.data()[pos]; }
};
Usage:
QByteArray s;
QtWrap<QByteArray> qs{s};
Trim(qs); // Tested OK in QtCreator: Modifies the underlying `s`
I have a function that, given a block of text, should remove all punctuation characters, and make all letters lowercase, and eventually, should shift them according to a mono alphabetic cipher. The code below works:
class Cipher {
public:
string keyword;
string decipheredText;
deque<string> encipheredAlphabet;
static bool is_punctuation (char c) {
return c == '.' || c == ',' || c == '!' || c == '\''|| c == '?' || c
== ' ';
}
string encipher(string text) {
Alphabet a;
encipheredAlphabet = a.cipherLetters(keyword);
text.erase( remove_if(text.begin(), text.end(), is_punctuation),
text.end() );
string::iterator it;
for (it = text.begin(); it != text.end(); it++) {
*it = tolower(*it);
// encipher text according to shift
}
return text;
}
};
The problem is, it currently makes two passes over the string, one to remove the punctuation, and one to do all the other stuff. This seems inefficient, since it seems like all the transformations could be accomplished in one pass through the string somehow. Is there a clean way to incorporate the erase-remove idiom with other loop conditions?
With range-v3, you might create (lazy) view:
return text | ranges::view::filter([](char c){ return !is_punctuation(c); })
| ranges::view::transform([](char c) -> char { return to_lower(c); });
You could do it by using std::accumulate and an iterator as init value that insert into an output std::string
auto filter = [](auto pred) {
return [=](auto map) {
auto accumulator = [=](auto it, auto c) {
if (pred(c)) {
*it = map(c);
}
return ++it;
};
return accumulator;
};
};
auto accumulator = filter(std::not_fn(is_punctuation))
([](auto c) {
return std::tolower(c);
});
std::string in = "insIsjs.|s!js";
std::string out;
std::accumulate(std::begin(in), std::end(in), std::back_inserter(out), accumulator);
See demo
Copy and/or modify characters, then truncate the string :
string encipher(string text)
{
auto it = text.begin(),
jt = it;
for (; it != text.end(); it++)
{
if (!is_punctuation(*it))
{
*jt = tolower(*it);
++jt;
}
}
text.erase(jt, it);
return text;
}
If you don't want to do two loops because you've measured and found that it's slower, write a custom algorithm:
template <typename Iter, typename OutIter>
OutIter lowercased_without_punctuation(Iter begin, Iter end, OutIter out) {
while (begin != end) {
// Ignoring things like std::move_iterator for brevity.
if (!is_punctuation(*begin)) {
*out = tolower(*begin);
++out;
}
// Use `++iter` rather than `iter++` when possible
++begin;
}
return out;
}
// ...
string encipher(string text) {
Alphabet a;
encipheredAlphabet = a.cipherLetters(keyword);
text.erase(
lowercased_without_punctuation(text.begin(), text.end(), text.begin()),
text.end());
return text;
}
If you think about it some more, lowercased_without_punctuation is actually a special-case of a more general algorithm which might be called transform_if (relevant Q&A):
template <typename Iter, typename OutIter, typename Pred, typename Transf>
OutIter transform_if(Iter begin, Iter end, OutIter out, Pred p, Transf t) {
while (begin != end) {
if (p(*begin)) {
*out = t(*begin);
++out;
}
++begin;
}
return out;
}
// ...
string encipher(string text) {
Alphabet a;
encipheredAlphabet = a.cipherLetters(keyword);
text.erase(
transform_if(text.begin(), text.end(), text.begin(),
[](char c) { return !is_punctuation(c); },
[](char c) { return tolower(c); }),
text.end());
return text;
}
I'm attempting to tokenize a scripting language in C++ and am struggling currently with including further delimiters as tokens.
#ifndef TOKENIZER_H
#define TOKENIZER_H
#include <regex>
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <cctype>
using namespace std;
regex re("[\\s]+");
vector<string> deconstructDelimit(const string &input) {
string trimmed = input.substr(input.find_first_not_of(" \t\f\v\n\r"));
vector<string> decons;
sregex_token_iterator it(trimmed.begin(), trimmed.end(), re, -1);
sregex_token_iterator reg_end;
for (; it != reg_end; ++it) {
decons.push_back(it->str());
}
return decons;
}
vector<string> tokenize(const string &input) {
vector<string> whitespace;
string currToken;
for (auto it = input.begin(); it != input.end(); ++it) {
if (*it == '\'') {
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
currToken.clear();
}
whitespace.push_back("\'");
++it;
while (*it != '\'' && it != input.end()) {
currToken += *it;
++it;
}
if (currToken.length()) whitespace.push_back(currToken);
whitespace.push_back("\'");
currToken.clear();
} else if (*it == '\"') {
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
currToken.clear();
}
whitespace.push_back("\"");
++it;
while (*it != '\"' && it != input.end()) {
currToken += *it;
++it;
}
if (currToken.length()) whitespace.push_back(currToken);
whitespace.push_back("\"");
currToken.clear();
} else {
currToken += *it;
}
}
if (currToken.length()) {
vector<string> decons = deconstructDelimit(currToken);
whitespace.insert(whitespace.end(), decons.begin(), decons.end());
}
return whitespace;
}
#endif
So far, it is able to convert this code:
i = 1
while(i <= 10) {
print i + " " then i++
}
into these tokens:
i
=
1
while(i
<=
10)
{
print
i
+
"
"
then
i++
}
However, I want to then split this string vector of tokens by other delimiters, such as operators (++, =, <=, +, etc.), keywords (while, then, etc.), and other grammar like parentheses and brackets, preferably without using boost. What would be the best way for me to achieve this, given the string vector output of my current progress?
Edit:
For example, the result of further tokenization would be:
i
=
1
while(i -> while, (, i
<=
10) -> 10, )
{
print
i
+
"
"
then
i++ -> i, ++
}
Which, expanded, would be:
i
=
1
while
(
i
<=
10
)
{
print
i
+
"
"
then
i
++
}
I had the exact same problem as you when I tried to separate items of a math expression using a regex. I successfully found a well working way to do it :
std::vector<std::string> resplit(const std::string& s, std::string rg_str = "\\s+"){
std::cmatch cm;
std::string reg_str = std::string("(.*?)(")+rg_str+std::string(")");
std::string str = s+std::string(" ");
unsigned a = 0;
unsigned b = 1;
std::string subs = str.substr(a, b-a);
std::vector<std::string> elements;
while(b <= str.length()){
subs = str.substr(a, b-a);
if(std::regex_match(subs.c_str(), cm, std::regex(reg_str), std::regex_constants::match_default)){
for(unsigned i=1; i<cm.size(); i++){
std::string cmi(cm[i]);
// The following part can be adapted if you want to include whitespaces or empty strings
if(!std::regex_match(cmi.c_str(), std::regex("\\s*"))){
elements.push_back(std::string(cm[i]));
}
}
a = b;
b = b+1;
} else {
b++;
}
}
return elements;
}
When I use it on resplit("sin(x^2) + 1", "[^0-9a-zPI.]|[ \\(\\)]");, I get : ["sin", "(", "x", "^", "2", ")", "+", "1"].
Don't forget to change :
if(!std::regex_match(cmi.c_str(), std::regex("\\s*"))){
elements.push_back(std::string(cm[i]));
}
into :
if(!std::regex_match(cmi.c_str(), std::regex(""))){
elements.push_back(std::string(cm[i]));
}
if you want to include spaces (it will remove empty strings though, but this is preferable).
I hope it's useful to someone. Have a nice day.
I had the same problem and here is my complete solution which consists of few helper functions:
#include <regex>
#include <string>
#include <iostream>
#include <algorithm>
void ltrim(std::string& str) {
str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int character) {
return !std::isspace(character);
}));
}
void rtrim(std::string& str) {
str.erase(std::find_if(str.rbegin(), str.rend(), [](int character) {
return !std::isspace(character);
}).base(), str.end());
}
void trim(std::string& str) {
ltrim(str);
rtrim(str);
}
bool is_empty(std::string const& str) {
return str.empty() || str.find_first_not_of(' ') == std::string::npos;
}
std::vector<std::string> split(std::string const& str, std::string const& pattern) {
std::regex regex(pattern);
std::vector<std::string> result(
std::sregex_token_iterator(str.begin(), str.end(), regex, {-1, 0}),
std::sregex_token_iterator()
);
for (auto& token : result) {
trim(token);
}
result.erase(
std::remove_if(
result.begin(),
result.end(),
[](std::string const& str) { return is_empty(str); }
),
result.end()
);
return result;
}
int main() {
for (auto &s: split("sin(x^2) + 1", "[^0-9a-zPI.]|[ \\(\\)]")) {
std::cout << s << '\n';
}
return 0;
}
The key thing I used is std::sregex_token_iterator. As the last argument to its constructor I passed {-1, 0} where -1 represents the parts that are not matched and 0 represents the entire match.
The result of the above code snippet is:
sin
(
x
^
2
)
+
1
I have
std::vector<std::string> vec;
std::string myString;
and I need to find out if myString is in vec using case insensitive comaprisons.
I know I can use
find(vec.begin(), vec.end(), myString) != vec.end())
to answer the question "is myString in vec?" but that will do case sensitive comparisons. I need case insensitive comparisons.
The position is not important, I just want to know if myString is in vec or not.
You need to use std::tolower and std::find_if:
std::vector<std::string> vec = {"ALF", "B"};
std::string toSearch = "Alf";
auto itr = std::find_if(vec.begin(), vec.end(),
[&](auto &s) {
if ( s.size() != toSearch.size() )
return false;
for (size_t i = 0; i < s.size(); ++i)
if (::tolower(s[i]) == ::tolower(toSearch[i]))
return true;
return false;
}
);
if ( itr != vec.end()) {
std::cout << *itr << std::endl;
}
Or, for a much smaller and easier-to-read solution, Boost!
// #include <algorithm>
// #include <boost/algorithm/string/predicate.hpp>
const auto it = std::find_if(
std::begin(vec),
std::end(vec),
[&myString](const auto& str) { return boost::iequals(myString, str); }
);
const bool found = (it != std::end(vec));
You need to use std::find_if and provide a custom comparator. To achieve case insensitive comparison I would advise you to convert both strings you want to compare to a common case: lower or upper. That would lead to a code like the following:
auto ret = std::find_if(vec.begin(), vec.end(),
[&myString](const std::string& s) {
if (s.size() != myString.size())
return false;
return std::equal(s.cbegin(), s.cend(), myString.cbegin(), myString.cend(), [](auto c1, auto c2) { return std::toupper(c1) == std::toupper(c2); });
});
This will return an iterator which will be vec.end() if no occurrence of myString was found. You can do whatever you please with that iterator (including comparing it to vec.end() to know if you found your string).
Bonus: running minimal example on Coliru
You may use std::find_if, an inline lambda and std::tolower to make the comparison:
//Computing the lower version of mystring
std::string my_string_lower;
my_string_lower.reserve(mystring.size());
std::transform(mystring.begin(), mystring.end(), std::back_inserter(my_string_lower), ::tolower);
// Checking if it is exist in the vector:
auto is_exist = std::find_if(vec.begin(), vec.end(), [&my_string_lower](std::string item){
//Transform the each vector item to lower temporally
std::transform(item.begin(), item.end(), item.begin(), ::tolower);
return mystring==item;
}) != vec.end();
if you are going to search many times in the same vetor of string, it would be better if you compute it once:
//Computing the lower version of the whole vector
std::vector<std::string> vec_lower;
vec_lower.reserve(vec.size());
std::transform(vec.begin(), vec.end(), std::back_inserter(vec_lower),[](std:string item){
std::transform(item.begin(), item.end(), item.begin(), ::tolower);
return item;
});
//Computing the lower version of mystring
std::string my_string_lower;
my_string_lower.reserve(mystring.size());
std::transform(mystring.begin(), mystring.end(), std::back_inserter(my_string_lower), ::tolower);
// Checking if it is exist in the lower version of the vector:
auto is_exist = std::find_if(vec_lower.begin(), vec_lower.end(), [&my_string_lower](const std::string& item){
return mystring==item;
}) != vec_lower.end();
template <class T>
long VecFindIgnoreCase( const std::vector< T >& vec, const std::string& sFind ) {
return VecFindIgnoreCase( vec, sFind.c_str() );
}
template <class T>
long VecFindIgnoreCase( const std::vector< T >& vec, const char* sFind )
{
for ( std::vector< T >::const_iterator iter = vec.begin(); iter != vec.end(); ++iter )
if ( _stricmp( (*iter).c_str(), sFind ) == 0 )
return (long)std::distance( vec.begin(), iter );
return -1;
}
template <class T>
long VecFindIgnoreCase( const std::vector< T >& vec, const std::wstring& sFind ) {
return VecFindIgnoreCase( vec, sFind.c_str() );
}
template <class T>
long VecFindIgnoreCase( const std::vector< T >& vec, const wchar_t* sFind )
{
for ( std::vector< T >::const_iterator iter = vec.begin(); iter != vec.end(); ++iter )
if ( _wcsicmp( (*iter).c_str(), sFind ) == 0 )
return (long)std::distance( vec.begin(), iter );
return -1;
}
Use:
#include <string>
#include <vector>
void TestCode()
{
std::vector< std::string > strvecA;
std::vector< std::wstring > strvecW;
strvecA.push_back("abc");
strvecA.push_back("def");
strvecA.push_back("ghi");
strvecW.push_back(L"abc");
strvecW.push_back(L"def");
strvecW.push_back(L"ghi");
long ind;
ind = VecFindIgnoreCase( strvecA, "ABC" ); // ind = 0 found
ind = VecFindIgnoreCase( strvecA, "ghI" ); // ind = 2 found
ind = VecFindIgnoreCase( strvecA, "Xyz" ); // ind = -1 not found
ind = VecFindIgnoreCase( strvecW, L"aBc" ); // ind = 0 found
ind = VecFindIgnoreCase( strvecW, L"DEF" ); // ind = 1 found
ind = VecFindIgnoreCase( strvecW, L"xyZ" ); // ind = -1 not found
std::string sFind( "mno" );
if ( (ind = VecFindIgnoreCase( strvecA, sFind )) >= 0 ) {
// found at strvecA[ind]
} else {
// not found
}
}
Since the performance of std::find is better than std::count, I have to implement a function template to search in std::vector :
template <class Iterator>
Iterator Find(Iterator first, Iterator last, const char *value)
{
while (first != last)
{
if (StrCmpIA((*first).c_str(), value) == 0)
{
return first;
}
first++;
}
return last;
}
Now you can use of the template function like this :
vector<string> vecStr = {"ali", "reza", "hamid", "saeed"};
if (Find(vecStr.begin(), vecStr.end(), "saeeD") != vecStr.end())
{
cout << "found" << endl;
}
else
{
cout << "not found" << endl;
}
There is a very useful function in Python called strip(). Any similar ones in C++?
I use this:
#include <string>
#include <cctype>
std::string strip(const std::string &inpt)
{
auto start_it = inpt.begin();
auto end_it = inpt.rbegin();
while (std::isspace(*start_it))
++start_it;
while (std::isspace(*end_it))
++end_it;
return std::string(start_it, end_it.base());
}
There's nothing built-in; I used to use something like the following:
template <std::ctype_base::mask mask>
class IsNot
{
std::locale myLocale; // To ensure lifetime of facet...
std::ctype<char> const* myCType;
public:
IsNot( std::locale const& l = std::locale() )
: myLocale( l )
, myCType( &std::use_facet<std::ctype<char> >( l ) )
{
}
bool operator()( char ch ) const
{
return ! myCType->is( mask, ch );
}
};
typedef IsNot<std::ctype_base::space> IsNotSpace;
std::string
trim( std::string const& original )
{
std::string::const_iterator right = std::find_if( original.rbegin(), original.rend(), IsNotSpace() ).base();
std::string::const_iterator left = std::find_if(original.begin(), right, IsNotSpace() );
return std::string( left, right );
}
which works pretty well. (I now have a significantly more complex
version which handles UTF-8 correctly.)
void strip(std::string &str)
{
if (str.length() != 0)
{
auto w = std::string(" ") ;
auto n = std::string("\n") ;
auto r = std::string("\t") ;
auto t = std::string("\r") ;
auto v = std::string(1 ,str.front());
while((v == w) || (v==t) || (v==r) || (v==n))
{
str.erase(str.begin());
v = std::string(1 ,str.front());
}
v = std::string(1 , str.back());
while((v ==w) || (v==t) || (v==r) || (v==n))
{
str.erase(str.end() - 1 );
v = std::string(1 , str.back());
}
}
This is on top of the answer provided by Ferdi Kedef to make it safer.
void strip(std::string& str)
{
if (str.length() == 0) {
return;
}
auto start_it = str.begin();
auto end_it = str.rbegin();
while (std::isspace(*start_it)) {
++start_it;
if (start_it == str.end()) break;
}
while (std::isspace(*end_it)) {
++end_it;
if (end_it == str.rend()) break;
}
int start_pos = start_it - str.begin();
int end_pos = end_it.base() - str.begin();
str = start_pos <= end_pos ? std::string(start_it, end_it.base()) : "";
}