Related
I tried to write a script that removes extra white spaces but I didn't manage to finish it.
Basically I want to transform abc sssd g g sdg gg gf into abc sssd g g sdg gg gf.
In languages like PHP or C#, it would be very easy, but not in C++, I see. This is my code:
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <unistd.h>
#include <string.h>
char* trim3(char* s) {
int l = strlen(s);
while(isspace(s[l - 1])) --l;
while(* s && isspace(* s)) ++s, --l;
return strndup(s, l);
}
char *str_replace(char * t1, char * t2, char * t6)
{
char*t4;
char*t5=(char *)malloc(10);
memset(t5, 0, 10);
while(strstr(t6,t1))
{
t4=strstr(t6,t1);
strncpy(t5+strlen(t5),t6,t4-t6);
strcat(t5,t2);
t4+=strlen(t1);
t6=t4;
}
return strcat(t5,t4);
}
void remove_extra_whitespaces(char* input,char* output)
{
char* inputPtr = input; // init inputPtr always at the last moment.
int spacecount = 0;
while(*inputPtr != '\0')
{
char* substr;
strncpy(substr, inputPtr+0, 1);
if(substr == " ")
{
spacecount++;
}
else
{
spacecount = 0;
}
printf("[%p] -> %d\n",*substr,spacecount);
// Assume the string last with \0
// some code
inputPtr++; // After "some code" (instead of what you wrote).
}
}
int main(int argc, char **argv)
{
printf("testing 2 ..\n");
char input[0x255] = "asfa sas f f dgdgd dg ggg";
char output[0x255] = "NO_OUTPUT_YET";
remove_extra_whitespaces(input,output);
return 1;
}
It doesn't work. I tried several methods. What I am trying to do is to iterate the string letter by letter and dump it in another string as long as there is only one space in a row; if there are two spaces, don't write the second character to the new string.
How can I solve this?
There are already plenty of nice solutions. I propose you an alternative based on a dedicated <algorithm> meant to avoid consecutive duplicates: unique_copy():
void remove_extra_whitespaces(const string &input, string &output)
{
output.clear(); // unless you want to add at the end of existing sring...
unique_copy (input.begin(), input.end(), back_insert_iterator<string>(output),
[](char a,char b){ return isspace(a) && isspace(b);});
cout << output<<endl;
}
Here is a live demo. Note that I changed from c style strings to the safer and more powerful C++ strings.
Edit: if keeping c-style strings is required in your code, you could use almost the same code but with pointers instead of iterators. That's the magic of C++. Here is another live demo.
Here's a simple, non-C++11 solution, using the same remove_extra_whitespace() signature as in the question:
#include <cstdio>
void remove_extra_whitespaces(char* input, char* output)
{
int inputIndex = 0;
int outputIndex = 0;
while(input[inputIndex] != '\0')
{
output[outputIndex] = input[inputIndex];
if(input[inputIndex] == ' ')
{
while(input[inputIndex + 1] == ' ')
{
// skip over any extra spaces
inputIndex++;
}
}
outputIndex++;
inputIndex++;
}
// null-terminate output
output[outputIndex] = '\0';
}
int main(int argc, char **argv)
{
char input[0x255] = "asfa sas f f dgdgd dg ggg";
char output[0x255] = "NO_OUTPUT_YET";
remove_extra_whitespaces(input,output);
printf("input: %s\noutput: %s\n", input, output);
return 1;
}
Output:
input: asfa sas f f dgdgd dg ggg
output: asfa sas f f dgdgd dg ggg
Since you use C++, you can take advantage of standard-library features designed for that sort of work. You could use std::string (instead of char[0x255]) and std::istringstream, which will replace most of the pointer arithmetic.
First, make a string stream:
std::istringstream stream(input);
Then, read strings from it. It will remove the whitespace delimiters automatically:
std::string word;
while (stream >> word)
{
...
}
Inside the loop, build your output string:
if (!output.empty()) // special case: no space before first word
output += ' ';
output += word;
A disadvantage of this method is that it allocates memory dynamically (including several reallocations, performed when the output string grows).
There are plenty of ways of doing this (e.g., using regular expressions), but one way you could do this is using std::copy_if with a stateful functor remembering whether the last character was a space:
#include <algorithm>
#include <string>
#include <iostream>
struct if_not_prev_space
{
// Is last encountered character space.
bool m_is = false;
bool operator()(const char c)
{
// Copy if last was not space, or current is not space.
const bool ret = !m_is || c != ' ';
m_is = c == ' ';
return ret;
}
};
int main()
{
const std::string s("abc sssd g g sdg gg gf into abc sssd g g sdg gg gf");
std::string o;
std::copy_if(std::begin(s), std::end(s), std::back_inserter(o), if_not_prev_space());
std::cout << o << std::endl;
}
You can use std::unique which reduces adjacent duplicates to a single instance according to how you define what makes two elements equal is.
Here I have defined elements as equal if they are both whitespace characters:
inline std::string& remove_extra_ws_mute(std::string& s)
{
s.erase(std::unique(std::begin(s), std::end(s), [](unsigned char a, unsigned char b){
return std::isspace(a) && std::isspace(b);
}), std::end(s));
return s;
}
inline std::string remove_extra_ws_copy(std::string s)
{
return remove_extra_ws_mute(s);
}
std::unique moves the duplicates to the end of the string and returns an iterator to the beginning of them so they can be erased.
Additionally, if you must work with low level strings then you can still use std::unique on the pointers:
char* remove_extra_ws(char const* s)
{
std::size_t len = std::strlen(s);
char* buf = new char[len + 1];
std::strcpy(buf, s);
// Note that std::unique will also retain the null terminator
// in its correct position at the end of the valid portion
// of the string
std::unique(buf, buf + len + 1, [](unsigned char a, unsigned char b){
return (a && std::isspace(a)) && (b && std::isspace(b));
});
return buf;
}
for in-place modification you can apply erase-remove technic:
#include <string>
#include <iostream>
#include <algorithm>
#include <cctype>
int main()
{
std::string input {"asfa sas f f dgdgd dg ggg"};
bool prev_is_space = true;
input.erase(std::remove_if(input.begin(), input.end(), [&prev_is_space](unsigned char curr) {
bool r = std::isspace(curr) && prev_is_space;
prev_is_space = std::isspace(curr);
return r;
}), input.end());
std::cout << input << "\n";
}
So you first move all extra spaces to the end of the string and then truncate it.
The great advantage of C++ is that is universal enough to port your code to plain-c-static strings with only few modifications:
void erase(char * p) {
// note that this ony works good when initial array is allocated in the static array
// so we do not need to rearrange memory
*p = 0;
}
int main()
{
char input [] {"asfa sas f f dgdgd dg ggg"};
bool prev_is_space = true;
erase(std::remove_if(std::begin(input), std::end(input), [&prev_is_space](unsigned char curr) {
bool r = std::isspace(curr) && prev_is_space;
prev_is_space = std::isspace(curr);
return r;
}));
std::cout << input << "\n";
}
Interesting enough remove step here is string-representation independent. It will work with std::string without modifications at all.
I have the sinking feeling that good ol' scanf will do (in fact, this is the C school equivalent to Anatoly's C++ solution):
void remove_extra_whitespaces(char* input, char* output)
{
int srcOffs = 0, destOffs = 0, numRead = 0;
while(sscanf(input + srcOffs, "%s%n", output + destOffs, &numRead) > 0)
{
srcOffs += numRead;
destOffs += strlen(output + destOffs);
output[destOffs++] = ' '; // overwrite 0, advance past that
}
output[destOffs > 0 ? destOffs-1 : 0] = '\0';
}
We exploit the fact that scanf has magical built-in space skipping capabilities. We then use the perhaps less known %n "conversion" specification which gives us the amount of chars consumed by scanf. This feature frequently comes in handy when reading from strings, like here. The bitter drop which makes this solution less-than-perfect is the strlen call on the output (there is no "how many bytes have I actually just written" conversion specifier, unfortunately).
Last not least use of scanf is easy here because sufficient memory is guaranteed to exist at output; if that were not the case, the code would become more complex due to buffering and overflow handling.
Since you are writing c-style, here's a way to do what you want.
Note that you can remove '\r' and '\n' which are line breaks (but of course that's up to you if you consider those whitespaces or not).
This function should be as fast or faster than any other alternative and no memory allocation takes place even when it's called with std::strings (I've overloaded it).
char temp[] = " alsdasdl gasdasd ee";
remove_whitesaces(temp);
printf("%s\n", temp);
int remove_whitesaces(char *p)
{
int len = strlen(p);
int new_len = 0;
bool space = false;
for (int i = 0; i < len; i++)
{
switch (p[i])
{
case ' ': space = true; break;
case '\t': space = true; break;
case '\n': break; // you could set space true for \r and \n
case '\r': break; // if you consider them spaces, I just ignore them.
default:
if (space && new_len > 0)
p[new_len++] = ' ';
p[new_len++] = p[i];
space = false;
}
}
p[new_len] = '\0';
return new_len;
}
// and you can use it with strings too,
inline int remove_whitesaces(std::string &str)
{
int len = remove_whitesaces(&str[0]);
str.resize(len);
return len; // returning len for consistency with the primary function
// but u can return std::string instead.
}
// again no memory allocation is gonna take place,
// since resize does not not free memory because the length is either equal or lower
If you take a brief look at the C++ Standard library, you will notice that a lot C++ functions that return std::string, or other std::objects are basically a wrapper to a well written extern "C" function. So don't be afraid to use C functions in C++ applications, if they are well written and you can overload them to support std::strings and such.
For example, in Visual Studio 2015, std::to_string is written exactly like this:
inline string to_string(int _Val)
{ // convert int to string
return (_Integral_to_string("%d", _Val));
}
inline string to_string(unsigned int _Val)
{ // convert unsigned int to string
return (_Integral_to_string("%u", _Val));
}
and _Integral_to_string is a wrapper to a C function sprintf_s
template<class _Ty> inline
string _Integral_to_string(const char *_Fmt, _Ty _Val)
{ // convert _Ty to string
static_assert(is_integral<_Ty>::value,
"_Ty must be integral");
char _Buf[_TO_STRING_BUF_SIZE];
int _Len = _CSTD sprintf_s(_Buf, _TO_STRING_BUF_SIZE, _Fmt, _Val);
return (string(_Buf, _Len));
}
Well here is a longish(but easy) solution that does not use pointers.
It can be optimized further but hey it works.
#include <iostream>
#include <string>
using namespace std;
void removeExtraSpace(string str);
int main(){
string s;
cout << "Enter a string with extra spaces: ";
getline(cin, s);
removeExtraSpace(s);
return 0;
}
void removeExtraSpace(string str){
int len = str.size();
if(len==0){
cout << "Simplified String: " << endl;
cout << "I would appreciate it if you could enter more than 0 characters. " << endl;
return;
}
char ch1[len];
char ch2[len];
//Placing characters of str in ch1[]
for(int i=0; i<len; i++){
ch1[i]=str[i];
}
//Computing index of 1st non-space character
int pos=0;
for(int i=0; i<len; i++){
if(ch1[i] != ' '){
pos = i;
break;
}
}
int cons_arr = 1;
ch2[0] = ch1[pos];
for(int i=(pos+1); i<len; i++){
char x = ch1[i];
if(x==char(32)){
//Checking whether character at ch2[i]==' '
if(ch2[cons_arr-1] == ' '){
continue;
}
else{
ch2[cons_arr] = ' ';
cons_arr++;
continue;
}
}
ch2[cons_arr] = x;
cons_arr++;
}
//Printing the char array
cout << "Simplified string: " << endl;
for(int i=0; i<cons_arr; i++){
cout << ch2[i];
}
cout << endl;
}
I don't know if this helps but this is how I did it on my homework. The only case where it might break a bit is when there is spaces at the beginning of the string EX " wor ds " In that case, it will change it to " wor ds"
void ShortenSpace(string &usrStr){
char cha1;
char cha2;
for (int i = 0; i < usrStr.size() - 1; ++i) {
cha1 = usrStr.at(i);
cha2 = usrStr.at(i + 1);
if ((cha1 == ' ') && (cha2 == ' ')) {
usrStr.erase(usrStr.begin() + 1 + i);
--i;//edit: was ++i instead of --i, made code not work properly
}
}
}
I ended up here for a slighly different problem. Since I don't know where else to put it, and I found out what was wrong, I share it here. Don't be cross with me, please.
I had some strings that would print additional spaces at their ends, while showing up without spaces in debugging. The strings where formed in windows calls like VerQueryValue(), which besides other stuff outputs a string length, as e.g. iProductNameLen in the following line converting the result to a string named strProductName:
strProductName = string((LPCSTR)pvProductName, iProductNameLen)
then produced a string with a \0 byte at the end, which did not show easily in de debugger, but printed on screen as a space. I'll leave the solution of this as an excercise, since it is not hard at all, once you are aware of this.
How do I find a substring from the string path "/user/desktop/abc/post/" using C/C++? I want to check if folder "abc" is present or not in that path.
Path is character pointer char *ptr = "/user/desktop/abc/post/";
Use std::string and find.
std::string str = "/user/desktop/abc/post/";
bool exists = str.find("/abc/") != std::string::npos;
In C, use the strstr() standard library function:
const char *str = "/user/desktop/abc/post/";
const int exists = strstr(str, "/abc/") != NULL;
Take care to not accidentally find a too-short substring (this is what the starting and ending slashes are for).
Example using std::string find method:
#include <iostream>
#include <string>
int main (){
std::string str ("There are two needles in this haystack with needles.");
std::string str2 ("needle");
size_t found = str.find(str2);
if(found!=std::string::npos){
std::cout << "first 'needle' found at: " << found << '\n';
}
return 0;
}
Result:
first 'needle' found at: 14.
Use strstr(const char *s , const char *t)
and include<string.h>
You can write your own function which behaves same as strstr and you can modify according to your requirement also
char * str_str(const char *s, const char *t)
{
int i, j, k;
for (i = 0; s[i] != '\0'; i++)
{
for (j=i, k=0; t[k]!='\0' && s[j]==t[k]; j++, k++);
if (k > 0 && t[k] == '\0')
return (&s[i]);
}
return NULL;
}
As user1511510 has identified, there's an unusual case when abc is at the end of the file name. We need to look for either /abc/ or /abc followed by a string-terminator '\0'. A naive way to do this would be to check if either /abc/ or /abc\0 are substrings:
#include <stdio.h>
#include <string.h>
int main() {
const char *str = "/user/desktop/abc";
const int exists = strstr(str, "/abc/") || strstr(str, "/abc\0");
printf("%d\n",exists);
return 0;
}
but exists will be 1 even if abc is not followed by a null-terminator. This is because the string literal "/abc\0" is equivalent to "/abc". A better approach is to test if /abc is a substring, and then see if the character after this substring (indexed using the pointer returned by strstr()) is either a / or a '\0':
#include <stdio.h>
#include <string.h>
int main() {
const char *str = "/user/desktop/abc", *substr;
const int exists = (substr = strstr(str, "/abc")) && (substr[4] == '\0' || substr[4] == '/');
printf("%d\n",exists);
return 0;
}
This should work in all cases.
If you are utilizing arrays too much then you should include cstring.h because it has too many functions including finding substrings.
I am trying to do some array manipulations.
I am doing char array sorting and duplicates removal here.
Your comments are welcome. Havent done much testing and error handling here though.
#include<stdafx.h>
#include<stdlib.h>
#include<stdio.h>
#include<string>
using namespace std;
void sort(char *& arr)
{
char temp;
for(int i=0;i<strlen(arr);i++)
{
for(int j=i+1;j<strlen(arr);j++)
{
if(arr[i] > arr[j])
{
temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
}
}
bool ispresent(char *uniqueArr, char * arr)
{
bool isfound = false;
for(int i=0;i<strlen(arr);i++)
{
for(int j=0;j<=strlen(uniqueArr);j++)
{
if(arr[i]== uniqueArr[j])
{
isfound = true;
return isfound;
}
else
isfound = false;
}
}
return isfound;
}
char * removeduplicates(char *&arr)
{
char * uniqqueArr = strdup(""); // To make this char array modifiable
int index = 0;
bool dup = false;
while(*arr!=NULL)
{
dup = ispresent(uniqqueArr, arr);
if(dup == true)
{}//do nothing
else// copy the char to new char array.
{
uniqqueArr[index] = *arr;
index++;
}
arr++;
}
return uniqqueArr;
}
int main()
{
char *arr = strdup("saaangeetha");
// if strdup() is not used , access violation writing to
//location occurs at arr[i] = arr[j].
//This makes the constant string modifiable
sort(arr);
char * uniqueArr = removeduplicates(arr);
}
If you use std::string, your code (which is actually C-Style) can be written in C++ Style in just these lines:
#include <iostream>
#include <string>
#include <algorithm>
int main() {
std::string s= "saaangeetha";
std::sort(s.begin(), s.end());
std::string::iterator it = std::unique (s.begin(), s.end());
s.resize( it - s.begin());
std::cout << s ;
return 0;
}
Output: (all duplicates removed)
aeghnst
Demo : http://ideone.com/pHpPh
If you want char* at the end, then you can do this:
const char *uniqueChars = s.c_str(); //after removing the duplicates!
If I were doing it, I think I'd do the job quite a bit differently. If you can afford to ignore IBM mainframes, I'd do something like this:
unsigned long bitset = 0;
char *arr = "saaangeetha";
char *pos;
for (pos=arr; *pos; ++pos)
if (isalpha(*pos))
bitset |= 1 << (tolower(*pos)-'a');
This associates one bit in bitset with each possible letter. It then walks through the string and for each letter in the string, sets the associated bit in bitset. To print out the letters once you're done, you'd walk through bitset and print out the associated letter if that bit was set.
If you do care about IBM mainframes, you can add a small lookup table:
static char const *letters = "abcdefghijklkmnopqrstuvwxyz";
and use strchr to find the correct position for each letter.
Edit: If you're using C++ rather than C (as the tag said when I wrote what's above), you can simplify the code a bit at the expense of using some extra storage (and probably being minutely slower):
std::string arr = "saaangeetha";
std::set<char> letters((arr.begin()), arr.end());
std::copy(letters.begin(), letters.end(), std::ostream_iterator<char>(std::cout, " "));
Note, however, that while these appear the same for the test input, they can behave differently -- the previous version screens out anything but letters (and converts them all to lower case), but this distinguishes upper from lower case, and shows all non-alphabetic characters in the output as well.
char *arr = "saangeetha";
arr is pointing to read only section where string literal saangeetha is stored. So, it cannot be modified and is the reason for access violation error. Instead you need to do -
char arr[] = "sangeetha"; // Now, the string literal can be modified because a copy is made.
How can I count the number of "_" in a string like "bla_bla_blabla_bla"?
#include <algorithm>
std::string s = "a_b_c";
std::string::difference_type n = std::count(s.begin(), s.end(), '_');
Pseudocode:
count = 0
For each character c in string s
Check if c equals '_'
If yes, increase count
EDIT: C++ example code:
int count_underscores(string s) {
int count = 0;
for (int i = 0; i < s.size(); i++)
if (s[i] == '_') count++;
return count;
}
Note that this is code to use together with std::string, if you're using char*, replace s.size() with strlen(s).
Also note: I can understand you want something "as small as possible", but I'd suggest you to use this solution instead. As you see you can use a function to encapsulate the code for you so you won't have to write out the for loop everytime, but can just use count_underscores("my_string_") in the rest of your code. Using advanced C++ algorithms is certainly possible here, but I think it's overkill.
Old-fashioned solution with appropriately named variables. This gives the code some spirit.
#include <cstdio>
int _(char*__){int ___=0;while(*__)___='_'==*__++?___+1:___;return ___;}int main(){char*__="_la_blba_bla__bla___";printf("The string \"%s\" contains %d _ characters\n",__,_(__));}
Edit: about 8 years later, looking at this answer I'm ashamed I did this (even though I justified it to myself as a snarky poke at a low-effort question). This is toxic and not OK. I'm not removing the post; I'm adding this apology to help shifting the atmosphere on StackOverflow. So OP: I apologize and I hope you got your homework right despite my trolling and that answers like mine did not discourage you from participating on the site.
Using the lambda function to check the character is "_" then the only count will be incremented else not a valid character
std::string s = "a_b_c";
size_t count = std::count_if( s.begin(), s.end(), []( char c ){return c =='_';});
std::cout << "The count of numbers: " << count << std::endl;
#include <boost/range/algorithm/count.hpp>
std::string str = "a_b_c";
int cnt = boost::count(str, '_');
You name it... Lambda version... :)
using namespace boost::lambda;
std::string s = "a_b_c";
std::cout << std::count_if (s.begin(), s.end(), _1 == '_') << std::endl;
You need several includes... I leave you that as an exercise...
Count character occurrences in a string is easy:
#include <bits/stdc++.h>
using namespace std;
int main()
{
string s="Sakib Hossain";
int cou=count(s.begin(),s.end(),'a');
cout<<cou;
}
There are several methods of std::string for searching, but find is probably what you're looking for. If you mean a C-style string, then the equivalent is strchr. However, in either case, you can also use a for loop and check each character—the loop is essentially what these two wrap up.
Once you know how to find the next character given a starting position, you continually advance your search (i.e. use a loop), counting as you go.
I would have done it this way :
#include <iostream>
#include <string>
using namespace std;
int main()
{
int count = 0;
string s("Hello_world");
for (int i = 0; i < s.size(); i++)
{
if (s.at(i) == '_')
count++;
}
cout << endl << count;
cin.ignore();
return 0;
}
You can find out occurrence of '_' in source string by using string functions.
find() function takes 2 arguments , first - string whose occurrences we want to find out and second argument takes starting position.While loop is use to find out occurrence till the end of source string.
example:
string str2 = "_";
string strData = "bla_bla_blabla_bla_";
size_t pos = 0,pos2;
while ((pos = strData.find(str2, pos)) < strData.length())
{
printf("\n%d", pos);
pos += str2.length();
}
The range based for loop comes in handy
int countUnderScores(string str)
{
int count = 0;
for (char c: str)
if (c == '_') count++;
return count;
}
int main()
{
string str = "bla_bla_blabla_bla";
int count = countUnderScores(str);
cout << count << endl;
}
I would have done something like that :)
const char* str = "bla_bla_blabla_bla";
char* p = str;
unsigned int count = 0;
while (*p != '\0')
if (*p++ == '_')
count++;
Try
#include <iostream>
#include <string>
using namespace std;
int WordOccurrenceCount( std::string const & str, std::string const & word )
{
int count(0);
std::string::size_type word_pos( 0 );
while ( word_pos!=std::string::npos )
{
word_pos = str.find(word, word_pos );
if ( word_pos != std::string::npos )
{
++count;
// start next search after this word
word_pos += word.length();
}
}
return count;
}
int main()
{
string sting1="theeee peeeearl is in theeee riveeeer";
string word1="e";
cout<<word1<<" occurs "<<WordOccurrenceCount(sting1,word1)<<" times in ["<<sting1 <<"] \n\n";
return 0;
}
public static void main(String[] args) {
char[] array = "aabsbdcbdgratsbdbcfdgs".toCharArray();
char[][] countArr = new char[array.length][2];
int lastIndex = 0;
for (char c : array) {
int foundIndex = -1;
for (int i = 0; i < lastIndex; i++) {
if (countArr[i][0] == c) {
foundIndex = i;
break;
}
}
if (foundIndex >= 0) {
int a = countArr[foundIndex][1];
countArr[foundIndex][1] = (char) ++a;
} else {
countArr[lastIndex][0] = c;
countArr[lastIndex][1] = '1';
lastIndex++;
}
}
for (int i = 0; i < lastIndex; i++) {
System.out.println(countArr[i][0] + " " + countArr[i][1]);
}
}
This is a question in my paper test today, the function signature is
int is_match(char* pattern,char* string)
The pattern is limited to only ASCII chars and the quantification * and ?, so it is relatively simple. is_match should return 1 if matched, otherwise 0.
How do I do this?
Brian Kernighan provided a short article on A Regular Expression Matcher that Rob Pike wrote as a demonstration program for a book they were working on. The article is a very nice read explaining a bit about the code and regular expressions in general.
I have played with this code, making a few changes to experiment with some extensions such as to also return where in the string the pattern matches so that the substring matching the pattern can be copied from the original text.
From the article:
I suggested to Rob that we needed to find the smallest regular
expression package that would illustrate the basic ideas while still
recognizing a useful and non-trivial class of patterns. Ideally, the
code would fit on a single page.
Rob disappeared into his office, and at least as I remember it now,
appeared again in no more than an hour or two with the 30 lines of C
code that subsequently appeared in Chapter 9 of TPOP. That code
implements a regular expression matcher that handles these constructs:
c matches any literal character c
. matches any single character
^ matches the beginning of the input string
$ matches the end of the input string
* matches zero or more occurrences of the previous character
This is quite a useful class; in my own experience of using regular
expressions on a day-to-day basis, it easily accounts for 95 percent
of all instances. In many situations, solving the right problem is a
big step on the road to a beautiful program. Rob deserves great credit
for choosing so wisely, from among a wide set of options, a very small
yet important, well-defined and extensible set of features.
Rob's implementation itself is a superb example of beautiful code:
compact, elegant, efficient, and useful. It's one of the best examples
of recursion that I have ever seen, and it shows the power of C
pointers. Although at the time we were most interested in conveying
the important role of a good notation in making a program easier to
use and perhaps easier to write as well, the regular expression code
has also been an excellent way to illustrate algorithms, data
structures, testing, performance enhancement, and other important
topics.
The actual C source code from the article is very very nice.
/* match: search for regexp anywhere in text */
int match(char *regexp, char *text)
{
if (regexp[0] == '^')
return matchhere(regexp+1, text);
do { /* must look even if string is empty */
if (matchhere(regexp, text))
return 1;
} while (*text++ != '\0');
return 0;
}
/* matchhere: search for regexp at beginning of text */
int matchhere(char *regexp, char *text)
{
if (regexp[0] == '\0')
return 1;
if (regexp[1] == '*')
return matchstar(regexp[0], regexp+2, text);
if (regexp[0] == '$' && regexp[1] == '\0')
return *text == '\0';
if (*text!='\0' && (regexp[0]=='.' || regexp[0]==*text))
return matchhere(regexp+1, text+1);
return 0;
}
/* matchstar: search for c*regexp at beginning of text */
int matchstar(int c, char *regexp, char *text)
{
do { /* a * matches zero or more instances */
if (matchhere(regexp, text))
return 1;
} while (*text != '\0' && (*text++ == c || c == '.'));
return 0;
}
See This Question for a solution you can not submit. See this paper for a description of how to implement a more readable one.
Here is recursive extendable implementation. Tested for first order of pattern complexity.
#include <string.h>
#include <string>
#include <vector>
#include <iostream>
struct Match {
Match():_next(0) {}
virtual bool match(const char * pattern, const char * input) const {
return !std::strcmp(pattern, input);
}
bool next(const char * pattern, const char * input) const {
if (!_next) return false;
return _next->match(pattern, input);
}
const Match * _next;
};
class MatchSet: public Match {
typedef std::vector<Match *> Set;
Set toTry;
public:
virtual bool match(const char * pattern, const char * input) const {
for (Set::const_iterator i = toTry.begin(); i !=toTry.end(); ++i) {
if ((*i)->match(pattern, input)) return true;
}
return false;
}
void add(Match * m) {
toTry.push_back(m);
m->_next = this;
}
~MatchSet() {
for (Set::const_iterator i = toTry.begin(); i !=toTry.end(); ++i)
if ((*i)->_next==this) (*i)->_next = 0;
}
};
struct MatchQuestion: public Match {
virtual bool match(const char * pattern, const char * input) const {
if (pattern[0] != '?')
return false;
if (next(pattern+1, input))
return true;
if (next(pattern+1, input+1))
return true;
return false;
}
};
struct MatchEmpty: public Match {
virtual bool match(const char * pattern, const char * input) const {
if (pattern[0]==0 && input[0]==0)
return true;
return false;
}
};
struct MatchAsterisk: public Match {
virtual bool match(const char * pattern, const char * input) const {
if (pattern[0] != '*')
return false;
if (pattern[1] == 0) {
return true;
}
for (int i = 0; input[i] != 0; ++i) {
if (next(pattern+1, input+i))
return true;
}
return false;
}
};
struct MatchSymbol: public Match {
virtual bool match(const char * pattern, const char * input) const {
// TODO: consider cycle here to prevent unnecessary recursion
// Cycle should detect special characters and call next on them
// Current implementation abstracts from that
if (pattern[0] != input[0])
return false;
return next(pattern+1, input+1);
}
};
class DefaultMatch: public MatchSet {
MatchEmpty empty;
MatchQuestion question;
MatchAsterisk asterisk;
MatchSymbol symbol;
public:
DefaultMatch() {
add(&empty);
add(&question);
add(&asterisk);
add(&symbol);
}
void test(const char * p, const char * input) const {
testOneWay(p, input);
if (!std::strcmp(p, input)) return;
testOneWay(input, p);
}
bool testOneWay(const char * p, const char * input) const {
const char * eqStr = " == ";
bool rv = match(p, input);
if (!rv) eqStr = " != ";
std::cout << p << eqStr << input << std::endl;
return rv;
}
};
int _tmain(int argc, _TCHAR* argv[])
{
using namespace std;
typedef vector<string> Strings;
Strings patterns;
patterns.push_back("*");
patterns.push_back("*hw");
patterns.push_back("h*w");
patterns.push_back("hw*");
patterns.push_back("?");
patterns.push_back("?ab");
patterns.push_back("a?b");
patterns.push_back("ab?");
patterns.push_back("c");
patterns.push_back("cab");
patterns.push_back("acb");
patterns.push_back("abc");
patterns.push_back("*this homework?");
patterns.push_back("Is this homework?");
patterns.push_back("This is homework!");
patterns.push_back("How is this homework?");
patterns.push_back("hw");
patterns.push_back("homework");
patterns.push_back("howork");
DefaultMatch d;
for (unsigned i = 0; i < patterns.size(); ++i)
for (unsigned j =i; j < patterns.size(); ++j)
d.test(patterns[i].c_str(), patterns[j].c_str());
return 0;
}
If something is unclear, ask.
Cheat. Use #include <boost/regex/regex.hpp>.
try to make a list of interesting test cases:
is_match("dummy","dummy") should
return true;
is_match("dumm?y","dummy") should
return true;
is_match("dum?y","dummy")
should return false;
is_match("dum*y","dummy") should
return true;
and so on ...
then see how to make the easier test pass, then the next one ...
Didn't test this, actually code it, or debug it, but this might get you a start...
for each character in the pattern
if pattern character after the current one is *
// enter * state
while current character from target == current pattern char, and not at end
get next character from target
skip a char from the pattern
else if pattern character after the current one is ?
// enter ? state
if current character from target == current pattern char
get next char from target
skip a char from the pattern
else
// enter character state
if current character from target == current pattern character
get next character from target
else
return false
return true
The full power of regular expressions and finite state machines are not needed to solve this problem. As an alternative there is a relatively simple dynamic programming solution.
Let match(i, j) be 1 if it is possible to match the the sub-string string[i..n-1] with the sub-pattern pattern[j, m - 1], where n and m are the lengths of string and pattern respectively. Otherwise let match(i, j) be 0.
The base cases are:
match(n, m) = 1, you can match an empty string with an empty pattern;
match(i, m) = 0, you can't match a non-empty string with an empty pattern;
The transition is divided into 3 cases depending on whether the current sub-pattern starts with a character followed by a '*', or a character followed by a '?' or just starts with a character with no special symbol after it.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int is_match(char* pattern, char* string)
{
int n = strlen(string);
int m = strlen(pattern);
int i, j;
int **match;
match = (int **) malloc((n + 1) * sizeof(int *));
for(i = 0; i <= n; i++) {
match[i] = (int *) malloc((m + 1) * sizeof(int));
}
for(i = n; i >= 0; i--) {
for(j = m; j >= 0; j--) {
if(i == n && j == m) {
match[i][j] = 1;
}
else if(i < n && j == m) {
match[i][j] = 0;
}
else {
match[i][j] = 0;
if(pattern[j + 1] == '*') {
if(match[i][j + 2]) match[i][j] = 1;
if(i < n && pattern[j] == string[i] && match[i + 1][j]) match[i][j] = 1;
}
else if(pattern[j + 1] == '?') {
if(match[i][j + 2]) match[i][j] = 1;
if(i < n && pattern[j] == string[i] && match[i + 1][j + 2]) match[i][j] = 1;
}
else if(i < n && pattern[j] == string[i] && match[i + 1][j + 1]) {
match[i][j] = 1;
}
}
}
}
int result = match[0][0];
for(i = 0; i <= n; i++) {
free(match[i]);
}
free(match);
return result;
}
int main(void)
{
printf("is_match(dummy, dummy) = %d\n", is_match("dummy","dummy"));
printf("is_match(dumm?y, dummy) = %d\n", is_match("dumm?y","dummy"));
printf("is_match(dum?y, dummy) = %d\n", is_match("dum?y","dummy"));
printf("is_match(dum*y, dummy) = %d\n", is_match("dum*y","dummy"));
system("pause");
return 0;
}
The time complexity of this approach is O(n * m). The memory complexity is also O(n * m) but with a simple modification can be reduced to O(m).
Simple recursive implementation. It's slow but easy to understand:
int is_match(char *pattern, char *string)
{
if (!pattern[0]) {
return !string[0];
} else if (pattern[1] == '?') {
return (pattern[0] == string[0] && is_match(pattern+2, string+1))
|| is_match(pattern+2, string);
} else if (pattern[1] == '*') {
size_t i;
for (i=0; string[i] == pattern[0]; i++)
if (is_match(pattern+2, string+i)) return 1;
return 0;
} else {
return pattern[0] == string[0] && is_match(pattern+1, string+1);
}
}
Hope I got it all right.
A C program to find the index,from where the sub-string in the main string is going to start.
enter code here
#include<stdio.h>
int mystrstr (const char *,const char *);
int mystrcmp(char *,char *);
int main()
{
char *s1,*s2;//enter the strings, s1 is main string and s2 is substring.
printf("Index is %d\n",mystrstr(s1,s2));
//print the index of the string if string is found
}
//search for the sub-string in the main string
int mystrstr (const char *ps1,const char *ps2)
{
int i=0,j=0,c=0,l,m;char *x,*y;
x=ps1;
y=ps2;
while(*ps1++)i++;
while(*ps2++)j++;
ps1=x;
ps2=y;
char z[j];
for(l=0;l<i-j;l++)
{
for(m=l;m<j+l;m++)
//store the sub-string of similar size from main string
z[c++]=ps1[m];
z[c]='\0'
c=0;
if(mystrcmp(z,ps2)==0)
break;
}
return l;
}
int mystrcmp(char *ps3,char *ps4) //compare two strings
{
int i=0;char *x,*y;
x=ps3;y=ps4;
while((*ps3!=0)&&(*ps3++==*ps4++))i++;
ps3=x;ps4=y;
if(ps3[i]==ps4[i])
return 0;
if(ps3[i]>ps4[i])
return +1;
else
return -1;
}