Bit Operation For Finding String Difference - c++

The following string of mine tried to find difference between two strings.
But it's horribly slow as it iterate the length of string:
#include <string>
#include <vector>
#include <iostream>
using namespace std;
int hd(string s1, string s2) {
// hd stands for "Hamming Distance"
int dif = 0;
for (unsigned i = 0; i < s1.size(); i++ ) {
string b1 = s1.substr(i,1);
string b2 = s2.substr(i,1);
if (b1 != b2) {
dif++;
}
}
return dif;
}
int main() {
string string1 = "AAAAA";
string string2 = "ATATT";
string string3 = "AAAAA";
int theHD12 = hd(string1,string2);
cout << theHD12 << endl;
int theHD13 = hd(string1,string3);
cout << theHD13 << endl;
}
Is there a fast alternative to do that?
In Perl we can have the following approach:
sub hd {
return ($_[0] ^ $_[1]) =~ tr/\001-\255//;
}
which is much2 faster than iterating the position.
I wonder what's the equivalent of it in C++?

Try to replace the for loop by:
for (unsigned i = 0; i < s1.size(); i++ ) {
if (b1[i] != b2[i]) {
dif++;
}
}
This should be a lot faster because no new strings are created.

Fun with the STL:
#include <numeric> //inner_product
#include <functional> //plus, equal_to, not2
#include <string>
#include <stdexcept>
unsigned int
hd(const std::string& s1, const std::string& s2)
{
// TODO: What should we do if s1.size() != s2.size()?
if (s1.size() != s2.size()){
throw std::invalid_argument(
"Strings passed to hd() must have the same lenght"
);
}
return std::inner_product(
s1.begin(), s1.end(), s2.begin(),
0, std::plus<unsigned int>(),
std::not2(std::equal_to<std::string::value_type>())
);
}

Use iterators:
int GetHammingDistance(const std::string &a, const std::string &b)
{
// Hamming distance is not defined for strings of different lengths.
ASSERT(a.length() == b.length());
std::string::const_iterator a_it = a.begin();
std::string::const_iterator b_it = b.begin();
std::string::const_iterator a_end = a.end();
std::string::const_iterator b_end = b.end();
int distance = 0;
while (a_it != a_end && b_it != b_end)
{
if (*a_it != *b_it) ++distance;
++a_it; ++b_it;
}
return distance;
}

Choice 1: Modify your original code to be as effecient as possable.
int hd(string const& s1, string const& s2)
{
// hd stands for "Hamming Distance"
int dif = 0;
for (std::string::size_type i = 0; i < s1.size(); i++ )
{
char b1 = s1[i];
char b2 = s2[i];
dif += (b1 != b2)?1:0;
}
return dif;
}
Second option use some of the STL algorithms to do the heavy lifting.
struct HammingFunc
{
inline int operator()(char s1,char s2)
{
return s1 == s2?0:1;
}
};
int hd(string const& s1, string const& s2)
{
int diff = std::inner_product(s1.begin(),s1.end(),
s2.begin(),
0,
std::plus<int>(),HammingFunc()
);
return diff;
}

Some obvious points that might make it faster:
Pass the strings as const references, not by value
Use the indexing operator [] to get characters, not a method call
Compile with optimization on

You use strings.
As explained here
The hunt for the fastest Hamming Distance C implementation
if you can use char* my experiements conclude that for Gcc 4.7.2 on an Intel Xeon X5650 the fastest general purpose hamming distance calculating function for small strings (char arrays) is:
// na = length of both strings
unsigned int HammingDistance(const char* a, unsigned int na, const char* b) {
unsigned int num_mismatches = 0;
while (na) {
if (*a != *b)
++num_mismatches;
--na;
++a;
++b;
}
return num_mismatches;
}
If your problem allows you to set an upper distance limit, so that you don't care for greater distances and this limit is always less than the strings' length, the above example can be furhterly optimized to:
// na = length of both strings, dist must always be < na
unsigned int HammingDistance(const char* const a, const unsigned int na, const char* const b, const unsigned int dist) {
unsigned int i = 0, num_mismatches = 0;
while(i <= dist)
{
if (a[i] != b[i])
++num_mismatches;
++i;
}
while(num_mismatches <= dist && i < na)
{
if (a[i] != b[i])
++num_mismatches;
++i;
}
return num_mismatches;
}
I am not sure if const does anything regarding speed, but i use it anyways...

Related

Avoiding or improving brute force method: Counting character repetition from all words in a dictionary text file

I wrote this utility function that will take the contents of a alpha dictionary file and will add up the repetition count of each letter or character of the alphabet.
This is what I have so far:
#include <algorithm>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <vector>
// this function just generates a map of each of the alphabet's
// character position within the alphabet.
void initCharIndexMap( std::map<unsigned, char>& index ) {
char c = 'a';
for ( unsigned i = 1; i < 27; i++ ) {
index[i] = c;
c++;
}
}
void countCharacterRepetition( std::vector<std::string>& words, const std::map<unsigned, char> index, std::map<char, unsigned>& weights ) {
unsigned count = 0;
for ( auto& s : words ) {
std::transform(s.begin(), s.end(), s.begin(), ::tolower );
for ( std::size_t i = 0; i < s.length(); i++ ) {
using It = std::map<unsigned, char>::const_iterator;
for ( It it = index.cbegin(); it != index.cend(); ++it ) {
if ( s[i] == it->second ) {
count++;
weights[it->second] += count;
}
count = 0;
}
}
}
}
int main() {
std::vector<std::string> words;
std::string line;
std::ifstream file;
file.open( "words_alpha.txt" );
while( std::getline( file, line )
words.push_back(line);
std::map<unsigned, char> index;
initCharIndexMap(index);
std::map<char, unsigned> weights;
countCharRepetition(words, index, weights);
for (auto& w : weights)
std::cout << w.first << ' ' << w.second << '\n';
return EXIT_SUCCESS;
}
It gives me this output which appears to be valid at first glance:
a 295794
b 63940
c 152980
d 113190
e 376455
f 39238
g 82627
h 92369
i 313008
j 5456
k 26814
l 194915
m 105208
n 251435
o 251596
p 113662
q 5883
r 246141
s 250284
t 230895
u 131495
v 33075
w 22407
x 10493
y 70578
z 14757
The dictionary text file that I am using can be found from this github page.
This appears to be working. It took about 3 minutes to process on my current machine which isn't horrible, however, this seems like a brute force approach. Is there a more efficient way of doing a task like this?
If you're just counting how many times each character appears, then all you need is this:
int frequency[26] = {};
for (auto const& str : words) {
for (int i=0; i<str.size(); i++) {
frequency[tolower(str[i]) - 'a']++;
}
}
for (int i=0; i<26; i++) {
cout << char(i + 'a') << " " << frequency[i] << endl;
}
If you want to include upper and lowercase characters, change the array size to 90, remove the tolower call, and change your loop so that it prints only if i is between a and z or A and Z.
If you are just going for performance, I would say you still have to read in the file char by char - but I think all the searching is processing that could be optimised.
I would say the following pseudo code should be faster (I'll try and knock up an example later):
void read_dictionary(char *fileName)
{
// Pre-sized array (faster access)
std::array<int, 26> alphabet_count = {0};
// Open the file
FILE *file = fopen(fileName, "r");
if (file == NULL)
return; //could not open file
// Read through the file
char c;
while ((c = fgetc(file)) != EOF)
{
// If it is a letter a-z
if ( ((c >= 'a') && (c <= 'z')) ||
{
// Increment the array value for that letter
++alphabet_count[c - 'a'];
}
// else if letter A-Z
else if ( ((c >= 'A') && (c <= 'Z')) ||
{
// Increment the array value for that letter
++alphabet_count[c - 'A'];
}
}
}
The point here is that we are not searching for matches we are using the char value to index into the array to increment the alphabet letter
All of the aforementioned answers assume continuity between a and z, and history will tell you that is not always the case. A solution doesn't need to assume this, and can still be efficient.
#include <iostream>
#include <fstream>
#include <iterator>
#include <climits>
#include <cctype>
int main(int argc, char *argv[])
{
if (argc < 2)
return EXIT_FAILURE;
unsigned int count[1U << CHAR_BIT] {};
std::ifstream inp(argv[1]);
for (std::istream_iterator<char> it(inp), it_eof; it != it_eof; ++it)
++count[ std::tolower(static_cast<unsigned char>(*it)) ];
for (unsigned i=0; i<(1U << CHAR_BIT); ++i)
{
if (std::isalpha(i) && count[i])
std::cout << static_cast<char>(i) << ' ' << count[i] << '\n';
}
}
Output
[~ user]$ clang++ --std=c++14 -O2 -o main main.cpp
[~ user] time ./main /usr/share/dict/words
a 199554
b 40433
c 103440
d 68191
e 235331
f 24165
g 47094
h 64356
i 201032
j 3167
k 16158
l 130463
m 70680
n 158743
o 170692
p 78163
q 3734
r 160985
s 139542
t 152831
u 87353
v 20177
w 13864
x 6932
y 51681
z 8460
real 0m0.085s
user 0m0.073s
sys 0m0.005s
That would probably be sufficiently fast enough for your application, whatever it is.
#include <array>
#include <fstream>
#include <iostream>
int main()
{
std::ifstream file;
file.open( "words_alpha.txt" );
char c;
std::array<std::size_t, 26> counts {};
while( file >> c)
++counts[c-'a'];
for(char c = 0; c<26;++c)
std::cout<<'('<<c+'a'<<','<<counts[c]<<")\n";
}
Your version keeps track of words unnecessarily: you're simply counting characters in a file. The separation into words and lines doesn't matter. It's also unnecessary to store the words.
You could aim for readable high-level code and write something like this:
// https://github.com/KubaO/stackoverflown/tree/master/questions/letter-count-56498637
#include <cctype>
#include <fstream>
#include <iostream>
#include <iterator>
#include <limits>
#include <utility>
#include <vector>
//*
int main() {
Histogram<char, 'a', 'z'> counts;
std::ifstream file;
file.open("words_alpha.txt");
for (auto ch : make_range<char>(file)) counts.count(tolower(ch));
for (auto c : std::as_const(counts)) std::cout << c.value << ' ' << c.count << '\n';
}
This is the bare minimum of how modern C++ code should look
This requires the Histogram class, and a make_range adapter for input streams. You can't merely implement std::begin and std::end for std::ifstream, because the member end() function takes precedence and interferes (see this answer). The code below is the fragment marked //* above.
template <typename T>
void saturating_inc(T &val) {
if (val < std::numeric_limits<T>::max()) val++;
}
template <typename T, T min, T max>
class Histogram {
using counter_type = unsigned;
using storage_type = std::vector<counter_type>;
storage_type counts;
public:
template <typename U>
void count(U val) {
if (val >= min && val <= max) saturating_inc(counts[size_t(val - min)]);
}
Histogram() : counts(1 + max - min) {}
struct element {
T value;
counter_type count;
};
class const_iterator {
T val;
storage_type::const_iterator it;
public:
const_iterator(T val, storage_type::const_iterator it) : val(val), it(it) {}
const_iterator &operator++() {
++val;
++it;
return *this;
}
bool operator!=(const const_iterator &o) const { return it != o.it; }
element operator*() const { return {val, *it}; }
};
const_iterator begin() const { return {min, counts.begin()}; }
const_iterator end() const { return {0, counts.end()}; }
};
template <class C, class T>
class istream_range {
C &ref;
public:
istream_range(C &ref) : ref(ref) {}
std::istream_iterator<T> begin() { return {ref}; }
std::istream_iterator<T> end() { return {}; }
};
template <class T, class C>
istream_range<C, T> make_range(C &ref) {
return {ref};
}
This concludes the example.

Comparator case insensitive string set

I was just checking effective STL(book) example for set comparator to implement case insensitive set, but i am facing problems(visual studio says comaparator not valid but it worked in ideone). I am sure there is something wrong in my implementation even then it is little confusing what is right.
Below is the code
#include <iostream>
#include <set>
#include <map>
#include <algorithm>
using namespace std;
inline int ciCharCompare(char c1, char c2) // case-insensitively compare chars
{ // c1 and c2, returning -1 if c1 < c2,
//0 if c1==c2, and 1 if c1 > c2
int Ic1 = tolower(static_cast<unsigned char>(c1)); //see below for
int Ic2 = tolower(static_cast<unsigned char>(c2)); // info on these
if (Ic1 < Ic2) return -1;
if (Ic1 > Ic2) return 1;
return 0;
}
struct CiStringCompare : public std::binary_function<string, string, bool>
{
int ciStringCompareImpl(const string& s1, const string& s2) const
{
auto p = mismatch( //see below for an
s1.begin(), s1.end(), //explanation of why
s2.begin(), //we need not2;see
not2(ptr_fun(ciCharCompare))); //Item 41 for why we
// need ptr_fun
if (p.first == s1.end()) { //if true, either s1 and
if (p.second == s2.end()) return 0; // s2 are equal or
else return -1; //s1 is shorter than s2
}
return ciCharCompare(*p.first, *p.second);
}
int xxxx(const string& s1, const string& s2) const
{
if (s1.size() <= s2.size()) return ciStringCompareImpl(s1, s2);
else return -ciStringCompareImpl(s2, s1);
}
int operator()(const string& str1, const string& str2) const {
return xxxx(str1, str2);
}
};
class DifferenceBetweenEquivalenceandEquality
{
// Comparator functor
set<string, CiStringCompare> s;
public:
DifferenceBetweenEquivalenceandEquality(const vector<string>& v)
{
for (const auto& x : v)
s.insert(x);
}
void print() {
for(const auto& x: s)
cout << x << endl;
}
};
int main()
{
DifferenceBetweenEquivalenceandEquality dbee({ "STL","stl","aaa","bbb" });
dbee.print();
return 0;
}
It is working fine in ideone though, so im confused as to what makes a comparator valid and how the equality, equivalence and ordering effect by each of these
For example :: In normal set i get
STL stl aaa bbb
but using the below comparator i get
aaa bbb stl STL
inline string toLowerCase(const string& str) {
string res(str);
int i;
for (i = 0; i < (int)res.size(); i++)
res[i] = (char)tolower(res[i]);
return res;
}
class NormalComparator
{
public:
bool operator()(const string& s1, const string& s2)
{
return toLowerCase(s1) < toLowerCase(s2) ||
!(toLowerCase(s2) < toLowerCase(s1)) && s1 < s2;
}
};

What's wrong with strcmp and c_str() here?

The following code is my solution for the Largest Number. However it will crash.
If I directly compare tampa and tempb in cmp() by using tempa > tempb instead of strcmp(), it is OK. So what is wrong here?
#include <iostream>
#include <string>
#include <stack>
#include <vector>
#include <climits>
#include <cstdio>
#include <algorithm>
#include <sstream>
#include <cstring>
using namespace std;
bool cmp(string a, string b) {
string tempa = a + b;
string tempb = b + a;
int res = strcmp(tempa.c_str(), tempb.c_str());
if (res < 0) {
return false;
} else return true;
}
class Solution {
public:
string largestNumber(vector<int>& nums) {
vector<string> str;
string res = "0";
if (nums.empty()) {
return res;
}
for (int i = 0; i < nums.size(); ++i) {
stringstream ss;
ss << nums[i];
str.push_back(ss.str());
}
sort(str.begin(), str.end(), cmp);
res = "";
for (int i = 0; i < str.size(); ++i) {
res += str[i];
}
return res[0] == '0' ? "0" : res;
}
};
int main()
{
Solution sol;
int data[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
vector<int> nums(data, data + sizeof(data) / sizeof(data[0]));
string res = sol.largestNumber(nums);
cout << res << endl;
return 0;
}
Your comparison is not equivalent to tempa > tempb. It is equivalent to !(tempa < tempb) or tempa >= tempb. And a "greater than or equal" comparison does not satisfy the requirements of std::sort. Specifically, the comparison needs to be irreflexive, which is to say that for an operand A, cmp(A, A) should be false, but with >=, it is true.
If you want the strcmp equivalent of tempa > tempb, then do:
return strcmp(tempa.c_str(), tempb.c_str()) > 0;
On closer inspection, your comparison is fundamentally broken. Why are you concatenating a and b together to form temporary strings for comparison? As one simple example of what can go wrong there, an empty string will compare equal to every other string, because:
(string("anything") + "") == (string("") + "anything")

Code review, C++, Anagram method

I'm doing some practice questions from the book "Cracking the coding interview" and wanted to get some people to review my code for bugs and optimizations. Any feedback would be greatly appreciated.
Question: Write a method to decide if two strings are anagrams or not.
/*
Time complexity: O(n^2)
Space complexity: O(n)
*/
bool IsAnagram(std::string str1, std::string str2)
{
if(str1.length() != str2.length())
return false;
for(int i = 0; i < str1.length();i++)
{
bool found = false;
int j = 0;
while(!found && j < str2.length())
{
if(str1[i] == str2[j])
{
found = true;
str2[j] = NULL;
}
j++;
}
if(!found)
return false;
}
return true;
}
This is more efficient generally
#include <iostream>
#include <string>
#include <algorithm>
bool IsAnagram(std::string& str1, std::string& str2)
{
if(str1.length() != str2.length())
return false;
std::sort(str1.begin(), str1.end());
std::sort(str2.begin(), str2.end());
return str1.compare(str2) == 0;
}
int main(int argc, char* argv[])
{
std::string an1("army");
std::string an2("mary");
if(IsAnagram(an1, an2))
std::cout << "Hooray!\n";
return 0;
}
For those who dislike the mutating strings then maybe this is a better option. Could either remove reference to parameters 1 and 2 or make a copy inside function as here. This way, parameters can be const.
bool IsAnagram2(const std::string& str1, const std::string& str2)
{
if(str1.length() != str2.length())
return false;
std::string cpy1(str1), cpy2(str2);
std::sort(cpy1.begin(), cpy1.end());
std::sort(cpy2.begin(), cpy2.end());
return cpy1.compare(cpy2) == 0;
}
O(n) algorithm. Instead of sorting (which is O(n lg n)), count up the character occurrences in s1 and compare it to the character occurrences in s2.
#include <string>
#include <iostream>
#include <limits>
bool IsAnagram(const std::string& s1, const std::string& s2)
{
if (s1.size() != s2.size()) {
return false;
}
int count[std::numeric_limits<char>::max() + (std::size_t)1] = {};
for (auto c : s1) {
count[c]++;
}
for (auto c : s2) {
if (!count[c]) {
return false;
}
count[c]--;
}
return true;
}
int main(int argc, char **argv)
{
std::cout << IsAnagram(argv[1], argv[2]) << std::endl;
return 0;
}
There is already standard algorithm std::is_permutation that allows to perform the task simply
#include <iostream>
#include <iomanip>
#include <string>
#include <algorithm>
int main()
{
std::string s( "aab" );
std::string t( "aba" );
std::cout << std::boolalpha
<< ( s.size() == t.size() &&
std::is_permutation( s.begin(), s.end(), t.begin() ) )
<< std::endl;
return 0;
}
The output is
true
So all ypu need is to see how the algorithm is realized.:)
If you want a separate function then it will look like
bool IsAnagram( const std::string &s1, const std::string &s2 )
{
return s1.size() == s2.size() &&
std::is_permutation( s1.begin(), s1.end(), s2.begin() );
}
To use std::sort is not a good approach because original strings will be changed or you have to pass them to the function by value.

C++: case-insensitive first-n-characters string comparison

My question is similar to this, but I have two strings (as char *) and the task is to replace strnicmp function (avaible only for MS VC) with something like boost::iequals.
Note strnicmp is not stricmp - it only compares first n characters.
Is there any solution simplier than this:
void foo(const char *s1, const char *s2)
{
...
std::string str1 = s1;
std::string str2 = s2;
int n = 7;
if (boost::iequals(str1.substr(0, n), str2)) {
...
}
}
If it's really necessary, write your own function:
bool mystrnicmp(char const* s1, char const* s2, int n){
for(int i=0; i < n; ++i){
unsigned char c1 = static_cast<unsigned char>(s1[i]);
unsigned char c2 = static_cast<unsigned char>(s2[i]);
if(tolower(c1) != tolower(c2))
return false;
if(c1 == '\0' || c2 == '\0')
break;
}
return true;
}
For case insensitivity, you need a custom comparison function
(or functor):
struct EqIgnoreCase
{
bool operator()( char lhs, char rhs ) const
{
return ::tolower( static_cast<unsigned char>( lhs ) )
== ::tolower( static_cast<unsigned char>( rhs ) );
}
};
If I understand correctly, you're checking for a prefix. The
simplest way to do this is:
bool
isPrefix( std::string const& s1, std::string const& s2 )
{
return s1.size() <= s2.size()
&& std::equals( s1.begin(), s1.end(), s2.begin(), EqIgnoreCase() );
}
(Note the check of the sizes. s1 can't be a prefix of s2 if
it it longer than s2. And of course, std::equals will
encounter undefined behavior if called with s1 longer than
s2.)
For a function defined in terms of C strings (character pointers) going "up" to STL strings seems horribly inefficient, but maybe that's totally premature thinking on my part.
I would consider a straight C solution "simpler", but again that depends on one's perspective.
#include <ctype.h>
void foo(const char *s1, const char *s2)
{
size_t i, n = 7;
for(i = 0; i < n; i++)
{
if(tolower(s1[i]) != tolower(s2[i]))
return;
if(s[i] == '\0' && s2[i] == '\0')
break;
}
/* Strings are equal, do the work. */
...
}
This assumes that if both strings end before the length of the prefix has been exhausted, it's a match.
Of course the above assumes ASCII strings where tolower() makes sense.
I suggest to write the function yourselfs, like this:
bool strnicmp2(const char *s, const char *t, size_t n) {
while (n > 0 && *s && *t && tolower(*s) == tolower(*t)) {
++s;
++t;
--n;
}
return n == 0 || !*s || !*t;
}
something like this ought to work..
#include <iostream>
#include <string>
#include <cctype>
#include <cstring>
#include <algorithm>
struct isequal
{
bool operator()(int l, int r) const
{
return std::tolower(l) == std::tolower(r);
}
};
bool istrncmp(const char* s1, const char* s2, size_t n)
{
size_t ls1 = std::strlen(s1);
size_t ls2 = std::strlen(s2);
// this is strict, but you can change
if (ls1 < n || ls2 < n)
return false;
return std::equal(s1, s1 + n, s2, isequal());
}
int main(void)
{
std::cout << istrncmp("fooB", "fooA", 3) << std::endl;
std::cout << istrncmp("fooB", "fooA", 5) << std::endl;
std::cout << istrncmp("fooB", "f1oA", 3) << std::endl;
return 0;
}
I don't know if this counts as simpler or not, but it has fewer lines and speed should be pretty good.
#include <boost/iterator/transform_iterator.hpp>
#include <algorithm>
#include <cctype>
bool equal_insensitive_n( char const *a, char const *b, size_t n ) {
n = std::min( n, std::min( ::strlen( a ) + 1, ::strlen( b ) + 1 ) );
#define tilc(S) boost::make_transform_iterator( (S), ::tolower )
return std::equals( tilc(a), tilc(a) + n, tilc(b) );
#undef tilc
}