Fastest way to do a case-insensitive substring search in C/C++? - c++

Note
The question below was asked in 2008 about some code from 2003. As the OP's update shows, this entire post has been obsoleted by vintage 2008 algorithms and persists here only as a historical curiosity.
I need to do a fast case-insensitive substring search in C/C++. My requirements are as follows:
Should behave like strstr() (i.e. return a pointer to the match point).
Must be case-insensitive (doh).
Must support the current locale.
Must be available on Windows (MSVC++ 8.0) or easily portable to Windows (i.e. from an open source library).
Here is the current implementation I am using (taken from the GNU C Library):
/* Return the offset of one string within another.
Copyright (C) 1994,1996,1997,1998,1999,2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
/*
* My personal strstr() implementation that beats most other algorithms.
* Until someone tells me otherwise, I assume that this is the
* fastest implementation of strstr() in C.
* I deliberately chose not to comment it. You should have at least
* as much fun trying to understand it, as I had to write it :-).
*
* Stephen R. van den Berg, berg#pool.informatik.rwth-aachen.de */
/*
* Modified to use table lookup instead of tolower(), since tolower() isn't
* worth s*** on Windows.
*
* -- Anders Sandvig (anders#wincue.org)
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <ctype.h>
#include <string.h>
typedef unsigned chartype;
char char_table[256];
void init_stristr(void)
{
int i;
char string[2];
string[1] = '\0';
for (i = 0; i < 256; i++)
{
string[0] = i;
_strlwr(string);
char_table[i] = string[0];
}
}
#define my_tolower(a) ((chartype) char_table[a])
char *
my_stristr (phaystack, pneedle)
const char *phaystack;
const char *pneedle;
{
register const unsigned char *haystack, *needle;
register chartype b, c;
haystack = (const unsigned char *) phaystack;
needle = (const unsigned char *) pneedle;
b = my_tolower (*needle);
if (b != '\0')
{
haystack--; /* possible ANSI violation */
do
{
c = *++haystack;
if (c == '\0')
goto ret0;
}
while (my_tolower (c) != (int) b);
c = my_tolower (*++needle);
if (c == '\0')
goto foundneedle;
++needle;
goto jin;
for (;;)
{
register chartype a;
register const unsigned char *rhaystack, *rneedle;
do
{
a = *++haystack;
if (a == '\0')
goto ret0;
if (my_tolower (a) == (int) b)
break;
a = *++haystack;
if (a == '\0')
goto ret0;
shloop:
;
}
while (my_tolower (a) != (int) b);
jin:
a = *++haystack;
if (a == '\0')
goto ret0;
if (my_tolower (a) != (int) c)
goto shloop;
rhaystack = haystack-- + 1;
rneedle = needle;
a = my_tolower (*rneedle);
if (my_tolower (*rhaystack) == (int) a)
do
{
if (a == '\0')
goto foundneedle;
++rhaystack;
a = my_tolower (*++needle);
if (my_tolower (*rhaystack) != (int) a)
break;
if (a == '\0')
goto foundneedle;
++rhaystack;
a = my_tolower (*++needle);
}
while (my_tolower (*rhaystack) == (int) a);
needle = rneedle; /* took the register-poor approach */
if (a == '\0')
break;
}
}
foundneedle:
return (char*) haystack;
ret0:
return 0;
}
Can you make this code faster, or do you know of a better implementation?
Note: I noticed that the GNU C Library now has a new implementation of strstr(), but I am not sure how easily it can be modified to be case-insensitive, or if it is in fact faster than the old one (in my case). I also noticed that the old implementation is still used for wide character strings, so if anyone knows why, please share.
Update
Just to make things clear—in case it wasn't already—I didn't write this function, it's a part of the GNU C Library. I only modified it to be case-insensitive.
Also, thanks for the tip about strcasestr() and checking out other implementations from other sources (like OpenBSD, FreeBSD, etc.). It seems to be the way to go. The code above is from 2003, which is why I posted it here in hope for a better version being available, which apparently it is. :)

The code you posted is about half as fast as strcasestr.
$ gcc -Wall -o my_stristr my_stristr.c
steve#solaris:~/code/tmp
$ gcc -Wall -o strcasestr strcasestr.c
steve#solaris:~/code/tmp
$ ./bench ./my_stristr > my_stristr.result ; ./bench ./strcasestr > strcasestr.result;
steve#solaris:~/code/tmp
$ cat my_stristr.result
run 1... time = 6.32
run 2... time = 6.31
run 3... time = 6.31
run 4... time = 6.31
run 5... time = 6.32
run 6... time = 6.31
run 7... time = 6.31
run 8... time = 6.31
run 9... time = 6.31
run 10... time = 6.31
average user time over 10 runs = 6.3120
steve#solaris:~/code/tmp
$ cat strcasestr.result
run 1... time = 3.82
run 2... time = 3.82
run 3... time = 3.82
run 4... time = 3.82
run 5... time = 3.82
run 6... time = 3.82
run 7... time = 3.82
run 8... time = 3.82
run 9... time = 3.82
run 10... time = 3.82
average user time over 10 runs = 3.8200
steve#solaris:~/code/tmp
The main function was:
int main(void)
{
char * needle="hello";
char haystack[1024];
int i;
for(i=0;i<sizeof(haystack)-strlen(needle)-1;++i)
{
haystack[i]='A'+i%57;
}
memcpy(haystack+i,needle, strlen(needle)+1);
/*printf("%s\n%d\n", haystack, haystack[strlen(haystack)]);*/
init_stristr();
for (i=0;i<1000000;++i)
{
/*my_stristr(haystack, needle);*/
strcasestr(haystack,needle);
}
return 0;
}
It was suitably modified to test both implementations. I notice as I am typing this up I left in the init_stristr call, but it shouldn't change things too much. bench is just a simple shell script:
#!/bin/bash
function bc_calc()
{
echo $(echo "scale=4;$1" | bc)
}
time="/usr/bin/time -p"
prog="$1"
accum=0
runs=10
for a in $(jot $runs 1 $runs)
do
echo -n "run $a... "
t=$($time $prog 2>&1| grep user | awk '{print $2}')
echo "time = $t"
accum=$(bc_calc "$accum+$t")
done
echo -n "average user time over $runs runs = "
echo $(bc_calc "$accum/$runs")

You can use StrStrI function which finds the first occurrence of a substring within a string. The comparison is not case-sensitive.
Don't forget to include its header - Shlwapi.h.
Check this out: http://msdn.microsoft.com/en-us/library/windows/desktop/bb773439(v=vs.85).aspx

use boost string algo. It is available, cross platform, and only a header file (no library to link in). Not to mention that you should be using boost anyway.
#include <boost/algorithm/string/find.hpp>
const char* istrstr( const char* haystack, const char* needle )
{
using namespace boost;
iterator_range<char*> result = ifind_first( haystack, needle );
if( result ) return result.begin();
return NULL;
}

For platform independent use:
const wchar_t *szk_wcsstri(const wchar_t *s1, const wchar_t *s2)
{
if (s1 == NULL || s2 == NULL) return NULL;
const wchar_t *cpws1 = s1, *cpws1_, *cpws2;
char ch1, ch2;
bool bSame;
while (*cpws1 != L'\0')
{
bSame = true;
if (*cpws1 != *s2)
{
ch1 = towlower(*cpws1);
ch2 = towlower(*s2);
if (ch1 == ch2)
bSame = true;
}
if (true == bSame)
{
cpws1_ = cpws1;
cpws2 = s2;
while (*cpws1_ != L'\0')
{
ch1 = towlower(*cpws1_);
ch2 = towlower(*cpws2);
if (ch1 != ch2)
break;
cpws2++;
if (*cpws2 == L'\0')
return cpws1_-(cpws2 - s2 - 0x01);
cpws1_++;
}
}
cpws1++;
}
return NULL;
}

Why do you use _strlwr(string); in init_stristr()? It's not a standard function. Presumably it's for locale support, but as it's not standard, I'd just use:
char_table[i] = tolower(i);

I'd advice you to take some of the common strcasestr implementation that already exists. For example of glib, glibc, OpenBSD, FreeBSD, etc. You can search for more with google.com/codesearch. You can then make some performance measurements and compare the different implementation.

Assuming both input strings are already lowercase.
int StringInStringFindFirst(const char* p_cText, const char* p_cSearchText)
{
int iTextSize = strlen(p_cText);
int iSearchTextSize = strlen(p_cSearchText);
char* p_cFound = NULL;
if(iTextSize >= iSearchTextSize)
{
int iCounter = 0;
while((iCounter + iSearchTextSize) <= iTextSize)
{
if(memcmp( (p_cText + iCounter), p_cSearchText, iSearchTextSize) == 0)
return iCounter;
iCounter ++;
}
}
return -1;
}
You could also, try using masks... if for example most of the strings you are going to compare only contains chars from a to z, maybe it's worth to do something like this.
long GetStringMask(const char* p_cText)
{
long lMask=0;
while(*p_cText != '\0')
{
if (*p_cText>='a' && *p_cText<='z')
lMask = lMask | (1 << (*p_cText - 'a') );
else if(*p_cText != ' ')
{
lMask = 0;
break;
}
p_cText ++;
}
return lMask;
}
Then...
int main(int argc, char* argv[])
{
char* p_cText = "this is a test";
char* p_cSearchText = "test";
long lTextMask = GetStringMask(p_cText);
long lSearchMask = GetStringMask(p_cSearchText);
int iFoundAt = -1;
// If Both masks are Valid
if(lTextMask != 0 && lSearchMask != 0)
{
if((lTextMask & lSearchMask) == lSearchMask)
{
iFoundAt = StringInStringFindFirst(p_cText, p_cSearchText);
}
}
else
{
iFoundAt = StringInStringFindFirst(p_cText, p_cSearchText);
}
return 0;
}

This will not consider the locale, but If you can change the IS_ALPHA and TO_UPPER you can make it to consider it.
#define IS_ALPHA(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z'))
#define TO_UPPER(c) ((c) & 0xDF)
char * __cdecl strstri (const char * str1, const char * str2){
char *cp = (char *) str1;
char *s1, *s2;
if ( !*str2 )
return((char *)str1);
while (*cp){
s1 = cp;
s2 = (char *) str2;
while ( *s1 && *s2 && (IS_ALPHA(*s1) && IS_ALPHA(*s2))?!(TO_UPPER(*s1) - TO_UPPER(*s2)):!(*s1-*s2))
++s1, ++s2;
if (!*s2)
return(cp);
++cp;
}
return(NULL);
}

If you want to shed CPU cycles, you might consider this - let's assume that we're dealing with ASCII and not Unicode.
Make a static table with 256 entries. Each entry in the table is 256 bits.
To test whether or not two characters are equal, you do something like this:
if (BitLookup(table[char1], char2)) { /* match */ }
To build the table, you set a bit everywhere in table[char1] where you consider it a match for char2. So in building the table you would set the bits at the index for 'a' and 'A' in the 'a'th entry (and the 'A'th entry).
Now this is going to be slowish to do the bit lookup (bit look up will be a shift, mask and add most likely), so you could use instead a table of bytes so you use 8 bits to represent 1 bit. This will take 32K - so hooray - you've hit a time/space trade-off! We might want to make the table more flexible, so let's say we do this instead - the table will define congruences instead.
Two characters are considered congruent if and only if there is a function that defines them as equivalent. So 'A' and 'a' are congruent for case insensitivity. 'A', 'À', 'Á' and 'Â' are congruent for diacritical insensitivity.
So you define bitfields that correspond to your congruencies
#define kCongruentCase (1 << 0)
#define kCongruentDiacritical (1 << 1)
#define kCongruentVowel (1 << 2)
#define kCongruentConsonant (1 << 3)
Then your test is something like this:
inline bool CharsAreCongruent(char c1, char c2, unsigned char congruency)
{
return (_congruencyTable[c1][c2] & congruency) != 0;
}
#define CaseInsensitiveCharEqual(c1, c2) CharsAreCongruent(c1, c2, kCongruentCase)
This kind of bit fiddling with ginormous tables is the heart of ctype, by the by.

If you can control the needle string so that it is always in lower case, then you can write a modified version of stristr() to avoid the lookups for that, and thus speed up the code. It isn't as general, but it can be faster - slightly faster. Similar comments apply to the haystack, but you are more likely to be reading the haystack from sources outside your control for you cannot be certain that the data meets the requirement.
Whether the gain in performance is worth it is another question altogether. For 99% of applications, the answer is "No, it is not worth it". Your application might be one of the tiny minority where it matters. More likely, it is not.

Related

Searching for an exact string match in a (arbitrary large) stream - C++

I am building a simple multi-server for string matching. I handle multiple clients at the same time by using sockets and select. The only job that the server does is this: a client connects to a server and sends a needle (of size less than 10 GB) and a haystack (of arbitrary size) as a stream through a network socket. Needle and haystack are an arbitrary binary data.
Server needs to search the haystack for all occurrences of the needle (as an exact string match) and sends a number of needle matches back to the client. Server needs to process clients on the fly and be able to handle any input in a reasonable time (that is a search algorithm have to have a linear time complexity).
To do this I obviously need to split the haystack into a small parts (possibly smaller than the needle) in order to process them as they are coming through the network socket. That is I would need a search algorithm that is able to handle a string, that is split into parts and search in it, the same way as strstr(...) does.
I could not find any standard C or C++ library function nor a Boost library object that could handle a string by parts. If I am not mistaken, algorithms in strstr(), string.find() and Boost searching/knuth_morris_pratt.hpp are only able to handle the search, when a whole haystack is in a continuous block of memory. Or is there some trick, that I could use to search a string by parts that I am missing? Do you guys know of any C/C++ library, that is able to cope with such a large needles and haystacks resp. that is able to handle haystack streams or search in haystack by parts?
I did not find any useful library by googling and hence I was forced to create my own variation of Knuth Morris Pratt algorithm, that is able to remember its own state (shown bellow). However I do not find it to be an optimal solution, as a well tuned string searching algorithm would surely perform better in my opinion, and it would be a less worry for a debugging later.
So my question is:
Is there some more elegant way to search in a large haystack stream by parts, other than creating my own search algorithm? Is there any trick how to use a standard C string library for this? Is there some C/C++ library that is specialized for a this kind of task?
Here is a (part of) code of my midified KMP algorithm:
#include <cstdlib>
#include <cstring>
#include <cstdio>
class knuth_morris_pratt {
const char* const needle;
const size_t needle_len;
const int* const lps; // a longest proper suffix table (skip table)
// suffix_len is an ofset of a longest haystack_part suffix matching with
// some prefix of the needle. suffix_len myst be shorter than needle_len.
// Ofset is defined as a last matching character in a needle.
size_t suffix_len;
size_t match_count; // a number of needles found in haystack
public:
inline knuth_morris_pratt(const char* needle, size_t len) :
needle(needle), needle_len(len),
lps( build_lps_array() ), suffix_len(0),
match_count(len == 0 ? 1 : 0) { }
inline ~knuth_morris_pratt() { free((void*)lps); }
void search_part(const char* haystack_part, size_t hp_len); // processes a given part of the haystack stream
inline size_t get_match_count() { return match_count; }
private:
const int* build_lps_array();
};
// Worst case complexity: linear space, linear time
// see: https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/
// see article: KNUTH D.E., MORRIS (Jr) J.H., PRATT V.R., 1977, Fast pattern matching in strings
void knuth_morris_pratt::search_part(const char* haystack_part, size_t hp_len) {
if(needle_len == 0) {
match_count += hp_len;
return;
}
const char* hs = haystack_part;
size_t i = 0; // index for txt[]
size_t j = suffix_len; // index for pat[]
while (i < hp_len) {
if (needle[j] == hs[i]) {
j++;
i++;
}
if (j == needle_len) {
// a needle found
match_count++;
j = lps[j - 1];
}
else if (i < hp_len && needle[j] != hs[i]) {
// Do not match lps[0..lps[j-1]] characters,
// they will match anyway
if (j != 0)
j = lps[j - 1];
else
i = i + 1;
}
}
suffix_len = j;
}
const int* knuth_morris_pratt::build_lps_array() {
int* const new_lps = (int*)malloc(needle_len);
// check_cond_fatal(new_lps != NULL, "Unable to alocate memory in knuth_morris_pratt(..)");
// length of the previous longest prefix suffix
size_t len = 0;
new_lps[0] = 0; // lps[0] is always 0
// the loop calculates lps[i] for i = 1 to M-1
size_t i = 1;
while (i < needle_len) {
if (needle[i] == needle[len]) {
len++;
new_lps[i] = len;
i++;
}
else // (pat[i] != pat[len])
{
// This is tricky. Consider the example.
// AAACAAAA and i = 7. The idea is similar
// to search step.
if (len != 0) {
len = new_lps[len - 1];
// Also, note that we do not increment
// i here
}
else // if (len == 0)
{
new_lps[i] = 0;
i++;
}
}
}
return new_lps;
}
int main()
{
const char* needle = "lorem";
const char* p1 = "sit voluptatem accusantium doloremque laudantium qui dolo";
const char* p2 = "rem ipsum quia dolor sit amet";
const char* p3 = "dolorem eum fugiat quo voluptas nulla pariatur?";
knuth_morris_pratt searcher(needle, strlen(needle));
searcher.search_part(p1, strlen(p1));
searcher.search_part(p2, strlen(p2));
searcher.search_part(p3, strlen(p3));
printf("%d \n", (int)searcher.get_match_count());
return 0;
}
You can have a look at BNDM, which has same performances as KMP:
O(m) for preprocessing
O(n) for matching.
It is used for nrgrep, the sources of which can be found here which containts C sources.
C source for BNDM algo are here.
See here for more information.
If I have well understood your problem, you want to search if a large std::string received part by part contains a substring.
If it is the case, I think you can store for each iteration the overlapping section between two contiguous received packets. And then you just have to check for each iteration that either the overlap or the packet contains the desired pattern to find.
In the example below, I consider the following contains() function to search a pattern in a std::string:
bool contains(const std::string & str, const std::string & pattern)
{
bool found(false);
if(!pattern.empty() && (pattern.length() < str.length()))
{
for(size_t i = 0; !found && (i <= str.length()-pattern.length()); ++i)
{
if((str[i] == pattern[0]) && (str.substr(i, pattern.length()) == pattern))
{
found = true;
}
}
}
return found;
}
Example:
std::string pattern("something"); // The pattern we want to find
std::string end_of_previous_packet(""); // The first part of overlapping section
std::string beginning_of_current_packet(""); // The second part of overlapping section
std::string overlap; // The string to store the overlap at each iteration
bool found(false);
while(!found && !all_data_received()) // stop condition
{
// Get the current packet
std::string packet = receive_part();
// Set the beginning of the current packet
beginning_of_current_packet = packet.substr(0, pattern.length());
// Build the overlap
overlap = end_of_previous_packet + beginning_of_current_packet;
// If the overlap or the packet contains the pattern, we found a match
if(contains(overlap, pattern) || contains(packet, pattern))
found = true;
// Set the end of previous packet for the next iteration
end_of_previous_packet = packet.substr(packet.length()-pattern.length());
}
Of course, in this example I made the assumption that the method receive_part() already exists. Same thing for the all_data_received() function. It is just an example to illustrate the idea.
I hope it will help you to find a solution.

How to walk along UTF-16 codepoints?

I have the following definition of varying ranges which correspond to codepoints and surrogate pairs:
https://en.wikipedia.org/wiki/UTF-16#Description
My code is based on ConvertUTF.c from the Clang implementation.
I'm currently struggling with wrapping my head around how to do this.
The code which is most relevant from LLVM's implementation that I'm trying to understand is:
unsigned short bytesToWrite = 0;
const u32char_t byteMask = 0xBF;
const u32char_t byteMark = 0x80;
u8char_t* target = *targetStart;
utf_result result = kConversionOk;
const u16char_t* source = *sourceStart;
while (source < sourceEnd) {
u32char_t ch;
const u16char_t* oldSource = source; /* In case we have to back up because of target overflow. */
ch = *source++;
/* If we have a surrogate pair, convert to UTF32 first. */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
/* If the 16 bits following the high surrogate are in the source buffer... */
if (source < sourceEnd) {
u32char_t ch2 = *source;
/* If it's a low surrogate, convert to UTF32. */
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == kStrictConversion) { /* it's an unpaired high surrogate */
--source; /* return to the illegal value itself */
result = kSourceIllegal;
break;
}
} else { /* We don't have the 16 bits following the high surrogate. */
--source; /* return to the high surrogate */
result = kSourceExhausted;
break;
}
} else if (flags == kStrictConversion) {
/* UTF-16 surrogate values are illegal in UTF-32 */
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
--source; /* return to the illegal value itself */
result = kSourceIllegal;
break;
}
}
...
Specifically they say in the comments:
If we have a surrogate pair, convert to UTF32 first.
and then:
If it's a low surrogate, convert to UTF32.
I'm getting lost along the lines of "if we have.." and "if it's.." and my response being while reading the comments: "what do we have?" and "what is it?"
I believe ch and ch2 is the first char16 and the next char16 (if one exists), checking to see if the second is part of a surrogate pair, and then walking along each char16 (or do you walk along pairs of chars?) until the end.
I'm getting lost along the lines of how they are using UNI_SUR_HIGH_START, UNI_SUR_HIGH_END, UNI_SUR_LOW_START, UNI_SUR_LOW_END, and their use of halfShift and halfBase.
Wikipedia also notes:
There was an attempt to rename "high" and "low" surrogates to "leading" and "trailing" due to their numerical values not matching their names. This appears to have been abandoned in recent Unicode standards.
Making note of "leading" and "trailing" in any responses may help clarify things as well.
ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END checks if ch is in the range where high surrogates are, that is, [D800-DBFF]. That's it. Then the same is done for checking if ch2 is in the range where low surrogates are, meaning [DC00-DFFF].
halfShift and halfBase are just used as prescribed by the UTF-16 decoding algorithm, which turns a pair of surrogates into the scalar value they represent. There's nothing special being done here; it's the textbook implementation of that algorithm, without any tricks.

incompatible implicit declaration of built-in function 'malloc'

My program is
#include <iostream>
char * grabNumber ( char * begin )
{
// Interpret *begin as the start of a double and add the characters to a
// string retstr
char * begincpy = begin;
int foundDot = 0;
while ((*begin >= '0' && *begin <= '9') || *begin == '.')
{
if (*begin == '.')
{
if (foundDot == 0) foundDot = 1;
else break;
}
++begin;
}
long n = begin - begincpy; // # of characters parsed
char * retstr = malloc(sizeof(char) * (n + 1)); // string to be returned
for (long k = 0; k < n; ++k) retstr[k] = *begincpy++;
retstr[n] = '\0';
return retstr;
}
int main()
{
char str [] = "abc3.14def";
std::cout << grabNumber(str+3); // should print "3.14"
return 0;
}
and the errors I'm getting are
Line 20: warning: incompatible implicit declaration of built-in
function 'malloc' Line 21: error: 'for' loop initial declaration used
outside C99 mode
corresponding to the 2 lines
char * retstr = malloc(sizeof(char) * (n + 1)); // string to be returned
for (long k = 0; k < n; ++k) retstr[k] = *begincpy++;
See: http://codepad.org/c2tNGFEo
Also, is there a way that I can cut down on the redundancy of my algorithm, because it's checking for a . twice in each iteration of the while loop, and yet I can't think of a cleaner way to handle the fact that I need to stop the loop if we've run into a second .
I'm guessing you are trying to write C++ as you have included iostream and used std::cout. However the error message shows you are using a C compiler. I also guess that you wrote gcc myprogram.c. To get C++ compilation you either need to write g++ instead of gcc, or rename your file to have a .cc extension. (Preferably both).
To use malloc you need #include <cstdlib>.
Also you may need using namespace std; or using std::malloc; after that; and you will need to cast the value returned by malloc because C++ does not implicitly convert from void * to other pointer types.
However malloc is rarely used in C++ as it does not initialize non-trivial objects properly. Consider changing this code to:
char * retstr = new char[n+1];
then you won't need any extra includes.
But this is still a weak design as you are now relying on the caller to free the memory. In fact your main function has a memory leak as it does not free the memory.
In C++ it is better style to have memory managed by a container class that knows about memory management; so the programmer can't make any mistakes. (Incase you are wondering, this usually doesn't introduce any inefficiency and may even speed things up).
A much better approach would be to #include <string>, make the function return std::string, and change the last five lines of your function to:
return { begincpy, begin };
or if using a pre-C++11 compiler,
return std::string(begincpy, begin);
Let's start by observing that you are not writing C, you are writing C++. You should fix your compilation/project settings so you compile your files using the C++ compiler instead of the C compiler. This will fix the compilation error about the for loop also, as that is not valid in C before C-99.
Secondly, the first warning is actually due to a missing include. In C you would #include <stdlib.h> in C++ you'd #include <cstdlib> to get the definitions from the C standard library.

Constant time password digest compares using Crypto++

I'm writing a program which hashes passwords with the pbkdf2 method using cryptopp.
I have problems with validating the passwords. I have tried to compare the output in "length-constant" time but it always fails and returns false.
// a and b are std strings containing the output of the DeriveKey function
unsigned diff = a.length() ^ b.length();
for(unsigned i = 0; i < a.length() && i < b.length(); i++)
{
diff |= (unsigned)a[i] ^ (unsigned)b[i];
}
bool equal = diff == 0;
Is using "slow equals" even the right way to validate pbkdf2 passwords? I am a bit confused on this.
I'm writing a program which hashes passwords with the pbkdf2 method using cryptopp.
You linked to the Crypto++ main page, and not a your particular use of PBKDF. Here's some code just in case (it uses the IETF test vectors from RFC 6070):
int main(int argc, char* argv[])
{
byte password[] ="password";
size_t plen = strlen((const char*)password);
byte salt[] = "salt";
size_t slen = strlen((const char*)salt);
int c = 1;
byte derived[20];
PKCS5_PBKDF2_HMAC<CryptoPP::SHA1> pbkdf2;
pbkdf2.DeriveKey(derived, sizeof(derived), 0, password, plen, salt, slen, c);
string result;
HexEncoder encoder(new StringSink(result));
encoder.Put(derived, sizeof(derived));
encoder.MessageEnd();
cout << "Derived: " << result << endl;
return 0;
}
I have tried to compare the output in "length-constant" time but it always fails and returns false.
Crypto++ has a constant time compare built in. Use VerifyBufsEqual from misc.h. The source is available in misc.cpp.
$ cd cryptopp
$ grep -R VerifyBufsEqual *
cryptlib.cpp: return VerifyBufsEqual(digest, digestIn, digestLength);
default.cpp: if (!VerifyBufsEqual(check, check+BLOCKSIZE, BLOCKSIZE))
fipstest.cpp: if (!VerifyBufsEqual(expectedModuleMac, actualMac, macSize))
fipstest.cpp: if (VerifyBufsEqual(expectedModuleMac, actualMac, macSize))
misc.cpp:bool VerifyBufsEqual(const byte *buf, const byte *mask, size_t count)
misc.h:CRYPTOPP_DLL bool CRYPTOPP_API VerifyBufsEqual(const byte *buf1, const byte *buf2, size_t count);
pssr.cpp: valid = VerifyBufsEqual(representative + representativeByteLength - u, hashIdentifier.first, hashIdentifier.second) && valid;
pubkey.cpp: return VerifyBufsEqual(representative, computedRepresentative, computedRepresentative.size());
secblock.h: return m_size == t.m_size && VerifyBufsEqual(m_ptr, t.m_ptr, m_size*sizeof(T));
What I'm not clear about: VerifyBufsEqual is predicated upon buffers of equal lengths. I'm not sure if its OK to overlook the "not-equal length" case.
There's also a question on the Information Stack Exchange that may be relevant: Timing attacks on password hashes. But I'm not certain if/how it generalizes to arbitrary buffer compares.
The question piqued my interest in an answer to the general problem (the question has always been there): Constant time compares when array sizes are not equal?. That should tell us if we have the proper tools in VerifyBufsEqual (Crypto++), CRYPTO_memcmp (OpenSSL), etc.

Memory comparison (with difference position)

Is there a way to compare two blocks of memory, and know at which point they differ (memcmp() does not meet this requirement)? I wouldn't want to perform costly loops. Thanks in advance.
Regards, Neo_b
std::mismatch will do that for you in conjunction std::distance.
Compared to whatever else you are doing, a loop is cheap: the big cost will be retrieving the data from ram (or disk!) in the first place.
You can't avoid looping with memory comparison of more than a few bytes. Write the algorithm as you can imagine it. It's simple enough and you might be amazed how well the compiler optimizes code like this.
memcmp simply does a "costly loop", byte for byte. For example, here is Microsoft's implementation:
EXTERN_C int __cdecl memcmp(const void *Ptr1, const void *Ptr2, size_t Count)
{
INT v = 0;
BYTE *p1 = (BYTE *)Ptr1;
BYTE *p2 = (BYTE *)Ptr2;
while(Count-- > 0 && v == 0) {
v = *(p1++) - *(p2++);
}
return v;
}
Most other implementations do the exact same thing. For your needs, you could do something like this:
long my_memcmp(const void *Ptr1, const void *Ptr2, size_t Count)
{
INT v = 0;
long pos = 0;
BYTE *p1 = (BYTE *)Ptr1;
BYTE *p2 = (BYTE *)Ptr2;
while(Count-- > 0 && v == 0)
{
v = *(p1++) - *(p2++);
if (v == 0)
pos++;
else
break;
}
return pos;
}
If there was a better way of comparing two blocks of memory, memcmp would be reimplemented to do that.
Having said that often, memcmp has a default portable implementation in the standard C library but there are is often implemented by the compiler itself as a builtin function. This builtin function should be highly optimized for the target architecture.So take the library implementation with a pinch of salt.
You will always need a loop. But you could benchmark if looping by 4 bytes (cast to int*) or by 8 bytes (uint64_t or long long int) is faster than the naive per-byte solution.
Even better, depending on the length (say, >1kb) you could unroll the loop, meaning you check e.g. per 8 int/uint64_t and on a mismatch pinpoint the first differing byte.
uint64_t *bigsteps1 = (uint64_t*)m1;
uint64_t *bigsteps2 = (uint64_t*)m2;
int steps = min(m1_len,m2_len)/sizeof(uint64_t);
int i;
for ( i=0; i<steps; i+=8 )
{
if ( bigsteps1[i] != bigsteps2[i]
|| bigsteps1[i+1] != bigsteps2[i+1]
/// ....
|| bigsteps1[i+7] != bigsteps2[i+7] ) break;
}
// i<steps tells if we found a difference
// end game is trivial, left as an excercise to the reader.
The loop unroll may also backfire, for you have all these +N things in there and the i+=8 as well. Benchmark to be sure.
ps also check memory alignment: this will be fastest when m1&0xff == m2&0xff == 0