char[] to hex string exercise - c++

Below is my current char* to hex string function. I wrote it as an exercise in bit manipulation. It takes ~7ms on a AMD Athlon MP 2800+ to hexify a 10 million byte array. Is there any trick or other way that I am missing?
How can I make this faster?
Compiled with -O3 in g++
static const char _hex2asciiU_value[256][2] =
{ {'0','0'}, {'0','1'}, /* snip..., */ {'F','E'},{'F','F'} };
std::string char_to_hex( const unsigned char* _pArray, unsigned int _len )
{
std::string str;
str.resize(_len*2);
char* pszHex = &str[0];
const unsigned char* pEnd = _pArray + _len;
clock_t stick, etick;
stick = clock();
for( const unsigned char* pChar = _pArray; pChar != pEnd; pChar++, pszHex += 2 ) {
pszHex[0] = _hex2asciiU_value[*pChar][0];
pszHex[1] = _hex2asciiU_value[*pChar][1];
}
etick = clock();
std::cout << "ticks to hexify " << etick - stick << std::endl;
return str;
}
Updates
Added timing code
Brian R. Bondy: replace the std::string with a heap alloc'd buffer and change ofs*16 to ofs << 4 - however the heap allocated buffer seems to slow it down? - result ~11ms
Antti Sykäri:replace inner loop with
int upper = *pChar >> 4;
int lower = *pChar & 0x0f;
pszHex[0] = pHex[upper];
pszHex[1] = pHex[lower];
result ~8ms
Robert: replace _hex2asciiU_value with a full 256-entry table, sacrificing memory space but result ~7ms!
HoyHoy: Noted it was producing incorrect results

This assembly function (based off my previous post here, but I had to modify the concept a bit to get it to actually work) processes 3.3 billion input characters per second (6.6 billion output characters) on one core of a Core 2 Conroe 3Ghz. Penryn is probably faster.
%include "x86inc.asm"
SECTION_RODATA
pb_f0: times 16 db 0xf0
pb_0f: times 16 db 0x0f
pb_hex: db 48,49,50,51,52,53,54,55,56,57,65,66,67,68,69,70
SECTION .text
; int convert_string_to_hex( char *input, char *output, int len )
cglobal _convert_string_to_hex,3,3
movdqa xmm6, [pb_f0 GLOBAL]
movdqa xmm7, [pb_0f GLOBAL]
.loop:
movdqa xmm5, [pb_hex GLOBAL]
movdqa xmm4, [pb_hex GLOBAL]
movq xmm0, [r0+r2-8]
movq xmm2, [r0+r2-16]
movq xmm1, xmm0
movq xmm3, xmm2
pand xmm0, xmm6 ;high bits
pand xmm2, xmm6
psrlq xmm0, 4
psrlq xmm2, 4
pand xmm1, xmm7 ;low bits
pand xmm3, xmm7
punpcklbw xmm0, xmm1
punpcklbw xmm2, xmm3
pshufb xmm4, xmm0
pshufb xmm5, xmm2
movdqa [r1+r2*2-16], xmm4
movdqa [r1+r2*2-32], xmm5
sub r2, 16
jg .loop
REP_RET
Note it uses x264 assembly syntax, which makes it more portable (to 32-bit vs 64-bit, etc). To convert this into the syntax of your choice is trivial: r0, r1, r2 are the three arguments to the functions in registers. Its a bit like pseudocode. Or you can just get common/x86/x86inc.asm from the x264 tree and include that to run it natively.
P.S. Stack Overflow, am I wrong for wasting time on such a trivial thing? Or is this awesome?

At the cost of more memory you can create a full 256-entry table of the hex codes:
static const char _hex2asciiU_value[256][2] =
{ {'0','0'}, {'0','1'}, /* ..., */ {'F','E'},{'F','F'} };
Then direct index into the table, no bit fiddling required.
const char *pHexVal = pHex[*pChar];
pszHex[0] = pHexVal[0];
pszHex[1] = pHexVal[1];

Faster C Implmentation
This runs nearly 3x faster than the C++ implementation. Not sure why as it's pretty similar. For the last C++ implementation that I posted it took 6.8 seconds to run through a 200,000,000 character array. The implementation took only 2.2 seconds.
#include <stdio.h>
#include <stdlib.h>
char* char_to_hex(const unsigned char* p_array,
unsigned int p_array_len,
char** hex2ascii)
{
unsigned char* str = malloc(p_array_len*2+1);
const unsigned char* p_end = p_array + p_array_len;
size_t pos=0;
const unsigned char* p;
for( p = p_array; p != p_end; p++, pos+=2 ) {
str[pos] = hex2ascii[*p][0];
str[pos+1] = hex2ascii[*p][1];
}
return (char*)str;
}
int main()
{
size_t hex2ascii_len = 256;
char** hex2ascii;
int i;
hex2ascii = malloc(hex2ascii_len*sizeof(char*));
for(i=0; i<hex2ascii_len; i++) {
hex2ascii[i] = malloc(3*sizeof(char));
snprintf(hex2ascii[i], 3,"%02X", i);
}
size_t len = 8;
const unsigned char a[] = "DO NOT WANT";
printf("%s\n", char_to_hex((const unsigned char*)a, len, (char**)hex2ascii));
}

Operate on 32 bits at a time (4 chars), then deal with the tail if needed. When I did this exercise with url encoding a full table lookup for each char was slightly faster than logic constructs, so you may want to test this in context as well to take caching issues into account.

It works for me with unsigned char:
unsigned char c1 = byteVal >> 4;
unsigned char c2 = byteVal & 0x0f;
c1 += c1 <= 9 ? '0' : ('a' - 10);
c2 += c2 <= 9 ? '0' : ('a' - 10);
std::string sHex(" ");
sHex[0] = c1 ;
sHex[1] = c2 ;
//sHex - contain what we need. For example "0f"

For one, instead of multiplying by 16 do a bitshift << 4
Also don't use the std::string, instead just create a buffer on the heap and then delete it. It will be more efficient than the object destruction that is needed from the string.

not going to make a lot of difference... *pChar-(ofs*16) can be done with [*pCHar & 0x0F]

This is my version, which, unlike the OP's version, doesn't assume that std::basic_string has its data in contiguous region:
#include <string>
using std::string;
static char const* digits("0123456789ABCDEF");
string
tohex(string const& data)
{
string result(data.size() * 2, 0);
string::iterator ptr(result.begin());
for (string::const_iterator cur(data.begin()), end(data.end()); cur != end; ++cur) {
unsigned char c(*cur);
*ptr++ = digits[c >> 4];
*ptr++ = digits[c & 15];
}
return result;
}

I assume this is Windows+IA32.
Try to use short int instead of the two hexadecimal letters.
short int hex_table[256] = {'0'*256+'0', '1'*256+'0', '2'*256+'0', ..., 'E'*256+'F', 'F'*256+'F'};
unsigned short int* pszHex = &str[0];
stick = clock();
for (const unsigned char* pChar = _pArray; pChar != pEnd; pChar++)
*pszHex++ = hex_table[*pChar];
etick = clock();

Changing
ofs = *pChar >> 4;
pszHex[0] = pHex[ofs];
pszHex[1] = pHex[*pChar-(ofs*16)];
to
int upper = *pChar >> 4;
int lower = *pChar & 0x0f;
pszHex[0] = pHex[upper];
pszHex[1] = pHex[lower];
results in roughly 5% speedup.
Writing the result two bytes at time as suggested by Robert results in about 18% speedup. The code changes to:
_result.resize(_len*2);
short* pszHex = (short*) &_result[0];
const unsigned char* pEnd = _pArray + _len;
const char* pHex = _hex2asciiU_value;
for(const unsigned char* pChar = _pArray;
pChar != pEnd;
pChar++, ++pszHex )
{
*pszHex = bytes_to_chars[*pChar];
}
Required initialization:
short short_table[256];
for (int i = 0; i < 256; ++i)
{
char* pc = (char*) &short_table[i];
pc[0] = _hex2asciiU_value[i >> 4];
pc[1] = _hex2asciiU_value[i & 0x0f];
}
Doing it 2 bytes at a time or 4 bytes at a time will probably result in even greater speedups, as pointed out by Allan Wind, but then it gets trickier when you have to deal with the odd characters.
If you're feeling adventurous, you might try to adapt Duff's device to do this.
Results are on an Intel Core Duo 2 processor and gcc -O3.
Always measure that you actually get faster results — a pessimization pretending to be an optimization is less than worthless.
Always test that you get the correct results — a bug pretending to be an optimization is downright dangerous.
And always keep in mind the tradeoff between speed and readability — life is too short for anyone to maintain unreadable code.
(Obligatory reference to coding for the violent psychopath who knows where you live.)

Make sure your compiler optimization is turned on to the highest working level.
You know, flags like '-O1' to '-03' in gcc.

I have found that using an index into an array, rather than a pointer, can speed things up a tick. It all depends on how your compiler chooses to optimize. The key is that the processor has instructions to do complex things like [i*2+1] in a single instruction.

If you're rather obsessive about speed here, you can do the following:
Each character is one byte, representing two hex values. Thus, each character is really two four-bit values.
So, you can do the following:
Unpack the four-bit values to 8-bit values using a multiplication or similar instruction.
Use pshufb, the SSSE3 instruction (Core2-only though). It takes an array of 16 8-bit input values and shuffles them based on the 16 8-bit indices in a second vector. Since you have only 16 possible characters, this fits perfectly; the input array is a vector of 0 through F characters, and the index array is your unpacked array of 4-bit values.
Thus, in a single instruction, you will have performed 16 table lookups in fewer clocks than it normally takes to do just one (pshufb is 1 clock latency on Penryn).
So, in computational steps:
A B C D E F G H I J K L M N O P (64-bit vector of input values, "Vector A") -> 0A 0B 0C 0D 0E 0F 0G 0H 0I 0J 0K 0L 0M 0N 0O 0P (128-bit vector of indices, "Vector B"). The easiest way is probably two 64-bit multiplies.
pshub [0123456789ABCDEF], Vector B

I'm not sure doing it more bytes at a time will be better... you'll probably just get tons of cache misses and slow it down significantly.
What you might try is to unroll the loop though, take larger steps and do more characters each time through the loop, to remove some of the loop overhead.

Consistently getting ~4ms on my Athlon 64 4200+ (~7ms with original code)
for( const unsigned char* pChar = _pArray; pChar != pEnd; pChar++) {
const char* pchars = _hex2asciiU_value[*pChar];
*pszHex++ = *pchars++;
*pszHex++ = *pchars;
}

The function as it is shown when I'm writing this produces incorrect output even when _hex2asciiU_value is fully specified. The following code works, and on my 2.33GHz Macbook Pro runs in about 1.9 seconds for 200,000,000 million characters.
#include <iostream>
using namespace std;
static const size_t _h2alen = 256;
static char _hex2asciiU_value[_h2alen][3];
string char_to_hex( const unsigned char* _pArray, unsigned int _len )
{
string str;
str.resize(_len*2);
char* pszHex = &str[0];
const unsigned char* pEnd = _pArray + _len;
const char* pHex = _hex2asciiU_value[0];
for( const unsigned char* pChar = _pArray; pChar != pEnd; pChar++, pszHex += 2 ) {
pszHex[0] = _hex2asciiU_value[*pChar][0];
pszHex[1] = _hex2asciiU_value[*pChar][1];
}
return str;
}
int main() {
for(int i=0; i<_h2alen; i++) {
snprintf(_hex2asciiU_value[i], 3,"%02X", i);
}
size_t len = 200000000;
char* a = new char[len];
string t1;
string t2;
clock_t start;
srand(time(NULL));
for(int i=0; i<len; i++) a[i] = rand()&0xFF;
start = clock();
t1=char_to_hex((const unsigned char*)a, len);
cout << "char_to_hex conversion took ---> " << (clock() - start)/(double)CLOCKS_PER_SEC << " seconds\n";
}

Related

OpenMP Strange Behavior - differences in performance

I want to speedup image processing code using OpenMP and I found some strange behavior in my code. I'm using Visual Studio 2019 and I also tried Intel C++ compiler with same result.
I'm not sure why is the code with OpenMP in some situations much slower than in the others. For example function divideImageDataWithParam() or difference between copyFirstPixelOnRow() and copyFirstPixelOnRowUsingTSize() using struct TSize as parameter of image data size. Why is performance of boxFilterRow() and boxFilterRow_OpenMP() so different a why isn't it with different radius size in program?
I created github repository for this little testing project:
https://github.com/Tb45/OpenMP-Strange-Behavior
Here are all results summarized:
https://github.com/Tb45/OpenMP-Strange-Behavior/blob/master/resuts.txt
I didn't find any explanation why is this happening or what am I doing wrong.
Thanks for your help.
I'm working on faster box filter and others for image processing algorithms.
typedef intptr_t int_t;
struct TSize
{
int_t width;
int_t height;
};
void divideImageDataWithParam(
const unsigned char * src, int_t srcStep, unsigned char * dst, int_t dstStep, TSize size, int_t param)
{
for (int_t y = 0; y < size.height; y++)
{
for (int_t x = 0; x < size.width; x++)
{
dst[y*dstStep + x] = src[y*srcStep + x]/param;
}
}
}
void divideImageDataWithParam_OpenMP(
const unsigned char * src, int_t srcStep, unsigned char * dst, int_t dstStep, TSize size, int_t param, bool parallel)
{
#pragma omp parallel for if(parallel)
for (int_t y = 0; y < size.height; y++)
{
for (int_t x = 0; x < size.width; x++)
{
dst[y*dstStep + x] = src[y*srcStep + x]/param;
}
}
}
Results of divideImageDataWithParam():
generateRandomImageData :: 3840x2160
numberOfIterations = 100
With Visual C++ 2019:
32bit 64bit
336.906ms 344.251ms divideImageDataWithParam
1832.120ms 6395.861ms divideImageDataWithParam_OpenMP single-thread parallel=false
387.152ms 1204.302ms divideImageDataWithParam_OpenMP multi-threaded parallel=true
With Intel C++ 19:
32bit 64bit
15.162ms 8.927ms divideImageDataWithParam
266.646ms 294.134ms divideImageDataWithParam_OpenMP single-threaded parallel=false
239.564ms 1195.556ms divideImageDataWithParam_OpenMP multi-threaded parallel=true
Screenshot from Intel VTune Amplifier, where divideImageDataWithParam_OpenMP() with parallel=false take most of the time in instruction mov to dst memory.
648trindade is right; it has to do with optimizations that cannot be done with openmp. But its not loop-unrolling or vectorization, its inlining which allows for a smart substitution.
Let me explain: Integer divisions are incredibly slow (64bit IDIV: ~40-100 Cycles). So whenever possible people (and compilers) try to avoid divisions. One trick you can use is to substitute a division with a multiplication and a shift. That only works if the divisor is known at compile time. This is the case because your function divideImageDataWithParam is inlined and PARAM is known. You can verify this by prepending it with __declspec(noinline). You will get the timings that you expected.
The openmp parallelization does not allow this trick because the function cannot be inlined and therefore param is not known at compile time and an expensive IDIV-instruction is generated.
Compiler output of divideImageDataWithParam (WIN10, MSVC2017, x64):
0x7ff67d151480 <+ 336> movzx ecx,byte ptr [r10+r8]
0x7ff67d151485 <+ 341> mov rax,r12
0x7ff67d151488 <+ 344> mul rax,rcx <------- multiply
0x7ff67d15148b <+ 347> shr rdx,3 <------- shift
0x7ff67d15148f <+ 351> mov byte ptr [r8],dl
0x7ff67d151492 <+ 354> lea r8,[r8+1]
0x7ff67d151496 <+ 358> sub r9,1
0x7ff67d15149a <+ 362> jne test!main+0x150 (00007ff6`7d151480)
And the openmp-version:
0x7ff67d151210 <+ 192> movzx eax,byte ptr [r10+rcx]
0x7ff67d151215 <+ 197> lea rcx,[rcx+1]
0x7ff67d151219 <+ 201> cqo
0x7ff67d15121b <+ 203> idiv rax,rbp <------- idiv
0x7ff67d15121e <+ 206> mov byte ptr [rcx-1],al
0x7ff67d151221 <+ 209> lea rax,[r8+rcx]
0x7ff67d151225 <+ 213> mov rdx,qword ptr [rbx]
0x7ff67d151228 <+ 216> cmp rax,rdx
0x7ff67d15122b <+ 219> jl test!divideImageDataWithParam$omp$1+0xc0 (00007ff6`7d151210)
Note 1) If you try out the compiler explorer (https://godbolt.org/) you will see that some compilers do the substitution for the openmp version too.
Note 2) As soon as the parameter is not known at compile time this optimization cannot be done anyway. So if you put your function into a library it will be slow. I'd do something like precomputing the division for all possible values and then do a lookup. This is even faster because the lookup table fits into 4-5 cache lines and L1 latency is only 3-4 cycles.
void divideImageDataWithParam(
const unsigned char * src, int_t srcStep, unsigned char * dst, int_t dstStep, TSize size, int_t param)
{
uint8_t tbl[256];
for(int i = 0; i < 256; i++) {
tbl[i] = i / param;
}
for (int_t y = 0; y < size.height; y++)
{
for (int_t x = 0; x < size.width; x++)
{
dst[y*dstStep + x] = tbl[src[y*srcStep + x]];
}
}
}
Also thanks for the interesting question, I learned a thing or two along the way! ;-)
This behavior is explained by the use of compiler optimizations: when enabled, divideImageDataWithParam sequential code will be subjected to a series of optimizations (loop-unrolling, vectorization, etc.) that divideImageDataWithParam_OpenMP parallel code probably is not, as it is certainly uncharacterized after the process of outlining parallel regions by the compiler.
If you compile this same code without optimizations, you will find that the runtime version of the sequential version is very similar to that of the parallel version with only one thread.
The maximum speedup of parallel version in this case is limited by the division of the original workload without optimizations. Optimizations in this case need to be writed manually.

convert to string with and without reinterpret_cast

I have been looking for code to get CPUID in Linux and came across several good examples. These are specific questions between difference in implementation. Below are unsigned integers and it uses reinterpret_cast with size_t as 12
struct CPUVendorID{
unsigned int ebx;
unsigned int edx;
unsigned int ecx;
string toString() const {
return string(reinterpret_cast<const char *>(this), 12);
}
};
...
CPUVendorID vendorID { .ebx = ebx, .edx = edx, .ecx = ecx };
string vendor = vendorID.toString();
another form of to get the same output with size_t as 4 is given below:
string vendor;
vendor += string((const char *)&cpuID.EBX(), 4);
vendor += string((const char *)&cpuID.EDX(), 4);
vendor += string((const char *)&cpuID.ECX(), 4);
cout << "CPU vendor = " << vendor << endl;
Both output 12 character string. Can someone explain me what is happening in the reinterpret_cast statement above ? I find this way of implementation very elegant, but I don't know why its working obviously 4*3=12. But, how does it manage to concatenate data from the 3 ebx, edx and ecx?
CPU's manufacturer ID string – a twelve-character ASCII string stored in EBX, EDX, ECX (in that order) CPUID wiki
Can someone explain me what is happening in the reinterpret_cast statement above?
The structure address this coincides with the address of its first member, which is int ebx. Since the following members are of the same type, there is no padding between them, so that this code treats the three int members as an int[3] array.

Fastest way to determine if two strings differ by a single character

I am writing a C++ algorithm that takes two strings and returns true if you can mutate from string a to string b by changing a single character to another.
The two strings must equal in size, and can only have one difference.
I also need to have access to the index that changed, and the character of strA that was altered.
I found a working algorithm, but it iterates through every single pair of words and is running way too slow on any large amount of input.
bool canChange(std::string const& strA, std::string const& strB, char& letter)
{
int dif = 0;
int position = 0;
int currentSize = (int)strA.size();
if(currentSize != (int)strB.size())
{
return false;
}
for(int i = 0; i < currentSize; ++i)
{
if(strA[i] != strB[i])
{
dif++;
position = i;
if(dif > 1)
{
return false;
}
}
}
if(dif == 1)
{
letter = strA[position];
return true;
}
else return false;
}
Any advice on optimization?
It's a bit hard to get away from examining all the characters in the strings, unless you can accept the occasional incorrect result.
I suggest using features of the standard library, and not trying to count the number of mismatches. For example;
#include <string>
#include <algorithm>
bool canChange(std::string const& strA, std::string const& strB, char& letter, std::size_t &index)
{
bool single_mismatch = false;
if (strA.size() == strB.size())
{
typedef std::string::const_iterator ci;
typedef std::pair<ci, ci> mismatch_result;
ci begA(strA.begin()), endA(strA.end());
mismatch_result result = std::mismatch(begA, endA, strB.begin());
if (result.first != endA) // found a mismatch
{
letter = *(result.first);
index = std::distance(begA, result.first);
// now look for a second mismatch
std::advance(result.first, 1);
std::advance(result.second, 1);
single_mismatch = (std::mismatch(result.first, endA, result.second).first == endA);
}
}
return single_mismatch;
}
This works for all versions. It can be simplified a little in C++11.
If the above returns true, then a single mismatch was found.
If the return value is false, then either the strings are different sizes, or the number of mismatches is not equal to 1 (either the strings are equal, or have more than one mismatch).
letter and index are unchanged if the strings are of different lengths or are exactly equal, but otherwise identify the first mismatch (value of the character in strA, and index).
If you want to optimize for mostly-identical strings, you could use x86 SSE/AVX vector instructions. Your basic idea looks fine: break as soon as you detect a second difference.
To find and count character differences, a sequence like PCMPEQB / PMOVMSKB / test-and-branch is probably good. (Use C/C++ intrinsic functions to get those vector instructions). When your vector loop detects non-zero differences in the current block, POPCNT the bitmask to see if you just found the first difference, or if you found two differences in the same block.
I threw together an untested and not-fully-fleshed out AVX2 version of what I'm describing. This code assumes string lengths are a multiple of 32. Stopping early and handling the last chunk with a cleanup epilogue is left as an exercise for the reader.
#include <immintrin.h>
#include <string>
// not tested, and doesn't avoid reading past the end of the string.
// TODO: epilogue to handle the last up-to-31 left-over bytes separately.
bool canChange_avx2_bmi(std::string const& strA, std::string const& strB, char& letter) {
size_t size = strA.size();
if (size != strB.size())
return false;
int diffs = 0;
size_t diffpos = 0;
size_t pos = 0;
do {
uint32_t diffmask = 0;
while( pos < size ) {
__m256i vecA = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(& strA[pos]));
__m256i vecB = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(& strB[pos]));
__m256i vdiff = _mm256_cmpeq_epi8(vecA, vecB);
diffmask = _mm256_movemask_epi8(vdiff);
pos += 32;
if (diffmask) break; // gcc makes worse code if you include && !diffmask in the while condition, instead of this break
}
if (diffmask) {
diffpos = pos + _tzcnt_u32(diffmask); // position of the lowest set bit. Could safely use BSF rather than TZCNT here, since we only run when diffmask is non-zero.
diffs += _mm_popcnt_u32(diffmask);
}
} while(pos < size && diffs <= 1);
if (diffs == 1) {
letter = strA[diffpos];
return true;
}
return false;
}
The ugly break instead of including that in the while condition apparently helps gcc generate better code. The do{}while() also matches up with how I want the asm to come out. I didn't try using a for or while loop to see what gcc would do.
The inner loop is really tight this way:
.L14:
cmp rcx, r8
jnb .L10 # the while(pos<size) condition
.L6: # entry point for first iteration, because gcc duplicates the pos<size test ahead of the loop
vmovdqu ymm0, YMMWORD PTR [r9+rcx] # tmp118,* pos
vpcmpeqb ymm0, ymm0, YMMWORD PTR [r10+rcx] # tmp123, tmp118,* pos
add rcx, 32 # pos,
vpmovmskb eax, ymm0 # tmp121, tmp123
test eax, eax # tmp121
je .L14 #,
In theory, this should run at one iteration per 2 clocks (Intel Haswell). There are 7 fused-domain uops in the loop. (Would be 6, but 2-reg addressing modes apparently can't micro-fuse on SnB-family CPUs.) Since two of the uops are loads, not ALU, this throughput might be achievable on SnB/IvB as well.
This should be exceptionally good for flying over regions where the two strings are identical. The overhead of correctly handling arbitrary string lengths will make this potentially slower than a simple scalar function if strings are short, and/or have multiple differences early on.
How big is your input?
I'd think that the strA[i], strB[i] has function call overhead unless it's inlined. So make sure you do your performance test with inlining turned on and compiled with release. Otherwise, try getting the bytes as a char* with strA.c_str().
If all that fails and it's still not fast enough, try breaking you string into chunks and using memcmp or strncmp on the chunks. If no difference, move to the next chunk until you reach the end or find a difference. If a difference is found, do your trivial byte by byte compare until you find the difference. I suggest this route because memcmp is often faster than your trivial implementations as they can make use of the processor SSE extensions and so forth to do very fast compares.
Also, there is a problem with your code. You're assuming strA is longer than strB and only checking the length of A for the array accessors.

How to fill memory fast with a `int32_t` value?

Is there a function (SSEx intrinsics is OK) which will fill the memory with a specified int32_t value? For instance, when this value is equal to 0xAABBCC00 the result memory should look like:
AABBCC00AABBCC00AABBCC00AABBCC00AABBCC00
AABBCC00AABBCC00AABBCC00AABBCC00AABBCC00
AABBCC00AABBCC00AABBCC00AABBCC00AABBCC00
AABBCC00AABBCC00AABBCC00AABBCC00AABBCC00
...
I could use std::fill or simple for-loop, but it is not fast enough.
Resizing of a vector performed only once in the beginning of program, this is not an issue. The bottleneck is filling the memory.
Simplified code:
struct X
{
typedef std::vector<int32_t> int_vec_t;
int_vec_t buffer;
X() : buffer( 5000000 ) { /* some more action */ }
~X() { /* some code here */ }
// the following function is called 25 times per second
const int_vec_t& process( int32_t background, const SOME_DATA& data );
};
const X::int_vec_t& X::process( int32_t background, const SOME_DATA& data )
{
// the following one string takes 30% of total time of #process function
std::fill( buffer.begin(), buffer.end(), background );
// some processing
// ...
return buffer;
}
This is how I would do it (please excuse the Microsoft-ness of it):
VOID FillInt32(__out PLONG M, __in LONG Fill, __in ULONG Count)
{
__m128i f;
// Fix mis-alignment.
if ((ULONG_PTR)M & 0xf)
{
switch ((ULONG_PTR)M & 0xf)
{
case 0x4: if (Count >= 1) { *M++ = Fill; Count--; }
case 0x8: if (Count >= 1) { *M++ = Fill; Count--; }
case 0xc: if (Count >= 1) { *M++ = Fill; Count--; }
}
}
f.m128i_i32[0] = Fill;
f.m128i_i32[1] = Fill;
f.m128i_i32[2] = Fill;
f.m128i_i32[3] = Fill;
while (Count >= 4)
{
_mm_store_si128((__m128i *)M, f);
M += 4;
Count -= 4;
}
// Fill remaining LONGs.
switch (Count & 0x3)
{
case 0x3: *M++ = Fill;
case 0x2: *M++ = Fill;
case 0x1: *M++ = Fill;
}
}
I have to ask: Have you definitely profiled std::fill and shown it to be the performance bottleneck? I would guess it to be implemented in a pretty efficient manner, such that the compiler can automatically generate the appropriate instructions (for example -march on gcc).
If it is the bottleneck, it may still be possible to get better benefit from an algorithmic redesign (if possible) to avoid setting so much memory (apparently over and over) such that it doesn't matter anymore which fill mechanism you use.
Thanks to everyone for your answers. I've checked wj32's solution , but it shows very similar time as std::fill do. My current solution works 4 times faster (in Visual Studio 2008) than std::fill with help of the function memcpy:
// fill the first quarter by the usual way
std::fill(buffer.begin(), buffer.begin() + buffer.size()/4, background);
// copy the first quarter to the second (very fast)
memcpy(&buffer[buffer.size()/4], &buffer[0], buffer.size()/4*sizeof(background));
// copy the first half to the second (very fast)
memcpy(&buffer[buffer.size()/2], &buffer[0], buffer.size()/2*sizeof(background));
In the production code one needs to add check if buffer.size() is divisible by 4 and add appropriate handling for that.
Have you considered using
vector<int32_t> myVector;
myVector.reserve( sizeIWant );
and then use std::fill? Or perhaps the constructor of a std::vector which takes as an argument the number of items held and the value to initialize them at?
Assuming you have a limited amount of values in your background parameter (or even better, only on), maybe you should try to allocate a static vector, and simply use memcpy.
const int32_t sBackground = 1234;
static vector <int32_t> sInitalizedBuffer(n, sBackground);
const X::int_vec_t& X::process( const SOME_DATA& data )
{
// the following one string takes 30% of total time of #process function
std::memcpy( (void*) data[0], (void*) sInitalizedBuffer[0], n * sizeof(sBackground));
// some processing
// ...
return buffer;
}
I just tested std::fill with g++ with full optimizations (SSE etc.. enabled):
#include <algorithm>
#include <inttypes.h>
int32_t a[5000000];
int main(int argc,char *argv[])
{
std::fill(a,a+5000000,0xAABBCC00);
return a[3];
}
and the inner loop looked like:
L2:
movdqa %xmm0, -16(%eax)
addl $16, %eax
cmpl %edx, %eax
jne L2
Looks like 0xAABBCC00 x 4 was loaded into xmm0 and is being moved 16-bytes at a time.
Not totally sure how you set 4 bytes in a row, but if you want to fill memory with just one byte over an over again, you can use memset.
void * memset ( void * ptr, int value, size_t num );
Fill block of memory
Sets the first num bytes of the block of memory pointed by ptr to the specified value (interpreted as an unsigned char).
the vs2013 and vs2015 can optimize a plain for-loop to a rep stos instruction. It's the fastest way to fill a buffer. You can specify the std::fill for your type like this:
namespace std {
inline void fill(vector<int>::iterator first, vector<int>::iterator last, int value){
for (size_t i = 0; i < last - first; i++)
first[i] = value;
}
}
BTW. To have the compiler do the optimization, the buffer must be accessed by the subscript operator.
It will not work on the gcc and clang. They both will compile the code to a conditional jump loop. It runs as slow as the original std::fill. And though the wchar_t is 32-bit, the wmemset does not have an assemble implement likes the memset. So you have to write assemble code to do the optimization.
It might be a bit non portable but you could use an overlapping memory copy.
Fill the first four bytes with the pattern you want and use memcpy().
int32* p = (int32*) malloc( size );
*p = 1234;
memcpy( p + 4, p, size - 4 );
don't think you can get much faster

Replacement for vsscanf on msvc

I've run into an issue porting a codebase from linux (gcc) to windows (msvc). It seems like the C99 function vsscanf isn't available and has no obvious replacement.
I've read about a solution using the internal function _input_l and linking statically to the crt runtime, but unfortunately I cannot link statically since it would mess with all the plugins (as dlls) being loaded by the application.
So is there any replacement or a way to write a wrapper for vsscanf?
Update 2016-02-24:
When this was first asked there was no native replacement but since then MSVC has implemented support for this and much more.
VS2013 and later implements vsscanf and friends.
C++11 includes support as well.
A hack that should work:
int vsscanf(const char *s, const char *fmt, va_list ap)
{
void *a[20];
int i;
for (i=0; i<sizeof(a)/sizeof(a[0]); i++) a[i] = va_arg(ap, void *);
return sscanf(s, fmt, a[0], a[1], a[2], a[3], a[4], a[5], a[6], /* etc... */);
}
Replace 20 with the max number of args you think you might need. This code isn't terribly portable but it's only intended to be used on one particular broken system missing vsscanf so that shouldn't matter so much.
A quick search turned up several suggestions, including http://www.flipcode.net/archives/vsscanf_for_Win32.shtml
As this is tagged C++ have you considered just biting the bullet and moving away from the scanf line of functions completely? The C++ idiomatic way would be to use a std::istringstream. Rewriting to make use of that instead of looking for a vsscanf replacement would possibly be easier and more portable, not to mention having much greater type safety.
Funny it never came up for me before today. I could've sworn I'd used the function in the past. But anyway, here's a solution that works and is as safe as your arguments and format string:
template < size_t _NumArgs >
int VSSCANF_S(LPCTSTR strSrc, LPCTSTR ptcFmt, INT_PTR (&arr)[_NumArgs]) {
class vaArgs
{
vaArgs() {}
INT_PTR* m_args[_NumArgs];
public:
vaArgs(INT_PTR (&arr)[_NumArgs])
{
for(size_t nIndex=0;nIndex<_NumArgs;++nIndex)
m_args[nIndex] = &arr[nIndex];
}
};
return sscanf_s(strSrc, ptcFmt, vaArgs(arr));
}
///////////////////////////////////////////////////////////////////////////////
int _tmain(int, LPCTSTR argv[])
{
INT_PTR args[3];
int nScanned = VSSCANF_S(_T("-52 Hello 456 #"), _T("%d Hello %u %c"), args);
return printf(_T("Arg1 = %d, arg2 = %u, arg3 = %c\n"), args[0], args[1], args[2]);
}
Out:
Arg1 = -52, arg2 = 456, arg3 = #
Press any key to continue . . .
Well I can't get the formatting right but you get the idea.
if you want to wrap sscanf and you are using C++11, you can do this:
template<typename... Args>
int mysscanf(const char* str, const char* fmt, Args... args) {
//...
return sscanf(str, fmt, args...);
}
to make this work on msvc, you need to download this update:
http://www.microsoft.com/en-us/download/details.aspx?id=35515
modified from :
http://www.gamedev.net/topic/310888-no-vfscanf-in-visual-studio/
#if defined(_WIN32) && (_MSC_VER <= 1500)
static int vsscanf(
const char *buffer,
const char *format,
va_list argPtr
)
{
// Get an upper bound for the # of args
size_t count = 0;
const char* p = format;
while(1)
{
char c = *(p++);
if (c == 0)
break;
if (c == '%' && (p[0] != '*' && p[0] != '%'))
++count;
}
if (count <= 0)
return 0;
int result;
// copy stack pointer
_asm
{
mov esi, esp;
}
// push variable parameters pointers on stack
for (int i = count - 1; i >= 0; --i)
{
_asm
{
mov eax, dword ptr[i];
mov ecx, dword ptr [argPtr];
mov edx, dword ptr [ecx+eax*4];
push edx;
}
}
int stackAdvance = (2 + count) * 4;
_asm
{
// now push on the fixed params
mov eax, dword ptr [format];
push eax;
mov eax, dword ptr [buffer];
push eax;
// call sscanf, and more the result in to result
call dword ptr [sscanf];
mov result, eax;
// restore stack pointer
mov eax, dword ptr[stackAdvance];
add esp, eax;
}
return result;
}
#endif // _WIN32 / _MSC_VER <= 1500
tested only on Visual Studio 2008