Best way to XOR two 128 bit values in MSVC? - c++

I'm trying to XOR the 128 bit Initialization Vector with the Plaintext as seen here
In linux x86-64 gcc 12.2, there's a one liner
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
For example, https://godbolt.org/z/sc8e66qeo
#include <stdio.h>
#include <stdint.h>
int main()
{
uint8_t plaintext[16] = {'t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x'};
uint8_t ivvectext[16] = {'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w'};
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
for (int i = 0; i < sizeof(plaintext); i++) { printf("%02X ", (unsigned char)plaintext[i]); }
return 0;
}
Question
In MSVC, what's the preferred method to XOR these 128 bit values?
Update
As noted in one of the answers, use the compiler intrinsic _mm_xor_si128
#include <stdint.h>
#include <immintrin.h>
#include <iostream>
#include <ios>
#include <iomanip>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
uint8_t* xored_array = (uint8_t*)&xored;
for (int i = 0; i < 16; i++) {
std::cout << std::uppercase << std::setw(2) << std::setfill('0') << std::hex << (int)xored_array[i] << " ";
}
std::cout << std::endl;
return 0;
}
The output matches linux
03 09 54 43 1A 0B 10 1A 0F 04 0C 04 1D 17 1A 0F
However, other answers suggest more readable code
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimizations figure out the internal assembly code. :)

If your goal is to optimize your code, then leave this task to the compiler. (Of course you might have to enable optimization.)
You can write a simple loop like
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimize this.
For example, x86 msvc v19.latest with option -O2 creates SSE2 instructions from this loop resulting in a single 128-bit operation.
_main PROC ; COMDAT
sub esp, 36 ; 00000024H
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+36], eax
mov DWORD PTR _plaintext$[esp+36], 1902471284 ; 71656874H
mov DWORD PTR _plaintext$[esp+40], 1801677173 ; 6b636975H
mov DWORD PTR _plaintext$[esp+44], 2003792482 ; 776f7262H
mov DWORD PTR _plaintext$[esp+48], 2020566638 ; 786f666eH
movups xmm1, XMMWORD PTR _plaintext$[esp+36]
mov DWORD PTR _ivvectext$[esp+36], 842097015 ; 32316177H
mov DWORD PTR _ivvectext$[esp+40], 1903387247 ; 7173626fH
mov DWORD PTR _ivvectext$[esp+44], 1935898221 ; 7363766dH
mov DWORD PTR _ivvectext$[esp+48], 2004185459 ; 77757173H
movups xmm0, XMMWORD PTR _ivvectext$[esp+36]
push esi
xor esi, esi
pxor xmm1, xmm0
movups XMMWORD PTR _plaintext$[esp+40], xmm1
...
See https://godbolt.org/z/afTPK5von
Additional hints from comments:
Even if you determine that you need to hand-optimize the code and use the intrinsic functions explicitly (e.g., the optimizer doesn't use them for some reason, sad panda), I recommend also keeping the straightforward implementation as a reference implementation for development & debugging purposes. (Eljay's comment)
Sometimes the MS compiler won't optimize what looks like a simple loop, in this case you can enable Vectorizer and parallelizer messages which can give you hints as to why it didn't. (user20716902's comment)

As mentioned in my comment I would use intrincis: Here is how to do it in MSVC:
#include <immintrin.h>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
return 0;
}

Related

Less aggressive loop optimization when using printf than cout

This question is a followup to this one:
Question on Undefined Behaviour (UB) and loop optimization
When using https://godbolt.org/ online compiler (g++ 7.2, x86_64) with -O2 setting, the loop condition gets optimized out when the loop contains std::cout but not with an identical loop using printf.
Any idea why? Both code versions are compiled with the C++ compiler. Both versions produce an UB warning, but no warning without -O2, even though the code is still in UB land.
FWIW, I also tried the MIPS g++ compiler, that one does not seem to optimize out the loop condition even with std::cout code version and -O2.
I can provide the compiler outputs if necessary, but the std::cout version is quite long.
#include <stdio.h>
int main()
{
for (int i = 0; i < 300; i++)
printf("%d %d", i, i*12345678);
}
/*#include <iostream>
int main()
{
for (int i = 0; i < 300; i++)
std::cout << i << " " << i * 12345678 << std::endl;
}*/
UPDATE: On suggestion from the comments, I removed the UB, then even the printf version removes the loop condition, instead jumping out of the loop when i is 11 (very unsurprising), see below:
#include <stdio.h>
int main()
{
for (int i = 0; i < 300; i++) {
printf("%d %d", i, i*123);
if (i * 123 > 1230) break;
}
}
// Generated assembly:
LC0:
.string "%d %d"
main:
push rbp
push rbx
xor edx, edx
xor esi, esi
mov edi, OFFSET FLAT:.LC0
xor eax, eax
sub rsp, 8
mov ebp, 123
xor ebx, ebx
call printf
.L2:
add ebx, 1
mov edx, ebp
xor eax, eax
mov esi, ebx
mov edi, OFFSET FLAT:.LC0
add ebp, 123
call printf
cmp ebx, 11
jne .L2
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret

When linking ASM to C++ file, there is an access violation

To give you context over the code, this is a dot product computation of two vectors using pointer arithmetic (looping with pointers as well). I have linked it in my main.cpp but for some reason, when the function is called (in this case, my ASM file), i get an access violation error. Here are the two files. Thank you for your help!
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <algorithm>
#include <iostream>
using namespace std;
extern "C" int dpp_pointerr(int *v, int *u, int n); //ASM FILE
void main(void) {
const int N = 10;
static int A[10];
static int B[10];
printf("Array A: ");
for (int i = 0; i < N; i++) { A[i] = rand() % 10; /*printf("%d ", A[i]); */ }
printf("\n\nArray B: ");
for (int j = 0; j < N; j++) { B[j] = rand() % 10;/* printf("%d ", B[j]);*/ }
printf("\n");
int result2 = dpp_pointerr(A, B, N);
printf("\nResult after POINTER dot product: %d\n", result2);
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
int acc = 0, i = 0;
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctr1) != 0) {
/****************CODE TO BE TIMED HERE**********************/
//int result3= dot_product_index(A, B, N);
int result2 = dpp_pointerr(A, B, N);
/**********************************************************/
QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
cout << "Start Value: " << ctr1 << endl;
cout << "End Value: " << ctr2 << endl;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
// freq is number of counts per second. It approximates the CPU frequency
printf("QueryPerformanceCounter minimum resolution: 1/%I64u Seconds.\n", freq);
printf("ctr2 - ctr1: %f counts.\n", ((ctr2 - ctr1) * 1.0 / 1.0));
cout << "65536 Increments by 1 computation time: " << ((ctr2 - ctr1) * 1.0 / freq) << " seconds\n";
}
else {
DWORD dwError = GetLastError();
printf("Error value = %d", dwError);
}
cout << endl;
cout << "Press ENTER to finish";
system("pause");
}
ASM FILE
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.11.25547.0
TITLE C:\Users\Patrick\source\repos\dot_product_legit\dot_product_legit\dpp_pointerr.cpp
.686P
.XMM
include listing.inc
.model flat, C
PUBLIC dpp_pointerr
_TEXT SEGMENT
_result$ = -32 ; size = 4
_B_beg$ = -20 ; size = 4
_A_beg$ = -8 ; size = 4
_v$ = 8 ; size = 4
_u$ = 12 ; size = 4
_n$ = 16 ; size = 4
?dpp_pointerr##YAHPAH0H#Z:
dpp_pointerr PROC ; dot_product_pointer, COMDAT
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
mov DWORD PTR _result$[ebp], 0
; Line 11
; Line 2
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
; Line 5
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _v$[ebp]
mov DWORD PTR _A_beg$[ebp], ecx
; Line 6
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _u$[ebp]
mov DWORD PTR _B_beg$[ebp], ecx
; Line 8
mov DWORD PTR _result$[ebp], 0
; Line 11
mov eax, DWORD PTR _A_beg$[ebp]
mov ebx, DWORD PTR _B_beg$[ebp]
mov ecx, DWORD PTR _n$[ebp]
mov edi, DWORD PTR _v$[ebp]
lea edi, DWORD PTR [edi+ecx*4]
mov esi, DWORD PTR _u$[ebp]
lea esi, DWORD PTR [esi+ecx*4]
jmp SHORT $LN4#dot_produc
$LN2#dot_produc:
add eax, 4
add ebx, 4
$LN4#dot_produc:
cmp eax, edi
jae SHORT $LN3#dot_produc
cmp ebx, esi
jae SHORT $LN3#dot_produc
; Line 12
imul eax, ebx
add DWORD PTR _result$[ebp], eax
jmp SHORT $LN2#dot_produc
$LN3#dot_produc:
; Line 13
mov eax, DWORD PTR _result$[ebp]
; Line 14
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
dpp_pointerr ENDP ; dot_product_pointer
_TEXT ENDS
END

Incorrect results when using SSE instrinsics in Visual Studio 2010/2012 and Release mode

I'm computing the mean and variance of an array using SSE intrinsics. Basically, this is the summation of the values and its squares which can be illustrated in the following program:
int main( int argc, const char* argv[] )
{
union u
{
__m128 m;
float f[4];
} x;
// Allocate memory and initialize data: [1,2,3,...stSize+1]
const size_t stSize = 1024;
float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32);
for ( size_t s = 0; s < stSize; ++s ) {
pData[s] = s+1;
}
// Sum and sum of squares
{
// Accumlation using SSE intrinsics
__m128 mEX = _mm_set_ps1(0.f);
__m128 mEXX = _mm_set_ps1(0.f);
for ( size_t s = 0; s < stSize; s+=4 )
{
__m128 m = _mm_load_ps(pData + s);
mEX = _mm_add_ps(mEX, m);
mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
}
// Final reduction
x.m = mEX;
double dEX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
x.m = mEXX;
double dEXX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl;
std::cout << "EX: " << dEX << std::endl;
std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl;
std::cout << "EXX: " << dEXX << std::endl;
}
// Clean up
_aligned_free(pData);
}
Now when I compile and run the program in Debug mode I get the following (and correct) output:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
However, compiling and running the program in Release mode the following (and wrong) results are produced:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.49272e+012
Changing the order of accumulation, i.e. EXX is updated before EX, the results are OK:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Looks like a 'counterproductive' compiler optimization or why is the order of execution relevant? Is this a known bug?
EDIT:
I just looked at the assembler output. Here is what I get (only the relevant parts).
For the release build with /arch:AVX compiler flag we have:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
vmovaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
vmovaps xmm2, xmm1
npad 12
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
vmovaps xmm0, xmm1
; 76 : mEX = _mm_add_ps(mEX, m);
vaddps xmm1, xmm1, XMMWORD PTR [rax]
add rax, 16
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
vmulps xmm0, xmm0, xmm0
vaddps xmm2, xmm0, xmm2
dec rcx
jne SHORT $LL3#main
This is clearly wrong as this (1) saves the accumulated EX result (xmm1) in xmm0 (2) accumulates EX with the current value (XMMWORD PTR [rax]) and (3) accumulates in EXX (xmm2) the square of the accumulated EX result previously save in xmm0.
In contrast, the version without the /arch:AVX looks fine and as expected:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
movaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
movaps xmm2, xmm1
npad 10
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
movaps xmm0, XMMWORD PTR [rax]
add rax, 16
dec rcx
; 76 : mEX = _mm_add_ps(mEX, m);
addps xmm1, xmm0
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
mulps xmm0, xmm0
addps xmm2, xmm0
jne SHORT $LL3#main
This really looks like a bug. Can anyone comfirm or refute this issue with a different compiler version? (I currently do not have permission to update the compiler)
Instead of manually performing the horizontal addition, I'd recommend using the corresponding SSE instruction _mm_hadd_ps
// Final reduction
__m128 sum1 = _mm_hadd_ps(mEX, mEXX);
// == {EX[0]+EX[1], EX[2]+EX[3], EXX[0]+EXX[1], EXX[2]+EXX[3]}
// final sum and conversion to double:
__m128d sum2 = _mm_cvtps_pd(_mm_hadd_ps(sum1, sum1));
// result vector:
double dEX_EXX[2]; // (I don't know MSVC syntax for stack aligned arrays)
// store register to stack: (should be _mm_store_pd, if the array is aligned)
_mm_storeu_pd(dEX_EXX, sum2);
std::cout << "EX: " << dEX_EXX[0] << "\nEXX: " << dEX_EXX[1] << std::endl;

Replacing IF statement (random condition) with boolean logic- execution time identical?

(Setup: Win 7 64, MSVC, 3rd Generation Core i7, 64-bit compliation, -O2 enabled)
The below code has three functions- one has an IF statement which executes different code depending on whether a condition has been met. I replaced this IF statement with some boolean logic. However the timings are identical.... I was expecting the lack of branch prediction to yield faster code:
#include <iostream>
unsigned long long iterations = 1000000000;
void test1(){
volatile int c = 0;
for(int i=0; i<iterations; i++){
bool condition = __rdtsc() % 2 == 0;
if(condition){
c = 4;
}
else{
c = 5;
}
}
}
void test2(){
volatile int c = 0;
for(int i=0; i<iterations; i++){
bool condition = __rdtsc() % 2 == 0;
c = (4 * condition) + (5 * !condition);
}
}
int main(){
unsigned long long s = 0;
unsigned long long f = 0;
unsigned long long s2 = 0;
unsigned long long f2 = 0;
unsigned int x = 0;
unsigned int y = 0;
start = __rdtscp(&x);
test1();
finish = __rdtscp(&y);
start2 = __rdtscp(&x);
test2();
finish2 = __rdtscp(&y);
std::cout << "1: " << f - s<< std::endl;
std::cout << "2: " << f2- s2<< std::endl;
}
UPDATE asm:
int main(){
push rbp
push rsi
push rdi
push r14
sub rsp,20h
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned long long start2 = 0;
unsigned long long finish2 = 0;
unsigned long long start3 = 0;
unsigned long long finish3 = 0;
unsigned int x = 0;
xor r8d,r8d
mov dword ptr [x],r8d
unsigned int y = 0;
mov dword ptr [y],r8d
start = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov dword ptr [r9],ecx
or rax,rdx
test1();
mov dword ptr [rsp+60h],r8d
mov ecx,r8d
start = __rdtscp(&x);
mov r10,rax
nop word ptr [rax+rax]
test1();
rdtsc
shl rdx,20h
or rax,rdx
xor al,0FFh
and al,1
neg al
sbb eax,eax
inc ecx
add eax,5
mov dword ptr [rsp+60h],eax
movsxd rax,ecx
cmp rax,3E8h
test1();
jb main+40h (013FFE1280h)
finish = __rdtscp(&y);
rdtscp
lea r9,[y]
shl rdx,20h
or rax,rdx
mov dword ptr [r9],ecx
mov rbp,rax
start2 = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov dword ptr [r9],ecx
or rax,rdx
test2();
mov dword ptr [rsp+60h],r8d
mov r9d,r8d
start2 = __rdtscp(&x);
mov r14,rax
nop word ptr [rax+rax]
test2();
rdtsc
shl rdx,20h
inc r9d
or rax,rdx
xor al,0FFh
and al,1
test2();
movzx ecx,al
lea eax,[rcx+rcx*8]
mov dword ptr [rsp+60h],eax
movsxd rax,r9d
cmp rax,3E8h
jb main+0A0h (013FFE12E0h)
finish2 = __rdtscp(&y);
The generated code doesn't contain any internal branches for either function, which is why there is no mis-prediction penalty.
In the first one it converts the boolean to either zero or -1 (around sbb eax,eax) and adds it to 5. This is a pretty standard optimisation when working with booleans.
In the second one it multiplies by nine (rcx+rcx*8), because you have 5 * condition not 5 * !condition.

Can C++ compilers optimize repeated virtual function calls on the same pointer? [duplicate]

This question already has answers here:
Hoisting the dynamic type out of a loop (a.k.a. doing Java the C++ way)
(4 answers)
Closed 10 years ago.
Suppose I have the following code
void f(PolymorphicType *p)
{
for (int i = 0; i < 1000; ++i)
{
p->virtualMethod(something);
}
}
Will the compiler's generated code dereference p's vtable entry for virtualMethod 1 or 1000 times? I am using Microsoft's compiler.
edit
here is the generated assembly for the real-world case I'm looking at. line->addPoint() is the virtual method of concern. I have no assembly experience, so I'm going over it slowly...
; 369 : for (int i = 0; i < numPts; ++i)
test ebx, ebx
je SHORT $LN1#RDS_SCANNE
lea edi, DWORD PTR [ecx+32]
npad 2
$LL3#RDS_SCANNE:
; 370 : {
; 371 : double *pts = pPoints[i].SystemXYZ;
; 372 : line->addPoint(pts[0], pts[1], pts[2]);
fld QWORD PTR [edi+8]
mov eax, DWORD PTR [esi]
mov edx, DWORD PTR [eax+16]
sub esp, 24 ; 00000018H
fstp QWORD PTR [esp+16]
mov ecx, esi
fld QWORD PTR [edi]
fstp QWORD PTR [esp+8]
fld QWORD PTR [edi-8]
fstp QWORD PTR [esp]
call edx
add edi, 96 ; 00000060H
dec ebx
jne SHORT $LL3#RDS_SCANNE
$LN314#RDS_SCANNE:
; 365 : }
In general, no, it is not possible. The function could destroy *this and placement-new some other object derived from the same base in that space.
Edit: even easier, the function could just change p. The compiler cannot possibly know who has the address of p, unless it is local to the optimization unit in question.
Impossible in general, but there are special cases that can be optimized, especially with inter-procedural analysis. VS2012 with full optimizations and whole-program optimization compiles this program:
#include <iostream>
using namespace std;
namespace {
struct A {
virtual void foo() { cout << "A::foo\n"; }
};
struct B : public A {
virtual void foo() { cout << "B::foo\n"; }
};
void test(A& a) {
for (int i = 0; i < 100; ++i)
a.foo();
}
}
int main() {
B b;
test(b);
}
to:
01251221 mov esi,64h
01251226 jmp main+10h (01251230h)
01251228 lea esp,[esp]
0125122F nop
01251230 mov ecx,dword ptr ds:[1253044h]
01251236 mov edx,12531ACh
0125123B call std::operator<<<std::char_traits<char> > (012516B0h)
01251240 dec esi
01251241 jne main+10h (01251230h)
so it's effectively optimized the loop to:
for(int i = 0; i < 100; ++i)
cout << "B::foo()\n";