Related
I'm trying to XOR the 128 bit Initialization Vector with the Plaintext as seen here
In linux x86-64 gcc 12.2, there's a one liner
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
For example, https://godbolt.org/z/sc8e66qeo
#include <stdio.h>
#include <stdint.h>
int main()
{
uint8_t plaintext[16] = {'t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x'};
uint8_t ivvectext[16] = {'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w'};
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
for (int i = 0; i < sizeof(plaintext); i++) { printf("%02X ", (unsigned char)plaintext[i]); }
return 0;
}
Question
In MSVC, what's the preferred method to XOR these 128 bit values?
Update
As noted in one of the answers, use the compiler intrinsic _mm_xor_si128
#include <stdint.h>
#include <immintrin.h>
#include <iostream>
#include <ios>
#include <iomanip>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
uint8_t* xored_array = (uint8_t*)&xored;
for (int i = 0; i < 16; i++) {
std::cout << std::uppercase << std::setw(2) << std::setfill('0') << std::hex << (int)xored_array[i] << " ";
}
std::cout << std::endl;
return 0;
}
The output matches linux
03 09 54 43 1A 0B 10 1A 0F 04 0C 04 1D 17 1A 0F
However, other answers suggest more readable code
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimizations figure out the internal assembly code. :)
If your goal is to optimize your code, then leave this task to the compiler. (Of course you might have to enable optimization.)
You can write a simple loop like
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimize this.
For example, x86 msvc v19.latest with option -O2 creates SSE2 instructions from this loop resulting in a single 128-bit operation.
_main PROC ; COMDAT
sub esp, 36 ; 00000024H
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+36], eax
mov DWORD PTR _plaintext$[esp+36], 1902471284 ; 71656874H
mov DWORD PTR _plaintext$[esp+40], 1801677173 ; 6b636975H
mov DWORD PTR _plaintext$[esp+44], 2003792482 ; 776f7262H
mov DWORD PTR _plaintext$[esp+48], 2020566638 ; 786f666eH
movups xmm1, XMMWORD PTR _plaintext$[esp+36]
mov DWORD PTR _ivvectext$[esp+36], 842097015 ; 32316177H
mov DWORD PTR _ivvectext$[esp+40], 1903387247 ; 7173626fH
mov DWORD PTR _ivvectext$[esp+44], 1935898221 ; 7363766dH
mov DWORD PTR _ivvectext$[esp+48], 2004185459 ; 77757173H
movups xmm0, XMMWORD PTR _ivvectext$[esp+36]
push esi
xor esi, esi
pxor xmm1, xmm0
movups XMMWORD PTR _plaintext$[esp+40], xmm1
...
See https://godbolt.org/z/afTPK5von
Additional hints from comments:
Even if you determine that you need to hand-optimize the code and use the intrinsic functions explicitly (e.g., the optimizer doesn't use them for some reason, sad panda), I recommend also keeping the straightforward implementation as a reference implementation for development & debugging purposes. (Eljay's comment)
Sometimes the MS compiler won't optimize what looks like a simple loop, in this case you can enable Vectorizer and parallelizer messages which can give you hints as to why it didn't. (user20716902's comment)
As mentioned in my comment I would use intrincis: Here is how to do it in MSVC:
#include <immintrin.h>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
return 0;
}
Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
};
test a;
int s = 6;
int suma(test *)
{
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
loop:
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
outt:
mov eax, edx;
}
return s;
}
int main()
{
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
}
cout << s;
return 0;
}
I'm computing the mean and variance of an array using SSE intrinsics. Basically, this is the summation of the values and its squares which can be illustrated in the following program:
int main( int argc, const char* argv[] )
{
union u
{
__m128 m;
float f[4];
} x;
// Allocate memory and initialize data: [1,2,3,...stSize+1]
const size_t stSize = 1024;
float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32);
for ( size_t s = 0; s < stSize; ++s ) {
pData[s] = s+1;
}
// Sum and sum of squares
{
// Accumlation using SSE intrinsics
__m128 mEX = _mm_set_ps1(0.f);
__m128 mEXX = _mm_set_ps1(0.f);
for ( size_t s = 0; s < stSize; s+=4 )
{
__m128 m = _mm_load_ps(pData + s);
mEX = _mm_add_ps(mEX, m);
mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
}
// Final reduction
x.m = mEX;
double dEX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
x.m = mEXX;
double dEXX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl;
std::cout << "EX: " << dEX << std::endl;
std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl;
std::cout << "EXX: " << dEXX << std::endl;
}
// Clean up
_aligned_free(pData);
}
Now when I compile and run the program in Debug mode I get the following (and correct) output:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
However, compiling and running the program in Release mode the following (and wrong) results are produced:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.49272e+012
Changing the order of accumulation, i.e. EXX is updated before EX, the results are OK:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Looks like a 'counterproductive' compiler optimization or why is the order of execution relevant? Is this a known bug?
EDIT:
I just looked at the assembler output. Here is what I get (only the relevant parts).
For the release build with /arch:AVX compiler flag we have:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
vmovaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
vmovaps xmm2, xmm1
npad 12
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
vmovaps xmm0, xmm1
; 76 : mEX = _mm_add_ps(mEX, m);
vaddps xmm1, xmm1, XMMWORD PTR [rax]
add rax, 16
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
vmulps xmm0, xmm0, xmm0
vaddps xmm2, xmm0, xmm2
dec rcx
jne SHORT $LL3#main
This is clearly wrong as this (1) saves the accumulated EX result (xmm1) in xmm0 (2) accumulates EX with the current value (XMMWORD PTR [rax]) and (3) accumulates in EXX (xmm2) the square of the accumulated EX result previously save in xmm0.
In contrast, the version without the /arch:AVX looks fine and as expected:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
movaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
movaps xmm2, xmm1
npad 10
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
movaps xmm0, XMMWORD PTR [rax]
add rax, 16
dec rcx
; 76 : mEX = _mm_add_ps(mEX, m);
addps xmm1, xmm0
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
mulps xmm0, xmm0
addps xmm2, xmm0
jne SHORT $LL3#main
This really looks like a bug. Can anyone comfirm or refute this issue with a different compiler version? (I currently do not have permission to update the compiler)
Instead of manually performing the horizontal addition, I'd recommend using the corresponding SSE instruction _mm_hadd_ps
// Final reduction
__m128 sum1 = _mm_hadd_ps(mEX, mEXX);
// == {EX[0]+EX[1], EX[2]+EX[3], EXX[0]+EXX[1], EXX[2]+EXX[3]}
// final sum and conversion to double:
__m128d sum2 = _mm_cvtps_pd(_mm_hadd_ps(sum1, sum1));
// result vector:
double dEX_EXX[2]; // (I don't know MSVC syntax for stack aligned arrays)
// store register to stack: (should be _mm_store_pd, if the array is aligned)
_mm_storeu_pd(dEX_EXX, sum2);
std::cout << "EX: " << dEX_EXX[0] << "\nEXX: " << dEX_EXX[1] << std::endl;
I am really struggling to get my head around this task which I've been given and I'm going out of my mind right now.
I have been given an example of the following loop:
#include <iostream> // for cout
using namespace std;
//declare an array of 'marks'.
const int array_size = 5;
int marks [] = {45,56,99,19,21};
int main()
{int index, average, total;
total = 0;
for ( index = 0; index < array_size; index++)
{
total = total + marks [index];
}
average = total/array_size;
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
in assembly:
#include <iostream> // for cout
using namespace std;
const int array_size = 5;
int marks [] = {45,56,99,19,21}; //declare an array of 'marks'.
int main()
{int index, average, total;
__asm {
mov total,0 ;total = 0;
; ************************** This part is the FOR loop *************************
mov index,0 ;for ( index = 0; index < array_size; index++)
jmp checkend
forloop1:
mov eax,index ;add 1 to index
add eax,1
mov index,eax
checkend:
cmp index,5 ;check if 5 ('array_size') loops have been done
jge endfor1 ;jump if greater than or equal to (remember?)
mov ecx,index
mov edx,total ; total =
add edx,[ecx*4+marks] ; total + marks [index];
mov total,edx
jmp forloop1
; ******************************************************************************
endfor1:
mov eax,total ;average = total/array_size;
cdq ;convert EAX to quadword (uses EDX register)
mov ecx,5 ;get array_size as divisor
idiv ecx ;divides EDX:EAX pair by ECX, EAX=answer
mov average,eax ;save answer in variable 'average'
} //end of assembly section
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
However, my problem is this: I need to convert the following loop into assembly, what is the best method of doing this based on the example I've been given? I'm really struggling with assigning the temp_char variable to an array variable.
By following loop I mean the encrypt_chars function loop
#define MAXCHARS 6
#define dollarchar '$' // string terminator
char OChars[MAXCHARS],
EChars[MAXCHARS],
DChars[MAXCHARS] = "Soon!"; // Global Original, Encrypted, Decrypted character strings
//----------------------------- C++ Functions ----------------------------------------------------------
void get_char(char& a_character)
{
cin >> a_character;
while (((a_character < '0') | (a_character > 'z')) && (a_character != dollarchar))
{
cout << "Alphanumeric characters only, please try again > ";
cin >> a_character;
}
}
//-------------------------------------------------------------------------------------------------------------
void get_original_chars(int& length)
{
char next_char;
length = 0;
get_char(next_char);
while ((length < MAXCHARS) && (next_char != dollarchar))
{
OChars[length++] = next_char;
get_char(next_char);
}
}
void encrypt_chars(int length, char EKey)
{
char temp_char; // char temporary store
for (int i = 0; i < length; i++) // encrypt characters one at a time
{
temp_char = OChars[i]; // temp_char now contains the address values of the individual character
__asm
{
push eax // Save values contained within register to stack
push ecx
movzx ecx, temp_char
push ecx // Push argument #2
lea eax, EKey
push eax // Push argument #1
call encrypt4
add esp, 8 // Clean parameters of stack
mov temp_char, al // Move the temp character into a register
pop ecx
pop eax
}
EChars[i] = temp_char; // Store encrypted char in the encrypted chars array
}
return;
// Inputs: register EAX = 32-bit address of Ekey,
// ECX = the character to be encrypted (in the low 8-bit field, CL).
// Output: register EAX = the encrypted value of the source character (in the low 8-bit field, AL).
__asm
{
encrypt4:
push ebp // Set stack
mov ebp, esp // Set up the base pointer
mov eax, [ebp + 8] // Move value of parameter 1 into EAX
mov ecx, [ebp + 12] // Move value of parameter 2 into ECX
push edi // Used for string and memory array copying
push ecx // Loop counter for pushing character onto stack
not byte ptr[eax] // Negation
add byte ptr[eax], 0x04 // Adds hex 4 to EKey
movzx edi, byte ptr[eax] // Moves value of EKey into EDI using zeroes
pop eax // Pop the character value from stack
xor eax, edi // XOR character to give encrypted value of source
pop edi // Pop original address of EDI from the stack
rol al, 1 // Rotates the encrypted value of source by 1 bit (left)
rol al, 1 // Rotates the encrypted value of source by 1 bit (left) again
add al, 0x04 // Adds hex 4 to encrypted value of source
mov esp, ebp // Deallocate values
pop ebp // Restore the base pointer
ret
}
//--- End of Assembly code
}
int main(void)
{
int char_count; // The number of actual characters entered (upto MAXCHARS limit).
char EKey; // Encryption key.
cout << "\nPlease enter your Encryption Key (EKey) letter: "; get_char(EKey);
cout << "\nNow enter upto " << MAXCHARS << " alphanumeric characters:\n";
get_original_chars(char_count);
cout << "\n\nOriginal source string = " << OChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << hex << setw(2) << setfill('0') << ((int(OChars[i])) & 0xFF) << " ";
encrypt_chars(char_count, EKey);
cout << "\n\nEncrypted string = " << EChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(EChars[i])) & 0xFF) << " ";
decrypt_chars(char_count, EKey);
cout << "\n\nDecrypted string = " << DChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(DChars[i])) & 0xFF) << " ";
cout << "\n\nPress a key to end...";
while (!_kbhit()); //hold the screen until a key is pressed
return (0);
}
offering a cookie to the best explained (step-by-step) answer/ solution!!
EDIT: Disassembly code generated from the loop etc:
http://pastebin.com/u5MgJ2SW
notice line 32 where it says mov cl,byte ptr [eax+0CC0320h] Unless you
can tell me how to convert that into 'normal asm code'
mov cl,byte ptr [eax+0CC0320h]: eax at this point contains the value of i, 0CC0320h is the address of OChars, i.e. its begining. So, the "normal" asm code is as follows:
__asm {
mov eax, i
mov cl, byte ptr OChars[eax]
mov temp_char, cl
}
I have a code snippet. The snippet just loads 2 arrays and calculates dot product between them using SSE.
Code here:
using namespace std;
long long size = 3200000;
float* _random()
{
unsigned int seed = 123;
// float *t = malloc(size*sizeof(float));
float *t = new float[size];
int i;
float num = 0.0;
for(i=0; i < size; i++) {
num = rand()/(RAND_MAX+1.0);
t[i] = num;
}
return t;
}
float _dotProductVectorSSE(float *s1, float *s2)
{
float prod;
int i;
__m128 X, Y, Z;
for(i=0; i<size; i+=4)
{
X = _mm_load_ps(&s1[i]);
Y = _mm_load_ps(&s2[i]);
X = _mm_mul_ps(X, Y);
Z = _mm_add_ps(X, Z);
}
float *v = new float[4];
_mm_store_ps(v,Z);
for(i=0; i<4; i++)
{
// prod += Z[i];
std::cout << v[i] << endl;
}
return prod;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
time_t start, stop;
double avg_time = 0;
double cur_time;
float* s1 = NULL;
float* s2 = NULL;
for(int i = 0; i < 100; i++)
{
s1 = _random();
s2 = _random();
start = clock();
float sse_product = _dotProductVectorSSE(s1, s2);
stop = clock();
cur_time = ((double) stop-start) / CLOCKS_PER_SEC;
avg_time += cur_time;
}
std::cout << "Averagely used " << avg_time/100 << " seconds." << endl;
return a.exec();
}
When I run, I got segment fault. Here is the backtrace:
(gdb) bt
0 0x0804965f in _mm_load_ps (__P=0xb6b56008) at /usr/lib/gcc/i586-suse-linux/4.6/include/xmmintrin.h:899
1 _dotProductVectorSSE (s1=0xb6b56008, s2=0xb5f20008) at ../simd/simd.cpp:37
2 0x0804987f in main (argc=1, argv=0xbfffee84) at ../simd/simd.cpp:80
Diassembler:
0x8049b30 push %ebp
0x8049b31 <+0x0001> push %edi
0x8049b32 <+0x0002> push %esi
0x8049b33 <+0x0003> push %ebx
0x8049b34 <+0x0004> sub $0x2c,%esp
0x8049b37 <+0x0007> mov 0x804c0a4,%esi
0x8049b3d <+0x000d> mov 0x40(%esp),%edx
0x8049b41 <+0x0011> mov 0x44(%esp),%ecx
0x8049b45 <+0x0015> mov 0x804c0a0,%ebx
0x8049b4b <+0x001b> cmp $0x0,%esi
0x8049b4e <+0x001e> jl 0x8049b7a <_Z20_dotProductVectorSSEPfS_+74>
0x8049b50 <+0x0020> jle 0x8049c10 <_Z20_dotProductVectorSSEPfS_+224>
0x8049b56 <+0x0026> add $0xffffffff,%ebx
0x8049b59 <+0x0029> adc $0xffffffff,%esi
0x8049b5c <+0x002c> xor %eax,%eax
0x8049b5e <+0x002e> shrd $0x2,%esi,%ebx
0x8049b62 <+0x0032> add $0x1,%ebx
0x8049b65 <+0x0035> shl $0x2,%ebx
**0x8049b68 <+0x0038> movaps (%edx,%eax,4),%xmm0**
0x8049b6c <+0x003c> mulps (%ecx,%eax,4),%xmm0
0x8049b70 <+0x0040> add $0x4,%eax
0x8049b73 <+0x0043> cmp %ebx,%eax
0x8049b75 <+0x0045> addps %xmm0,%xmm1
0x8049b78 <+0x0048> jne 0x8049b68 <_Z20_dotProductVectorSSEPfS_+56>
0x8049b7a <+0x004a> movaps %xmm1,0x10(%esp)
0x8049b7f <+0x004f> xor %ebx,%ebx
I am using QtCreator and defined in .pro file:
QMAKE_CXXFLAGS += -msse -msse2
DEFINES += __SSE__
DEFINES += __SSE2__
DEFINES += __MMX__
Please tell me how to fix that problem !
You are not ensuring that your data is 16 byte aligned (malloc/new are not sufficient in general) - you will either need to use _mm_loadu_ps instead of _mm_load_ps to deal with your potentially misaligned data, or preferably use a suitable method to allocate aligned memory (e.g. posix_memalign on Linux).
Note that you should _mm_load_ps and 16 byte aligned memory if you possibly can, otherwise use _mm_loadu_ps but note that this may reduce performance signficantly on some (older) CPUs.
Try the link below.
http://flyeater.wordpress.com/2010/11/29/memory-allocation-and-data-alignment-custom-mallocfree/
You basically allocate a bit more memory than you need, then calculate the address which is modulo 16 and use memory beginning from that address to load/store data.
Take care of pointer arithmetic.
Most of the code here ideone.com/fXKQhR is taken from the above link, sample usage.
I think, the _mm_malloc maybe helpful with you.