Strange SIMD instruction behavior - c++

SSE2 instruction (paddd xmm, m128) works really strange. Code tells all.
#include <iostream>
using namespace std;
int main()
{
int * v0 = new int [80];
for (int i=0; i<80; ++i)
v0[i] = i;
int * v1 = new int [80];
for (int i=0; i<80; ++i)
v1[i] = i;
asm(
".intel_syntax noprefix;"
"mov rcx , 20;"
"mov rax , %0;"
"mov rbx , %1;"
"m_start:;"
"cmp rcx , 0;"
"je m_end;"
"movdqu xmm0 , [rax];"
"paddd xmm0 , [rbx];"
"movdqu [rax] , xmm0;"
"add rbx , 16;" /* WTF?? If I put there 128, it's work really bad */
"add rax , 16;" /* but why?? I must add 128 because XMM width is 128 bits ... */
"dec rcx;"
"jmp m_start;"
"m_end:;"
".att_syntax noprefix;"
: //
: "r"(v0) , "r"(v1)
: //
);
for (int i=1; i<81; ++i)
{
cout << v0[i-1] << (char*)((i%10==0) ? "\n" : ", ");
}
return 0;
}

You must add 16 because 128 bits is 16 bytes.
Additional notes: you forgot to tell the compiler that you clobber some registers and you are not supposed to switch syntax without telling the compiler either (use -masm=intel switch instead).

Related

Best way to XOR two 128 bit values in MSVC?

I'm trying to XOR the 128 bit Initialization Vector with the Plaintext as seen here
In linux x86-64 gcc 12.2, there's a one liner
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
For example, https://godbolt.org/z/sc8e66qeo
#include <stdio.h>
#include <stdint.h>
int main()
{
uint8_t plaintext[16] = {'t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x'};
uint8_t ivvectext[16] = {'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w'};
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
for (int i = 0; i < sizeof(plaintext); i++) { printf("%02X ", (unsigned char)plaintext[i]); }
return 0;
}
Question
In MSVC, what's the preferred method to XOR these 128 bit values?
Update
As noted in one of the answers, use the compiler intrinsic _mm_xor_si128
#include <stdint.h>
#include <immintrin.h>
#include <iostream>
#include <ios>
#include <iomanip>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
uint8_t* xored_array = (uint8_t*)&xored;
for (int i = 0; i < 16; i++) {
std::cout << std::uppercase << std::setw(2) << std::setfill('0') << std::hex << (int)xored_array[i] << " ";
}
std::cout << std::endl;
return 0;
}
The output matches linux
03 09 54 43 1A 0B 10 1A 0F 04 0C 04 1D 17 1A 0F
However, other answers suggest more readable code
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimizations figure out the internal assembly code. :)
If your goal is to optimize your code, then leave this task to the compiler. (Of course you might have to enable optimization.)
You can write a simple loop like
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimize this.
For example, x86 msvc v19.latest with option -O2 creates SSE2 instructions from this loop resulting in a single 128-bit operation.
_main PROC ; COMDAT
sub esp, 36 ; 00000024H
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+36], eax
mov DWORD PTR _plaintext$[esp+36], 1902471284 ; 71656874H
mov DWORD PTR _plaintext$[esp+40], 1801677173 ; 6b636975H
mov DWORD PTR _plaintext$[esp+44], 2003792482 ; 776f7262H
mov DWORD PTR _plaintext$[esp+48], 2020566638 ; 786f666eH
movups xmm1, XMMWORD PTR _plaintext$[esp+36]
mov DWORD PTR _ivvectext$[esp+36], 842097015 ; 32316177H
mov DWORD PTR _ivvectext$[esp+40], 1903387247 ; 7173626fH
mov DWORD PTR _ivvectext$[esp+44], 1935898221 ; 7363766dH
mov DWORD PTR _ivvectext$[esp+48], 2004185459 ; 77757173H
movups xmm0, XMMWORD PTR _ivvectext$[esp+36]
push esi
xor esi, esi
pxor xmm1, xmm0
movups XMMWORD PTR _plaintext$[esp+40], xmm1
...
See https://godbolt.org/z/afTPK5von
Additional hints from comments:
Even if you determine that you need to hand-optimize the code and use the intrinsic functions explicitly (e.g., the optimizer doesn't use them for some reason, sad panda), I recommend also keeping the straightforward implementation as a reference implementation for development & debugging purposes. (Eljay's comment)
Sometimes the MS compiler won't optimize what looks like a simple loop, in this case you can enable Vectorizer and parallelizer messages which can give you hints as to why it didn't. (user20716902's comment)
As mentioned in my comment I would use intrincis: Here is how to do it in MSVC:
#include <immintrin.h>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
return 0;
}

Working on Array inside Struct , with Assembly

Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
};
test a;
int s = 6;
int suma(test *)
{
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
loop:
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
outt:
mov eax, edx;
}
return s;
}
int main()
{
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
}
cout << s;
return 0;
}

Incorrect results when using SSE instrinsics in Visual Studio 2010/2012 and Release mode

I'm computing the mean and variance of an array using SSE intrinsics. Basically, this is the summation of the values and its squares which can be illustrated in the following program:
int main( int argc, const char* argv[] )
{
union u
{
__m128 m;
float f[4];
} x;
// Allocate memory and initialize data: [1,2,3,...stSize+1]
const size_t stSize = 1024;
float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32);
for ( size_t s = 0; s < stSize; ++s ) {
pData[s] = s+1;
}
// Sum and sum of squares
{
// Accumlation using SSE intrinsics
__m128 mEX = _mm_set_ps1(0.f);
__m128 mEXX = _mm_set_ps1(0.f);
for ( size_t s = 0; s < stSize; s+=4 )
{
__m128 m = _mm_load_ps(pData + s);
mEX = _mm_add_ps(mEX, m);
mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
}
// Final reduction
x.m = mEX;
double dEX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
x.m = mEXX;
double dEXX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl;
std::cout << "EX: " << dEX << std::endl;
std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl;
std::cout << "EXX: " << dEXX << std::endl;
}
// Clean up
_aligned_free(pData);
}
Now when I compile and run the program in Debug mode I get the following (and correct) output:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
However, compiling and running the program in Release mode the following (and wrong) results are produced:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.49272e+012
Changing the order of accumulation, i.e. EXX is updated before EX, the results are OK:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Looks like a 'counterproductive' compiler optimization or why is the order of execution relevant? Is this a known bug?
EDIT:
I just looked at the assembler output. Here is what I get (only the relevant parts).
For the release build with /arch:AVX compiler flag we have:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
vmovaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
vmovaps xmm2, xmm1
npad 12
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
vmovaps xmm0, xmm1
; 76 : mEX = _mm_add_ps(mEX, m);
vaddps xmm1, xmm1, XMMWORD PTR [rax]
add rax, 16
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
vmulps xmm0, xmm0, xmm0
vaddps xmm2, xmm0, xmm2
dec rcx
jne SHORT $LL3#main
This is clearly wrong as this (1) saves the accumulated EX result (xmm1) in xmm0 (2) accumulates EX with the current value (XMMWORD PTR [rax]) and (3) accumulates in EXX (xmm2) the square of the accumulated EX result previously save in xmm0.
In contrast, the version without the /arch:AVX looks fine and as expected:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
movaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
movaps xmm2, xmm1
npad 10
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
movaps xmm0, XMMWORD PTR [rax]
add rax, 16
dec rcx
; 76 : mEX = _mm_add_ps(mEX, m);
addps xmm1, xmm0
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
mulps xmm0, xmm0
addps xmm2, xmm0
jne SHORT $LL3#main
This really looks like a bug. Can anyone comfirm or refute this issue with a different compiler version? (I currently do not have permission to update the compiler)
Instead of manually performing the horizontal addition, I'd recommend using the corresponding SSE instruction _mm_hadd_ps
// Final reduction
__m128 sum1 = _mm_hadd_ps(mEX, mEXX);
// == {EX[0]+EX[1], EX[2]+EX[3], EXX[0]+EXX[1], EXX[2]+EXX[3]}
// final sum and conversion to double:
__m128d sum2 = _mm_cvtps_pd(_mm_hadd_ps(sum1, sum1));
// result vector:
double dEX_EXX[2]; // (I don't know MSVC syntax for stack aligned arrays)
// store register to stack: (should be _mm_store_pd, if the array is aligned)
_mm_storeu_pd(dEX_EXX, sum2);
std::cout << "EX: " << dEX_EXX[0] << "\nEXX: " << dEX_EXX[1] << std::endl;

How to create a loop in ASM?

I am really struggling to get my head around this task which I've been given and I'm going out of my mind right now.
I have been given an example of the following loop:
#include <iostream> // for cout
using namespace std;
//declare an array of 'marks'.
const int array_size = 5;
int marks [] = {45,56,99,19,21};
int main()
{int index, average, total;
total = 0;
for ( index = 0; index < array_size; index++)
{
total = total + marks [index];
}
average = total/array_size;
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
in assembly:
#include <iostream> // for cout
using namespace std;
const int array_size = 5;
int marks [] = {45,56,99,19,21}; //declare an array of 'marks'.
int main()
{int index, average, total;
__asm {
mov total,0 ;total = 0;
; ************************** This part is the FOR loop *************************
mov index,0 ;for ( index = 0; index < array_size; index++)
jmp checkend
forloop1:
mov eax,index ;add 1 to index
add eax,1
mov index,eax
checkend:
cmp index,5 ;check if 5 ('array_size') loops have been done
jge endfor1 ;jump if greater than or equal to (remember?)
mov ecx,index
mov edx,total ; total =
add edx,[ecx*4+marks] ; total + marks [index];
mov total,edx
jmp forloop1
; ******************************************************************************
endfor1:
mov eax,total ;average = total/array_size;
cdq ;convert EAX to quadword (uses EDX register)
mov ecx,5 ;get array_size as divisor
idiv ecx ;divides EDX:EAX pair by ECX, EAX=answer
mov average,eax ;save answer in variable 'average'
} //end of assembly section
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
However, my problem is this: I need to convert the following loop into assembly, what is the best method of doing this based on the example I've been given? I'm really struggling with assigning the temp_char variable to an array variable.
By following loop I mean the encrypt_chars function loop
#define MAXCHARS 6
#define dollarchar '$' // string terminator
char OChars[MAXCHARS],
EChars[MAXCHARS],
DChars[MAXCHARS] = "Soon!"; // Global Original, Encrypted, Decrypted character strings
//----------------------------- C++ Functions ----------------------------------------------------------
void get_char(char& a_character)
{
cin >> a_character;
while (((a_character < '0') | (a_character > 'z')) && (a_character != dollarchar))
{
cout << "Alphanumeric characters only, please try again > ";
cin >> a_character;
}
}
//-------------------------------------------------------------------------------------------------------------
void get_original_chars(int& length)
{
char next_char;
length = 0;
get_char(next_char);
while ((length < MAXCHARS) && (next_char != dollarchar))
{
OChars[length++] = next_char;
get_char(next_char);
}
}
void encrypt_chars(int length, char EKey)
{
char temp_char; // char temporary store
for (int i = 0; i < length; i++) // encrypt characters one at a time
{
temp_char = OChars[i]; // temp_char now contains the address values of the individual character
__asm
{
push eax // Save values contained within register to stack
push ecx
movzx ecx, temp_char
push ecx // Push argument #2
lea eax, EKey
push eax // Push argument #1
call encrypt4
add esp, 8 // Clean parameters of stack
mov temp_char, al // Move the temp character into a register
pop ecx
pop eax
}
EChars[i] = temp_char; // Store encrypted char in the encrypted chars array
}
return;
// Inputs: register EAX = 32-bit address of Ekey,
// ECX = the character to be encrypted (in the low 8-bit field, CL).
// Output: register EAX = the encrypted value of the source character (in the low 8-bit field, AL).
__asm
{
encrypt4:
push ebp // Set stack
mov ebp, esp // Set up the base pointer
mov eax, [ebp + 8] // Move value of parameter 1 into EAX
mov ecx, [ebp + 12] // Move value of parameter 2 into ECX
push edi // Used for string and memory array copying
push ecx // Loop counter for pushing character onto stack
not byte ptr[eax] // Negation
add byte ptr[eax], 0x04 // Adds hex 4 to EKey
movzx edi, byte ptr[eax] // Moves value of EKey into EDI using zeroes
pop eax // Pop the character value from stack
xor eax, edi // XOR character to give encrypted value of source
pop edi // Pop original address of EDI from the stack
rol al, 1 // Rotates the encrypted value of source by 1 bit (left)
rol al, 1 // Rotates the encrypted value of source by 1 bit (left) again
add al, 0x04 // Adds hex 4 to encrypted value of source
mov esp, ebp // Deallocate values
pop ebp // Restore the base pointer
ret
}
//--- End of Assembly code
}
int main(void)
{
int char_count; // The number of actual characters entered (upto MAXCHARS limit).
char EKey; // Encryption key.
cout << "\nPlease enter your Encryption Key (EKey) letter: "; get_char(EKey);
cout << "\nNow enter upto " << MAXCHARS << " alphanumeric characters:\n";
get_original_chars(char_count);
cout << "\n\nOriginal source string = " << OChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << hex << setw(2) << setfill('0') << ((int(OChars[i])) & 0xFF) << " ";
encrypt_chars(char_count, EKey);
cout << "\n\nEncrypted string = " << EChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(EChars[i])) & 0xFF) << " ";
decrypt_chars(char_count, EKey);
cout << "\n\nDecrypted string = " << DChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(DChars[i])) & 0xFF) << " ";
cout << "\n\nPress a key to end...";
while (!_kbhit()); //hold the screen until a key is pressed
return (0);
}
offering a cookie to the best explained (step-by-step) answer/ solution!!
EDIT: Disassembly code generated from the loop etc:
http://pastebin.com/u5MgJ2SW
notice line 32 where it says mov cl,byte ptr [eax+0CC0320h] Unless you
can tell me how to convert that into 'normal asm code'
mov cl,byte ptr [eax+0CC0320h]: eax at this point contains the value of i, 0CC0320h is the address of OChars, i.e. its begining. So, the "normal" asm code is as follows:
__asm {
mov eax, i
mov cl, byte ptr OChars[eax]
mov temp_char, cl
}

_mm_load_ps caused segment fault

I have a code snippet. The snippet just loads 2 arrays and calculates dot product between them using SSE.
Code here:
using namespace std;
long long size = 3200000;
float* _random()
{
unsigned int seed = 123;
// float *t = malloc(size*sizeof(float));
float *t = new float[size];
int i;
float num = 0.0;
for(i=0; i < size; i++) {
num = rand()/(RAND_MAX+1.0);
t[i] = num;
}
return t;
}
float _dotProductVectorSSE(float *s1, float *s2)
{
float prod;
int i;
__m128 X, Y, Z;
for(i=0; i<size; i+=4)
{
X = _mm_load_ps(&s1[i]);
Y = _mm_load_ps(&s2[i]);
X = _mm_mul_ps(X, Y);
Z = _mm_add_ps(X, Z);
}
float *v = new float[4];
_mm_store_ps(v,Z);
for(i=0; i<4; i++)
{
// prod += Z[i];
std::cout << v[i] << endl;
}
return prod;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
time_t start, stop;
double avg_time = 0;
double cur_time;
float* s1 = NULL;
float* s2 = NULL;
for(int i = 0; i < 100; i++)
{
s1 = _random();
s2 = _random();
start = clock();
float sse_product = _dotProductVectorSSE(s1, s2);
stop = clock();
cur_time = ((double) stop-start) / CLOCKS_PER_SEC;
avg_time += cur_time;
}
std::cout << "Averagely used " << avg_time/100 << " seconds." << endl;
return a.exec();
}
When I run, I got segment fault. Here is the backtrace:
(gdb) bt
0 0x0804965f in _mm_load_ps (__P=0xb6b56008) at /usr/lib/gcc/i586-suse-linux/4.6/include/xmmintrin.h:899
1 _dotProductVectorSSE (s1=0xb6b56008, s2=0xb5f20008) at ../simd/simd.cpp:37
2 0x0804987f in main (argc=1, argv=0xbfffee84) at ../simd/simd.cpp:80
Diassembler:
0x8049b30 push %ebp
0x8049b31 <+0x0001> push %edi
0x8049b32 <+0x0002> push %esi
0x8049b33 <+0x0003> push %ebx
0x8049b34 <+0x0004> sub $0x2c,%esp
0x8049b37 <+0x0007> mov 0x804c0a4,%esi
0x8049b3d <+0x000d> mov 0x40(%esp),%edx
0x8049b41 <+0x0011> mov 0x44(%esp),%ecx
0x8049b45 <+0x0015> mov 0x804c0a0,%ebx
0x8049b4b <+0x001b> cmp $0x0,%esi
0x8049b4e <+0x001e> jl 0x8049b7a <_Z20_dotProductVectorSSEPfS_+74>
0x8049b50 <+0x0020> jle 0x8049c10 <_Z20_dotProductVectorSSEPfS_+224>
0x8049b56 <+0x0026> add $0xffffffff,%ebx
0x8049b59 <+0x0029> adc $0xffffffff,%esi
0x8049b5c <+0x002c> xor %eax,%eax
0x8049b5e <+0x002e> shrd $0x2,%esi,%ebx
0x8049b62 <+0x0032> add $0x1,%ebx
0x8049b65 <+0x0035> shl $0x2,%ebx
**0x8049b68 <+0x0038> movaps (%edx,%eax,4),%xmm0**
0x8049b6c <+0x003c> mulps (%ecx,%eax,4),%xmm0
0x8049b70 <+0x0040> add $0x4,%eax
0x8049b73 <+0x0043> cmp %ebx,%eax
0x8049b75 <+0x0045> addps %xmm0,%xmm1
0x8049b78 <+0x0048> jne 0x8049b68 <_Z20_dotProductVectorSSEPfS_+56>
0x8049b7a <+0x004a> movaps %xmm1,0x10(%esp)
0x8049b7f <+0x004f> xor %ebx,%ebx
I am using QtCreator and defined in .pro file:
QMAKE_CXXFLAGS += -msse -msse2
DEFINES += __SSE__
DEFINES += __SSE2__
DEFINES += __MMX__
Please tell me how to fix that problem !
You are not ensuring that your data is 16 byte aligned (malloc/new are not sufficient in general) - you will either need to use _mm_loadu_ps instead of _mm_load_ps to deal with your potentially misaligned data, or preferably use a suitable method to allocate aligned memory (e.g. posix_memalign on Linux).
Note that you should _mm_load_ps and 16 byte aligned memory if you possibly can, otherwise use _mm_loadu_ps but note that this may reduce performance signficantly on some (older) CPUs.
Try the link below.
http://flyeater.wordpress.com/2010/11/29/memory-allocation-and-data-alignment-custom-mallocfree/
You basically allocate a bit more memory than you need, then calculate the address which is modulo 16 and use memory beginning from that address to load/store data.
Take care of pointer arithmetic.
Most of the code here ideone.com/fXKQhR is taken from the above link, sample usage.
I think, the _mm_malloc maybe helpful with you.