_mm_aesimc_si128 not compiled correctly with MSVC - c++

I was reading the AES-NI White Paper and wanted to try it out myself by writing a simple demo program based on the code provided by Intel but I was getting some weird result. It works in Debug/Release x86 and Debug x64 modes but I would get some random result in Release x64 mode. I also tried it with GCC and had no such problem. After some digging it seems that MSVC is confused with the source and destination of AESIMC instruction. It generates code like aesimc xmm3,xmmword ptr[rsp+20h] when actually xmm3 is the source and [rsp+20h] is the destination. In x86 mode, it will generate correct code like aesimc xmm0,xmm5 movaps xmmword ptr[K4],xmm0 (Two instructions are needed since something like aesimc xmmword ptr[K4],xmm5 is invalid I think).
I'm not sure if this is indeed a compiler bug or there's something wrong with my code.
Release x64 disassembly: (check below for complete code)
K11 = _mm_aesimc_si128(K11);
K12 = _mm_aesimc_si128(K12);
00007FF6C0A717C6 66 0F 38 DB 5C 24 20 aesimc xmm3,xmmword ptr [rsp+20h]
00007FF6C0A717CD 66 0F 6F 1C 24 movdqa xmm3,xmmword ptr [rsp]
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
00007FF6C0A717D2 66 44 0F EF F9 pxor xmm15,xmm1
K13 = _mm_aesimc_si128(K13);
00007FF6C0A717D7 66 0F 38 DB 54 24 10 aesimc xmm2,xmmword ptr [rsp+10h]
auto blocks = size >> 4;
auto feedback = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
00007FF6C0A717DE F3 0F 6F 12 movdqu xmm2,xmmword ptr [rdx]
00007FF6C0A717E2 66 45 0F 38 DB F6 aesimc xmm14,xmm14
00007FF6C0A717E8 66 45 0F 38 DB ED aesimc xmm13,xmm13
00007FF6C0A717EE 66 45 0F 38 DB E4 aesimc xmm12,xmm12
00007FF6C0A717F4 66 45 0F 38 DB DB aesimc xmm11,xmm11
00007FF6C0A717FA 66 45 0F 38 DB D2 aesimc xmm10,xmm10
00007FF6C0A71800 66 45 0F 38 DB C9 aesimc xmm9,xmm9
00007FF6C0A71806 66 45 0F 38 DB C0 aesimc xmm8,xmm8
00007FF6C0A7180C 66 0F 38 DB FF aesimc xmm7,xmm7
00007FF6C0A71811 66 0F 38 DB F6 aesimc xmm6,xmm6
00007FF6C0A71816 66 0F 38 DB ED aesimc xmm5,xmm5
00007FF6C0A7181B 66 0F 38 DB E4 aesimc xmm4,xmm4
{
auto lastIn = _mm_loadu_si128(static_cast<const __m128i *>(input) + i);
00007FF6C0A71820 F3 41 0F 6F 0C 00 movdqu xmm1,xmmword ptr [r8+rax]
00007FF6C0A71826 48 8D 40 10 lea rax,[rax+10h]
auto m = _mm_xor_si128(lastIn, K14);
00007FF6C0A7182A 66 0F 6F C1 movdqa xmm0,xmm1
00007FF6C0A7182E 66 41 0F EF C7 pxor xmm0,xmm15
m = _mm_aesdec_si128(m, K13);
00007FF6C0A71833 66 0F 38 DE 44 24 10 aesdec xmm0,xmmword ptr [K13]
m = _mm_aesdec_si128(m, K12);
00007FF6C0A7183A 66 0F 38 DE 44 24 20 aesdec xmm0,xmmword ptr [K12]
m = _mm_aesdec_si128(m, K11);
Complete code: (should work with both MSVC and GCC)
#include <cstdio>
#include <cstring>
#include <cstdint>
#include <cstddef>
#include <wmmintrin.h>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <cpuid.h>
#else
#error compiler not supported
#endif
static int check_aes_support()
{
#if defined(_MSC_VER)
int info[4];
__cpuid(info, 0x01);
return info[2] & 0x2000000;
#else
unsigned int eax, ebx, ecx, edx;
__get_cpuid(0x01, &eax, &ebx, &ecx, &edx);
return ecx & 0x2000000;
#endif
}
static inline __m128i aes256_key_assist_1(__m128i key1, __m128i key2)
{
key2 = _mm_shuffle_epi32(key2, _MM_SHUFFLE(3, 3, 3, 3));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
return _mm_xor_si128(key1, key2);
}
static inline __m128i aes256_key_assist_2(__m128i key1, __m128i key2)
{
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
return _mm_xor_si128(key2, _mm_shuffle_epi32(_mm_aeskeygenassist_si128(key1, 0x00),
_MM_SHUFFLE(2, 2, 2, 2)));
}
#define AES256_GENKEY_1(K1, K2, C) aes256_key_assist_1(K1, _mm_aeskeygenassist_si128(K2, C))
#define AES256_GENKEY_2(K1, K2) aes256_key_assist_2(K1, K2)
static int aes256_cbc_encrypt(const void *key, const void *iVec,
const void *input, std::size_t size, void *output)
{
if (!size || size & 0xF)
return 1;
auto K0 = _mm_loadu_si128(static_cast<const __m128i *>(key));
auto K1 = _mm_loadu_si128(static_cast<const __m128i *>(key) + 1);
auto K2 = AES256_GENKEY_1(K0, K1, 0x01);
auto K3 = AES256_GENKEY_2(K2, K1);
auto K4 = AES256_GENKEY_1(K2, K3, 0x02);
auto K5 = AES256_GENKEY_2(K4, K3);
auto K6 = AES256_GENKEY_1(K4, K5, 0x04);
auto K7 = AES256_GENKEY_2(K6, K5);
auto K8 = AES256_GENKEY_1(K6, K7, 0x08);
auto K9 = AES256_GENKEY_2(K8, K7);
auto K10 = AES256_GENKEY_1(K8, K9, 0x10);
auto K11 = AES256_GENKEY_2(K10, K9);
auto K12 = AES256_GENKEY_1(K10, K11, 0x20);
auto K13 = AES256_GENKEY_2(K12, K11);
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
auto blocks = size >> 4;
auto m = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
for (decltype(blocks) i = 0; i < blocks; i++)
{
m = _mm_xor_si128(m, _mm_loadu_si128(static_cast<const __m128i *>(input) + i));
m = _mm_xor_si128(m, K0);
m = _mm_aesenc_si128(m, K1);
m = _mm_aesenc_si128(m, K2);
m = _mm_aesenc_si128(m, K3);
m = _mm_aesenc_si128(m, K4);
m = _mm_aesenc_si128(m, K5);
m = _mm_aesenc_si128(m, K6);
m = _mm_aesenc_si128(m, K7);
m = _mm_aesenc_si128(m, K8);
m = _mm_aesenc_si128(m, K9);
m = _mm_aesenc_si128(m, K10);
m = _mm_aesenc_si128(m, K11);
m = _mm_aesenc_si128(m, K12);
m = _mm_aesenc_si128(m, K13);
m = _mm_aesenclast_si128(m, K14);
_mm_storeu_si128(static_cast<__m128i *>(output) + i, m);
}
return 0;
}
static int aes256_cbc_decrypt(const void *key, const void *iVec,
const void *input, std::size_t size, void *output)
{
if (!size || size & 0xF)
return 1;
auto K0 = _mm_loadu_si128(static_cast<const __m128i *>(key));
auto K1 = _mm_loadu_si128(static_cast<const __m128i *>(key) + 1);
auto K2 = AES256_GENKEY_1(K0, K1, 0x01);
auto K3 = AES256_GENKEY_2(K2, K1);
auto K4 = AES256_GENKEY_1(K2, K3, 0x02);
auto K5 = AES256_GENKEY_2(K4, K3);
auto K6 = AES256_GENKEY_1(K4, K5, 0x04);
auto K7 = AES256_GENKEY_2(K6, K5);
auto K8 = AES256_GENKEY_1(K6, K7, 0x08);
auto K9 = AES256_GENKEY_2(K8, K7);
auto K10 = AES256_GENKEY_1(K8, K9, 0x10);
auto K11 = AES256_GENKEY_2(K10, K9);
auto K12 = AES256_GENKEY_1(K10, K11, 0x20);
auto K13 = AES256_GENKEY_2(K12, K11);
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
K1 = _mm_aesimc_si128(K1);
K2 = _mm_aesimc_si128(K2);
K3 = _mm_aesimc_si128(K3);
K4 = _mm_aesimc_si128(K4);
K5 = _mm_aesimc_si128(K5);
K6 = _mm_aesimc_si128(K6);
K7 = _mm_aesimc_si128(K7);
K8 = _mm_aesimc_si128(K8);
K9 = _mm_aesimc_si128(K9);
K10 = _mm_aesimc_si128(K10);
K11 = _mm_aesimc_si128(K11);
K12 = _mm_aesimc_si128(K12);
K13 = _mm_aesimc_si128(K13);
auto blocks = size >> 4;
auto feedback = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
for (decltype(blocks) i = 0; i < blocks; i++)
{
auto lastIn = _mm_loadu_si128(static_cast<const __m128i *>(input) + i);
auto m = _mm_xor_si128(lastIn, K14);
m = _mm_aesdec_si128(m, K13);
m = _mm_aesdec_si128(m, K12);
m = _mm_aesdec_si128(m, K11);
m = _mm_aesdec_si128(m, K10);
m = _mm_aesdec_si128(m, K9);
m = _mm_aesdec_si128(m, K8);
m = _mm_aesdec_si128(m, K7);
m = _mm_aesdec_si128(m, K6);
m = _mm_aesdec_si128(m, K5);
m = _mm_aesdec_si128(m, K4);
m = _mm_aesdec_si128(m, K3);
m = _mm_aesdec_si128(m, K2);
m = _mm_aesdec_si128(m, K1);
m = _mm_aesdeclast_si128(m, K0);
m = _mm_xor_si128(m, feedback);
_mm_storeu_si128(static_cast<__m128i *>(output) + i, m);
feedback = lastIn;
}
return 0;
}
int main()
{
auto aesSupport = check_aes_support();
std::printf("AES: %s\n", aesSupport ? "yes" : "no");
if (!aesSupport)
return -1;
std::uint64_t data[] = {0x1122334455667788, 0xAABBCCDDEEFFBBAA, 0xAAAAAAAAAAAAAAAA, 0x4444333333333333};
std::uint64_t key[] = {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x1111111111111111, 0x1111111111111111};
std::uint64_t iVec[] = {0x123456789ABCDEF0, 0x0FEDCBA987654321};
std::uint64_t cipher[4] = {0};
aes256_cbc_encrypt(key, iVec, data, sizeof(data), cipher);
std::printf("0x%016llX 0x%016llX 0x%016llX 0x%016llX\n", cipher[0], cipher[1], cipher[2], cipher[3]);
std::memset(data, 0, sizeof(data));
aes256_cbc_decrypt(key, iVec, cipher, sizeof(data), data);
std::printf("0x%016llX 0x%016llX 0x%016llX 0x%016llX\n", data[0], data[1], data[2], data[3]);
}
It should output:
0xCF8A4156843F0A3E 0x04D4BB63524324E6 0xAAB88C080DB40B2F 0xCC346B02BA6B16E8
0x1122334455667788 0xAABBCCDDEEFFBBAA 0xAAAAAAAAAAAAAAAA 0x4444333333333333
But I would get something random in Release x64 mode:
0xCF8A4156843F0A3E 0x04D4BB63524324E6 0xAAB88C080DB40B2F 0xCC346B02BA6B16E8
0xEE64C4650D902107 0x0D03C7FA41AA930B 0x257F65FF49A99474 0xFACB372EDED13BAA

Related

Fastest way for wrapping a value in an interval

I am curious about the ways to wrap a floating-point value x in a semi-closed interval [0; a[.
For instance, I could have an arbitrary real number, say x = 354638.515, that I wish to fold into [0; 2π[ because I have a good sin approximation for that range.
The fmod standard C functions show up quite high in my benchmarks, and by checking the source code of various libc implementations, I can understand why: the thing is fairly branch-ey, likely in order to handle a lot of IEEE754-specific issues:
glibc: https://github.com/bminor/glibc/blob/master/sysdeps/ieee754/flt-32/e_fmodf.c
Apple: https://opensource.apple.com/source/Libm/Libm-315/Source/ARM/fmod.c.auto.html
musl: https://git.musl-libc.org/cgit/musl/tree/src/math/fmod.c
Running with -ffast-math causes GCC to generate code that goes through the x87 FPU on x86/x86_64 which comes with its own set of problems (such as 80-bit doubles, FP state, and other fun things). I would like the implementation to vectorize at least semi-correctly, and if possible to not go through the x87 FPU but through vector registers at least as the rest of my code ends up vectorized, even if not necessarily an optimal way, by the compiler.
This one looks much simpler: https://github.com/KnightOS/libc/blob/master/src/fmod.c
In my case, I am only concerned about usual real values, not NaNs, not infinity. My range is also known at compile-time and sane (a common occurence being π/2), thus the checks for "special cases" such as range == 0 are unnecessary.
Thus, what would be good implementations of fmod for that specific use case ?
Assuming that the range is constant and positive you can compute its reciprocal to avoid costly division.
void fast_fmod(float * restrict dst, const float * restrict src, size_t n, float divisor) {
float reciprocal = 1.0f / divisor;
for (size_t i = 0; i < n; ++i)
dst[i] = src[i] - divisor * (int)(src[i] * reciprocal);
}
The final code with a simple demo is:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
void fast_fmod(float * restrict dst, const float * restrict src, size_t n, float divisor) {
float reciprocal = 1.0f / divisor;
for (size_t i = 0; i < n; ++i)
dst[i] = src[i] - divisor * (int)(src[i] * reciprocal);
}
int main() {
float src[9] = {-4, -3, -2, -1, 0, 1, 2, 3, 4};
float dst[9];
float div = 3;
fast_fmod(dst, src, 9, div);
for (int i = 0; i < 9; ++i) {
printf("fmod(%g, %g) = %g vs %g\n", src[i], div, dst[i], fmod(src[i], div));
}
}
produces an expected output:
fmod(-4, 3) = -1 vs -1
fmod(-3, 3) = 0 vs -0
fmod(-2, 3) = -2 vs -2
fmod(-1, 3) = -1 vs -1
fmod(0, 3) = 0 vs 0
fmod(1, 3) = 1 vs 1
fmod(2, 3) = 2 vs 2
fmod(3, 3) = 0 vs 0
fmod(4, 3) = 1 vs 1
Compilation with GCC with command:
$ gcc prog.c -o prog -O3 -march=haswell -lm -fopt-info-vec
prog.c:8:4: optimized: loop vectorized using 32 byte vectors
prog.c:8:4: optimized: loop vectorized using 32 byte vectors
prog.c:8:30: optimized: basic block part vectorized using 32 byte vectors
Thus the code was nicely vectorized.
EDIT
It looks that CLANG does even a better job vectorizing this code:
401170: c5 fc 10 24 8e vmovups (%rsi,%rcx,4),%ymm4
401175: c5 fc 10 6c 8e 20 vmovups 0x20(%rsi,%rcx,4),%ymm5
40117b: c5 fc 10 74 8e 40 vmovups 0x40(%rsi,%rcx,4),%ymm6
401181: c5 fc 10 7c 8e 60 vmovups 0x60(%rsi,%rcx,4),%ymm7
401187: c5 6c 59 c4 vmulps %ymm4,%ymm2,%ymm8
40118b: c5 6c 59 cd vmulps %ymm5,%ymm2,%ymm9
40118f: c5 6c 59 d6 vmulps %ymm6,%ymm2,%ymm10
401193: c5 6c 59 df vmulps %ymm7,%ymm2,%ymm11
401197: c4 41 7e 5b c0 vcvttps2dq %ymm8,%ymm8
40119c: c4 41 7e 5b c9 vcvttps2dq %ymm9,%ymm9
4011a1: c4 41 7e 5b d2 vcvttps2dq %ymm10,%ymm10
4011a6: c4 41 7e 5b db vcvttps2dq %ymm11,%ymm11
4011ab: c4 41 7c 5b c0 vcvtdq2ps %ymm8,%ymm8
4011b0: c4 41 7c 5b c9 vcvtdq2ps %ymm9,%ymm9
4011b5: c4 41 7c 5b d2 vcvtdq2ps %ymm10,%ymm10
4011ba: c4 41 7c 5b db vcvtdq2ps %ymm11,%ymm11
4011bf: c5 3c 59 c3 vmulps %ymm3,%ymm8,%ymm8
4011c3: c5 34 59 cb vmulps %ymm3,%ymm9,%ymm9
4011c7: c5 2c 59 d3 vmulps %ymm3,%ymm10,%ymm10
4011cb: c5 24 59 db vmulps %ymm3,%ymm11,%ymm11
4011cf: c4 c1 5c 5c e0 vsubps %ymm8,%ymm4,%ymm4
4011d4: c4 c1 54 5c e9 vsubps %ymm9,%ymm5,%ymm5
4011d9: c4 c1 4c 5c f2 vsubps %ymm10,%ymm6,%ymm6
4011de: c4 c1 44 5c fb vsubps %ymm11,%ymm7,%ymm7
4011e3: c5 fc 11 24 8f vmovups %ymm4,(%rdi,%rcx,4)
4011e8: c5 fc 11 6c 8f 20 vmovups %ymm5,0x20(%rdi,%rcx,4)
4011ee: c5 fc 11 74 8f 40 vmovups %ymm6,0x40(%rdi,%rcx,4)
4011f4: c5 fc 11 7c 8f 60 vmovups %ymm7,0x60(%rdi,%rcx,4)
4011fa: 48 83 c1 20 add $0x20,%rcx
4011fe: 48 39 c8 cmp %rcx,%rax
401201: 0f 85 69 ff ff ff jne 401170 <fast_fmod+0x40>
This is a fmod()-alternative without precision loss I wrote.
The computation can take very long if the counter has a very high exponent and the denominator has a very low exponent, but it is still faster than the current Gnu C libary implementation:
#include <stdint.h>
#include <string.h>
#include <fenv.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__GNUC__) || defined(__clang__)
#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif
#define MAX_EXP (0x7FF)
#define SIGN_BIT ((uint64_t)1 << 63)
#define EXP_MASK ((uint64_t)MAX_EXP << 52)
#define IMPLCIT_BIT ((uint64_t)1 << 52)
#define MANT_MASK (IMPLCIT_BIT - 1)
#define QNAN_BIT (IMPLCIT_BIT >> 1)
inline uint64_t bin( double d )
{
uint64_t u;
memcpy( &u, &d, sizeof d );
return u;
}
inline double dbl( uint64_t u )
{
double d;
memcpy( &d, &u, sizeof u );
return d;
}
inline double NaN()
{
feraiseexcept( FE_INVALID );
return dbl( SIGN_BIT | EXP_MASK | QNAN_BIT );
}
inline void normalize( uint64_t *mant, int *exp )
{
#if defined(__GNUC__) || defined(__clang__)
unsigned bits = __builtin_clz( *mant ) - 11;
*mant <<= bits;
#elif defined(_MSC_VER)
unsigned long bits;
_BitScanReverse64( &bits, *mant );
bits = (bits ^ 63) - 11;
*mant <<= bits;
#else
unsigned bits = 0;
for( ; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits );
#endif
*exp -= bits;
}
double myFmodC( double counter, double denominator )
{
uint64_t const
bCounter = bin( counter ),
bDenom = bin( denominator );
uint64_t const sign = bCounter & SIGN_BIT;
if( unlikely((bCounter & EXP_MASK) == EXP_MASK) )
// +/-[Inf|QNaN|SNaN] % ... = -QNaN
return NaN();
if( unlikely((bDenom & EXP_MASK) == EXP_MASK) )
// +/-x % +/-[Inf|QNan|SNaN]
if( likely(!(bDenom & MANT_MASK)) )
// +/-x % +/-Inf = -/+x
return dbl( sign | bCounter & ~SIGN_BIT );
else
// +/-x % +/-[QNaN|SNaN] = -NaN
return NaN();
int
counterExp = bCounter >> 52 & MAX_EXP,
denomExp = bDenom >> 52 & MAX_EXP;
uint64_t
counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK,
denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK;
if( unlikely(!counterExp) )
// counter is denormal
if( likely(!counterMant) )
// counter == +/-0.0
if( likely(denomMant) )
// +/-0.0 % +/-x = -/+0.0
return dbl( sign );
else
// +/-0.0 % +/-0.0 = -QNaN
return NaN();
else
// normalize counter
normalize( &counterMant, &counterExp ),
++counterExp;
if( unlikely(!denomExp) )
// denominator is denormal
if( likely(!denomMant) )
// +/-x % +/-0.0 = -/+QNaN
return NaN();
else
// normalize denominator
normalize( &denomMant, &denomExp ),
++denomExp;
int remExp = counterExp;
uint64_t remMant = counterMant;
for( ; ; )
{
int below = remMant < denomMant;
if( unlikely(remExp - below < denomExp) )
break;
remExp -= below;
remMant <<= below;
if( unlikely(!(remMant -= denomMant)) )
{
remExp = 0;
break;
}
normalize( &remMant, &remExp );
};
if( unlikely(remExp <= 0) )
// denormal result
remMant >>= -remExp + 1,
remExp = 0;
return dbl( sign | (uint64_t)remExp << 52 | remMant & MANT_MASK );
}

APDU Write block commands on mifare card

I have been trying to writing some data to my mifare classic cards. first I send these two commands which returns 90 00:
Load Mifare Keys:
FF 82 20 01 06 FF FF FF FF FF FF
Authenticate:
FF 86 00 03 05 01 00 05 60 00
now I'm write commands to sector 0 and block 3 and block 4 by this commands
APDU_WRITE_data_1 : FF D 00 03 16
APDU_WRITE_data_1 : FF D 00 04 16
// writedata1 in block 3 ...
//
if (nres == SM_SUCCESS)// &&
//bAPDURes )
{
nlenrcv = sizeof(btRcv);
nlencmd = 0;
btCmd[nlencmd++] = 0xFF; // CLA
btCmd[nlencmd++] = 0xD6; // INS
btCmd[nlencmd++] = 0x00; // P1, Mifare Block Number MSB
btCmd[nlencmd++] = 0x03; // P2, Mifare Block Number LSB
btCmd[nlencmd++] = 16; // Lc, Data Length
memcpy(btCmd + nlencmd, btWrite_1, 16);
nlencmd += 16;
nres = m_Smart.RFTransmit(DEV_INTERNALRF, nlencmd, btCmd, (DWORD*)&nlenrcv, btRcv_1);
// writedata2 in block 4 ...
if (nres == SM_SUCCESS)// &&
//bAPDURes )
{
nlenrcv = sizeof(btRcv);
nlencmd = 0;
btCmd[nlencmd++] = 0xFF; // CLA
btCmd[nlencmd++] = 0xD6; // INS
btCmd[nlencmd++] = 0x00; // P1, Mifare Block Number MSB
btCmd[nlencmd++] = 0x04; // P2, Mifare Block Number LSB
btCmd[nlencmd++] = 16; // Lc, Data Length
memcpy(btCmd + nlencmd, btWrite_2, 16);
nlencmd += 16;
nres = m_Smart.RFTransmit(DEV_INTERNALRF, nlencmd, btCmd, (DWORD*)&nlenrcv, btRcv_2);
But it doesn't work
is it the APDU for writing wrong?

Add+Mul become slower with Intrinsics - where am I wrong?

Having this array:
alignas(16) double c[voiceSize][blockSize];
This is the function I'm trying to optimize:
inline void Process(int voiceIndex, int blockSize) {
double *pC = c[voiceIndex];
double value = start + step * delta;
double deltaValue = rate * delta;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
pC[sampleIndex] = value + deltaValue * sampleIndex;
}
}
And this is my intrinsics (SSE2) attempt:
inline void Process(int voiceIndex, int blockSize) {
double *pC = c[voiceIndex];
double value = start + step * delta;
double deltaValue = rate * delta;
__m128d value_add = _mm_set1_pd(value);
__m128d deltaValue_mul = _mm_set1_pd(deltaValue);
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2) {
__m128d result_mul = _mm_setr_pd(sampleIndex, sampleIndex + 1);
result_mul = _mm_mul_pd(result_mul, deltaValue_mul);
result_mul = _mm_add_pd(result_mul, value_add);
_mm_store_pd(pC + sampleIndex, result_mul);
}
}
Which is slower than "scalar" (even if auto-optimized) original code, unfortunately :)
Where's the bottleneck in your opinion? Where am I wrong?
I'm using MSVC, Release/x86, /02 optimization flag (Favor fast code).
EDIT: doing this (suggested by #wim), it seems that performance become better than C version:
inline void Process(int voiceIndex, int blockSize) {
double *pC = c[voiceIndex];
double value = start + step * delta;
double deltaValue = rate * delta;
__m128d value_add = _mm_set1_pd(value);
__m128d deltaValue_mul = _mm_set1_pd(deltaValue);
__m128d sampleIndex_acc = _mm_set_pd(-1.0, -2.0);
__m128d sampleIndex_add = _mm_set1_pd(2.0);
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2) {
sampleIndex_acc = _mm_add_pd(sampleIndex_acc, sampleIndex_add);
__m128d result_mul = _mm_mul_pd(sampleIndex_acc, deltaValue_mul);
result_mul = _mm_add_pd(result_mul, value_add);
_mm_store_pd(pC + sampleIndex, result_mul);
}
}
Why? Is _mm_setr_pd expensive?
Why? Is _mm_setr_pd expensive?
Somewhat; it takes at least a shuffle. More importantly in this case, computing each scalar operand is expensive, and as #spectras' answer shows, gcc at least fails to auto-vectorize that into paddd / cvtdq2pd. Instead it re-computes each operand from a scalar integer, doing the int->double conversion separately, then shuffles those together.
This is the function I'm trying to optimize:
You're simply filling an array with a linear function. You're re-multiplying every time inside the loop. That avoids a loop-carried dependency on anything except the integer loop counter, but you run into throughput bottlenecks from doing so much work inside the loop.
i.e. you're computing a[i] = c + i*scale separately for every step. But instead you can strength-reduce that to a[i+n] = a[i] + (n*scale). So you only have one addpd instruction per vector of results.
This will introduce some rounding error that accumulates vs. redoing the computation from scratch, but double is probably overkill for what you're doing anyway.
It also comes at the cost of introducing a serial dependency on an FP add instead of integer. But you already have a loop-carried FP add dependency chain in your "optimized" version that uses sampleIndex_acc = _mm_add_pd(sampleIndex_acc, sampleIndex_add); inside the loop, using FP += 2.0 instead of re-converting from integer.
So you'll want to unroll with multiple vectors to hide that FP latency, and keep at least 3 or 4 FP additions in flight at once. (Haswell: 3 cycle latency, one per clock throughput. Skylake: 4 cycle latency, 2 per clock throughput.) See also Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? for more about unrolling with multiple accumulators for a similar problem with loop-carried dependencies (a dot product).
void Process(int voiceIndex, int blockSize) {
double *pC = c[voiceIndex];
double val0 = start + step * delta;
double deltaValue = rate * delta;
__m128d vdelta2 = _mm_set1_pd(2 * deltaValue);
__m128d vdelta4 = _mm_add_pd(vdelta2, vdelta2);
__m128d v0 = _mm_setr_pd(val0, val0 + deltaValue);
__m128d v1 = _mm_add_pd(v0, vdelta2);
__m128d v2 = _mm_add_pd(v0, vdelta4);
__m128d v3 = _mm_add_pd(v1, vdelta4);
__m128d vdelta8 = _mm_mul_pd(vdelta2, _mm_set1_pd(4.0));
double *endp = pC + blocksize - 7; // stop if there's only room for 7 or fewer doubles
// or use -8 and have your cleanup handle lengths of 1..8
// since the inner loop always calculates results for next iteration
for (; pC < endp ; pC += 8) {
_mm_store_pd(pC, v0);
v0 = _mm_add_pd(v0, vdelta8);
_mm_store_pd(pC+2, v1);
v1 = _mm_add_pd(v1, vdelta8);
_mm_store_pd(pC+4, v2);
v2 = _mm_add_pd(v2, vdelta8);
_mm_store_pd(pC+6, v3);
v3 = _mm_add_pd(v3, vdelta8);
}
// if (blocksize % 8 != 0) ... store final vectors
}
The choice of whether to add or multiply when building up vdelta4 / vdelta8 is not very significant; I tried to avoid too long a dependency chain before the first stores can happen. Since v0 through v3 need to be calculated as well, it seemed to make sense to create a vdelta4 instead of just making a chain of v2 = v1+vdelta2. Maybe it would have been better to create vdelta4 with a multiply from 4.0*delta, and double it to get vdelta8. This could be relevant for very small block size, especially if you cache-block your code by only generating small chunks of this array as needed, right before it will be read.
Anyway, this compiles to a very efficient inner loop with gcc and MSVC (on the Godbolt compiler explorer).
;; MSVC -O2
$LL4#Process: ; do {
movups XMMWORD PTR [rax], xmm5
movups XMMWORD PTR [rax+16], xmm0
movups XMMWORD PTR [rax+32], xmm1
movups XMMWORD PTR [rax+48], xmm2
add rax, 64 ; 00000040H
addpd xmm5, xmm3 ; v0 += vdelta8
addpd xmm0, xmm3 ; v1 += vdelta8
addpd xmm1, xmm3 ; v2 += vdelta8
addpd xmm2, xmm3 ; v3 += vdelta8
cmp rax, rcx
jb SHORT $LL4#Process ; }while(pC < endp)
This has 4 separate dependency chains, through xmm0, 1, 2, and 5. So there's enough instruction-level parallelism to keep 4 addpd instructions in flight. This is more than enough for Haswell, but half of what Skylake can sustain.
Still, with a store throughput of 1 vector per clock, more than 1 addpd per clock isn't useful. In theory this can run at about 16 bytes per clock cycle, and saturate store throughput. i.e. 1 vector / 2 doubles per clock.
AVX with wider vectors (4 doubles) could still go at 1 vector per clock on Haswell and later, i.e. 32 bytes per clock. (Assuming the output array is hot in L1d cache or possibly even L2.)
Even better: don't store this data in memory at all; re-generate on the fly.
Generate it on the fly when it's needed, if the code consuming it only reads it a few times, and is also manually vectorized.
On my system, g++ test.cpp -march=native -O2 -c -o test
This will output for the normal version (loop body extract):
30: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0
34: c5 fb 2a c0 vcvtsi2sd %eax,%xmm0,%xmm0
38: c4 e2 f1 99 c2 vfmadd132sd %xmm2,%xmm1,%xmm0
3d: c5 fb 11 04 c2 vmovsd %xmm0,(%rdx,%rax,8)
42: 48 83 c0 01 add $0x1,%rax
46: 48 39 c8 cmp %rcx,%rax
49: 75 e5 jne 30 <_Z11ProcessAutoii+0x30>
And for the intrinsics version:
88: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0
8c: 8d 50 01 lea 0x1(%rax),%edx
8f: c5 f1 57 c9 vxorpd %xmm1,%xmm1,%xmm1
93: c5 fb 2a c0 vcvtsi2sd %eax,%xmm0,%xmm0
97: c5 f3 2a ca vcvtsi2sd %edx,%xmm1,%xmm1
9b: c5 f9 14 c1 vunpcklpd %xmm1,%xmm0,%xmm0
9f: c4 e2 e9 98 c3 vfmadd132pd %xmm3,%xmm2,%xmm0
a4: c5 f8 29 04 c1 vmovaps %xmm0,(%rcx,%rax,8)
a9: 48 83 c0 02 add $0x2,%rax
ad: 48 39 f0 cmp %rsi,%rax
b0: 75 d6 jne 88 <_Z11ProcessSSE2ii+0x38>
So in short: the compiler automatically generates AVX code from the C version.
Edit after playing a bit more with flags to have SSE2 only in both cases:
g++ test.cpp -msse2 -O2 -c -o test
The compiler still does something different from what you generate with intrinsics. Compiler version:
30: 66 0f ef c0 pxor %xmm0,%xmm0
34: f2 0f 2a c0 cvtsi2sd %eax,%xmm0
38: f2 0f 59 c2 mulsd %xmm2,%xmm0
3c: f2 0f 58 c1 addsd %xmm1,%xmm0
40: f2 0f 11 04 c2 movsd %xmm0,(%rdx,%rax,8)
45: 48 83 c0 01 add $0x1,%rax
49: 48 39 c8 cmp %rcx,%rax
4c: 75 e2 jne 30 <_Z11ProcessAutoii+0x30>
Intrinsics version:
88: 66 0f ef c0 pxor %xmm0,%xmm0
8c: 8d 50 01 lea 0x1(%rax),%edx
8f: 66 0f ef c9 pxor %xmm1,%xmm1
93: f2 0f 2a c0 cvtsi2sd %eax,%xmm0
97: f2 0f 2a ca cvtsi2sd %edx,%xmm1
9b: 66 0f 14 c1 unpcklpd %xmm1,%xmm0
9f: 66 0f 59 c3 mulpd %xmm3,%xmm0
a3: 66 0f 58 c2 addpd %xmm2,%xmm0
a7: 0f 29 04 c1 movaps %xmm0,(%rcx,%rax,8)
ab: 48 83 c0 02 add $0x2,%rax
af: 48 39 f0 cmp %rsi,%rax
b2: 75 d4 jne 88 <_Z11ProcessSSE2ii+0x38>
Compiler does not unroll the loop here. It might be better or worse depending on many things. You might want to bench both versions.

C++ Performance: Two shorts vs int bit operations

What of this options have the best performance:
Use two shorts for two sensible informations, or use one int and use bit operations to retrive half of it for each sensible information?
This may vary depending on architecture and compiler, but generally one using int and bit operations on it will have slightly less performance. But the difference of performance will be so minimal that till now I haven't written code that will require that level of optimization. I depend on the compiler to these kind of optimizations for me.
Now let us check the below C++ code that simulates the behaviur:
int main()
{
int x = 100;
short a = 255;
short b = 127;
short p = x >> 16;
short q = x & 0xffff;
short y = a;
short z = b;
return 0;
}
The corresponding assembly code on x86_64 system (from gnu g++) will be as shown below:
00000000004004ed <main>:
int main()
{
4004ed: 55 push %rbp
4004ee: 48 89 e5 mov %rsp,%rbp
int x = 100;
4004f1: c7 45 fc 64 00 00 00 movl $0x64,-0x4(%rbp)
short a = 255;
4004f8: 66 c7 45 f0 ff 00 movw $0xff,-0x10(%rbp)
short b = 127;
4004fe: 66 c7 45 f2 7f 00 movw $0x7f,-0xe(%rbp)
short p = x >> 16;
400504: 8b 45 fc mov -0x4(%rbp),%eax
400507: c1 f8 10 sar $0x10,%eax
40050a: 66 89 45 f4 mov %ax,-0xc(%rbp)
short q = x & 0xffff;
40050e: 8b 45 fc mov -0x4(%rbp),%eax
400511: 66 89 45 f6 mov %ax,-0xa(%rbp)
short y = a;
400515: 0f b7 45 f0 movzwl -0x10(%rbp),%eax
400519: 66 89 45 f8 mov %ax,-0x8(%rbp)
short z = b;
40051d: 0f b7 45 f2 movzwl -0xe(%rbp),%eax
400521: 66 89 45 fa mov %ax,-0x6(%rbp)
return 0;
400525: b8 00 00 00 00 mov $0x0,%eax
}
As we see, "short p = x >> 16" is the slowest as it uses the extra expensive right shift operation. While all other assignments are equal in terms of cost.

Does compiler optimize away intermediaries [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question appears to be off-topic because it lacks sufficient information to diagnose the problem. Describe your problem in more detail or include a minimal example in the question itself.
Closed 8 years ago.
Improve this question
I am trying to optimize an exaggerated version of the following code.
int a = startvalue1;
int b = startvalue2;
int c = startvalue3;
int ta = a + b;
int tb = b + c;
int tc = c + a;
int tta = ta * tb;
int ttb = tb * tc;
int ttc = tc * ta;
int finalvalue1 = tta - ttc;
int finalvalue2 = ttb - ttc;
int finalvalue3 = ttc + tta;
Will the compiler automatically get rid of the intermediaries? If not, will my code run faster if I get rid of them?
I have profiled my program and I have to optimize my discrete fourier transform. But getting rid of the intermediaries would be tedious and I'd like to avoid it if the compiler is going to do it for me.
You need to look at the disassembly output for your compiler while compiling with optimizations on.
If you use Visual Studio, set a break-point and press ctrl+f11.
If you use gcc, use the -S flag to output the assembly.
On my system, Visual Studio 2012 removes all of the intermediate steps.
Here is the test program I wrote:
#include <iostream>
int main (void)
{
int startvalue1 = 10 ;
int startvalue2 = 15 ;
int startvalue3 = 20 ;
int a = startvalue1;
int b = startvalue2;
int c = startvalue3;
int ta = a + b;
int tb = b + c;
int tc = c + a;
int tta = ta * tb;
int ttb = tb * tc;
int ttc = tc * ta;
int finalvalue1 = tta - ttc;
int finalvalue2 = ttb - ttc;
int finalvalue3 = ttc + tta;
// This line is only here make sure everything isn't optimized out!
std::cout << finalvalue1 << finalvalue2 << finalvalue3 ;
return 0 ;
}
Here is the optimized assembly:
01291270 8B 0D 30 30 29 01 mov ecx,dword ptr ds:[1293030h]
01291276 68 59 06 00 00 push 659h
0129127B 68 2C 01 00 00 push 12Ch
01291280 6A 7D push 7Dh
01291282 FF 15 38 30 29 01 call dword ptr ds:[1293038h]
01291288 8B C8 mov ecx,eax
0129128A FF 15 38 30 29 01 call dword ptr ds:[1293038h]
01291290 8B C8 mov ecx,eax
01291292 FF 15 38 30 29 01 call dword ptr ds:[1293038h]
01291298 33 C0 xor eax,eax
0129129A C3 ret
This is roughly equivalent to:
#include <iostream>
int main (void)
{
std::cout << 125 << 300 << 1625 ;
return 0 ;
}