C++ Optimize Memory Read Speed - c++

I'm creating an int (32 bit) vector with 1024 * 1024 * 1024 elements like so:
std::vector<int> nums;
for (size_t i = 0; i < 1024 * 1024 * 1024; i++) {
nums.push_back(rand() % 1024);
}
which holds 4 GB of random data at that point. And then I'm simply summing up all the elements in the vector like so:
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn++) {
total += *cn;
}
This takes about ~0.18 seconds which means the data is processed at around 22.2 GB/s. I'm running this on an M1 with a much higher memory bandwidth of about 60GB/s. Is there a way to make the above code run faster on a single core?
EDIT:
Manual SIMD version:
int32x4_t simd_total = vmovq_n_s32(0);
for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) {
const int32_t v[4] = {cn[0], cn[1], cn[2], cn[3]}
simd_total = vaddq_s32(simd_total, vld1q_s32(v));
}
return vaddvq_s32(simd_total);
The SIMD version has the same performance as the non-manual-SIMD version.
EDIT 2:
Alright, so I changed the vector elements to uint32_t and also changed the result type to uint32_t(as suggested by #Peter Cordes):
uint32_t sum_ints_32(const std::vector<uint32_t>& nums) {
uint32_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn++) {
total += *cn;
}
return total;
}
This runs much faster (~45 GB/s). This is the disassembly:
0000000100002218 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100002218: a940200c ldp x12, x8, [x0]
10000221c: eb08019f cmp x12, x8
100002220: 54000102 b.cs 100002240 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28> // b.hs, b.nlast
100002224: aa2c03e9 mvn x9, x12
100002228: 8b090109 add x9, x8, x9
10000222c: f1006d3f cmp x9, #0x1b
100002230: 540000c8 b.hi 100002248 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30> // b.pmore
100002234: 52800000 mov w0, #0x0 // #0
100002238: aa0c03e9 mov x9, x12
10000223c: 14000016 b 100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c>
100002240: 52800000 mov w0, #0x0 // #0
100002244: d65f03c0 ret
100002248: d342fd29 lsr x9, x9, #2
10000224c: 9100052a add x10, x9, #0x1
100002250: 927ded4b and x11, x10, #0x7ffffffffffffff8
100002254: 8b0b0989 add x9, x12, x11, lsl #2
100002258: 9100418c add x12, x12, #0x10
10000225c: 6f00e400 movi v0.2d, #0x0
100002260: aa0b03ed mov x13, x11
100002264: 6f00e401 movi v1.2d, #0x0
100002268: ad7f8d82 ldp q2, q3, [x12, #-16]
10000226c: 4ea08440 add v0.4s, v2.4s, v0.4s
100002270: 4ea18461 add v1.4s, v3.4s, v1.4s
100002274: 9100818c add x12, x12, #0x20
100002278: f10021ad subs x13, x13, #0x8
10000227c: 54ffff61 b.ne 100002268 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x50> // b.any
100002280: 4ea08420 add v0.4s, v1.4s, v0.4s
100002284: 4eb1b800 addv s0, v0.4s
100002288: 1e260000 fmov w0, s0
10000228c: eb0b015f cmp x10, x11
100002290: 540000a0 b.eq 1000022a4 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x8c> // b.none
100002294: b840452a ldr w10, [x9], #4
100002298: 0b000140 add w0, w10, w0
10000229c: eb08013f cmp x9, x8
1000022a0: 54ffffa3 b.cc 100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c> // b.lo, b.ul, b.last
1000022a4: d65f03c0 ret
I also rewrote the Manual-SIMD version:
uint32_t sum_ints_simd_2(const std::vector<uint32_t>& nums) {
uint32x4_t simd_total = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
simd_total = vaddq_u32(simd_total, vld1q_u32(v));
}
return vaddvq_u32(simd_total);
}
which still runs 2x slower than the non-manual-SIMD version and results in the following disassembly:
0000000100002464 <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100002464: a9402408 ldp x8, x9, [x0]
100002468: d1003129 sub x9, x9, #0xc
10000246c: 6f00e400 movi v0.2d, #0x0
100002470: eb09011f cmp x8, x9
100002474: 540000c2 b.cs 10000248c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28> // b.hs, b.nlast
100002478: 6f00e400 movi v0.2d, #0x0
10000247c: 3cc10501 ldr q1, [x8], #16
100002480: 4ea08420 add v0.4s, v1.4s, v0.4s
100002484: eb09011f cmp x8, x9
100002488: 54ffffa3 b.cc 10000247c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x18> // b.lo, b.ul, b.last
10000248c: 4eb1b800 addv s0, v0.4s
100002490: 1e260000 fmov w0, s0
100002494: d65f03c0 ret
To reach the same speed as the auto-vectorized version, we can use a uint32x4x2 instead of uint32x4 for our manual-SIMD version:
uint32_t sum_ints_simd_3(const std::vector<uint32_t>& nums) {
uint32x4x2_t simd_total;
simd_total.val[0] = vmovq_n_u32(0);
simd_total.val[1] = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-7; cn +=8) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
}
return vaddvq_u32(simd_total.val[0]) + vaddvq_u32(simd_total.val[1]);
}
And to gain even more speed we can leverage uint32x4x4 (which gets us about ~53 GB/s):
uint32_t sum_ints_simd_4(const std::vector<uint32_t>& nums) {
uint32x4x4_t simd_total;
simd_total.val[0] = vmovq_n_u32(0);
simd_total.val[1] = vmovq_n_u32(0);
simd_total.val[2] = vmovq_n_u32(0);
simd_total.val[3] = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-15; cn +=16) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
const uint32_t v3[4] = { cn[8], cn[9], cn[10], cn[11] };
const uint32_t v4[4] = { cn[12], cn[13], cn[14], cn[15] };
simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
simd_total.val[2] = vaddq_u32(simd_total.val[2], vld1q_u32(v3));
simd_total.val[3] = vaddq_u32(simd_total.val[3], vld1q_u32(v4));
}
return vaddvq_u32(simd_total.val[0])
+ vaddvq_u32(simd_total.val[1])
+ vaddvq_u32(simd_total.val[2])
+ vaddvq_u32(simd_total.val[3]);
}
which gets us the following disassembly:
0000000100005e34 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100005e34: a9402408 ldp x8, x9, [x0]
100005e38: d100f129 sub x9, x9, #0x3c
100005e3c: 6f00e403 movi v3.2d, #0x0
100005e40: 6f00e402 movi v2.2d, #0x0
100005e44: 6f00e401 movi v1.2d, #0x0
100005e48: 6f00e400 movi v0.2d, #0x0
100005e4c: eb09011f cmp x8, x9
100005e50: 540001c2 b.cs 100005e88 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x54> // b.hs, b.nlast
100005e54: 6f00e400 movi v0.2d, #0x0
100005e58: 6f00e401 movi v1.2d, #0x0
100005e5c: 6f00e402 movi v2.2d, #0x0
100005e60: 6f00e403 movi v3.2d, #0x0
100005e64: ad401504 ldp q4, q5, [x8]
100005e68: ad411d06 ldp q6, q7, [x8, #32]
100005e6c: 4ea38483 add v3.4s, v4.4s, v3.4s
100005e70: 4ea284a2 add v2.4s, v5.4s, v2.4s
100005e74: 4ea184c1 add v1.4s, v6.4s, v1.4s
100005e78: 4ea084e0 add v0.4s, v7.4s, v0.4s
100005e7c: 91010108 add x8, x8, #0x40
100005e80: eb09011f cmp x8, x9
100005e84: 54ffff03 b.cc 100005e64 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30> // b.lo, b.ul, b.last
100005e88: 4eb1b863 addv s3, v3.4s
100005e8c: 1e260068 fmov w8, s3
100005e90: 4eb1b842 addv s2, v2.4s
100005e94: 1e260049 fmov w9, s2
100005e98: 0b080128 add w8, w9, w8
100005e9c: 4eb1b821 addv s1, v1.4s
100005ea0: 1e260029 fmov w9, s1
100005ea4: 0b090108 add w8, w8, w9
100005ea8: 4eb1b800 addv s0, v0.4s
100005eac: 1e260009 fmov w9, s0
100005eb0: 0b090100 add w0, w8, w9
100005eb4: d65f03c0 ret
Crazy stuff

Does -march=native help? IDK if there are any SIMD features that Apple clang won't already take advantage on the first generation of AArch64 MacOS CPUs, but clang might just be taking baseline AArch64 in general.
Can you go faster if you use uint32_t sums, so the compiler doesn't have to widen each element before adding? That means each SIMD instruction can only handle half as much data from memory as with same-sized accumulators.
https://godbolt.org/z/7c19913jE shows that Thomas Matthews' unrolling suggestion does actually get clang11 -O3 -march=apple-a13 to unroll the SIMD-vectorized asm loops it makes. That source change is not a win in general, e.g. much worse for x86-64 clang -O3 -march=haswell, but it does help here.
Another possibility is that a single core can't saturate memory bandwidth. But benchmark results published by Anandtech for example seem to rule that out: they found that even a single core can achieve 59GB/s, although that was probably running an optimize memcpy function.
(They say The fact that a single Firestorm core can almost saturate the memory controllers is astounding and something we’ve never seen in a design before. That sounds a bit weird; desktop / laptop Intel CPUs come pretty close, unlike their "server" chips. Maybe not as close as Apple?
M1 has pretty low memory latency compared to modern x86, so that probably helps a single core be able to track the incoming loads to keep the necessary latency x bandwidth product in flight, even with its high memory bandwidth.

Here are some techniques.
Loop Unrolling
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
total += cn[0];
total += cn[1];
total += cn[2];
total += cn[3];
}
Register Prefetch
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
const uint64 n0 = cn[0];
const uint64 n1 = cn[1];
const uint64 n2 = cn[2];
const uint64 n3 = cn[3];
total += n0;
total += n1;
total += n2;
total += n3;
}
You should print the assembly language for each of these at high optimization level and compare them.
Also, your processor may have some specialized instructions that you could. For example, the ARM processor can load multiple registers from memory with one instruction.
Also, look up SIMD instructions or search the internet for "C++ SIMD read memory".
I've argued with compilers (on embedded systems) and found out that the compiler's optimization strategies may be better or equal to instruction specialization or other techniques (timings were performed using Test Points and oscilloscope).
You'll have to remember that your task, on a one core machine, will most likely be swapped out more often that with a system with multiple cores or a specialized (embedded) system.

Consider precalculating as much as you can and using built-in STL functions, this will lead to as much optimal code as possible before trying SIMD or assembly approaches. If it's still too slow, then try the SIMD/assembly versions:
Avoid calling push_back on unreserved std::vectors: this causes the system to allocate more space when the capacity limit is reached. Since you know the size of the array before hand, reserve the space ahead of time: (for non-built-in types, consider emplace_back as well).
Additionally, the STL functions can reduce the boilerplate code down to two function calls.
Also, avoid rand().
const std::size_t GB = 1024 * 1024 * 1024;
std::vector<int> nums(4 * GB);
std::generate(std::begin(nums), std::end(nums), [](){ return rand() % 1024; });
//...
const auto sum = std::accumulate(std::begin(nums), std::end(nums), 0);

Related

Assembly Code only returns two of possible four values in comparison

I am currently writing a Program that requires C++ and Arm Assembly language to be implemented. The C++ code uses functions from the assembly file to perform particular tasks on a STM32F4discovery board touchscreen. One of the particular functions is a function that returns what quadrant the user is touching currently, and we had to write that function in assembly. No matter what I do, this function always returns quadrant 1 or 3. I think I am having an issue with branching properly, as this is one of my weak points, but I do not know for sure. Here is the function in context of the C++ code:
//all of this is above main
extern "C" uint32_t getQuad(uint16_t, uint16_t);
TS_DISCO_F429ZI ts; //touch screen sensing object
TS_StateTypeDef TS_State;
//other code here
ts.GetState(&TS_State);
uint16_t x = TS_State.X;
uint16_t y = TS_State.Y;
uint32_t quad = getQuad(x, y);
And here is the function in the assembly file
AREA Program3_F20_Gibbs, CODE, READONLY
GLOBAL getQuad
; uint32_t getQuad ( uint16_t x, uint16_t y)
getQuad LDR R0, [R0]
LDR R1, [R1]
CMP R0, #120
BLE TwoThree
CMP R1, #160
BLE Four
MOV R0, #1
BX LR
TwoThree CMP R1, #160
BLE Three
MOV R0, #2
BX LR
Four MOV R0, #4
BX LR
Three MOV R0, #3
BX LR
I should probably also note that this quadrant system follows an x/y coordinate system with a bit of difference. Quadrant one has x/y coordinates with x > 120 and y > 160, Quadrant two has x/y coordinates with x < 120 and y > 160, Quadrant three has x/y coordinates with x < 120 and y < 160, and Quadrant four has x/y coordinates with x > 120 and y < 160. If I need to give any more info, please let me know, and thank you for any help in advance!
You can get away without any (conditional) branch:
getQuad
cmp r0, #120
adr r12, %f1
addls r12, r12, #2
cmp r1, #160
addls r12, r12, #1
ldrb r0, [r12]
bx lr
1
dcb 1, 4, 2, 3
You really don't need to write it in assembly:
static inline uint32_t getQuad(uint16_t x, uint16_t y)
{
static const uint8_t array[4] = {1, 4, 2, 3};
uint8_t *pSrc = array;
if (x <= 120) pSrc += 2;
if (y <= 160) pSrc += 1;
return (uint32_t) *pSrc;
}
It's a very short function, hence you better inline it, and put it in the header file.
The pesky function call overhead is gone then.
PS: le (less of equal) is for signed value. You should us ls (lower or same) instead. le works in this case, but if you are dealing with 32bit values, it could cause disastrous problems that are hard to debug.
Your logic seems sound except for the fact there should be some <= type operations in the text:
Quadrant one has x/y coordinates with x > 120 and y > 160, Quadrant two has x/y coordinates with x <= 120 and y > 160, Quadrant three has x/y coordinates with x <= 120 and y <= 160, and Quadrant four has x/y coordinates with x > 120 and y <= 160.
However, unless the calling convention is very strange, you probably don't want to be doing this at the start:
LDR R0, [R0]
LDR R1, [R1]
This is what you do if you're passing the addresses of variables and you want to dereference those addresses to get the values. Therefore you will be using the wrong values to figure out which quadrant you're in.

Efficiently compute absolute values of std::complex<float> vector with AVX2

For some real-time DSP application I need to compute the absolute values of a complex valued vector.
The straightforward implementation would look like that
computeAbsolute (std::complex<float>* complexSourceVec,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; ++i)
realValuedDestinationVec[i] = std::abs (complexSourceVec[i]);
}
I want to replace this implementation with an AVX2 optimized version, based on AVX2 instrincts. What would be the most efficient way to implement it that way?
Note: The source data is handed to me by an API I have no access to, so there is no chance to change the layout of the complex input vector for better efficiency.
Inspired by the answer of Dan M. I first implemented his version with some tweaks:
First changed it to use the wider 256 Bit registers, then marked the temporary re and im arrays with __attribute__((aligned (32))) to be able to use aligned load
void computeAbsolute1 (const std::complex<float>* cplxIn, float* absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
float re[8] __attribute__((aligned (32))) = {cplxIn[i].real(), cplxIn[i + 1].real(), cplxIn[i + 2].real(), cplxIn[i + 3].real(), cplxIn[i + 4].real(), cplxIn[i + 5].real(), cplxIn[i + 6].real(), cplxIn[i + 7].real()};
float im[8] __attribute__((aligned (32))) = {cplxIn[i].imag(), cplxIn[i + 1].imag(), cplxIn[i + 2].imag(), cplxIn[i + 3].imag(), cplxIn[i + 4].imag(), cplxIn[i + 5].imag(), cplxIn[i + 6].imag(), cplxIn[i + 7].imag()};
__m256 x4 = _mm256_load_ps (re);
__m256 y4 = _mm256_load_ps (im);
__m256 b4 = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (x4,x4), _mm256_mul_ps (y4,y4)));
_mm256_storeu_ps (absOut + i, b4);
}
}
However manually shuffling the values this way seemed like a task that could be speeded up somehow. Now this is the solution I came up with, that runs 2 - 3 times faster in a quick test compiled by clang with full optimization:
#include <complex>
#include <immintrin.h>
void computeAbsolute2 (const std::complex<float>* __restrict cplxIn, float* __restrict absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
// load 8 complex values (--> 16 floats overall) into two SIMD registers
__m256 inLo = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i ));
__m256 inHi = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i + 4));
// seperates the real and imaginary part, however values are in a wrong order
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (2, 0, 2, 0));
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (3, 1, 3, 1));
// do the heavy work on the unordered vectors
__m256 abs = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (re, re), _mm256_mul_ps (im, im)));
// reorder values prior to storing
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3,1,2,0));
_mm256_storeu_ps (absOut + i, _mm256_castpd_ps(ordered));
}
}
I think I'll go with that implementation if no one comes up with a faster solution
This compiles efficiently with gcc and clang (on the Godbolt compiler explorer).
It's really hard (if possible) to write "highly optimized AVX2" version of complex abs since the way complex numbers are defined in the standard prevents (specifically due to all inf/nan corner cases) a lot of optimization.
However, if you don't care about the correctness you can just use -ffast-math and some compilers would optimize the code for you. See gcc output: https://godbolt.org/z/QbZlBI
You can also take this output and create your own abs function with inline assembly.
But yes, as was already mentioned, if you really need performance, you probably want to swap std::complex for something else.
I was able to get a decent output for your specific case with all the required shuffles by manually filling small re and im arrays. See: https://godbolt.org/z/sWAAXo
This could be trivially extended for ymm registers.
Anyway, here is the ultimate solution adapted from this SO answer which uses intrinsics in combination with clever compiler optimizations:
#include <complex>
#include <cassert>
#include <immintrin.h>
static inline void cabs_soa4(const float *re, const float *im, float *b) {
__m128 x4 = _mm_loadu_ps(re);
__m128 y4 = _mm_loadu_ps(im);
__m128 b4 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x4,x4), _mm_mul_ps(y4,y4)));
_mm_storeu_ps(b, b4);
}
void computeAbsolute (const std::complex<float>* src,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; i += 4) {
float re[4] = {src[i].real(), src[i + 1].real(), src[i + 2].real(), src[i + 3].real()};
float im[4] = {src[i].imag(), src[i + 1].imag(), src[i + 2].imag(), src[i + 3].imag()};
cabs_soa4(re, im, realValuedDestinationVec);
}
}
which compiles to simple
_Z15computeAbsolutePKSt7complexIfEPfi:
test edx, edx
jle .L5
lea eax, [rdx-1]
shr eax, 2
sal rax, 5
lea rax, [rdi+32+rax]
.L3:
vmovups xmm0, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdi+16]
add rdi, 32
vshufps xmm1, xmm0, xmm2, 136
vmulps xmm1, xmm1, xmm1
vshufps xmm0, xmm0, xmm2, 221
vfmadd132ps xmm0, xmm1, xmm0
vsqrtps xmm0, xmm0
vmovups XMMWORD PTR [rsi], xmm0
cmp rax, rdi
jne .L3
.L5:
ret
https://godbolt.org/z/Yu64Wg

best way to shuffle across AVX lanes?

There are questions with similar titles, but my question relates to one very specific use case not covered elsewhere.
I have 4 __128d registers (x0, x1, x2, x3) and I want to recombine their content in 5 __256d registers (y0, y1, y2, y3, y4) as follows, in preparation of other calculations:
on entry:
x0 contains {a0, a1}
x1 contains {a2, a3}
x2 contains {a4, a5}
x3 contains {a6, a7}
on exit:
y0 contains {a0, a1, a2, a3}
y1 contains {a1, a2, a3, a4}
y2 contains {a2, a3, a4, a5}
y3 contains {a3, a4, a5, a6}
y4 contains {a4, a5, a6, a7}
My implementation here below is quite slow. Is there a better way?
y0 = _mm256_set_m128d(x1, x0);
__m128d lo = _mm_shuffle_pd(x0, x1, 1);
__m128d hi = _mm_shuffle_pd(x1, x2, 1);
y1 = _mm256_set_m128d(hi, lo);
y2 = _mm256_set_m128d(x2, x1);
lo = hi;
hi = _mm_shuffle_pd(x2, x3, 1);
y3 = _mm256_set_m128d(hi, lo);
y4 = _mm256_set_m128d(x3, x2);
With inputs in registers, you can do it in 5 shuffle instructions:
3x vinsertf128 to create y0, y2, and y4 by concatenating 2 xmm registers each.
2x vshufpd (in-lane shuffles) between those results to create y1 and y3.
Notice that the low lanes of y0 and y2 contain a1 and a2, the elements needed for the low lane of y1. And the same shuffle also works for the high lane.
#include <immintrin.h>
void merge(__m128d x0, __m128d x1, __m128d x2, __m128d x3,
__m256d *__restrict y0, __m256d *__restrict y1,
__m256d *__restrict y2, __m256d *__restrict y3, __m256d *__restrict y4)
{
*y0 = _mm256_set_m128d(x1, x0);
*y2 = _mm256_set_m128d(x2, x1);
*y4 = _mm256_set_m128d(x3, x2);
// take the high element from the first vector, low element from the 2nd.
*y1 = _mm256_shuffle_pd(*y0, *y2, 0b0101);
*y3 = _mm256_shuffle_pd(*y2, *y4, 0b0101);
}
Compiles pretty nicely (with gcc and clang -O3 -march=haswell on Godbolt) to:
merge(double __vector(2), double __vector(2), double __vector(2), double __vector(2), double __vector(4)*, double __vector(4)*, double __vector(4)*, double __vector(4)*, double __vector(4)*):
vinsertf128 ymm0, ymm0, xmm1, 0x1
vinsertf128 ymm3, ymm2, xmm3, 0x1
vinsertf128 ymm1, ymm1, xmm2, 0x1
# vmovapd YMMWORD PTR [rdi], ymm0
vshufpd ymm0, ymm0, ymm1, 5
# vmovapd YMMWORD PTR [rdx], ymm1
vshufpd ymm1, ymm1, ymm3, 5
# vmovapd YMMWORD PTR [r8], ymm3
# vmovapd YMMWORD PTR [rsi], ymm0
# vmovapd YMMWORD PTR [rcx], ymm1
# vzeroupper
# ret
I commented out the stores and stuff that would go away on inlining, so we really do just have the 5 shuffle instructions, vs. 9 shuffle instructions for the code in your question. (Also included in the Godbolt compiler explorer link).
This is very good on AMD, where vinsertf128 is super-cheap (because 256-bit registers are implemented as 2x 128-bit halves, so it's just a 128-bit copy without needing a special shuffle port.) 256-bit lane-crossing shuffles are slow on AMD, but in-lane 256-bit shuffles like vshufpd is just 2 uops.
On Intel it's pretty good, but mainstream Intel CPUs with AVX only have 1 per clock shuffle throughput for 256-bit or FP shuffles. (Sandybridge and earlier have more throughput for integer 128-bit shuffles, but AVX2 CPUs dropped the extra shuffle units, and they didn't help anyway for this.)
So Intel CPUs can't exploit the instruction-level parallelism at all, but it's only 5 uops total which is nice. That's the minimum possible, because you need 5 results.
But especially if the surrounding code also bottlenecks on shuffles, it's worth considering a store/reload strategy with just 4 stores and 5 overlapping vector loads. Or maybe 2x vinsertf128 to construct y0 and y4, then 2x 256-bit stores + 3 overlapping reloads. That could let out-of-order exec get started on dependent instructions using just y0 or y4 while the store-forwarding stall resolved for y1..3.
Especially if you don't care much about Intel first-gen Sandybridge where unaligned 256-bit vector loads are less efficient. (Note that you'd want to compile with gcc -mtune=haswell to turn off the -mavx256-split-unaligned-load default / sandybridge tuning, if you're using GCC. Regardless of the compiler, -march=native is a good idea if making binaries to run on the machine where you compile it, to take full advantage of instruction sets and set tuning options.)
But if total uop throughput from the front-end is more where the bottleneck lies, then the shuffle implementation is best.
(See https://agner.org/optimize/ and other performance links in the x86 tag wiki for more about performance tuning. Also What considerations go into predicting latency for operations on modern superscalar processors and how can I calculate them by hand?, but really Agner Fog's guide is a more in-depth guide that explains what throughput vs. latency is actually about.)
I do not even need to save, as data is also already available in contiguous memory.
Then simply loading it with 5 overlapping loads is almost certainly the most efficient thing you could do.
Haswell can do 2 loads per clock from L1d, or less when any cross a cache-line boundary. So if you can align your block by 64, it's perfectly efficient with no cache-line-splits at all. Cache misses are slow, but reloading hot data from L1d cache is very cheap, and modern CPUs with AVX support generally have efficient unaligned-load support.
(Like I said earlier, if using gcc make sure you compile with -march=haswell or -mtune=haswell, not just -mavx, to avoid gcc's -mavx256-split-unaligned-load.)
4 loads + 1 vshufpd (y0, y2) might be a good way to balance load port pressure with ALU pressure, depending on bottlenecks in the surrounding code. Or even 3 loads + 2 shuffles, if the surrounding code is low on shuffle port pressure.
they are in registers from previous calculations which required them to be loaded.
If that previous calculation still has the source data in registers, you could have done 256-bit loads in the first place and just used their 128-bit low halves for the earlier calc. (An XMM register is the low 128 of the corresponding YMM register, and reading them doesn't disturb the upper lanes, so _mm256_castpd256_pd128 compiles to zero asm instructions.)
Do 256-bit loads for y0,y2, and y4, and use their low halves as x0, x1, and x2. (Construct y1 and y3 later with unaligned loads or shuffles).
Only x3 isn't already the low 128 bits of a 256-bit vector you also want.
Ideally a compiler would already notice this optimization when you do a _mm_loadu_pd and a _mm256_loadu_pd from the same address, but probably you need to hand-hold it by doing
__m256d y0 = _mm256_loadu_pd(base);
__m128d x0 = _mm256_castpd256_pd128(y0);
and so on, and either an extract ALU intrinsic (_mm256_extractf128_pd) or a 128-bit load for x3, depending on the surrounding code. If it's only needed once, letting it fold into a memory operand for whatever instruction uses it might be best.
Potential downside: slightly higher latency before the 128-bit calculation can start, or several cycles if the 256-bit loads were cache-line crossing where 128-bit loads weren't. But if your block of data is aligned by 64 bytes, this won't happen.

Fastest way to perform AVX inner product operations with mixed (float, double) input vectors

I need to build a single-precision floating-point inner product routine for mixed single/double-precision floating-point vectors, exploiting the AVX instruction set for SIMD registers with 256 bits.
Problem: one input vector is float (x), while the other is double (yD).
Hence, before to compute the true inner product operations, I need to convert my input yD vector data from double to float.
Using the SSE2 instruction set, I was able to implement a very fast code doing what I needed, and with speed performances very close to the case when both vectors x and y were float:
void vector_operation(const size_t i)
{
__m128 X = _mm_load_ps(x + i);
__m128 Y = _mm_movelh_ps(_mm_cvtpd_ps(_mm_load_pd(yD + i + 0)), _mm_cvtpd_ps(_mm_load_pd(yD + i + 2)));
//inner-products accumulation
res = _mm_add_ps(res, _mm_mul_ps(X, Y));
}
Now, with the hope to further speed-up, I implemented a correpsonding version with AVX instruction set:
inline void vector_operation(const size_t i)
{
__m256 X = _mm256_load_ps(x + i);
__m128 yD1 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 0));
__m128 yD2 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 2));
__m128 yD3 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 4));
__m128 yD4 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 6));
__m128 Ylow = _mm_movelh_ps(yD1, yD2);
__m128 Yhigh = _mm_movelh_ps(yD3, yD4);
//Pack __m128 data inside __m256
__m256 Y = _mm256_permute2f128_ps(_mm256_castps128_ps256(Ylow), _mm256_castps128_ps256(Yhigh), 0x20);
//inner-products accumulation
res = _mm256_add_ps(res, _mm256_mul_ps(X, Y));
}
I also tested other AVX implementations using, for example, casting and insertion operations instead of perfmuting data. Performances were comparably poor compared to the case where both x and y vectors were float.
The problem with the AVX code is that no matter how I implemented it, its performance is by far inferior to the ones achieved by using only float x and y vectors (i.e. no double-float conversion is needed).
The conversion from double to float for the yD vector seems pretty fast, while a lot of time is lost in the line where data is inserted in the _m256 Y register.
Do you know if this is a well-known issue with AVX?
Do you have a solution that could preserve good performances?
Thanks in advance!
I rewrote your function and took better advantage of what AVX has to offer. I also used fused multiply-add at the end; if you can't use FMA, just replace that line with addition and multiplication. I only now see that I wrote an implementation that uses unaligned loads and yours uses aligned loads, but I'm not gonna lose any sleep over it. :)
__m256 foo(float*x, double* yD, const size_t i, __m256 res_prev)
{
__m256 X = _mm256_loadu_ps(x + i);
__m128 yD21 = _mm256_cvtpd_ps(_mm256_loadu_pd(yD + i + 0));
__m128 yD43 = _mm256_cvtpd_ps(_mm256_loadu_pd(yD + i + 4));
__m256 Y = _mm256_set_m128(yD43, yD21);
return _mm256_fmadd_ps(X, Y, res_prev);
}
I did a quick benhmark and compared running times of your and my implementation. I tried two different benchmark approaches with several repetitions and every time my code was around 15% faster. I used MSVC 14.1 compiler and compiled the program with /O2 and /arch:AVX2 flags.
EDIT: this is the disassembly of the function:
vcvtpd2ps xmm3,ymmword ptr [rdx+r8*8+20h]
vcvtpd2ps xmm2,ymmword ptr [rdx+r8*8]
vmovups ymm0,ymmword ptr [rcx+r8*4]
vinsertf128 ymm3,ymm2,xmm3,1
vfmadd213ps ymm0,ymm3,ymmword ptr [r9]
EDIT 2: this is the disassembly of your AVX implementation of the same algorithm:
vcvtpd2ps xmm0,xmmword ptr [rdx+r8*8+30h]
vcvtpd2ps xmm1,xmmword ptr [rdx+r8*8+20h]
vmovlhps xmm3,xmm1,xmm0
vcvtpd2ps xmm0,xmmword ptr [rdx+r8*8+10h]
vcvtpd2ps xmm1,xmmword ptr [rdx+r8*8]
vmovlhps xmm2,xmm1,xmm0
vperm2f128 ymm3,ymm2,ymm3,20h
vmulps ymm0,ymm3,ymmword ptr [rcx+r8*4]
vaddps ymm0,ymm0,ymmword ptr [r9]

Fast weighted mean & variance of 10 bins

I would like to speed up a part of my code but I don't think there is a possible better way to do the following calculation:
float invSum = 1.0f / float(sum);
for (int i = 0; i < numBins; ++i)
{
histVec[i] *= invSum;
}
for (int i = 0; i < numBins; ++i)
{
float midPoint = (float)i*binSize + binOffset;
float f = histVec[i];
fmean += f * midPoint;
}
for (int i = 0; i < numBins; ++i)
{
float midPoint = (float)i*binSize + binOffset;
float f = histVec[i];
float diff = midPoint - fmean;
var += f * hwk::sqr(diff);
}
numBins in the for-loops is typically 10 but this bit of code is called very often (frequency of 80 frames per seconds, called at least 8 times per frame)
I tried to use some SSE methods but it is only slightly speeding up this code. I think I could avoid calculating twice midPoint but I am not sure how. Is there a better way to compute fmean and var?
Here is the SSE code:
// make hist contain a multiple of 4 valid values
for (int i = numBins; i < ((numBins + 3) & ~3); i++)
hist[i] = 0;
// find sum of bins in inHist
__m128i iSum4 = _mm_set1_epi32(0);
for (int i = 0; i < numBins; i += 4)
{
__m128i a = *((__m128i *) &inHist[i]);
iSum4 = _mm_add_epi32(iSum4, a);
}
int iSum = iSum4.m128i_i32[0] + iSum4.m128i_i32[1] + iSum4.m128i_i32[2] + iSum4.m128i_i32[3];
//float stdevB, meanB;
if (iSum == 0.0f)
{
stdev = 0.0;
mean = 0.0;
}
else
{
// Set histVec to normalised values in inHist
__m128 invSum = _mm_set1_ps(1.0f / float(iSum));
for (int i = 0; i < numBins; i += 4)
{
__m128i a = *((__m128i *) &inHist[i]);
__m128 b = _mm_cvtepi32_ps(a);
__m128 c = _mm_mul_ps(b, invSum);
_mm_store_ps(&histVec[i], c);
}
float binSize = 256.0f / (float)numBins;
float halfBinSize = binSize * 0.5f;
float binOffset = halfBinSize;
__m128 binSizeMask = _mm_set1_ps(binSize);
__m128 binOffsetMask = _mm_set1_ps(binOffset);
__m128 fmean4 = _mm_set1_ps(0.0f);
for (int i = 0; i < numBins; i += 4)
{
__m128i idx4 = _mm_set_epi32(i + 3, i + 2, i + 1, i);
__m128 idx_m128 = _mm_cvtepi32_ps(idx4);
__m128 histVec4 = _mm_load_ps(&histVec[i]);
__m128 midPoint4 = _mm_add_ps(_mm_mul_ps(idx_m128, binSizeMask), binOffsetMask);
fmean4 = _mm_add_ps(fmean4, _mm_mul_ps(histVec4, midPoint4));
}
fmean4 = _mm_hadd_ps(fmean4, fmean4); // 01 23 01 23
fmean4 = _mm_hadd_ps(fmean4, fmean4); // 0123 0123 0123 0123
float fmean = fmean4.m128_f32[0];
//fmean4 = _mm_set1_ps(fmean);
__m128 var4 = _mm_set1_ps(0.0f);
for (int i = 0; i < numBins; i+=4)
{
__m128i idx4 = _mm_set_epi32(i + 3, i + 2, i + 1, i);
__m128 idx_m128 = _mm_cvtepi32_ps(idx4);
__m128 histVec4 = _mm_load_ps(&histVec[i]);
__m128 midPoint4 = _mm_add_ps(_mm_mul_ps(idx_m128, binSizeMask), binOffsetMask);
__m128 diff4 = _mm_sub_ps(midPoint4, fmean4);
var4 = _mm_add_ps(var4, _mm_mul_ps(histVec4, _mm_mul_ps(diff4, diff4)));
}
var4 = _mm_hadd_ps(var4, var4); // 01 23 01 23
var4 = _mm_hadd_ps(var4, var4); // 0123 0123 0123 0123
float var = var4.m128_f32[0];
stdev = sqrt(var);
mean = fmean;
}
I might be doing something wrong since I dont have a lot of improvement as I was expecting.
Is there something in the SSE code that might possibly slow down the process?
(editor's note: the SSE part of this question was originally asked as https://stackoverflow.com/questions/31837817/foor-loop-optimisation-sse-comparison, which was closed as a duplicate.)
I only just realized that your data array starts out as an array of int, since you didn't have declarations in your code. I can see in the SSE version that you start with integers, and only store a float version of it later.
Keeping everything integer will let us do the loop-counter-vector with a simple ivec = _mm_add_epi32(ivec, _mm_set1_epi32(4)); Aki Suihkonen's answer has some transformations that should let it optimize a lot better. Especially, the auto-vectorizer should be able to do more even without -ffast-math. In fact, it does quite well. You could do better with intrinsics, esp. saving some vector 32bit multiplies and shortening the dependency chain.
My old answer, based on just trying to optimize your code as written, assuming FP input:
You may be able to combine all 3 loops into one, using the algorithm #Jason linked to. It might not be profitable, though, since it involves a division. For small numbers of bins, probably just loop multiple times.
Start by reading the guides at http://agner.org/optimize/. A couple of the techniques in his Optimising Assembly guide will speed up your SSE attempt (which I edited into this question for you).
combine your loops where possible, so you do more with the data for each time it's loaded / stored.
multiple accumulators to hide the latency of loop-carried dependency chains. (Even FP add takes 3 cycles on recent Intel CPUs.) This won't apply for really short arrays like your case.
instead of int->float conversion on every iteration, use a float loop counter as well as the int loop counter. (add a vector of _mm_set1_ps(4.0f) every iteration.) _mm_set... with variable args is something to avoid in loops, when possible. It takes several instructions (esp. when each arg to setr has to be calculated separately.)
gcc -O3 manages to auto-vectorize the first loop, but not the others. With -O3 -ffast-math, it auto-vectorizes more. -ffast-math allows it to do FP operations in a different order than the code specifies. e.g. adding up the array in 4 elements of a vector, and only combining the 4 accumulators at the end.
Telling gcc that the input pointer is aligned by 16 lets gcc auto-vectorize with a lot less overhead (no scalar loops for unaligned portions).
// return mean
float fpstats(float histVec[], float sum, float binSize, float binOffset, long numBins, float *variance_p)
{
numBins += 3;
numBins &= ~3; // round up to multiple of 4. This is just a quick hack to make the code fast and simple.
histVec = (float*)__builtin_assume_aligned(histVec, 16);
float invSum = 1.0f / float(sum);
float var = 0, fmean = 0;
for (int i = 0; i < numBins; ++i)
{
histVec[i] *= invSum;
float midPoint = (float)i*binSize + binOffset;
float f = histVec[i];
fmean += f * midPoint;
}
for (int i = 0; i < numBins; ++i)
{
float midPoint = (float)i*binSize + binOffset;
float f = histVec[i];
float diff = midPoint - fmean;
// var += f * hwk::sqr(diff);
var += f * (diff * diff);
}
*variance_p = var;
return fmean;
}
gcc generates some weird code for the 2nd loop.
# broadcasting fmean after the 1st loop
subss %xmm0, %xmm2 # fmean, D.2466
shufps $0, %xmm2, %xmm2 # vect_cst_.16
.L5: ## top of 2nd loop
movdqa %xmm3, %xmm5 # vect_vec_iv_.8, vect_vec_iv_.8
cvtdq2ps %xmm3, %xmm3 # vect_vec_iv_.8, vect__32.9
movq %rcx, %rsi # D.2465, D.2467
addq $1, %rcx #, D.2465
mulps %xmm1, %xmm3 # vect_cst_.11, vect__33.10
salq $4, %rsi #, D.2467
paddd %xmm7, %xmm5 # vect_cst_.7, vect_vec_iv_.8
addps %xmm2, %xmm3 # vect_cst_.16, vect_diff_39.15
mulps %xmm3, %xmm3 # vect_diff_39.15, vect_powmult_53.17
mulps (%rdi,%rsi), %xmm3 # MEM[base: histVec_10, index: _107, offset: 0B], vect__41.18
addps %xmm3, %xmm4 # vect__41.18, vect_var_42.19
cmpq %rcx, %rax # D.2465, bnd.26
ja .L8 #, ### <--- This is insane.
haddps %xmm4, %xmm4 # vect_var_42.19, tmp160
haddps %xmm4, %xmm4 # tmp160, vect_var_42.21
.L2:
movss %xmm4, (%rdx) # var, *variance_p_44(D)
ret
.p2align 4,,10
.p2align 3
.L8:
movdqa %xmm5, %xmm3 # vect_vec_iv_.8, vect_vec_iv_.8
jmp .L5 #
So instead of just jumping back to the top every iteration, gcc decides to jump ahead to copy a register, and then unconditionally jmp back to the top of the loop. The uop loop buffer may remove the front-end overhead of this sillyness, but gcc should have structured the loop so it didn't copy xmm5->xmm3 and then xmm3->xmm5 every iteration, because that's silly. It should have the conditional jump just go to the top of the loop.
Also note the technique gcc used to get a float version of the loop counter: start with an integer vector of 1 2 3 4, and add set1_epi32(4). Use that as an input for packed int->float cvtdq2ps. On Intel HW, that instruction runs on the FP-add port, and has 3 cycle latency, same as packed FP add. gcc prob. would have done better to just add a vector of set1_ps(4.0), even though this creates a 3-cycle loop-carried dependency chain, instead of 1 cycle vector int add, with a 3 cycle convert forking off on every iteration.
small iteration count
You say this will often be used on exactly 10 bins? A specialized version for just 10 bins could give a big speedup, by avoiding all the loop overhead and keeping everything in registers.
With that small a problem size, you can have the FP weights just sitting there in memory, instead of re-computing them with integer->float conversion every time.
Also, 10 bins is going to mean a lot of horizontal operations relative to the amount of vertical operations, since you only have 2 and a half vectors worth of data.
If exactly 10 is really common, specialize a version for that. If under-16 is common, specialize a version for that. (They can and should share the const float weights[] = { 0.0f, 1.0f, 2.0f, ...}; array.)
You probably will want to use intrinsics for the specialized small-problem versions, rather than auto-vectorization.
Having zero-padding after the end of the useful data in your array might still be a good idea in your specialized version(s). However, you can load the last 2 floats and clear the upper 64b of a vector register with a movq instruction. (__m128i _mm_cvtsi64_si128 (__int64 a)). Cast this to __m128 and you're good to go.
As peterchen mentioned, these operations are very trivial for current desktop processors. The function is linear, i.e. O(n). What's the typical size of numBins? If it's rather large (say, over 1000000), parallelization will help. This could be simple using a library like OpenMP. If numBins starts approaching MAXINT, you may consider GPGPU as an option (CUDA/OpenCL).
All that considered, you should try profiling your application. Chances are good that, if there is a performance constraint, it's not in this method. Michael Abrash's definition of "high-performance code" has helped me greatly in determining if/when to optimize:
Before we can create high-performance code, we must understand what high performance is. The objective (not always attained) in creating high-performance software is to make the software able to carry out its appointed tasks so rapidly that it responds instantaneously, as far as the user is concerned. In other words, high-performance code should ideally run so fast that any further improvement in the code would be pointless. Notice that the above definition most emphatically does not say anything about making the software as fast as possible.
Reference:
The Graphics Programming Black Book
The overall function to be calculated is
std = sqrt(SUM_i { hist[i]/sum * (midpoint_i - mean_midpoint)^2 })
Using the identity
Var (aX + b) = Var (X) * a^2
one can reduce the complexity of the overall operation considerably
1) midpoint of a bin doesn't need offset b
2) no need to prescale by bin array elements with bin width
and
3) no need to normalize histogram entries with reciprocal of sum
The optimized calculation goes as follows
float calcVariance(int histBin[], float binWidth)
{
int i;
int sum = 0;
int mid = 0;
int var = 0;
for (i = 0; i < 10; i++)
{
sum += histBin[i];
mid += i*histBin[i];
}
float inv_sum = 1.0f / (float)sum;
float mid_sum = mid * inv_sum;
for (i = 0; i < 10; i++)
{
int diff = i * sum - mid; // because mid is prescaled by sum
var += histBin[i] * diff * diff;
}
return sqrt(float(var) / (float)(sum * sum * sum)) * binWidth;
}
Minor changes are required if it's float histBin[];
Also I second padding histBin size to a multiple of 4 for better vectorization.
EDIT
Another way to calculate this with floats in the inner loop:
float inv_sum = 1.0f / (float)sum;
float mid_sum = mid * inv_sum;
float var = 0.0f;
for (i = 0; i < 10; i++)
{
float diff = (float)i - mid_sum;
var += (float)histBin[i] * diff * diff;
}
return sqrt(var * inv_sum) * binWidth;
Perform the scaling on the global results only and keep integers as long as possible.
Group all computation in a single loop, using Σ(X-m)²/N = ΣX²/N - m².
// Accumulate the histogram
int mean= 0, var= 0;
for (int i = 0; i < numBins; ++i)
{
mean+= i * histVec[i];
var+= i * i * histVec[i];
}
// Compute the reduced mean and variance
float fmean= (float(mean) / sum);
float fvar= float(var) / sum - fmean * fmean;
// Rescale
fmean= fmean * binSize + binOffset;
fvar= fvar * binSize * binSize;
The required integer type will depend on the maximum value in the bins. The SSE optimization of the loop can exploit the _mm_madd_epi16 instruction.
If the number of bins is a small as 10, consider fully unrolling the loop. Precompute the i and i² vectors in a table.
In the lucky case that the data fits in 16 bits and the sums in 32 bits, the accumulation is done with something like
static short I[16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0 };
static short I2[16]= { 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 0, 0, 0, 0, 0 };
// First group
__m128i i= _mm_load_si128((__m128i*)&I[0]);
__m128i i2= _mm_load_si128((__m128i*)&I2[0]);
__m128i h= _mm_load_si128((__m128i*)&inHist[0]);
__m128i mean= _mm_madd_epi16(i, h);
__m128i var= _mm_madd_epi16(i2, h);
// Second group
i= _mm_load_si128((__m128i*)&I[8]);
i2= _mm_load_si128((__m128i*)&I2[8]);
h= _mm_load_si128((__m128i*)&inHist[8]);
mean= _mm_add_epi32(mean, _mm_madd_epi16(i, h));
var= _mm_add_epi32(var, _mm_madd_epi16(i2, h));
CAUTION: unchecked