Optimal branchless conditional selection of two SSE2 packed doubles - c++

I'm trying to write a branchless bit select function for packed SSE2 doubles:
#include <iostream>
#include <emmintrin.h>
inline __m128d select(bool expression, const __m128d& x, const __m128d& y)
{
const int conditional_mask = expression ? -1 : 0;
const auto mask = _mm_castsi128_pd(_mm_set_epi64x(conditional_mask, conditional_mask));
return _mm_or_pd(_mm_and_pd(mask, x), _mm_andnot_pd(mask, y));
}
int main()
{
auto r1 = _mm_setr_pd(1, 2);
auto r2 = _mm_setr_pd(5, 6);
auto result = select(true, r1, r2);
auto packed = reinterpret_cast<double*>(&result);
std::cout << "result = " << packed[0] << ", " << packed[1] << std::endl;
std::getchar();
return EXIT_SUCCESS;
}
Is there a simpler approach for SSE2 and SSE4 that would be more optimal on x64?

You've specified that SSE4 is allowed, SSE4.1 has blendvpd so you can blend with a built-in blend: (not tested, but compiled)
inline __m128d select(bool expression, const __m128d& x, const __m128d& y)
{
const int c_mask = expression ? -1 : 0;
const auto mask = _mm_castsi128_pd(_mm_set_epi64x(c_mask, c_mask));
return _mm_blendv_pd(y, x, mask);
}
I would also not take SSE vectors as argument by reference, copying them is trivial so not something to be avoided and taking them by reference encourages the compiler to bounce them through memory (for non-inlined calls).

Related

Why does boost::numeric::interval::widen not behave the same as manually applying the same logic

I have the following test code
#include <boost/numeric/interval.hpp>
#include <iostream>
#include <limits>
int main() {
using Interval = boost::numeric::interval<double>;
using dbl = std::numeric_limits< double >;
std::cout << std::setprecision(dbl::max_digits10) << std::endl ;
for (int j = 0; j < 100; j++) {
double lower = -j;
double upper = j;
double tol = 0.1;// 1e-12;
Interval i{lower, upper};
double lower_ = lower + tol;
double upper_ = upper - tol;
Interval i_ = widen(i, -tol);
if (lower_ != i_.lower() || upper_ != i_.upper()) {
std::cout
<< " error "
<< lower_ << "," << upper_
<< " "
<< i_.lower() << "," << i_.upper()
<< std::endl;
}
}
}
and the output is
error -1.8999999999999999,1.8999999999999999 -1.9000000000000001,1.9000000000000001
error -2.8999999999999999,2.8999999999999999 -2.9000000000000004,2.9000000000000004
error -3.8999999999999999,3.8999999999999999 -3.9000000000000004,3.9000000000000004
error -16.899999999999999,16.899999999999999 -16.900000000000002,16.900000000000002
error -17.899999999999999,17.899999999999999 -17.900000000000002,17.900000000000002
error -18.899999999999999,18.899999999999999 -18.900000000000002,18.900000000000002
error -19.899999999999999,19.899999999999999 -19.900000000000002,19.900000000000002
error -20.899999999999999,20.899999999999999 -20.900000000000002,20.900000000000002
error -21.899999999999999,21.899999999999999 -21.900000000000002,21.900000000000002
error -22.899999999999999,22.899999999999999 -22.900000000000002,22.900000000000002
error -23.899999999999999,23.899999999999999 -23.900000000000002,23.900000000000002
error -24.899999999999999,24.899999999999999 -24.900000000000002,24.900000000000002
error -25.899999999999999,25.899999999999999 -25.900000000000002,25.900000000000002
error -26.899999999999999,26.899999999999999 -26.900000000000002,26.900000000000002
error -27.899999999999999,27.899999999999999 -27.900000000000002,27.900000000000002
error -28.899999999999999,28.899999999999999 -28.900000000000002,28.900000000000002
error -29.899999999999999,29.899999999999999 -29.900000000000002,29.900000000000002
error -30.899999999999999,30.899999999999999 -30.900000000000002,30.900000000000002
error -31.899999999999999,31.899999999999999 -31.900000000000002,31.900000000000002
error -32.899999999999999,32.899999999999999 -32.900000000000006,32.900000000000006
error -33.899999999999999,33.899999999999999 -33.900000000000006,33.900000000000006
error -34.899999999999999,34.899999999999999 -34.900000000000006,34.900000000000006
error -35.899999999999999,35.899999999999999 -35.900000000000006,35.900000000000006
error -36.899999999999999,36.899999999999999 -36.900000000000006,36.900000000000006
error -37.899999999999999,37.899999999999999 -37.900000000000006,37.900000000000006
error -38.899999999999999,38.899999999999999 -38.900000000000006,38.900000000000006
error -39.899999999999999,39.899999999999999 -39.900000000000006,39.900000000000006
error -40.899999999999999,40.899999999999999 -40.900000000000006,40.900000000000006
error -41.899999999999999,41.899999999999999 -41.900000000000006,41.900000000000006
error -42.899999999999999,42.899999999999999 -42.900000000000006,42.900000000000006
error -43.899999999999999,43.899999999999999 -43.900000000000006,43.900000000000006
error -44.899999999999999,44.899999999999999 -44.900000000000006,44.900000000000006
error -45.899999999999999,45.899999999999999 -45.900000000000006,45.900000000000006
error -46.899999999999999,46.899999999999999 -46.900000000000006,46.900000000000006
error -47.899999999999999,47.899999999999999 -47.900000000000006,47.900000000000006
error -48.899999999999999,48.899999999999999 -48.900000000000006,48.900000000000006
error -49.899999999999999,49.899999999999999 -49.900000000000006,49.900000000000006
error -50.899999999999999,50.899999999999999 -50.900000000000006,50.900000000000006
error -51.899999999999999,51.899999999999999 -51.900000000000006,51.900000000000006
error -52.899999999999999,52.899999999999999 -52.900000000000006,52.900000000000006
error -53.899999999999999,53.899999999999999 -53.900000000000006,53.900000000000006
error -54.899999999999999,54.899999999999999 -54.900000000000006,54.900000000000006
error -55.899999999999999,55.899999999999999 -55.900000000000006,55.900000000000006
error -56.899999999999999,56.899999999999999 -56.900000000000006,56.900000000000006
error -57.899999999999999,57.899999999999999 -57.900000000000006,57.900000000000006
error -58.899999999999999,58.899999999999999 -58.900000000000006,58.900000000000006
error -59.899999999999999,59.899999999999999 -59.900000000000006,59.900000000000006
error -60.899999999999999,60.899999999999999 -60.900000000000006,60.900000000000006
error -61.899999999999999,61.899999999999999 -61.900000000000006,61.900000000000006
error -62.899999999999999,62.899999999999999 -62.900000000000006,62.900000000000006
error -63.899999999999999,63.899999999999999 -63.900000000000006,63.900000000000006
Can somebody please explain why widen is not doing exactly the same as manually applying the widening to the limits.
See https://godbolt.org/z/fonMrGj74 for a live demo
Root Cause
The problem is floating point inexact representation. If we simplify the test program and use a decimal representation, there is no issue:
Live On Coliru
#include <boost/multiprecision/cpp_dec_float.hpp>
#include <boost/numeric/interval.hpp>
#include <boost/numeric/interval/io.hpp>
#include <iostream>
#include <limits>
int main() {
using T = boost::multiprecision::cpp_dec_float_50;
using Interval = boost::numeric::interval<T>;
using LIM = std::numeric_limits<T>;
std::cout << "Precision: " << LIM::max_digits10 << "\n"
<< std::setprecision(LIM::max_digits10);
const T tol("0.1"); // 1e-12;
for (int j = 0; j < 100; j++) {
T lower = -j;
T upper = j;
Interval const manual(lower - tol, upper + tol);
Interval library{lower, upper};
library = widen(library, tol);
using namespace boost::numeric::interval_lib::compare::lexicographic;
if (library != manual)
std::cout << " error " << manual << " vs " << library << "\n";
}
std::cout << "Done\n";
}
Prints
Precision: 80
Done
Interval Policies & Rounding Modes
It looks like the naive manual implementation results in boundaries that are closest to the intended representation if the representation cannot be exact.
By contrast it looks like widen makes sure that if the result cannot be represented exactly, the widening is guaranteed to be at least the requested amount, never accidentally slightly less due to representation issues. This means that in these situations
the widened lower bound may be lower than the manually calculated one
the widened upper bound may be higher than the manually calculated one
The behaviour can be adjusted, because it comes from the rounding policy which supplies sub_down and add_up primitives.
I would suggest not to change it, though:
if you need precise computations with the float or double types, use the default rounded_math;
rounded_math<T> is already the default.
If you insist on having identical results as the manual method (even if they are inferior):
using T = double; // boost::multiprecision::cpp_dec_float_50;
namespace I = boost::numeric::interval_lib;
using Interval = I::change_rounding<boost::numeric::interval<T>,
I::rounded_arith_exact<T>>::type;
Prints Live On Coliru
Precision: 17
Done
Further Caveats
There's a documentation warning:
Warning! Guaranteed interval arithmetic for native floating-point format is not supported on every combination of processor, operating system, and compiler.
There is a list of specific compiler quirks/flags you have to keep in mind.
TL;DR
Boost is complicated and somewhere it sets the mode of the floating-point unit (FPU) which changes rounding behavior.
The answer by sehe is spot on but in case you are interested in a more technical answer then read on.
The lines of interest in your example are
using Interval = boost::numeric::interval<double>;
Interval i{lower, upper};
Interval i_ = widen(i, -tol);
Let's go down the rabbit hole and try to understand how boost implements these lines.
The main class is
// boost/numeric/interval/interval.hpp:l.37
template<class T, class Policies>
class interval;
The template argument Policies gets its default value defined at
// boost/numeric/interval/detail/interval_prototype.hpp:l.27
template<class T>
struct default_policies
{
typedef policies<rounded_math<T>, checking_strict<T> > type;
};
// <truncated>
template<class T, class Policies = typename interval_lib::default_policies<T>::type >
class interval;
The general rounded_math template is defined at
// boost/numeric/interval/rounding.hpp:l.93
template<class T>
struct rounded_math: save_state_nothing<rounded_arith_exact<T> >
{};
but we actually need to look at the explicit template specialization
// boost/numeric/interval/hw_rounding.hpp:l.59
template<>
struct rounded_math<double>
: save_state<rounded_arith_opp<double> >
{};
The save_state template is a derived class from its argument, i.e.
// boost/numeric/interval/rounding.hpp:l.75
template<class Rounding>
struct save_state: Rounding
{
typename Rounding::rounding_mode mode;
save_state() {
this->get_rounding_mode(mode);
this->init();
}
~save_state() { this->set_rounding_mode(mode); }
typedef detail::save_state_unprotected<Rounding> unprotected_rounding;
};
Note here that Rounding::init() will be called (as well as Rounding::set_rounding_mode(mode) at destruction)!
Then lets have a look at rounded_arith_opp
// boost/numeric/interval/rounded_arith.hpp:l.78
template<class T, class Rounding>
struct rounded_arith_opp: Rounding {
void init() { this->upward(); }
// <truncated>
# define BOOST_UP(EXPR) return this->force_rounding(EXPR)
# define BOOST_UP_NEG(EXPR) return -this->force_rounding(EXPR)
T sub_down(const T& x, const T& y) { BOOST_UP_NEG(y - x); }
T add_up (const T& x, const T& y) { BOOST_UP(x + y); }
// <truncated>
};
Its default Rounding argument is defined at
// boost/numeric/interval/rounding.hpp:l.46
template<class T, class Rounding = rounding_control<T> >
struct rounded_arith_opp;
The rounding control is defined at
// boost/numeric/interval/rounding.hpp:l.20
template<class T>
struct rounding_control
{
// <truncated>
static const T& force_rounding(const T& x) { return x; }
};
It also has explicit template specializations which vary by platforms, e.g.
// boost/numeric/interval/detail/c99_rounding_control.hpp:l.28
template<>
struct rounding_control<double>:
detail::c99_rounding_control
{
static double force_rounding(double const &r)
{ volatile double r_ = r; return r_; }
};
The c99_rounding_control class implements the change of rounding modes, i.e.
// boost/numeric/interval/detail/c99sub_rounding_control.hpp:l.23
struct c99_rounding_control
{
static void set_rounding_mode(rounding_mode mode) { fesetround(mode); }
static void get_rounding_mode(rounding_mode &mode) { mode = fegetround(); }
static void upward() { set_rounding_mode(FE_UPWARD); }
// <truncated>
};
Now we understand the interval class and can have a look at the widen function
// boost/numeric/interval/detail/c99_rounding_control.hpp:l.82
template<class T, class Policies> inline
interval<T, Policies> widen(const interval<T, Policies>& x, const T& v)
{
if (interval_lib::detail::test_input(x))
return interval<T, Policies>::empty();
typename Policies::rounding rnd;
return interval<T, Policies>(rnd.sub_down(x.lower(), v),
rnd.add_up (x.upper(), v), true);
}
It creates the rnd object which will call in the end c99_rounding_control::upward and together with rounded_arith_opp::{add_up,sub_down} one gets the observed results.
I will supply a simple example to show how the rounding takes place (godbolt-link)
#include <boost/numeric/interval.hpp>
#include <iomanip>
#include <iostream>
#include <limits>
using namespace std;
double force_rounding(const double& r) {
volatile double r_ = r;
return r_;
}
int main() {
cout << setprecision(numeric_limits<double>::max_digits10) << endl;
auto tol = 0.1;
const auto lower = -2;
const auto upper = 3;
cout << "default rounding:\t" << lower + tol << ", " << upper - tol << '\n';
{
using Interval = boost::numeric::interval<double>;
const auto i = widen(Interval{lower, upper}, -tol);
cout << "widen:\t\t\t" << i.lower() << ", " << i.upper() << '\n';
}
{
boost::numeric::interval_lib::detail::c99_rounding_control::upward();
const auto lower_ = -force_rounding(-lower - tol);
const auto upper_ = force_rounding(upper - tol);
cout << "set rounding:\t\t" << lower_ << ", " << upper_ << '\n';
}
return 0;
}
Output:
default rounding: -1.8999999999999999, 2.8999999999999999
widen: -1.9000000000000001, 2.9000000000000004
set rounding: -1.9000000000000001, 2.9000000000000004
See also
boost doc about interval's rounding: boost.org
boost version: 1.79.0 (for line numbers)

Type-pun uint64_t as two uint32_t in C++20

This code to read a uint64_t as two uint32_t is UB due to the strict aliasing rule:
uint64_t v;
uint32_t lower = reinterpret_cast<uint32_t*>(&v)[0];
uint32_t upper = reinterpret_cast<uint32_t*>(&v)[1];
Likewise, this code to write the upper and lower part of an uint64_t is UB due to the same reason:
uint64_t v;
uint32_t* lower = reinterpret_cast<uint32_t*>(&v);
uint32_t* upper = reinterpret_cast<uint32_t*>(&v) + 1;
*lower = 1;
*upper = 1;
How can one write this code in a safe and clean way in modern C++20, potentially using std::bit_cast?
Using std::bit_cast:
Try it online!
#include <bit>
#include <array>
#include <cstdint>
#include <iostream>
int main() {
uint64_t x = 0x12345678'87654321ULL;
// Convert one u64 -> two u32
auto v = std::bit_cast<std::array<uint32_t, 2>>(x);
std::cout << std::hex << v[0] << " " << v[1] << std::endl;
// Convert two u32 -> one u64
auto y = std::bit_cast<uint64_t>(v);
std::cout << std::hex << y << std::endl;
}
Output:
87654321 12345678
1234567887654321
std::bit_cast is available only in C++20. Prior to C++20 you can manually implement std::bit_cast through std::memcpy, with one exception that such implementation is not constexpr like C++20 variant:
template <class To, class From>
inline To bit_cast(From const & src) noexcept {
//return std::bit_cast<To>(src);
static_assert(std::is_trivially_constructible_v<To>,
"Destination type should be trivially constructible");
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
For this specific case of integers quite optimal would be just to do bit shift/or arithmetics to convert one u64 to two u32 and back again. std::bit_cast is more generic, supporting any trivially constructible type, although std::bit_cast solution should be same optimal as bit arithmetics on modern compilers with high level of optimization.
One extra profit of bit arithmetics is that it handles correctly endianess, it is endianess independent, unlike std::bit_cast.
Try it online!
#include <cstdint>
#include <iostream>
int main() {
uint64_t x = 0x12345678'87654321ULL;
// Convert one u64 -> two u32
uint32_t lo = uint32_t(x), hi = uint32_t(x >> 32);
std::cout << std::hex << lo << " " << hi << std::endl;
// Convert two u32 -> one u64
uint64_t y = (uint64_t(hi) << 32) | lo;
std::cout << std::hex << y << std::endl;
}
Output:
87654321 12345678
123456788765432
Notice! As #Jarod42 points out, solution with bit shifting is not equivalent to memcpy/bit_cast solution, their equivalence depends on endianess. On little endian CPU memcpy/bit_cast gives least significant half (lo) as array element v[0] and most significant (hi) in v[1], while on big endian least significant (lo) goes to v[1] and most significant goes to v[0]. While bit-shifting solution is endianess independent, and on all systems gives most significant half (hi) as uint32_t(num_64 >> 32) and least significant half (lo) as uint32_t(num_64).
in a safe and clean way
Do not use reinterpret_cast. Do not depend on unclear code that depends on some specific compiler settings and fishy, uncertain behavior. Use exact arithmetic operations with well-known defined result. Classes and operator overloads are all there waiting for you. For example, some global functions:
#include <iostream>
struct UpperUint64Ref {
uint64_t &v;
UpperUint64Ref(uint64_t &v) : v(v) {}
UpperUint64Ref operator=(uint32_t a) {
v &= 0x00000000ffffffffull;
v |= (uint64_t)a << 32;
return *this;
}
operator uint64_t() {
return v;
}
};
struct LowerUint64Ref {
uint64_t &v;
LowerUint64Ref(uint64_t &v) : v(v) {}
/* as above */
};
UpperUint64Ref upper(uint64_t& v) { return v; }
LowerUint64Ref lower(uint64_t& v) { return v; }
int main() {
uint64_t v;
upper(v) = 1;
}
Or interface object:
#include <iostream>
struct Uint64Ref {
uint64_t &v;
Uint64Ref(uint64_t &v) : v(v) {}
struct UpperReference {
uint64_t &v;
UpperReference(uint64_t &v) : v(v) {}
UpperReference operator=(uint32_t a) {
v &= 0x00000000ffffffffull;
v |= (uint64_t)a << 32u;
}
};
UpperReference upper() {
return v;
}
struct LowerReference {
uint64_t &v;
LowerReference(uint64_t &v) : v(v) {}
};
LowerReference lower() { return v; }
};
int main() {
uint64_t v;
Uint64Ref r{v};
r.upper() = 1;
}
Using std::memcpy
#include <cstdint>
#include <cstring>
void foo(uint64_t& v, uint32_t low_val, uint32_t high_val) {
std::memcpy(reinterpret_cast<unsigned char*>(&v), &low_val,
sizeof(low_val));
std::memcpy(reinterpret_cast<unsigned char*>(&v) + sizeof(low_val),
&high_val, sizeof(high_val));
}
int main() {
uint64_t v = 0;
foo(v, 1, 2);
}
With O1, the compiler reduces foo to:
mov DWORD PTR [rdi], esi
mov DWORD PTR [rdi+4], edx
ret
Meaning there are no extra copies made, std::memcpy just serves as a hint to the compiler.
std::bit_cast alone is not enough since results will vary by the endian of the system.
Fortunately <bit> also contains std::endian.
Keeping in mind that optimizers generally compile-time resolve ifs that are always true or false, we can test endianness and act accordingly.
We only know beforehand how to handle big or little-endian. If it is not one of those, bit_cast results are not decodable.
Another factor that can spoil things is padding. Using bit_cast assumes 0 padding between array elements.
So we can check if there is no padding and the endianness is big or little to see if it is castable.
If it is not castable, we do a bunch of shifts as per the old method.
(this can be slow)
If the endianness is big -- return the results of bit_cast.
If the endianness is little -- reverse the order. Not the same as c++23 byteswap, as we swap elements.
I arbitrarily decided that big-endian has the correct order with the high bits at x[0].
#include <bit>
#include <array>
#include <cstdint>
#include <climits>
#include <concepts>
template <std::integral F, std::integral T>
requires (sizeof(F) >= sizeof(T))
constexpr auto split(F x) {
enum consts {
FBITS=sizeof(F)*CHAR_BIT,
TBITS=sizeof(F)*CHAR_BIT,
ELEM=sizeof(F)/sizeof(T),
BASE=FBITS-TBITS,
MASK=~0ULL >> BASE
};
using split=std::array<T, ELEM>;
const bool is_big=std::endian::native==std::endian::big;
const bool is_little=std::endian::native==std::endian::little;
const bool can_cast=((is_big || is_little)
&& (sizeof(F) == sizeof(split)));
// All the following `if`s should be eliminated at compile time
// since they are always true or always false
if (!can_cast)
{
split ret;
for (int e = 0; e < ELEM; ++e)
{
ret[e]=(x>>(BASE-e*TBITS)) & MASK;
}
return ret;
}
split tmp=std::bit_cast<split>(x);
if (is_big)
{
return tmp;
}
split ret;
for (int e=0; e < ELEM; ++e)
{
ret[e]=tmp[ELEM-(e+1)];
}
return ret;
}
auto tst(uint64_t x, int y)
{
return split<decltype(x), uint32_t>(x)[y];
}
I believe this should be defined behavior.
EDIT: changed uint64 base to template parameter and minor edit tweaks
Don't bother, because arithmetic is faster anyway:
uint64_t v;
uint32_t lower = v;
uint32_t upper = v >> 32;

C++ operator[] access to elements of SIMD (e.g. AVX) variable

I'm looking for a way to overload operator[] (within a broader SIMD class) to facilitate reading and writing individual elements within a SIMD word (e.g. __m512i). A couple constraints:
Compliant with C++11 (or later)
Compatible with additional intrinsics based code
Not OpenCL/SYCL (which I could, but I can't *sigh*)
Mostly portable across g++, icpc, clang++
Preferably applicable to other SIMD beyond Intel (ARM, IBM, etc...)
(edit) Performance isn't really an issue (not generally used in places where performance matters)
(This rules out things like type punning through pointer casting, and GCC vector types.)
Based heavily on Scott Meyers' "More Effective C++" (Item 30), and other code I've come up with the following MVC code that seems "right", that seems to work, but also seems over complicated. (The "proxy" approach is meant to deal with the left/right hand operator[] usage, and the "memcpy" is meant to deal with the type punning/C++ standard issue.)
I'm wonder if someone has a better solution (and can explain it so I learn something ;^))
#include <iostream>
#include <cstring>
#include "immintrin.h"
using T = __m256i; // SIMD type
using Te = unsigned int; // SIMD element type
class SIMD {
class SIMDProxy;
public :
const SIMDProxy operator[](int index) const {
std::cout << "SIMD::operator[] const" << std::endl;
return SIMDProxy(const_cast<SIMD&>(*this), index);
}
SIMDProxy operator[](int index){
std::cout << "SIMD::operator[]" << std::endl;
return SIMDProxy(*this, index);
}
Te get(int index) {
std::cout << "SIMD::get" << std::endl;
alignas(T) Te tmp[8];
std::memcpy(tmp, &value, sizeof(T)); // _mm256_store_si256(reinterpret_cast<__m256i *>(tmp), c.value);
return tmp[index];
}
void set(int index, Te x) {
std::cout << "SIMD::set" << std::endl;
alignas(T) Te tmp[8];
std::memcpy(tmp, &value, sizeof(T)); // _mm256_store_si256(reinterpret_cast<__m256i *>(tmp), c.value);
tmp[index] = x;
std::memcpy(&value, tmp, sizeof(T)); // c.value = _mm256_load_si256(reinterpret_cast<__m256i const *>(tmp));
}
void splat(Te x) {
alignas(T) Te tmp[8];
std::memcpy(tmp, &value, sizeof(T));
for (int i=0; i<8; i++) tmp[i] = x;
std::memcpy(&value, tmp, sizeof(T));
}
void print() {
alignas(T) Te tmp[8];
std::memcpy(tmp, &value, sizeof(T));
for (int i=0; i<8; i++) std::cout << tmp[i] << " ";
std::cout << std::endl;
}
protected :
private :
T value;
class SIMDProxy {
public :
SIMDProxy(SIMD & c_, int index_) : c(c_), index(index_) {};
// lvalue access
SIMDProxy& operator=(const SIMDProxy& rhs) {
std::cout << "SIMDProxy::=SIMDProxy" << std::endl;
c.set(rhs.index, rhs.c.get(rhs.index));
return *this;
}
SIMDProxy& operator=(Te x) {
std::cout << "SIMDProxy::=T" << std::endl;
c.set(index,x);
return *this;
}
// rvalue access
operator Te() const {
std::cout << "SIMDProxy::()" << std::endl;
return c.get(index);
}
private:
SIMD& c; // SIMD this proxy refers to
int index; // index of element we want
};
friend class SIMDProxy; // give SIMDProxy access into SIMD
};
/** a little main to exercise things **/
int
main(int argc, char *argv[])
{
SIMD x, y;
Te a = 3;
x.splat(1);
x.print();
y.splat(2);
y.print();
x[0] = a;
x.print();
y[1] = a;
y.print();
x[1] = y[1];
x.print();
}
Your code is very inefficient. Normally these SIMD types are not present anywhere in memory, they are hardware registers, they don’t have addresses and you can’t pass them to memcpy(). Compilers pretend very hard they’re normal variables that’s why your code compiles and probably works, but it’s slow, you’re doing roundtrips from registers to memory and back all the time.
Here’s how I would do that, assuming AVX2 and integer lanes.
class SimdVector
{
__m256i val;
alignas( 64 ) static const std::array<int, 8 + 7> s_blendMaskSource;
public:
int operator[]( size_t lane ) const
{
assert( lane < 8 );
// Move lane index into lowest lane of vector register
const __m128i shuff = _mm_cvtsi32_si128( (int)lane );
// Permute the vector so the lane we need is moved to the lowest lane
// _mm256_castsi128_si256 says "the upper 128 bits of the result are undefined",
// and we don't care indeed.
const __m256i tmp = _mm256_permutevar8x32_epi32( val, _mm256_castsi128_si256( shuff ) );
// Return the lowest lane of the result
return _mm_cvtsi128_si32( _mm256_castsi256_si128( tmp ) );
}
void setLane( size_t lane, int value )
{
assert( lane < 8 );
// Load the blending mask
const int* const maskLoadPointer = s_blendMaskSource.data() + 7 - lane;
const __m256i mask = _mm256_loadu_si256( ( const __m256i* )maskLoadPointer );
// Broadcast the source value into all lanes.
// The compiler will do equivalent of _mm_cvtsi32_si128 + _mm256_broadcastd_epi32
const __m256i broadcasted = _mm256_set1_epi32( value );
// Use vector blending instruction to set the desired lane
val = _mm256_blendv_epi8( val, broadcasted, mask );
}
template<size_t lane>
int getLane() const
{
static_assert( lane < 8 );
// That thing is not an instruction;
// compilers emit different ones based on the index
return _mm256_extract_epi32( val, (int)lane );
}
template<size_t lane>
void setLane( int value )
{
static_assert( lane < 8 );
val = _mm256_insert_epi32( val, value, (int)lane );
}
};
// Align by 64 bytes to guarantee it's contained within a cache line
alignas( 64 ) const std::array<int, 8 + 7> SimdVector::s_blendMaskSource
{
0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0
};
For ARM it’s different. If lane index is known at compile time, see vgetq_lane_s32 and vsetq_lane_s32 intrinsics.
For setting lanes on ARM you can use the same broadcast + blend trick. Broadcast is vdupq_n_s32. An approximate equivalent of vector blend is vbslq_s32, it handles every bit independently, but for this use case it’s equally suitable because -1 has all 32 bits set.
For extracting either write a switch, or store the complete vector into memory, not sure which of these two is more efficient.
Of the original approaches (memcpy, intrinsic load/store), and the additional suggestions (user defined union-punning, user defined vector type) it seems like the intrinsic approach may have a small advantage. This is based on some quick examples I attempted to code up in Godbolt (https://godbolt.org/z/5zdbKe).
The "best" for writing to an element looks something like this.
__m256i foo2(__m256i x, unsigned int a, int index)
{
alignas(__m256i) unsigned int tmp[8];
_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), x);
tmp[index] = a;
__m256i z = _mm256_load_si256(reinterpret_cast<__m256i const *>(tmp));
return z;
}
If you only care about g++/clang++/icc compatibility, you can just use the __attribute__ which these compilers use internally to define their intrinsic instructions:
typedef int32_t int32x16_t __attribute__((vector_size(16*sizeof(int32_t)))) __attribute__((aligned(16*sizeof(int32_t))));
When it makes sense (and is possible on the given architecture), variables will be stored in vector registers. Also, the compilers provide a read/writeable operator[] for this typedef (which should get optimized, if the index is known at compile-time).

Casting __fp16 to float fails to link on Clang 9

I need to read a file containing floating point numbers stored in binary16 format and convert them to float. Based on https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html#half-precision-floating-point, I read the data into __fp16* fp16_weights_buf and then simply did
for (int i = 0; i < config_.weights_buf_size; i++) {
buf_weights_[i] = static_cast<T>(fp16_weights_buf[i]);
}
This compiles, but linking fails:
: && /usr/bin/clang++-9 -g -fsanitize=address,undefined -fno-omit-frame-pointer -fno-limit-debug-info CMakeFiles/run_model.dir/src/run_model.cc.o -o run_model libfused_transformer.a ../thirdparty/OpenBLAS/libopenblas.a ../thirdparty/icu/icu4c/linux/prebuilt/lib/libicui18n.a ../thirdparty/icu/icu4c/linux/prebuilt/lib/libicuuc.a ../thirdparty/icu/icu4c/linux/prebuilt/lib/libicudata.a -lpthread /usr/lib/llvm-9/lib/libomp.so -lpthread && :
CMakeFiles/run_model.dir/src/run_model.cc.o: In function `Pipeline':
/mnt/e/MyProgramming/fused-transformer-mobile-1/build/../include/pipeline.h:424: undefined reference to `__gnu_h2f_ieee'
Do I need to pass some additional options for this to work?
As a workaround, I added the code for __gnu_h2f_ieee from https://gist.github.com/whchung/25875271922806e58ac21ad7d707e3cd:
#ifdef __x86_64__
#include <limits.h>
#include <stdint.h>
typedef uint16_t src_t;
typedef uint16_t src_rep_t;
#define SRC_REP_C UINT16_C
static const int srcSigBits = 10;
#define src_rep_t_clz __builtin_clz
typedef float dst_t;
typedef uint32_t dst_rep_t;
#define DST_REP_C UINT32_C
static const int dstSigBits = 23;
// End of specialization parameters. Two helper routines for conversion to and
// from the representation of floating-point data as integer values follow.
static __inline src_rep_t srcToRep(src_t x) {
const union { src_t f; src_rep_t i; } rep = {.f = x};
return rep.i;
}
static __inline dst_t dstFromRep(dst_rep_t x) {
const union { dst_t f; dst_rep_t i; } rep = {.i = x};
return rep.f;
}
// End helper routines. Conversion implementation follows.
static __inline dst_t __extendXfYf2__(src_t a) {
// Various constants whose values follow from the type parameters.
// Any reasonable optimizer will fold and propagate all of these.
const int srcBits = sizeof(src_t)*CHAR_BIT;
const int srcExpBits = srcBits - srcSigBits - 1;
const int srcInfExp = (1 << srcExpBits) - 1;
const int srcExpBias = srcInfExp >> 1;
const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits;
const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits);
const src_rep_t srcAbsMask = srcSignMask - 1;
const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
const src_rep_t srcNaNCode = srcQNaN - 1;
const int dstBits = sizeof(dst_t)*CHAR_BIT;
const int dstExpBits = dstBits - dstSigBits - 1;
const int dstInfExp = (1 << dstExpBits) - 1;
const int dstExpBias = dstInfExp >> 1;
const dst_rep_t dstMinNormal = DST_REP_C(1) << dstSigBits;
// Break a into a sign and representation of the absolute value
const src_rep_t aRep = srcToRep(a);
const src_rep_t aAbs = aRep & srcAbsMask;
const src_rep_t sign = aRep & srcSignMask;
dst_rep_t absResult;
// If sizeof(src_rep_t) < sizeof(int), the subtraction result is promoted
// to (signed) int. To avoid that, explicitly cast to src_rep_t.
if ((src_rep_t)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
// a is a normal number.
// Extend to the destination type by shifting the significand and
// exponent into the proper position and rebiasing the exponent.
absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits);
absResult += (dst_rep_t)(dstExpBias - srcExpBias) << dstSigBits;
}
else if (aAbs >= srcInfinity) {
// a is NaN or infinity.
// Conjure the result by beginning with infinity, then setting the qNaN
// bit (if needed) and right-aligning the rest of the trailing NaN
// payload field.
absResult = (dst_rep_t)dstInfExp << dstSigBits;
absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - srcSigBits);
absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - srcSigBits);
}
else if (aAbs) {
// a is denormal.
// renormalize the significand and clear the leading bit, then insert
// the correct adjusted exponent in the destination type.
const int scale = src_rep_t_clz(aAbs) - src_rep_t_clz(srcMinNormal);
absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits + scale);
absResult ^= dstMinNormal;
const int resultExponent = dstExpBias - srcExpBias - scale + 1;
absResult |= (dst_rep_t)resultExponent << dstSigBits;
}
else {
// a is zero.
absResult = 0;
}
// Apply the signbit to (dst_t)abs(a).
const dst_rep_t result = absResult | (dst_rep_t)sign << (dstBits - srcBits);
return dstFromRep(result);
}
// Use a forwarding definition and noinline to implement a poor man's alias,
// as there isn't a good cross-platform way of defining one.
__attribute__((noinline)) float __extendhfsf2(uint16_t a) {
return __extendXfYf2__(a);
}
extern "C" float __gnu_h2f_ieee(uint16_t a) {
return __extendhfsf2(a);
}
#endif
in a separate source file (#ifdef because on ARM this function should be defined).

is cvCeil() faster than standard library?

I see that OpenCV implement cvCeil function:
CV_INLINE int cvCeil( double value )
{
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
__m128d t = _mm_set_sd( value );
int i = _mm_cvtsd_si32(t);
return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
int i = (int)value;
return i + (i < value);
#else
int i = cvRound(value);
float diff = (float)(i - value);
return i + (diff < 0);
#endif
}
I'm curious in this implementations first part, i.e. the _mm_set_sd related calls. Will they be faster than MSVCRT / libstdc++ / libc++ ? And why?
A simple benchmark below tells me that std::round works more than 3 times faster on my SSE4-enabled machine, but about 2 times slower when SSE4 is not enabled.
#include <cmath>
#include <chrono>
#include <sstream>
#include <iostream>
#include <opencv2/core/fast_math.hpp>
auto currentTime() { return std::chrono::steady_clock::now(); }
template<typename T, typename P>
std::string toString(std::chrono::duration<T,P> dt)
{
std::ostringstream str;
using namespace std::chrono;
str << duration_cast<microseconds>(dt).count()*1e-3 << " ms";
return str.str();
}
int main()
{
volatile double x=34.234;
volatile double y;
constexpr auto MAX_ITER=100'000'000;
const auto t0=currentTime();
for(int i=0;i<MAX_ITER;++i)
y=std::ceil(x);
const auto t1=currentTime();
for(int i=0;i<MAX_ITER;++i)
y=cvCeil(x);
const auto t2=currentTime();
std::cout << "std::ceil: " << toString(t1-t0) << "\n"
"cvCeil : " << toString(t2-t1) << "\n";
}
I test with -O3 option on GCC 8.3.0, glibc-2.27, Ubuntu 18.04.1 x86_64 on Intel Core i7-3930K 3.2 GHz.
Output when compiled with -msse4:
std::ceil: 39.357 ms
cvCeil : 143.224 ms
Output when compiled without -msse4:
std::ceil: 274.945 ms
cvCeil : 146.218 ms
It's easy to understand: SSE4.1 introduces the ROUNDSD instruction, which is basically what std::round does. Before this the compiler has to do some comparison/conditional-moves tricks, and it also has to make sure that these don't overflow. Thus the cvCeil version, sacrificing well-definedness for value>INT_MAX and for value<INT_MIN, gets speedup for the values for which it's well-defined. For others it has undefined behavior (or, with intrinsics, simply gives wrong results).