Why aren't clang++ and g++ de-duplicating these instructions? - c++

Consider the following function:
std::string get_value(const bool b)
{
if (b) {
return "Hello";
}
else {
return "World";
}
}
g++ 11.0.1 20210312 compiles this (as C++17 and with maximum optimization) into:
get_value[abi:cxx11](bool):
lea rdx, [rdi+16]
mov rax, rdi
mov QWORD PTR [rdi], rdx
test sil, sil
je .L2
mov DWORD PTR [rdi+16], 1819043144
mov BYTE PTR [rdx+4], 111
mov QWORD PTR [rax+8], 5
mov BYTE PTR [rax+21], 0
ret
.L2:
mov DWORD PTR [rdi+16], 1819438935
mov BYTE PTR [rdx+4], 100
mov QWORD PTR [rax+8], 5
mov BYTE PTR [rax+21], 0
ret
Why does it not move the two replicated mov instructions up before the jump, or even before the test, reducing the code size by two instructions?
The same thing happens with clang++ and libc++, except it only has one relevant instruction to move up.
(See this also on GodBolt)

Related

Why is vzeroupper being inserted at the end of this code?

I noticed something strange when I compile this code on godbolt, with MSVC:
#include <intrin.h>
#include <cstdint>
void test(unsigned char*& pSrc) {
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pSrc));
int32_t mask = _mm256_movemask_epi8(data);
if (!mask) {
++pSrc;
}
else {
unsigned long v;
_BitScanForward(&v, mask);
pSrc += v;
}
}
I get this resulting assembly:
pSrc$ = 8
void test(unsigned char * &) PROC ; test, COMDAT
mov rdx, QWORD PTR [rcx]
vmovdqu ymm0, YMMWORD PTR [rdx]
vpmovmskb eax, ymm0
test eax, eax
jne SHORT $LN2#test
mov eax, 1
add rax, rdx
mov QWORD PTR [rcx], rax
vzeroupper ; Why is this being inserted?
ret 0
$LN2#test:
bsf eax, eax
add rax, rdx
mov QWORD PTR [rcx], rax
vzeroupper ; Why is this being inserted?
ret 0
void test(unsigned char * &) ENDP ; test
Why is vzeroupper being inserted at the end of each scope? I heard that it's because of switching between SSE and AVX, but I'm not doing that here. I'm using exclusively AVX code.
I was wondering, does this pose a performance problem?

One writer and multiple readers - 256bit - AVX - atomic [duplicate]

This question already has answers here:
Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?
(1 answer)
SSE instructions: which CPUs can do atomic 16B memory operations?
(7 answers)
Largest data type which can be fetch-ANDed atomically?
(2 answers)
Closed 2 years ago.
Would like to write 256bit of data on one core and read it on another one. So there will be only one process to write and can be multiple readers.
Was thinking to implement it using AVX. The reads and writes should be atomic since they are only 1 instruction (vmovdqa) and if aligned by cache line cache coherency would move the data atomically between cores.
Looked at the generated assembly but can see 2 writes and 2 reads. Why is not there just one? Would this solution for atomic read/write work given the assumptions?
#include <immintrin.h>
#include <cstdint>
struct Data {
int64_t a[4];
};
struct DataHolder {
void set_data(Data* in) {
_mm256_store_si256(reinterpret_cast<__m256i *>(&data_), *reinterpret_cast<__m256i *>(in));
}
void get_data(Data* out) {
_mm256_store_si256(reinterpret_cast<__m256i *>(out), *reinterpret_cast<__m256i *>(&data_));
}
alignas(64) Data data_;
char padding [64 - sizeof(Data)];
};
int main() {
Data a, b;
DataHolder ab;
ab.set_data(&a);
ab.get_data(&b);
}
DataHolder::set_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-80]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-72]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
DataHolder::get_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-72]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-80]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
main:
push rbp
mov rbp, rsp
and rsp, -64
add rsp, -128
lea rdx, [rsp+96]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::set_data(Data*)
lea rdx, [rsp+64]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::get_data(Data*)
mov eax, 0
leave
ret

Passing a r-value-reference to constructor to reduce copies

I have the following code lines
#include <stdio.h>
#include <utility>
class A
{
public: // member functions
explicit A(int && Val)
{
_val = std::move(Val); // \2\
}
virtual ~A(){}
private: // member variables
int _val = 0;
private: // member functions
A(const A &) = delete;
A& operator = (const A &) = delete;
A(A &&) = delete;
A&& operator = (A &&) = delete;
};
int main()
{
A a01{3}; // \1\
return 0;
}
I would like to ask how many copies did I make from \1\ to \2\?
Your code doesn't compile, but after making the changes needed for it to compile, it does nothing and compiles into this x86 assembly because none of it's values are ever used:
main:
xor eax, eax
ret
https://godbolt.org/z/q70EMb
Modifying the code so that it requires the output of the _val member variable (with a print statement) shows that with optimizations it simply moves the value 0x03 into a register and prints it:
.LC0:
.string "%d\n"
main:
sub rsp, 8
mov esi, 3
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
xor eax, eax
add rsp, 8
ret
https://godbolt.org/z/JG73Ll
If you disable optimizations in an attempt to get the compiler to output a more verbose version of the program:
A::A(int&&):
push rbp
mov rbp, rsp
sub rsp, 16
mov QWORD PTR [rbp-8], rdi
mov QWORD PTR [rbp-16], rsi
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], 0
mov rax, QWORD PTR [rbp-16]
mov rdi, rax
call std::remove_reference<int&>::type&& std::move<int&>(int&)
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
nop
leave
ret
.LC0:
.string "%d\n"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 3
lea rdx, [rbp-4]
lea rax, [rbp-8]
mov rsi, rdx
mov rdi, rax
call A::A(int&&)
mov eax, DWORD PTR [rbp-8]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
std::remove_reference<int&>::type&& std::move<int&>(int&):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov rax, QWORD PTR [rbp-8]
pop rbp
ret
https://godbolt.org/z/ZTK40d
The answer to your question depends on how your program is compiled and how copy elision is enforced, as well as if there is any benefit in the case of an int to not "copying" a value, since an int* and int likely take up the same amount of memory.
your are merely assigning a value, not copying. Nevertheless, you can have a static member in your class that is incremented everytime this method is called!
class A
{
public: // member functions
static int counter = 0;
explicit A(int && Val)
{
_val = std::move(Val); // \2\
counter++;
}
....

how does the compiler optimize this piece of code

consider the following loop:
unsigned long x = 0;
for(unsigned long i = 2314543142; i > 0; i-- )
x+=i;
std::cout << x << std::endl;
when I compile this normally it take roughly 6.5 seconds to execute this loop. But when I compile with -O3 optimization the loop gets executed in 10^-6 seconds. How is this possible? The compiler surely does not know how the closed form expression for x...
You don't really have to know all about assembly to see that the complier determines the value of x at compile time, if compiling with optimization on.
I modified your code slightly to be able to use the online tool Compiler Explorer, changing the std::cout << x << std::endl to extern unsigned long foo; and foo = x;. Not really necessary but it makes the output cleaner.
Compiled with -O2:
test():
movabs rax, 2678554979246887653
mov QWORD PTR foo[rip], rax
ret
Compiled with -O0:
test():
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], 0
mov DWORD PTR [rbp-16], -1980424154
mov DWORD PTR [rbp-12], 0
jmp .L2
.L3:
mov rax, QWORD PTR [rbp-16]
add QWORD PTR [rbp-8], rax
sub QWORD PTR [rbp-16], 1
.L2:
cmp QWORD PTR [rbp-16], 0
setne al
test al, al
jne .L3
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR foo[rip], rax
leave
ret
Also: the first revision of your code with undefined behavior due to i >= 0 just outputs:
test():
.L2:
jmp .L2
:-)
The compiler determines that the value of x after the loop and uses that in the output statement.

Trying to understand ASM code

EDIT
I switched from memcmp to a home brewed 13 byte compare function and the homebrew doesnt have the extra instructions. So all I can guess is that the extra assembly is just a flaw in the optimizer.
if (!EQ13(&ti, &m_ti)) { // in 2014, memcmp was not being optimzied here
000007FEF91B2CFE mov rdx,qword ptr [rsp]
000007FEF91B2D02 movzx eax,byte ptr [rsp+0Ch]
000007FEF91B2D07 mov ecx,dword ptr [rsp+8]
000007FEF91B2D0B cmp rdx,qword ptr [r10+28h]
000007FEF91B2D0F jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D11 cmp ecx,dword ptr [r10+30h]
000007FEF91B2D15 jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D17 cmp al,byte ptr [r10+34h]
000007FEF91B2D1B je TSccIter::SetTi+0B1h (7FEF91B2D31h)
My homebrew isn't perfect in this case since it does 3 movs at the start even though it is unlikely to ever check past the first mov. I need to work on that part.
ORIGINAL QUESTION
Here is asm code from msvc 2010 showing how it can optimze a small, fixed-sized memcmp (in this case, 13 bytes). I've seen this type of optimization a lot in our code, but never with the last 6 lines. Can anyone tell me why the last 6 lines of assembly are there? TransferItem is 13 bytes so that explains the QWORD, DWORD, then BYTE cmps.
struct TransferItem {
char m_szCxrMkt1[3];
char m_szCxrOp1[3];
char m_chDelimiter;
char m_szCxrMkt2[3];
char m_szCxrOp2[3];
};
...
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
Also what is the point of xor eax,eax which we know will be zero and then testing that for that known to be zero on line 2bb7?
Here is the whole function
// fWildCard means match certain fields to '**' in the db
// szCxrMkt1,2 are required and cannot be null, ' ', or '\0\0'.
// szCxrOp1,2 can be null, ' ', or '\0\0'.
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
if (m_fSkipSet)
return *this;
m_iSid = -1; // resets the iterator to search from the start
// Pad the struct to 16 bytes so we can clear it with 2 QWORDS
// We use a temp, ti, to detect if the new transferitem has changed
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)]; // get us to 16 bytes
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020; // 8 spaces
// copy in the params
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
if (szCxrOp1 && *szCxrOp1)
CPY2(ti.m_szCxrOp1, szCxrOp1);
ti.m_chDelimiter = (fWildCard) ? '*' : ':'; // this controls wild card matching
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
if (szCxrOp2 && *szCxrOp2)
CPY2(ti.m_szCxrOp2, szCxrOp2);
// see if different
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
memcpy(&m_ti, &ti, sizeof(TransferItem));
m_fQryChanged = true;
}
return *this;
}
typedef unsigned __int64 U8;
#define CPY2(a,b) ((*(WORD*)a) = (*(WORD*)b))
And here's the whole asm
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
2B10 sub rsp,18h
if (m_fSkipSet)
2B14 cmp byte ptr [rcx+0EAh],0
2B1B mov r10,rcx
return *this;
2B1E jne TSccIter::SetTi+0CCh (7FEF9302BDCh)
m_iSid = -1;
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)];
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020;
2B24 mov rax,2020202020202020h
2B2E mov byte ptr [rcx+36h],0FFh
2B32 mov qword ptr [rsp],rax
2B36 mov qword ptr [rsp+8],rax
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
2B3B movzx eax,word ptr [r8]
2B3F mov word ptr [rsp],ax
if (szCxrOp1 && *szCxrOp1)
2B43 test r9,r9
2B46 je TSccIter::SetTi+47h (7FEF9302B57h)
2B48 cmp byte ptr [r9],0
2B4C je TSccIter::SetTi+47h (7FEF9302B57h)
CPY2(ti.m_szCxrOp1, szCxrOp1);
2B4E movzx eax,word ptr [r9]
2B52 mov word ptr [rsp+3],ax
ti.m_chDelimiter = (fWildCard) ? '*' : ':';
2B57 mov eax,3Ah
2B5C mov ecx,2Ah
2B61 test dl,dl
2B63 cmovne eax,ecx
2B66 mov byte ptr [rsp+6],al
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
2B6A mov rax,qword ptr [szCxrMkt2]
2B6F movzx ecx,word ptr [rax]
if (szCxrOp2 && *szCxrOp2)
2B72 mov rax,qword ptr [szCxrOp2]
2B77 mov word ptr [rsp+7],cx
2B7C test rax,rax
2B7F je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
2B81 cmp byte ptr [rax],0
2B84 je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
CPY2(ti.m_szCxrOp2, szCxrOp2);
2B86 movzx eax,word ptr [rax]
2B89 mov word ptr [rsp+0Ah],ax
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
memcpy(&m_ti, &ti, sizeof(TransferItem));
2BBB mov rax,qword ptr [rsp]
m_fQryChanged = true;
2BBF mov byte ptr [r10+0E9h],1
2BC7 mov qword ptr [r10+28h],rax
2BCB mov eax,dword ptr [rsp+8]
2BCF mov dword ptr [r10+30h],eax
2BD3 movzx eax,byte ptr [rsp+0Ch]
2BD8 mov byte ptr [r10+34h],al
}
return *this;
2BDC mov rax,r10
}
2bb7 can be reached by different code paths: via taken jumps at 2b99, 2ba2 and 2bac, as well as directly when none of the conditional jumps is taken. The xor eax,eax is only executed at the last path, and it ensures that eax is 0 - which is apparently not the case otherwise.
The last 6 lines return the value in eax == 0 for a match, and also set the SF and ZF condition codes.
test eax, eax will test whether eax AND eax == 0. The following je will jump if zero.
And xor eax, eax is an efficient way to encode "eax = 0". It is more efficient than mov eax, 0
EDIT: Initially misread the question. It looks like something will happen at "TSccIter::SetTi+0A7h" which should change the value?
Also, the SBB trick to replicate the carry(2BB2-2BB4) is explained here:
http://compgroups.net/comp.lang.asm.x86/trick-with-sbb-instruction/20164