Compiler optimization for sum of squared numbers [duplicate] - c++

This question already has answers here:
How does clang generate non-looping code for sum of squares?
(2 answers)
Closed last month.
Here is something that I find interesting:
pub fn sum_of_squares(n: i32) -> i32 {
let mut sum = 0;
for i in 1..n+1 {
sum += i*i;
}
sum
}
This is the naive implementation of the sum of squared numbers in Rust. This is the assembly code with rustc 1.65.0 with -O3
lea ecx, [rdi + 1]
xor eax, eax
cmp ecx, 2
jl .LBB0_2
lea eax, [rdi - 1]
lea ecx, [rdi - 2]
imul rcx, rax
lea eax, [rdi - 3]
imul rax, rcx
shr rax
imul eax, eax, 1431655766
shr rcx
lea ecx, [rcx + 4*rcx]
add ecx, eax
lea eax, [rcx + 4*rdi]
add eax, -3
.LBB0_2:
ret
I was expecting it to use the formula for the sum of squared numbers, but it does not. It uses a magical number 1431655766 which I don't understand at all.
Then I wanted to see what clang and gcc do in C++ for the same function
test edi, edi
jle .L8
lea eax, [rdi-1]
cmp eax, 17
jbe .L9
mov edx, edi
movdqa xmm3, XMMWORD PTR .LC0[rip]
xor eax, eax
pxor xmm1, xmm1
movdqa xmm4, XMMWORD PTR .LC1[rip]
shr edx, 2
.L4:
movdqa xmm0, xmm3
add eax, 1
paddd xmm3, xmm4
movdqa xmm2, xmm0
pmuludq xmm2, xmm0
psrlq xmm0, 32
pmuludq xmm0, xmm0
pshufd xmm2, xmm2, 8
pshufd xmm0, xmm0, 8
punpckldq xmm2, xmm0
paddd xmm1, xmm2
cmp eax, edx
jne .L4
movdqa xmm0, xmm1
mov eax, edi
psrldq xmm0, 8
and eax, -4
paddd xmm1, xmm0
add eax, 1
movdqa xmm0, xmm1
psrldq xmm0, 4
paddd xmm1, xmm0
movd edx, xmm1
test dil, 3
je .L1
.L7:
mov ecx, eax
imul ecx, eax
add eax, 1
add edx, ecx
cmp edi, eax
jge .L7
.L1:
mov eax, edx
ret
.L8:
xor edx, edx
mov eax, edx
ret
.L9:
mov eax, 1
xor edx, edx
jmp .L7
.LC0:
.long 1
.long 2
.long 3
.long 4
.LC1:
.long 4
.long 4
.long 4
.long 4
This is gcc 12.2 with -O3. GCC also does not use the sum of squared formula. I also don't know why it checks if the number is greater than 17? But for some reason, gcc does make a lot of operations compared to clang and rustc.
This is clang 15.0.0 with -O3
test edi, edi
jle .LBB0_1
lea eax, [rdi - 1]
lea ecx, [rdi - 2]
imul rcx, rax
lea eax, [rdi - 3]
imul rax, rcx
shr rax
imul eax, eax, 1431655766
shr rcx
lea ecx, [rcx + 4*rcx]
add ecx, eax
lea eax, [rcx + 4*rdi]
add eax, -3
ret
.LBB0_1:
xor eax, eax
ret
I don't really understand what kind of optimization clang is doing there. But rustc, clang, and gcc doesn't like n(n+1)(2n+1)/6
Then I timed their performance. Rust is doing significantly better than gcc and clang. These are the average results of 100 executions. Using 11th gen intel core i7-11800h # 2.30 GHz
Rust: 0.2 microseconds
Clang: 3 microseconds
gcc: 5 microseconds
Can someone explain the performance difference?
Edit
C++:
int sum_of_squares(int n){
int sum = 0;
for(int i = 1; i <= n; i++){
sum += i*i;
}
return sum;
}
EDIT 2
For everyone wondering here is my benchmark code:
use std::time::Instant;
pub fn sum_of_squares(n: i32) -> i32 {
let mut sum = 0;
for i in 1..n+1 {
sum += i*i;
}
sum
}
fn main() {
let start = Instant::now();
let result = sum_of_squares(1000);
let elapsed = start.elapsed();
println!("Result: {}", result);
println!("Elapsed time: {:?}", elapsed);
}
And in C++:
#include <chrono>
#include <iostream>
int sum_of_squares(int n){
int sum = 0;
for(int i = 1; i <= n; i++){
sum += i*i;
}
return sum;
}
int main() {
auto start = std::chrono::high_resolution_clock::now();
int result = sum_of_squares(1000);
auto end = std::chrono::high_resolution_clock::now();
std::cout << "Result: " << result << std::endl;
std::cout << "Elapsed time: "
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< " microseconds" << std::endl;
return 0;
}

I was expecting it to use the formula for the sum of squared numbers, but it does not. It uses a magical number 1431655766 which I don't understand at all.
LLVM does transform that loop into a formula but its different from naive square sum formula.
This article explains formula and generated code better than I could.

Clang does the same optimization using -O3 in C++ but not GCC yet. See on GodBolt. AFAIK, the default Rust compiler use LLVM internally like Clang. This is why they produce a similar code. GCC use a naive loop vectorized using SIMD instructions while Clang uses a formula like the one you gave in the question.
The optimized assembly code from the C++ code is the following:
sum_of_squares(int): # #sum_of_squares(int)
test edi, edi
jle .LBB0_1
lea eax, [rdi - 1]
lea ecx, [rdi - 2]
imul rcx, rax
lea eax, [rdi - 3]
imul rax, rcx
shr rax
imul eax, eax, 1431655766
shr rcx
lea ecx, [rcx + 4*rcx]
add ecx, eax
lea eax, [rcx + 4*rdi]
add eax, -3
ret
.LBB0_1:
xor eax, eax
ret
This optimization mainly comes from the IndVarSimplify optimization pass. On can see that some variables are encoded on 32 bits while some others are encoded on 33 bits (requiring a 64 bit register on mainstream platforms). The code basically does:
if(edi == 0)
return 0;
eax = rdi - 1;
ecx = rdi - 2;
rcx *= rax;
eax = rdi - 3;
rax *= rcx;
rax >>= 1;
eax *= 1431655766;
rcx >>= 1;
ecx = rcx + 4*rcx;
ecx += eax;
eax = rcx + 4*rdi;
eax -= 3;
return eax;
This can be further simplified to the following equivalent C++ code:
if(n == 0)
return 0;
int64_t m = n;
int64_t tmp = ((m - 3) * (m - 1) * (m - 2)) / 2;
tmp = int32_t(int32_t(tmp) * int32_t(1431655766));
return 5 * ((m - 1) * (m - 2) / 2) + tmp + (4*m - 3);
Note that some casts and overflows are ignored for sake of clarity.
The magical number 1431655766 comes from a kind of correction from an overflow related to a division by 3. Indeed, 1431655766 / 2**32 ~= 0.33333333348855376. Clang plays with the 32-bit overflows so to generate a fast implementation of the formula n(n+1)(2n+1)/6.

Division by a constant c on a machine with 128 bit product is often implemented by multiplying by 2^64 / c. That’s where your strange constant comes from.
Now the formula n(n+1)(2n+1) / 6 will overflow for large n, while the sum won’t, so this formula can only be used very, very carefully.

Related

A strange bug of g++9.3.0 -O2 on linux

I've met a strange bug of g++9.3.0 -O2 on linux
The code below is converted from my code of the SJT algorithm.
If I keep the last line init in generate, the time cost is 1200+ms.
if I delete it, the time cost is 600+ms.
This bug appears on ubuntu20.04 with g++9.3.0. I've tested it on win10 and macOS with g++9.3.0, the bug doesn't appear. I've also tested it on linux with g++8 and g++10, the bug doesn't appear, either.
Here is the code. The original question is 69468547.
I want to know what causes this strange "time cost double" behavior?
20211008: I reproduce this bug in another way. Here is the whole code.I execute the strange_func(SJT algorithm) twice in generate, the first one's time cost is 653ms and the second one's is 1322ms. You can reproduce the bug with gcc9.3.0 on linux. I've also tried gcc10, there is no bug.
#include <cstdio>
#include <cstring>
#include <chrono>
using namespace std::chrono;
#define MAXN 100
struct Permutation {
int N;
char s[2*MAXN];
int r[MAXN];
inline void init() {
memset(s, 0, sizeof(s));
memset(r, 0, sizeof(r));
}
void generate(int n) {
N = n;
init();
auto start = steady_clock::now();
strange_func();
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - start);
printf("time cost(ms): %ld\n", duration.count());
init();
}
void strange_func() {
int k = N, t = -1;
while (true) {
r[N] += 1;
if (r[N] < N) {
char c = s[k]; s[k] = s[k+t]; s[k+t] = c;
k += t;
} else {
int i = N;
while (r[i] == i)
r[i] = 0, r[--i] += 1;
if (i == 0) break;
t = 0;
}
}
}
} perm;
int main() {
int n;
scanf("%d", &n);
perm.generate(n);
return 0;
}
The fact that init() is called after the strange_func() function call change the generated assembly code of the variable swap (between s[k] and s[k+t]) in the loop in strange_func()! The apparent minor assembly change has a huge impact on the performance as the loop is very sensitive to micro-optimizations and the generated code with the init() is clearly less efficient. Such a change is likely due to fragile compiler heuristics (with a clear chaotic behaviour in this specific case) and the fact the strange_func() function call is inlined.
To understand what is going on, let us analyse the assembly generated by the two variants.
Here is the assembly code of the hot loop without (left) and with (right) init():
.L2: | .L2:
add ecx, 1 | add esi, 1
mov DWORD PTR 12[rbx+rdx*4], ecx | mov DWORD PTR 12[r12+rdx*4], esi
cmp r8d, ecx | cmp ecx, esi
jle .L3 | jle .L3
|
.L13: | .L13:
movsx r9, eax | movsx r9, eax
add eax, esi | add eax, edi
add ecx, 1 | add esi, 1
movsx rdi, eax | movzx r11d, BYTE PTR 4[r12+r9]
movzx r11d, BYTE PTR 4[rbx+r9] | movsx r8, eax
mov DWORD PTR 12[rbx+rdx*4], ecx | mov DWORD PTR 12[r12+rdx*4], esi
movzx r14d, BYTE PTR 4[rbx+rdi] | mov BYTE PTR 15[rsp], r11b
| movzx r11d, BYTE PTR 4[r12+r8]
mov BYTE PTR 4[rbx+r9], r14b | mov BYTE PTR 4[r12+r9], r11b
| movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[rbx+rdi], r11b | mov BYTE PTR 4[r12+r8], r9b
cmp r8d, ecx | cmp ecx, esi
jg .L13 | jg .L13
|
.L3: | .L3:
jne .L9 | jne .L9
mov rsi, r10 | mov rdi, r10
mov ecx, r8d | mov esi, ecx
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L6: | .L6:
mov edi, DWORD PTR 200[rsi] | mov r11d, DWORD PTR 200[rdi]
sub ecx, 1 | sub esi, 1
sub rsi, 4 | sub rdi, 4
mov DWORD PTR 208[rsi], 0 | mov DWORD PTR 208[rdi], 0
add edi, 1 | lea r8d, 1[r11]
mov DWORD PTR 204[rsi], edi | mov DWORD PTR 204[rdi], r8d
cmp ecx, edi | cmp esi, r8d
je .L6 | je .L6
test ecx, ecx | test esi, esi
je .L14 | je .L14
|
.L7: | .L7:
mov ecx, DWORD PTR 12[rbx+rdx*4] | mov esi, DWORD PTR 12[r12+rdx*4]
xor esi, esi | xor edi, edi
jmp .L2 | jmp .L2
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L9: | .L9:
mov ecx, r8d | mov esi, ecx
test ecx, ecx | test esi, esi
jne .L7 | jne .L7
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
As we can see, the L13 block contains more instructions with the init() call. The rest of the blocks look similar.
Here is a detailed analysis of the blocks without init():
movsx r9, eax
add eax, esi
add ecx, 1
movsx rdi, eax
movzx r11d, BYTE PTR 4[rbx+r9] ; Perform r11b=s[k]
mov DWORD PTR 12[rbx+rdx*4], ecx ; Perform r[N]+=1 (r[N] was stored in ecx previously)
movzx r14d, BYTE PTR 4[rbx+rdi] ; Perform r14b=s[k+t]
mov BYTE PTR 4[rbx+r9], r14b ; Perform s[k]=r14b
mov BYTE PTR 4[rbx+rdi], r11b ; Perform s[k+t]=r11b
cmp r8d, ecx
jg .L13
Here is a detailed analysis of the blocks with init():
movsx r9, eax
add eax, edi
add esi, 1
movzx r11d, BYTE PTR 4[r12+r9]
movsx r8, eax
mov DWORD PTR 12[r12+rdx*4], esi ; Perform r[N]+=1 (r[N] was stored in ecx previously)
mov BYTE PTR 15[rsp], r11b ; Perform c = s[k] (c is stored in memory)
movzx r11d, BYTE PTR 4[r12+r8]
mov BYTE PTR 4[r12+r9], r11b ; Perform s[k]=s[k+t]
movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[r12+r8], r9b ; Perform s[k+t]=c
cmp ecx, esi
jg .L13
We can see that in the first case, GCC is able to swap s[k] and s[k+t] efficiently while in the second case, GCC use store one value in a temporary location in the stack which is clearly less efficient. An in-memory swap is clearly less efficient because of the data dependency and L1 cache latency (generally about 3-4 cycles on modern x86 AMD/Intel processors).
Whether this is a bug or just a missing optimization of GCC 9.3.0 is still unclear. However, this is very hard to tell without delving into an old version of the GCC code not actively maintained anymore (since March 12, 2020).
A quick workaround solution to this issue is to tell GCC not to inline the function using __attribute__((noinline)). Alternatively, it should be possible to tune GCC heuristic parameters (using the GCC command line) so that this does not happen. Another solution would be to optimize the loop to compute several permutations at once so that such micro-optimizations do not matter so much.

Understanding what clang is doing in assembly, decrementing for a loop that is incrementing

Consider the following code, in C++:
#include <cstdlib>
std::size_t count(std::size_t n)
{
std::size_t i = 0;
while (i < n) {
asm volatile("": : :"memory");
++i;
}
return i;
}
int main(int argc, char* argv[])
{
return count(argc > 1 ? std::atoll(argv[1]) : 1);
}
It is just a loop that is incrementing its value, and returns it at the end. The asm volatile prevents the loop from being optimized away. We compile it under g++ 8.1 and clang++ 5.0 with the arguments -Wall -Wextra -std=c++11 -g -O3.
Now, if we look at what compiler explorer is producing, we have, for g++:
count(unsigned long):
mov rax, rdi
test rdi, rdi
je .L2
xor edx, edx
.L3:
add rdx, 1
cmp rax, rdx
jne .L3
.L2:
ret
main:
mov eax, 1
xor edx, edx
cmp edi, 1
jg .L25
.L21:
add rdx, 1
cmp rdx, rax
jb .L21
mov eax, edx
ret
.L25:
push rcx
mov rdi, QWORD PTR [rsi+8]
mov edx, 10
xor esi, esi
call strtoll
mov rdx, rax
test rax, rax
je .L11
xor edx, edx
.L12:
add rdx, 1
cmp rdx, rax
jb .L12
.L11:
mov eax, edx
pop rdx
ret
and for clang++:
count(unsigned long): # #count(unsigned long)
test rdi, rdi
je .LBB0_1
mov rax, rdi
.LBB0_3: # =>This Inner Loop Header: Depth=1
dec rax
jne .LBB0_3
mov rax, rdi
ret
.LBB0_1:
xor edi, edi
mov rax, rdi
ret
main: # #main
push rbx
cmp edi, 2
jl .LBB1_1
mov rdi, qword ptr [rsi + 8]
xor ebx, ebx
xor esi, esi
mov edx, 10
call strtoll
test rax, rax
jne .LBB1_3
mov eax, ebx
pop rbx
ret
.LBB1_1:
mov eax, 1
.LBB1_3:
mov rcx, rax
.LBB1_4: # =>This Inner Loop Header: Depth=1
dec rcx
jne .LBB1_4
mov rbx, rax
mov eax, ebx
pop rbx
ret
Understanding the code generated by g++, is not that complicated, the loop being:
.L3:
add rdx, 1
cmp rax, rdx
jne .L3
every iteration increments rdx, and compares it to rax that stores the size of the loop.
Now, I have no idea of what clang++ is doing. Apparently it uses dec, which is weird to me, and I don't even understand where the actual loop is. My question is the following: what is clang doing?
(I am looking for comments about the clang assembly code to describe what is done at each step and how it actually works).
The effect of the function is to return n, either by counting up to n and returning the result, or by simply returning the passed-in value of n. The clang code does the latter. The counting loop is here:
mov rax, rdi
.LBB0_3: # =>This Inner Loop Header: Depth=1
dec rax
jne .LBB0_3
mov rax, rdi
ret
It begins by copying the value of n into rax. It decrements the value in rax, and if the result is not 0, it jumps back to .LBB0_3. If the value is 0 it falls through to the next instruction, which copies the original value of n into rax and returns.
There is no i stored, but the code does the loop the prescribed number of times, and returns the value that i would have had, namely, n.

Is ICC unable to optimize away simple and idiomatic constructs?

bool f1(int value) { return value % 2 == 0; }
bool f2(int value) { return ! (value & 1); }
Those two functions do the same thing, and I expect from a decent compiler to produce the same assembly for both of them.
GCC 7 with -O2:
f1(int):
mov eax, edi
not eax
and eax, 1
ret
f2(int):
mov eax, edi
not eax
and eax, 1
ret
clang 5 with -O2:
f1(int): # #f1(int)
test dil, 1
sete al
ret
f2(int): # #f2(int)
test dil, 1
sete al
ret
icc 18 with -O2:
f1(int):
and edi, -2147483647 #1.37
jge ..B1.4 # Prob 50% #1.37
sub edi, 1 #1.37
or edi, -2 #1.37
inc edi #1.37
..B1.4: # Preds ..B1.1 ..B1.5
xor eax, eax #1.42
cmp edi, 1 #1.42
setb al #1.42
ret #1.42
f2(int):
not edi #3.40
and edi, 1 #3.40
mov eax, edi #3.40
ret #3.40
(-O3 doesn't help).
So, is icc not a decent compiler? is it right to discriminate the two implementations for some specific corner case I didn't anticipate? Do I need to get my expectations lower about what a decent compiler can do?

Using lambda in default initializer vs using member function

Is there any difference in using "one-time" lambda in default initializer and using plain old member function?
struct A
{
int i;
int j = [&]
// something non-trivial,
// that requires multiple
// statements and depends
// on upper data members
{
int f = 0;
for (int k = 0; k < i; ++k) {
f += k;
}
return f;
}();
A(int k) : i(k) { ; }
};
Versus:
struct A
{
int i;
int J() const
{
int f = 0;
for (int k = 0; k < i; ++k) {
f += k;
}
return f;
}
int j = J();
A(int k) : i(k) { ; }
};
The only I see is the downside of the second approach: here extra symbol J introduced into namespace of class A. Are there another distinctions?
Regarding performance, there is no difference in the gcc 7.1 -O3 compiled code. Both implementations yield the same assembly.
The test code:
int callerFunc(int init)
{
A st(init);
return st.j;
}
gets compiled to:
callerFunc(int):
test edi, edi
jle .L7
lea eax, [rdi-1]
cmp eax, 7
jbe .L8
pxor xmm0, xmm0
mov edx, edi
xor eax, eax
movdqa xmm1, XMMWORD PTR .LC0[rip]
shr edx, 2
movdqa xmm2, XMMWORD PTR .LC1[rip]
.L5:
add eax, 1
paddd xmm0, xmm1
paddd xmm1, xmm2
cmp eax, edx
jb .L5
movdqa xmm1, xmm0
mov edx, edi
and edx, -4
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
cmp edi, edx
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
je .L10
.L3:
lea ecx, [rdx+1]
add eax, edx
cmp edi, ecx
jle .L1
add eax, ecx
lea ecx, [rdx+2]
cmp edi, ecx
jle .L1
add eax, ecx
lea ecx, [rdx+3]
cmp edi, ecx
jle .L1
add eax, ecx
lea ecx, [rdx+4]
cmp edi, ecx
jle .L1
add eax, ecx
lea ecx, [rdx+5]
cmp edi, ecx
jle .L1
add eax, ecx
lea ecx, [rdx+6]
cmp edi, ecx
jle .L1
add eax, ecx
add edx, 7
lea ecx, [rax+rdx]
cmp edi, edx
cmovg eax, ecx
ret
.L7:
xor eax, eax
.L1:
rep ret
.L10:
rep ret
.L8:
xor eax, eax
xor edx, edx
jmp .L3
.LC0:
.long 0
.long 1
.long 2
.long 3
.LC1:
.long 4
.long 4
.long 4
.long 4
A couple of differences come to mind:
The lambda cannot be overloaded. Hence, any inherited class will use the same function.
The lambda allows you to capture local variables by default, which may cause bugs if things are renamed/re-ordered. If you pass them explicitly, this can be mitigated. While re-ordering is dangerous for methods and lambdas alike, default lambda capture is more volatile because the variable you re-order will be passed into the lambda implicitly.

How to toggle a bit n times without using loop in C/C++?

I would like to toggle a bit n times without loop.
like 1(bit) after toggling 3 times will be 0 and so on.
For toggling a bit 1 time I use bit^=1 .
I actually searching a bit manipulation formula to do so.
If I understand correctly, you want to toggle a bit N times.
Now, Toggling a bin N times equals toggling N%2 times so:
b ^= (N%2);
Modulo 2 is the same as N&1 so you can also write:
b ^= (N&1);
generalised solution:
int toggle_bit_in_word(int word, int bit, int ntimes)
{
auto archetype = ntimes & 1;
auto toggler = archetype << bit;
return word ^= toggler;
}
gcc 5.3 produces this code:
toggle_bit_in_word(int, int, int):
and edx, 1
shlx edx, edx, esi
mov eax, edx
xor eax, edi
ret
for fun, let's write it the naiive way and enjoy the hilarity:
int toggle_bit_in_word_naiive(int word, int bit, int ntimes)
{
auto toggler = 1 << bit;
while (ntimes--)
word ^= toggler;
return word;
}
output (5.3):
toggle_bit_in_word_naiive(int, int, int):
mov ecx, 1
mov eax, edi
shlx esi, ecx, esi
lea edi, [rdx-1]
test edx, edx
je .L48
lea ecx, [rdx-8]
shr ecx, 3
add ecx, 1
lea r9d, [0+rcx*8]
cmp edi, 12
jbe .L4
vmovd xmm1, esi
xor r8d, r8d
vpxor xmm0, xmm0, xmm0
vpbroadcastd ymm1, xmm1
.L5:
add r8d, 1
vpxor ymm0, ymm0, ymm1
cmp ecx, r8d
ja .L5
vpxor xmm1, xmm1, xmm1
vperm2i128 ymm2, ymm0, ymm1, 33
vpxor ymm0, ymm0, ymm2
sub edi, r9d
vperm2i128 ymm2, ymm0, ymm1, 33
vpalignr ymm2, ymm2, ymm0, 8
vpxor ymm0, ymm0, ymm2
vperm2i128 ymm1, ymm0, ymm1, 33
vpalignr ymm1, ymm1, ymm0, 4
vpxor ymm0, ymm0, ymm1
vmovd ecx, xmm0
xor eax, ecx
cmp edx, r9d
je .L47
vzeroupper
.L4:
xor eax, esi
test edi, edi
je .L48
xor eax, esi
cmp edi, 1
je .L48
xor eax, esi
cmp edi, 2
je .L48
xor eax, esi
cmp edi, 3
je .L48
xor eax, esi
cmp edi, 4
je .L48
xor eax, esi
cmp edi, 5
je .L48
xor eax, esi
cmp edi, 6
je .L48
xor eax, esi
cmp edi, 7
je .L48
xor eax, esi
cmp edi, 8
je .L48
xor eax, esi
cmp edi, 9
je .L48
xor eax, esi
cmp edi, 10
je .L48
xor eax, esi
xor esi, eax
cmp edi, 11
cmovne eax, esi
ret
.L47:
vzeroupper
.L48:
ret
8-/
of course, when an optimiser has all the information it needs, even naiive code becomes efficient:
int main(int argc, char**argv)
{
return toggle_bit_in_word_naiive(argc, 3, 3);
}
result:
main:
mov eax, edi
xor eax, 8
ret
This is impossible if you only ever manipulate bits independently.
That's because a particular bit does not know its previous two states.
You need a minimum of 2 bits if you want a cycle with a periodicity of 4.