(Setup: Win 7 64, MSVC, 3rd Generation Core i7, 64-bit compliation, -O2 enabled)
The below code has three functions- one has an IF statement which executes different code depending on whether a condition has been met. I replaced this IF statement with some boolean logic. However the timings are identical.... I was expecting the lack of branch prediction to yield faster code:
#include <iostream>
unsigned long long iterations = 1000000000;
void test1(){
volatile int c = 0;
for(int i=0; i<iterations; i++){
bool condition = __rdtsc() % 2 == 0;
if(condition){
c = 4;
}
else{
c = 5;
}
}
}
void test2(){
volatile int c = 0;
for(int i=0; i<iterations; i++){
bool condition = __rdtsc() % 2 == 0;
c = (4 * condition) + (5 * !condition);
}
}
int main(){
unsigned long long s = 0;
unsigned long long f = 0;
unsigned long long s2 = 0;
unsigned long long f2 = 0;
unsigned int x = 0;
unsigned int y = 0;
start = __rdtscp(&x);
test1();
finish = __rdtscp(&y);
start2 = __rdtscp(&x);
test2();
finish2 = __rdtscp(&y);
std::cout << "1: " << f - s<< std::endl;
std::cout << "2: " << f2- s2<< std::endl;
}
UPDATE asm:
int main(){
push rbp
push rsi
push rdi
push r14
sub rsp,20h
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned long long start2 = 0;
unsigned long long finish2 = 0;
unsigned long long start3 = 0;
unsigned long long finish3 = 0;
unsigned int x = 0;
xor r8d,r8d
mov dword ptr [x],r8d
unsigned int y = 0;
mov dword ptr [y],r8d
start = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov dword ptr [r9],ecx
or rax,rdx
test1();
mov dword ptr [rsp+60h],r8d
mov ecx,r8d
start = __rdtscp(&x);
mov r10,rax
nop word ptr [rax+rax]
test1();
rdtsc
shl rdx,20h
or rax,rdx
xor al,0FFh
and al,1
neg al
sbb eax,eax
inc ecx
add eax,5
mov dword ptr [rsp+60h],eax
movsxd rax,ecx
cmp rax,3E8h
test1();
jb main+40h (013FFE1280h)
finish = __rdtscp(&y);
rdtscp
lea r9,[y]
shl rdx,20h
or rax,rdx
mov dword ptr [r9],ecx
mov rbp,rax
start2 = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov dword ptr [r9],ecx
or rax,rdx
test2();
mov dword ptr [rsp+60h],r8d
mov r9d,r8d
start2 = __rdtscp(&x);
mov r14,rax
nop word ptr [rax+rax]
test2();
rdtsc
shl rdx,20h
inc r9d
or rax,rdx
xor al,0FFh
and al,1
test2();
movzx ecx,al
lea eax,[rcx+rcx*8]
mov dword ptr [rsp+60h],eax
movsxd rax,r9d
cmp rax,3E8h
jb main+0A0h (013FFE12E0h)
finish2 = __rdtscp(&y);
The generated code doesn't contain any internal branches for either function, which is why there is no mis-prediction penalty.
In the first one it converts the boolean to either zero or -1 (around sbb eax,eax) and adds it to 5. This is a pretty standard optimisation when working with booleans.
In the second one it multiplies by nine (rcx+rcx*8), because you have 5 * condition not 5 * !condition.
Related
https://leetcode.com/problems/number-of-provinces/
I was pretty excited when I solved this problem on my very first try, within only 20/30 minutes, though when I submitted my code, I ended up in 8.43 percentile. I looked at how the fastest solutions approached the problem, and low and behold, the sample top solution is nearly identical to my code, yet it runs 3x faster. I've been comparing the code and can't really point out a substantial enough difference. Both should be equally fast... Can anyone explain the why? If I'm not mistaken it's O(mn) performance in both cases.
The following is my code. It's pretty self-explanatory, so not sure heavy commenting would do any good.
class Solution {
public:
int findCircleNum(vector<vector<int>>& isConnected) {
int components = 0;
vector<bool> visited (isConnected.size(), false);
// go through each row
for (int i = 0; i < isConnected.size(); i++) {
// explore only unvisited items
if (!visited[i]) {
queue<int> q;
q.push(i);
components++;
while (!q.empty()) {
int node = q.front();
q.pop();
visited[node] = true;
// push all direct connections onto the queue so we explore them
for (int j = 0; j < isConnected[0].size(); j++) {
if (isConnected[node][j] == 1 && !visited[j]) {
q.push(j);
}
}
}
}
}
return components;
}
};
and the following is a sample top solution that runs 3x faster than my code.
class Solution {
public:
int findCircleNum(vector<vector<int>>& M) {
if (M.empty()) {
return 0;
}
int count = 0;
vector<bool> visited(M.size());
auto bfs = [&](int student) {
queue<int> q;
q.push(student);
visited[student] = true;
while (!q.empty()) {
auto current = q.front();
cout << "current " << current << endl;
q.pop();
for (int i = 0; i < M.size(); i++) {
if (M[current][i] == 1 and !visited[i]) {
visited[i] = true;
q.push(i);
}
}
}
};
for (int r = 0; r < M.size(); r++) {
if (visited[r] == false) {
count++;
bfs(r);
}
}
return count;
}
};
The difference is as far as I can [see][1] the placement of visited[i] = true;, which causes a few less memory access per iteration. Where the OP code needs to re-fetch the bool.
And there might be a data or control flow dependence between
visited[node] = true;
and
!visited[j]
That is not there in the Best code.
OP code inner loop
.L118:
mov rax, QWORD PTR [rsi+rbx]
cmp DWORD PTR [rax+rcx*4], 1
jne .L116
mov rax, rbp
mov r8, rcx
sal rax, cl
mov rcx, QWORD PTR [rsp+80]
shr r8, 6
and rax, QWORD PTR [rcx+r8*8]
jne .L116
mov rax, QWORD PTR [rsp+192]
sub rax, 4
cmp rdi, rax
je .L117
"Best" code
.L76:
mov rax, QWORD PTR [rsi+rbx]
cmp DWORD PTR [rax+rcx*4], 1
jne .L74
mov rax, QWORD PTR [r12]
mov rsi, rcx
shr rsi, 6
mov rax, QWORD PTR [rax]
lea rsi, [rax+rsi*8]
mov eax, 1
sal rax, cl
mov rcx, QWORD PTR [rsi]
test rcx, rax
jne .L74
or rax, rcx <------------ visited[i] = true;
mov QWORD PTR [rsi], rax
mov rax, QWORD PTR [rsp+96]
sub rax, 4
cmp r8, rax
je .L75
mov DWORD PTR [r8], edx
add r8, 4
mov QWORD PTR [rsp+80], r8
jmp .L74
[1]: https://godbolt.org/z/obfqf7
Godbolt Link: https://godbolt.org/g/Hv6MAL
typedef int cell;
cell y;
const cell *phys_addr = (const cell*)0x12340;
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y = subsubarray[k];
}
}
}
}
It feels natural to expect the compiler to optimize the above code to something similar to:
int main() {
for (int i = 0; i < 20; i++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
for (int j = 0; j < 30; j++) {
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
for (int k = 0; k < 50; k++) {
y = subsubarray[k];
}
}
}
}
but the assembly generated by gcc 8.2 with -O3 -m32 as flags is:
push ebp
push edi
push esi
push ebx
sub esp, 8
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov ebp, eax
.L4:
xor esi, esi
.L3:
lea edi, [0+esi*4]
xor eax, eax
.L2:
mov edx, DWORD PTR [ebp+0]
mov ecx, DWORD PTR [esp+4]
shr edx, 2
add edx, DWORD PTR [esp]
lea ebx, [ecx+edx*4]
lea edx, [eax+esi]
add eax, 1
mov ecx, DWORD PTR [ebx+edi]
shr ecx, 2
add edx, ecx
mov edx, DWORD PTR [ebx+edx*4]
mov DWORD PTR y, edx
cmp eax, 50
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov eax, DWORD PTR [esp]
add ebp, 4
cmp eax, 20
jne .L4
add esp, 8
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
Why isn't the compiler moving the subarray and subsubarray calculation outside the inner loops?
random volatile does magic
I randomly added volatile to prevent DCE from getting rid of all the code and then somehow the loop invariants got hoisted out of the inner loops.
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
volatile cell y = subsubarray[k];
}
}
}
return 0;
}
This mostly wasn't because of y being a local variable since using std::cout << subsubarray[k]; prevented the optimization.
The assembly generated by gcc 8.2 with -O3 -m32 as flags for the aforementioned code is:
main:
push ebp
push edi
xor edi, edi
push esi
push ebx
sub esp, 20
mov ebp, DWORD PTR phys_addr
.L4:
mov eax, DWORD PTR [ebp+0+edi*4]
xor ecx, ecx
shr eax, 2
add eax, edi
lea ebx, [ebp+0+eax*4]
lea esi, [ebx+200]
.L3:
mov edx, DWORD PTR [ebx+ecx*4]
mov DWORD PTR [esp], ecx
shr edx, 2
add edx, ecx
sal edx, 2
lea eax, [ebx+edx]
add edx, esi
.L2:
mov ecx, DWORD PTR [eax]
add eax, 4
mov DWORD PTR [esp+16], ecx
cmp edx, eax
jne .L2
mov ecx, DWORD PTR [esp]
add ecx, 1
cmp ecx, 30
jne .L3
add edi, 1
cmp edi, 20
jne .L4
add esp, 20
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
The loop invariants are pushed out of the inner loops. What did the random volatile do to allow the GCC to optimize the invariants? The optimization does not happen when clang 6.0.0.
It is not about random volatile fixing your problem - problem is deeper.
As you already guessed problem indeed relates to "y"
Check this example:
typedef int cell;
const cell *phys_addr = (const cell*)0x12340;
int main() {
cell y = 1;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y /= subsubarray[k];
}
}
}
return y;
}
I've used trick with division to avoid hard optimization (gcc can evaluate all loops and provide y directly in plain assignment; when using add, sub or multiply it will unroll innermost loop too - please play in godbolt to see how it looks)
Now disassembly looks like that:
https://godbolt.org/g/R1EGSb
main:
push ebp
push edi
push esi
push ebx
sub esp, 12
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov eax, 1
.L4:
mov esi, DWORD PTR [esp]
mov edi, DWORD PTR [esp+4]
mov edx, DWORD PTR [edi+esi*4]
mov DWORD PTR [esp+8], edx
shr edx, 2
add edx, esi
xor esi, esi
lea edi, [edi+edx*4]
lea ebp, [edi+200]
.L3:
mov ebx, DWORD PTR [edi+esi*4]
shr ebx, 2
add ebx, esi
sal ebx, 2
lea ecx, [edi+ebx]
add ebx, ebp
.L2:
cdq
idiv DWORD PTR [ecx]
add ecx, 4
cmp ebx, ecx
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov edi, DWORD PTR [esp]
cmp edi, 20
jne .L4
add esp, 12
pop ebx
pop esi
pop edi
pop ebp
ret
phys_addr:
.long 74560
.L2 is innermost loop so code looks like expected - subarray and subsubarray are precomputed earlier.
So you may wonder - why is that when "y" is local all is ok and when global is not.
To be clear - "y" does not have to be declared in main. It could be made static like this
static cell y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
or use namespace
namespace wtf{ cell y; }
const cell * __restrict__ phys_addr = (const cell*)0x12340;
and than refer to y as wtf::y;
Still good.
All condenses to aliasing. To see it let's change y to pointer first:
typedef int cell;
cell * y;
const cell * phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
}
}
}
return *y;
}
No loop optimization again....
It may be assumed that y and phys_addr overlap - writing y may modify some memory cells so all dictionaries have to be calculated with most up to date data (const in phys_addr means only your pointer should not modify memory, not that it is globally readonly).
But if you "promise" that those addresses do not overlap optimization will come back.
typedef int cell;
cell * __restrict__ y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
}
}
}
return *y;
}
TL;DR;
If you are using pointers compilator may not be able to prove addresses do not alias and will use safe path. If you are 100% sure they don't use restrict to inform it about that fact.
To give you context over the code, this is a dot product computation of two vectors using pointer arithmetic (looping with pointers as well). I have linked it in my main.cpp but for some reason, when the function is called (in this case, my ASM file), i get an access violation error. Here are the two files. Thank you for your help!
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <algorithm>
#include <iostream>
using namespace std;
extern "C" int dpp_pointerr(int *v, int *u, int n); //ASM FILE
void main(void) {
const int N = 10;
static int A[10];
static int B[10];
printf("Array A: ");
for (int i = 0; i < N; i++) { A[i] = rand() % 10; /*printf("%d ", A[i]); */ }
printf("\n\nArray B: ");
for (int j = 0; j < N; j++) { B[j] = rand() % 10;/* printf("%d ", B[j]);*/ }
printf("\n");
int result2 = dpp_pointerr(A, B, N);
printf("\nResult after POINTER dot product: %d\n", result2);
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
int acc = 0, i = 0;
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctr1) != 0) {
/****************CODE TO BE TIMED HERE**********************/
//int result3= dot_product_index(A, B, N);
int result2 = dpp_pointerr(A, B, N);
/**********************************************************/
QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
cout << "Start Value: " << ctr1 << endl;
cout << "End Value: " << ctr2 << endl;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
// freq is number of counts per second. It approximates the CPU frequency
printf("QueryPerformanceCounter minimum resolution: 1/%I64u Seconds.\n", freq);
printf("ctr2 - ctr1: %f counts.\n", ((ctr2 - ctr1) * 1.0 / 1.0));
cout << "65536 Increments by 1 computation time: " << ((ctr2 - ctr1) * 1.0 / freq) << " seconds\n";
}
else {
DWORD dwError = GetLastError();
printf("Error value = %d", dwError);
}
cout << endl;
cout << "Press ENTER to finish";
system("pause");
}
ASM FILE
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.11.25547.0
TITLE C:\Users\Patrick\source\repos\dot_product_legit\dot_product_legit\dpp_pointerr.cpp
.686P
.XMM
include listing.inc
.model flat, C
PUBLIC dpp_pointerr
_TEXT SEGMENT
_result$ = -32 ; size = 4
_B_beg$ = -20 ; size = 4
_A_beg$ = -8 ; size = 4
_v$ = 8 ; size = 4
_u$ = 12 ; size = 4
_n$ = 16 ; size = 4
?dpp_pointerr##YAHPAH0H#Z:
dpp_pointerr PROC ; dot_product_pointer, COMDAT
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
mov DWORD PTR _result$[ebp], 0
; Line 11
; Line 2
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
; Line 5
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _v$[ebp]
mov DWORD PTR _A_beg$[ebp], ecx
; Line 6
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _u$[ebp]
mov DWORD PTR _B_beg$[ebp], ecx
; Line 8
mov DWORD PTR _result$[ebp], 0
; Line 11
mov eax, DWORD PTR _A_beg$[ebp]
mov ebx, DWORD PTR _B_beg$[ebp]
mov ecx, DWORD PTR _n$[ebp]
mov edi, DWORD PTR _v$[ebp]
lea edi, DWORD PTR [edi+ecx*4]
mov esi, DWORD PTR _u$[ebp]
lea esi, DWORD PTR [esi+ecx*4]
jmp SHORT $LN4#dot_produc
$LN2#dot_produc:
add eax, 4
add ebx, 4
$LN4#dot_produc:
cmp eax, edi
jae SHORT $LN3#dot_produc
cmp ebx, esi
jae SHORT $LN3#dot_produc
; Line 12
imul eax, ebx
add DWORD PTR _result$[ebp], eax
jmp SHORT $LN2#dot_produc
$LN3#dot_produc:
; Line 13
mov eax, DWORD PTR _result$[ebp]
; Line 14
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
dpp_pointerr ENDP ; dot_product_pointer
_TEXT ENDS
END
Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
};
test a;
int s = 6;
int suma(test *)
{
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
loop:
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
outt:
mov eax, edx;
}
return s;
}
int main()
{
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
}
cout << s;
return 0;
}
I was benchmarking arrays, vectors, boost::array on different compilers on Windows and Linux. I ran into the following strange thing.
I have gcc 4.7.2 on Linux 3.7.0.7, with flags:
g++ -O3 -g -Wall -c -fmessage-length=0 -std=c++11 -MMD -MP -MF"main.d" -MT"main.d" -o "main.o" "../main.cpp"
And this code:
const int arrLength = 5;
int a[arrLength];
for (int i = 0; i < arrLength; i++) {
a[i] = i * 5;
}
srand(time(0)); // randomise at run time so it cannot be precomputed by the compiler
int numbers[10];
for (auto &i : numbers)
i = rand();
clock_t c;
c = clock();
for (int i = 0; i < 100000000; i++) {
for (int j = 0; j < arrLength; j++)
a[j] += numbers[j%10];
}
// write it out so the compiler doesn't omit the whole operation if the values in the array are not being used
for (int x : a)
cout << x;
cout << endl;
cout << (float) (clock() - c) << endl;
It actually runs in 0 seconds... how can this happen?
My compiler just computes the final result. Here's my annotated source code:
asm volatile("DEBUG_IN");
for (int i = 0; i < 100000000; i++)
{
for (int j = 0; j < arrLength; j++)
{
a[j] += numbers[j % 10];
}
}
asm volatile("DEBUG_OUT");
The invocation is g++ -std=c++11 -O3 -S -masm=intel.
The result:
#APP
# 21 "/tmp/x.cpp" 1
DEBUG_IN
# 0 "" 2
#NO_APP
mov ecx, DWORD PTR [esp+60]
imul edi, DWORD PTR [esp+56], 100000000
mov eax, DWORD PTR [esp+64]
mov edx, DWORD PTR [esp+68]
mov DWORD PTR [esp+36], edi
imul edi, ecx, 99999999
lea ecx, [ecx+5+edi]
mov DWORD PTR [esp+40], ecx
imul ecx, eax, 99999999
lea eax, [eax+10+ecx]
mov DWORD PTR [esp+44], eax
imul eax, edx, 99999999
lea eax, [edx+15+eax]
mov edx, DWORD PTR [esp+72]
mov DWORD PTR [esp+48], eax
imul eax, DWORD PTR [esp+72], 99999999
lea eax, [edx+20+eax]
mov DWORD PTR [esp+52], eax
#APP
# 31 "/tmp/x.cpp" 1
DEBUG_OUT
# 0 "" 2
#NO_APP
As you can see, there are just five simple assignments. Note that [esp+36] to [esp+52] refer to the respective elements of numbers.