How can adding code to a loop make it faster?

How can adding code to a loop make it faster? - c++

I have a simple function with an inner loop - it scales the input value, looks up an output value in a lookup table, and copies it to the destination. (ftol_ambient is a trick I copied from the web for fast conversion of float to int).
for (i = 0; i < iCount; ++i)
{
iScaled = ftol_ambient(*pSource * PRECISION3);
if (iScaled <= 0)
*pDestination = 0;
else if (iScaled >= PRECISION3)
*pDestination = 255;
else
{
iSRGB = FloatToSRGBTable3[iScaled];
*pDestination = iSRGB;
}
pSource++;
pDestination++;
}
Now my lookup table is finite, and floats are infinite, so there's a possibility of off-by-one errors. I created a copy of the function with some code to handle that case. Notice that the only difference is the added 2 lines of code - please ignore the ugly pointer casting.
for (i = 0; i < iCount; ++i)
{
iScaled = ftol_ambient(*pSource * PRECISION3);
if (iScaled <= 0)
*pDestination = 0;
else if (iScaled >= PRECISION3)
*pDestination = 255;
else
{
iSRGB = FloatToSRGBTable3[iScaled];
if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
++iSRGB;
*pDestination = (unsigned char) iSRGB;
}
pSource++;
pDestination++;
}
Here's the strange part. I'm testing both versions with identical input of 100000 elements, repeated 100 times. On my Athlon 64 1.8 GHz (32 bit mode), the first function takes 0.231 seconds, and the second (longer) function takes 0.185 seconds. Both functions are adjacent in the same source file, so there's no possibility of different compiler settings. I've run the tests many times, reversing the order they're run in, and the timings are roughly the same every time.
I know there's a lot of mystery in modern processors, but how is this possible?
Here for comparison are the relevant assembler outputs from the Microsoft VC++6 compiler.
; 173 : for (i = 0; i < iCount; ++i)
$L4455:
; 174 : {
; 175 : iScaled = ftol_ambient(*pSource * PRECISION3);
fld DWORD PTR [esi]
fmul DWORD PTR __real#4#400b8000000000000000
fstp QWORD PTR $T5011[ebp]
; 170 : int i;
; 171 : int iScaled;
; 172 : unsigned int iSRGB;
fld QWORD PTR $T5011[ebp]
; 173 : for (i = 0; i < iCount; ++i)
fistp DWORD PTR _i$5009[ebp]
; 176 : if (iScaled <= 0)
mov edx, DWORD PTR _i$5009[ebp]
test edx, edx
jg SHORT $L4458
; 177 : *pDestination = 0;
mov BYTE PTR [ecx], 0
; 178 : else if (iScaled >= PRECISION3)
jmp SHORT $L4461
$L4458:
cmp edx, 4096 ; 00001000H
jl SHORT $L4460
; 179 : *pDestination = 255;
mov BYTE PTR [ecx], 255 ; 000000ffH
; 180 : else
jmp SHORT $L4461
$L4460:
; 181 : {
; 182 : iSRGB = FloatToSRGBTable3[iScaled];
; 183 : *pDestination = (unsigned char) iSRGB;
mov dl, BYTE PTR _FloatToSRGBTable3[edx]
mov BYTE PTR [ecx], dl
$L4461:
; 184 : }
; 185 : pSource++;
add esi, 4
; 186 : pDestination++;
inc ecx
dec edi
jne SHORT $L4455
$L4472:
; 199 : {
; 200 : iScaled = ftol_ambient(*pSource * PRECISION3);
fld DWORD PTR [esi]
fmul DWORD PTR __real#4#400b8000000000000000
fstp QWORD PTR $T4865[ebp]
; 195 : int i;
; 196 : int iScaled;
; 197 : unsigned int iSRGB;
fld QWORD PTR $T4865[ebp]
; 198 : for (i = 0; i < iCount; ++i)
fistp DWORD PTR _i$4863[ebp]
; 201 : if (iScaled <= 0)
mov edx, DWORD PTR _i$4863[ebp]
test edx, edx
jg SHORT $L4475
; 202 : *pDestination = 0;
mov BYTE PTR [edi], 0
; 203 : else if (iScaled >= PRECISION3)
jmp SHORT $L4478
$L4475:
cmp edx, 4096 ; 00001000H
jl SHORT $L4477
; 204 : *pDestination = 255;
mov BYTE PTR [edi], 255 ; 000000ffH
; 205 : else
jmp SHORT $L4478
$L4477:
; 206 : {
; 207 : iSRGB = FloatToSRGBTable3[iScaled];
xor ecx, ecx
mov cl, BYTE PTR _FloatToSRGBTable3[edx]
; 208 : if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
mov edx, DWORD PTR _SRGBCeiling[ecx*4]
cmp edx, DWORD PTR [esi]
jg SHORT $L4481
; 209 : ++iSRGB;
inc ecx
$L4481:
; 210 : *pDestination = (unsigned char) iSRGB;
mov BYTE PTR [edi], cl
$L4478:
; 211 : }
; 212 : pSource++;
add esi, 4
; 213 : pDestination++;
inc edi
dec eax
jne SHORT $L4472
Edit: Trying to test Nils Pipenbrinck's hypothesis, I added a couple of lines before and inside of the loop of the first function:
int one = 1;
int two = 2;
if (one == two)
++iSRGB;
The run time of the first function is now down to 0.152 seconds. Interesting.
Edit 2: Nils pointed out that the comparison would be optimized out of a release build, and indeed it is. The changes in the assembly code are very subtle, I will post it here to see if it provides any clues. At this point I'm wondering if it's code alignment?
; 175 : for (i = 0; i < iCount; ++i)
$L4457:
; 176 : {
; 177 : iScaled = ftol_ambient(*pSource * PRECISION3);
fld DWORD PTR [edi]
fmul DWORD PTR __real#4#400b8000000000000000
fstp QWORD PTR $T5014[ebp]
; 170 : int i;
; 171 : int iScaled;
; 172 : int one = 1;
fld QWORD PTR $T5014[ebp]
; 173 : int two = 2;
fistp DWORD PTR _i$5012[ebp]
; 178 : if (iScaled <= 0)
mov esi, DWORD PTR _i$5012[ebp]
test esi, esi
jg SHORT $L4460
; 179 : *pDestination = 0;
mov BYTE PTR [edx], 0
; 180 : else if (iScaled >= PRECISION3)
jmp SHORT $L4463
$L4460:
cmp esi, 4096 ; 00001000H
jl SHORT $L4462
; 181 : *pDestination = 255;
mov BYTE PTR [edx], 255 ; 000000ffH
; 182 : else
jmp SHORT $L4463
$L4462:
; 183 : {
; 184 : iSRGB = FloatToSRGBTable3[iScaled];
xor ecx, ecx
mov cl, BYTE PTR _FloatToSRGBTable3[esi]
; 185 : if (one == two)
; 186 : ++iSRGB;
; 187 : *pDestination = (unsigned char) iSRGB;
mov BYTE PTR [edx], cl
$L4463:
; 188 : }
; 189 : pSource++;
add edi, 4
; 190 : pDestination++;
inc edx
dec eax
jne SHORT $L4457

My guess is, that in the first case two different branches end up in the same branch-prediction slot on the CPU. If these two branches predict different each time the code will slow down.
In the second loop, the added code may just be enough to move one of the branches to a different branch prediction slot.
To be sure you can give the Intel VTune analyzer or the AMD CodeAnalyst tool a try. These tools will show you what's exactly going on in your code.
However, keep in mind that it's most probably not worth to optimize this code further. If you tune your code to be faster on your CPU it may at the same time become slower on a different brand.
EDIT:
If you want to read on the branch-prediction give Agner Fog's excellent web-site a try: http://www.agner.org/optimize/
This pdf explains the branch-prediction slot allocation in detail: http://www.agner.org/optimize/microarchitecture.pdf

My first guess is that the branch is being predicted better in the second case. Possibly because the nested if gives whatever algorithm the processor's using more information to guess from. Just out of curiousity, what happens when you remove the line
if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
?

How are you timing these routines? I wonder if paging or caching is having an effect on the timings? It's possible that calling the first routine loads both into memory, crosses a page boundary or causes the stack to cross into an invalid page (causing a page-in), but only the first routine pays the price.
You may want to to run through both functions once before making the calls that take the measurements to reduce the effects that virtual memory and caching might have.

Are you just testing this inner loop, or are you testing your undisclosed outer loop as well? If so, look at these three lines:
if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
++iSRGB;
*pDestination = (unsigned char) iSRGB;
Now, it looks like *pDestination is the counter for the outer loop. So by sometimes doing an extra increment of the iSRGB value you get to skip some of the iterations in the outer loop, thereby reducing the total amount of work the code needs to do.

I once had a similar situation. I hoisted some code out of a loop to make it faster, but it got slower. Confusing. Turns out, the average number of times though the loop was less than 1.
The lesson (which you don't need, obviously) is that a change doesn't make your code faster unless you measure it actually running faster.

Related

C++ Does this code modify any data pointer to this or does it just return it?

It seems that everytime I call this function. Like so, it actually works right.. But I want to avoid calling functions because this can sometimes crash if parameters are wrong, so I want to rely on just modifying memory addresses/values to avoid that issue.
Here is the working hook CALL.
DWORD thisPointer = 0xABCDEF;
typedef int(__thiscall* dwordHookFunction )(DWORD a1, DWORD a2);
dwordHookFunction testCall;
testCall = (dwordHookFunction)0x5E0280; //Address matches ForceSlectionFunction offset.
testCall(thisPointer, 1);
Here is the function decompiled.
int __thiscall ForceSelectionFunction(int this, int a2)
{
int result; // eax
if ( a2 > 0 && a2 < *(_DWORD *)(this + 4) )
result = *(_DWORD *)this + 0x1E9B8 * a2;
else
result = 0;
return result;
}
How does this function modify the memory for this pointer does it even modify it? Does it even modify anything or just return data?.. It has to modify something because otherwise it wouldn't work I think..
How would I go about calling the changes?
*(unsigned int*)(0xABCDEF + 0x1E9B8 * 1) = ???
or is it
*(unsigned int*)(0xABCDEF) = 0x1E9B8 * 1 //???
P.S.> anyone want to look at the ASM code here it is.
seg000:005E0280 ; =============== S U B R O U T I N E =======================================
seg000:005E0280
seg000:005E0280 ; Attributes: bp-based frame
seg000:005E0280
seg000:005E0280 ; int __thiscall ForceSelectionFunction(_DWORD this, int a2)
seg000:005E0280 ForceSelectionFunction proc near ; CODE XREF: sub_412260+618↑p
seg000:005E0280 ; sub_415240+F6↑p ...
seg000:005E0280
seg000:005E0280 var_4 = dword ptr -4
seg000:005E0280 a2 = dword ptr 8
seg000:005E0280
seg000:005E0280 push ebp
seg000:005E0281 mov ebp, esp
seg000:005E0283 push ecx
seg000:005E0284 mov [ebp+var_4], ecx
seg000:005E0287 cmp [ebp+a2], 0
seg000:005E028B jle short loc_5E0298
seg000:005E028D mov eax, [ebp+var_4]
seg000:005E0290 mov ecx, [ebp+a2]
seg000:005E0293 cmp ecx, [eax+4]
seg000:005E0296 jl short loc_5E029C
seg000:005E0298
seg000:005E0298 loc_5E0298: ; CODE XREF: ForceSelectionFunction+B↑j
seg000:005E0298 xor eax, eax
seg000:005E029A jmp short loc_5E02AA
seg000:005E029C ; ---------------------------------------------------------------------------
seg000:005E029C
seg000:005E029C loc_5E029C: ; CODE XREF: ForceSelectionFunction+16↑j
seg000:005E029C mov eax, [ebp+a2]
seg000:005E029F imul eax, 1E9B8h
seg000:005E02A5 mov edx, [ebp+var_4]
seg000:005E02A8 add eax, [edx]
seg000:005E02AA
seg000:005E02AA loc_5E02AA: ; CODE XREF: ForceSelectionFunction+1A↑j
seg000:005E02AA mov esp, ebp
seg000:005E02AC pop ebp
seg000:005E02AD retn 4
seg000:005E02AD ForceSelectionFunction endp
In a older version of this game it decompiles like so.. which is pretty much the same thing just with some struct work done.
int __thiscall ForceSelectionFunction(_DWORD *this, int a2)
{
int result; // eax
if ( a2 <= 0 || a2 >= this[1] )
result = 0;
else
result = *this + 0x1FB50 * a2;
return result;
}

Need assistance with loop to iteratively add to sum variable in Assembly

I'm trying to convert the following C++ code into Assembly (MASM, Irvine32):
const int SIZE = 10;
int numbers[SIZE] = {10,60,20,33,72,89,45,65,72,18};
int limit = 50;
int index = 0;
int sum = 0;
while( index < SIZE )
{
if( numbers[index] <= limit )
{
sum = sum + numbers[index]; // sum += array[index];
}
index++;
}
If anyone could clear up where I'm going wrong -- I'm getting errors at L1: it's just spouting out "+10". I believe it's because I cannot translate sum=sum+numbers[index] into Assembly. If anyone could help me do that it would be fantastic. My attempt at translating it (lines starting from "total: mov esi, offset numbers" to "inc index") is obviously incorrect.
.data
SYZ = 10
numbers DWORD 10, 60, 20, 33, 72, 89, 45, 65, 72, 18
limit DWORD 50
index DWORD 0
sum DWORD 0
.code
main PROC
mov eax, index
mov ebx, SYZ
top: cmp eax, ebx
jae next
jb total
total: mov esi, OFFSET numbers
mov ecx, limit
cmp [esi], ecx
jbe L1
L1: add eax, ebx
inc index
jmp top
next: mov edx, sum
call WriteInt
exit
main ENDP
END main

Your conditional branch that implements the if is wrong. It should look like:
top:
...
cmp [esi], ecx
ja L1 ; conditional jump *over* an ADD instruction
add eax, [esi] ; [esi] is array[index] if you increment ESI properly...
L1: inc index
jmp top
In your C++, you can see that if numbers[index] <= limit then you want to update the sum, otherwise just increment the index and go back to the "top"; aka recheck the stopping condition.
Your original asm code was doing a condition check and then continuing regardless of the result.
cmp [esi], ecx
jbe L1 ; jump or fall-through to L1, condition irrelevant
L1: add eax, ebx
The C++ equivalent of your original asm is:
if( numbers[index] <= limit )
{
}
sum += ebx;
index++;
I'm not sure if this will solve all of your problems, but it will definitely solve one of them.

.data
SYZ = 10
numbers DWORD 10, 60, 20, 33, 72, 89, 45, 65, 72, 18
limit DWORD 50
index DWORD 0
sum DWORD 0
.code
main PROC
mov eax, index
mov ebx, SYZ
mov esi, OFFSET numbers
mov ecx, limit
mov edx, 0
top: cmp eax, ebx
jae next
cmp [esi], ecx
ja L1
add edx, [esi]
L1: inc index
mov eax, index
add esi, 4
jmp top
next: mov sum, edx
mov eax, sum
call WriteInt

C++ running slower than VB6

this is my first post on Stack.
I usually dev in VB6 but have recently started doing more coding in C++ using the DEV-C++ IDE with the g++ compiler lib.
I'm having a problem with general program execution speeds.
This old VB6 code runs in 20 seconds.
DefLng A-Z
Private Sub Form_Load()
Dim n(10000, 10) As Long
Dim c(10000, 10) As Long
For d = 1 To 1000000
For dd = 1 To 10000
n(dd, 1) = c(dd, 2) + c(dd, 3)
Next
Next
MsgBox "Done"
End Sub
This C++ code takes 57 seconds...
int main(int argc, char *argv[]) {
long n[10000][10];
long c[10000][10];
for (long d=1;d<1000000;d++){
for (long dd=1;dd<10000;dd++){
n[dd][1]=c[dd][2]+c[dd][3];
}
}
system("PAUSE");
return EXIT_SUCCESS; }
Most of the coding I do is AI related and very heavy on array usage. I've tried using int rather than long, I've tried different machines, the C++ always runs at least three times slower.
Am I being dumb? Can anyone explain what I'm doing wrong?
Cheers.

Short answer
You need to look into your compiler optimization settings. This resource might help
Takeaway: C++ allows you to use many tricks some generic and some dependent on your architecture, when used properly it will be superior to VB in terms of performance.
Long answer
Keep in mind this is highly dependent on your architecture and compiler, also compiler settings. You should configure your compiler to do a more agressive optimization.
Also you should write optimized code taking into account memory access, using CPU cache wisely etc.
I have done a test for you on an ubuntu 16.04 virtual machine using a core of Intel(R) Core(TM) i7-7700K CPU # 4.20GHz. Using the code bellow here are my times depending on the optimization level of the compiler I used g++ 5.4.0
I'm using optimization level 0,1,2,3,s and obtain 36s(completely unoptimized), 23s, and then.. zero.
osboxes#osboxes:~/test$ g++ a.cpp -O0 -o a0
osboxes#osboxes:~/test$ ./a0 start..finished in 36174855 micro seconds
osboxes#osboxes:~/test$ g++ a.cpp -O1 -o a1
osboxes#osboxes:~/test$ ./a1 start..finished in 2352767 micro seconds
osboxes#osboxes:~/test$ g++ a.cpp -O2 -o a2
osboxes#osboxes:~/test$ ./a2 start..finished in 0 micro seconds
osboxes#osboxes:~/test$ g++ a.cpp -O3 -o a3
osboxes#osboxes:~/test$ ./a3 start..finished in 0 micro seconds
osboxes#osboxes:~/test$ g++ a.cpp -Os -o as
osboxes#osboxes:~/test$ ./as start..finished in 0 micro seconds
Note that by using a more agressive optimization level, the compiler will eliminate the code completely because the values in n[] are not used in the program.
To force the compiler to generate code use the volatile keyword when declaring n
With the volatile added now you'll have ~12s with the most agressive optimization (on my machine)
osboxes#osboxes:~/test$ g++ a.cpp -O3 -o a3
osboxes#osboxes:~/test$ ./a3 start..finished in 12139348 micro seconds
osboxes#osboxes:~/test$ g++ a.cpp -Os -o as
osboxes#osboxes:~/test$ ./as start..finished in 12493927 micro seconds
The code I used for the test(based on your example)
#include <iostream>
#include <sys/time.h>
using namespace std;
typedef unsigned long long u64;
u64 timestamp()
{
struct timeval now;
gettimeofday(&now, NULL);
return now.tv_usec + (u64)now.tv_sec*1000000;
}
int main()
{
cout<<"start"<<endl;
u64 t0 = timestamp();
volatile long n[10000][10];
long c[10000][10];
for(long d=1;d<1000000;d++)
{
for(long dd=1;dd<10000;dd++)
{
n[dd][1]=c[dd][2]+c[dd][3];
}
}
u64 t1 = timestamp();
cout<<"..finished in "<< (t1-t0) << " micro seconds\n";
return 0;
}
Multithreading
I have converted your code to use multithreading, using 2 threads I am able to reduce the time by half.
I am using the fact that as it is now, the results are not used so the inner for is not dependent on the outer one, in reality you should find another way to split the work so that the results do not overwrite one another.
#include <iostream>
#include <sys/time.h>
#include <omp.h>
using namespace std;
typedef unsigned long long u64;
u64 timestamp()
{
struct timeval now;
gettimeofday(&now, NULL);
return now.tv_usec + (u64)now.tv_sec*1000000;
}
int main()
{
omp_set_num_threads(2);
#pragma omp parallel
{
}
cout<<"start"<<endl;
u64 t0 = timestamp();
volatile long n[10000][10];
long c[10000][10];
for(long d=1;d<1000000;d++)
{
#pragma omp parallel for
for(long dd=1;dd<10000;dd++)
{
n[dd][1]=c[dd][2]+c[dd][3];
}
}
u64 t1 = timestamp();
cout<<"..finished in "<< (t1-t0) << " micro seconds\n";
return 0;
}
osboxes#osboxes:~/test$ g++ a.cpp -O3 -fopenmp -o a3
osboxes#osboxes:~/test$ ./a3 start..finished in 6673741 micro seconds

UPDATE : Recent c++ compilers give much better results compare to VB6 compiler.
The VB6 codes shown above does not reflect the real case, where access index can be varied and calculations can be breaked into embraced functions.
More experiences show that VB6 has a huge penaty for optimizing when arrays used were passed as function input (by reference).
I tried severals c++ compilers and rewritten the benchmark codes where some random behaviors was added to trick the optimization. Certainly,
the codes could be better, all suggestions are welcome. I used -O3 option for maximizing the speed, multithread mode was not used.
RESULT:
The g++ 8.1 gives the best final result with 5 seconds with const index access (compiler known it, case shown in OP), and 6 seconds with dynamic index access.
VC 2017 takes 2 seconds only for const index access but 35 seconds for dynamic case.
VB6 S1 : Original version, n and c are local variables. VB6 S2: inner loops was moved into a function, n and c are input variables passed by reference.
Test condition : Intel Xeon W3690 / Windows 10
Here are the test result :
The rewritten codes :
VB6:
DefLng A-Z
Private Declare Function GetTickCount Lib "kernel32" () As Long
Public Sub cal_var(ByRef n() As Long, ByRef c() As Long, id As Long)
For dd = 0 To 10000
n(dd, id) = c(dd, id + 1) + c(dd, id + 2)
Next
End Sub
Public Sub cal_const(ByRef n() As Long, ByRef c() As Long)
For dd = 0 To 10000
n(dd, 1) = c(dd, 2) + c(dd, 3)
Next
End Sub
Private Sub Form_Load()
Dim n(10001, 10) As Long
Dim c(10001, 10) As Long
Dim t0 As Long
Dim t1 As Long
Dim t2 As Long
Dim id As Long
Dim ret As Long
t0 = GetTickCount
For d = 1 To 1000000
id = d And 7
Call cal_var(n, c, id) 'For VB S2
'For dd = 0 To 10000
' n(dd, id + 0) = c(dd, id + 1) + c(dd, id + 2)
'Next
Next
t1 = GetTickCount
For d = 1 To 1000000
Call cal_const(n, c) 'For VB S2
'For dd = 0 To 10000
' n(dd, 1) = c(dd, 2) + c(dd, 3)
'Next
Next
t2 = GetTickCount
For d = 0 To 10000
Sum = Sum + n(d, t0 And 7)
Next
MsgBox "Done in " & (t1 - t0) & " and " & (t2 - t1) & " miliseconds"
End Sub
C++ Code:
#include <iostream>
#include <time.h>
#include <string.h>
using namespace std;
#define NUM_ITERATION 1000000
#define ARR_SIZE 10000
typedef long long_arr[ARR_SIZE][10];
void calc_var(long_arr &n, long_arr &c, long id) {
for (long dd = 0; dd < ARR_SIZE; dd++) {
n[dd][id] = c[dd][id + 1] + c[dd][id + 2];
}
}
void calc_const(long_arr &n, long_arr &c) {
for (long dd = 0; dd < ARR_SIZE; dd++) {
n[dd][0] = c[dd][1] + c[dd][2];
}
}
int main()
{
cout << "start ..." << endl;
time_t t0 = time(NULL);
long_arr n;
long_arr c;
memset(n, 0, sizeof(n));
memset(c, t0 & 1, sizeof(c));
for (long d = 1; d < NUM_ITERATION; d++) {
calc_var(n, c, (long)t0 & 7);
}
time_t t1 = time(NULL);
for (long d = 1; d < NUM_ITERATION; d++) {
calc_const(n, c);
}
time_t t2 = time(NULL);
long sum = 0;
for (int i = 0; i < ARR_SIZE; i++) {
sum += n[i][t0 & 7];
}
cout << "by dynamic index: finished in " << (t1 - t0) << " seconds" << endl;
cout << "by const index: finished in " << (t2 - t1) << " seconds" << endl;
cout << "with final result : " << sum << endl;
return 0;
}
The following original answer was based only on test done in VC2008, VB6 S1 case.
ORIGINAL ANSWER:
I have the same result with Progger. I use the following code to avoid the optimization (code blank) made by the compiler :
int main(int argc, char *argv[]) {
long n[10000][10];
long c[10000][10];
long sum = 0;
memset(n, 0, sizeof(n) );
memset(c, 0, sizeof(c) );
for (long d=1;d<1000000;d++){
for (long dd=1;dd<10000;dd++){
n[dd][1]=c[dd][2]+c[dd][3];
}
}
for (long dd=1;dd<10000;dd++){
sum += n[dd][1];
}
return sum;
}
I use the visual studio C++ 2008 to compile the code. it turns out that the compiler uses a lot of adress reallocation with local variable, mixing with an imul (multiplication) instruction, that could be very costly, for example :
.text:00401065 mov edx, [ebp+var_C3510]
.text:0040106B imul edx, 28h
.text:0040106E mov eax, [ebp+var_C3510]
.text:00401074 imul eax, 28h
.text:00401077 mov ecx, [ebp+edx+var_C3500]
.text:0040107E add ecx, [ebp+eax+var_C34FC]
.text:00401085 mov edx, [ebp+var_C3510]
.text:0040108B imul edx, 28h
.text:0040108E mov [ebp+edx+var_61A7C], ecx
Each access to an index take one multiplication and one add instruction.
EDIT1: The problem is related to multidimension array access performance in C++. More information can be found here:
Array Lookups
C++ doesn't provide multidimensional arrays so scientific and engineering applications either write their own or use one of the available array libraries such as Eigen, Armadillo, uBLAS, or Boost.MultiArray. High quality C++ array libraries can provide generally excellent performance but yet simple element lookup speed can still lag that of Fortran. One reason is that Fortran arrays are built in to the language so its compilers can figure out array index strides in loops and avoid computing the memory offset from the indexes for each element. We can't change the C++ compiler so the next best solution is to offer a linear, or flat, 1D indexing operator for multidimensional arrays that can be used to improve performance of hot spot loops.
The VB version use a traditional way (1D optimized flat memory), intructions used are only registers add (eax, edx, esi ...), they are much faster.
.text:00401AD4 mov eax, [ebp-54h]
.text:00401AD7 mov ecx, [eax+esi*4+13888h]
.text:00401ADE mov edx, [eax+esi*4+1D4CCh]
.text:00401AE5 add ecx, edx
.text:00401AE7 mov edx, [ebp-2Ch]
.text:00401AEA mov eax, edi
That can answer the question of speed.
Advice: you should use existed libraries (for example: uBlas) if possible. More discussion can be found here.
Here is the machine code of C++ version :
.text:00401000 ; int __cdecl main(int argc, const char **argv, const char **envp)
.text:00401000 _main proc near ; CODE XREF: ___tmainCRTStartup+F6p
.text:00401000
.text:00401000 var_C3514 = dword ptr -0C3514h
.text:00401000 var_C3510 = dword ptr -0C3510h
.text:00401000 var_C350C = dword ptr -0C350Ch
.text:00401000 var_C3500 = dword ptr -0C3500h
.text:00401000 var_C34FC = dword ptr -0C34FCh
.text:00401000 var_61A84 = dword ptr -61A84h
.text:00401000 var_61A7C = dword ptr -61A7Ch
.text:00401000 argc = dword ptr 8
.text:00401000 argv = dword ptr 0Ch
.text:00401000 envp = dword ptr 10h
.text:00401000
.text:00401000 push ebp
.text:00401001 mov ebp, esp
.text:00401003 mov eax, 0C3514h
.text:00401008 call __alloca_probe
.text:0040100D mov [ebp+var_61A84], 0
.text:00401017 mov [ebp+var_C350C], 1
.text:00401021 jmp short loc_401032
.text:00401023 ; ---------------------------------------------------------------------------
.text:00401023
.text:00401023 loc_401023: ; CODE XREF: _main:loc_401097j
.text:00401023 mov eax, [ebp+var_C350C]
.text:00401029 add eax, 1
.text:0040102C mov [ebp+var_C350C], eax
.text:00401032
.text:00401032 loc_401032: ; CODE XREF: _main+21j
.text:00401032 cmp [ebp+var_C350C], 0F4240h
.text:0040103C jge short loc_401099
.text:0040103E mov [ebp+var_C3510], 1
.text:00401048 jmp short loc_401059
.text:0040104A ; ---------------------------------------------------------------------------
.text:0040104A
.text:0040104A loc_40104A: ; CODE XREF: _main+95j
.text:0040104A mov ecx, [ebp+var_C3510]
.text:00401050 add ecx, 1
.text:00401053 mov [ebp+var_C3510], ecx
.text:00401059
.text:00401059 loc_401059: ; CODE XREF: _main+48j
.text:00401059 cmp [ebp+var_C3510], 2710h
.text:00401063 jge short loc_401097
.text:00401065 mov edx, [ebp+var_C3510]
.text:0040106B imul edx, 28h
.text:0040106E mov eax, [ebp+var_C3510]
.text:00401074 imul eax, 28h
.text:00401077 mov ecx, [ebp+edx+var_C3500]
.text:0040107E add ecx, [ebp+eax+var_C34FC]
.text:00401085 mov edx, [ebp+var_C3510]
.text:0040108B imul edx, 28h
.text:0040108E mov [ebp+edx+var_61A7C], ecx
.text:00401095 jmp short loc_40104A
.text:00401097 ; ---------------------------------------------------------------------------
.text:00401097
.text:00401097 loc_401097: ; CODE XREF: _main+63j
.text:00401097 jmp short loc_401023
.text:00401099 ; ---------------------------------------------------------------------------
.text:00401099
.text:00401099 loc_401099: ; CODE XREF: _main+3Cj
.text:00401099 mov [ebp+var_C3514], 1
.text:004010A3 jmp short loc_4010B4
.text:004010A5 ; ---------------------------------------------------------------------------
.text:004010A5
.text:004010A5 loc_4010A5: ; CODE XREF: _main+DCj
.text:004010A5 mov eax, [ebp+var_C3514]
.text:004010AB add eax, 1
.text:004010AE mov [ebp+var_C3514], eax
.text:004010B4
.text:004010B4 loc_4010B4: ; CODE XREF: _main+A3j
.text:004010B4 cmp [ebp+var_C3514], 2710h
.text:004010BE jge short loc_4010DE
.text:004010C0 mov ecx, [ebp+var_C3514]
.text:004010C6 imul ecx, 28h
.text:004010C9 mov edx, [ebp+var_61A84]
.text:004010CF add edx, [ebp+ecx+var_61A7C]
.text:004010D6 mov [ebp+var_61A84], edx
.text:004010DC jmp short loc_4010A5
.text:004010DE ; ---------------------------------------------------------------------------
.text:004010DE
.text:004010DE loc_4010DE: ; CODE XREF: _main+BEj
.text:004010DE mov eax, [ebp+var_61A84]
.text:004010E4 mov esp, ebp
.text:004010E6 pop ebp
.text:004010E7 retn
.text:004010E7 _main endp
The VB code is longer, so I post here only the main function :
.text:00401A81 loc_401A81: ; CODE XREF: .text:00401B18j
.text:00401A81 mov ecx, [ebp-68h]
.text:00401A84 mov eax, 0F4240h
.text:00401A89 cmp ecx, eax
.text:00401A8B jg loc_401B1D
.text:00401A91 mov edx, [ebp-18h]
.text:00401A94 mov edi, 1
.text:00401A99 add edx, 1
.text:00401A9C mov esi, edi
.text:00401A9E jo loc_401CEC
.text:00401AA4 mov [ebp-18h], edx
.text:00401AA7
.text:00401AA7 loc_401AA7: ; CODE XREF: .text:00401B03j
.text:00401AA7 mov eax, 2710h
.text:00401AAC cmp esi, eax
.text:00401AAE jg short loc_401B05
.text:00401AB0 mov ebx, ds:__vbaGenerateBoundsError
.text:00401AB6 cmp esi, 2711h
.text:00401ABC jb short loc_401AC0
.text:00401ABE call ebx ; __vbaGenerateBoundsError
.text:00401AC0 ; ---------------------------------------------------------------------------
.text:00401AC0
.text:00401AC0 loc_401AC0: ; CODE XREF: .text:00401ABCj
.text:00401AC0 cmp esi, 2711h
.text:00401AC6 jb short loc_401AD4
.text:00401AC8 call ebx ; __vbaGenerateBoundsError
.text:00401ACA ; ---------------------------------------------------------------------------
.text:00401ACA cmp esi, 2711h
.text:00401AD0 jb short loc_401AD4
.text:00401AD2 call ebx
.text:00401AD4
.text:00401AD4 loc_401AD4: ; CODE XREF: .text:00401AC6j
.text:00401AD4 ; .text:00401AD0j
.text:00401AD4 mov eax, [ebp-54h]
.text:00401AD7 mov ecx, [eax+esi*4+13888h]
.text:00401ADE mov edx, [eax+esi*4+1D4CCh]
.text:00401AE5 add ecx, edx
.text:00401AE7 mov edx, [ebp-2Ch]
.text:00401AEA mov eax, edi
.text:00401AEC jo loc_401CEC
.text:00401AF2 add eax, esi
.text:00401AF4 mov [edx+esi*4+9C44h], ecx
.text:00401AFB jo loc_401CEC
.text:00401B01 mov esi, eax
.text:00401B03 jmp short loc_401AA7
.text:00401B05 ; ---------------------------------------------------------------------------
.text:00401B05
.text:00401B05 loc_401B05: ; CODE XREF: .text:00401AAEj
.text:00401B05 mov ecx, [ebp-68h]
.text:00401B08 mov eax, 1
.text:00401B0D add eax, ecx
.text:00401B0F jo loc_401CEC
.text:00401B15 mov [ebp-68h], eax
.text:00401B18 jmp loc_401A81
.text:00401B1D ; ---------------------------------------------------------------------------
.text:00401B1D
.text:00401B1D loc_401B1D: ; CODE XREF: .text:00401A8Bj
.text:00401B1D mov edi, 1
.text:00401B22 mov ebx, 2710h
.text:00401B27 mov esi, edi
.text:00401B29
.text:00401B29 loc_401B29: ; CODE XREF: .text:00401B5Fj
.text:00401B29 cmp esi, ebx
.text:00401B2B jg short loc_401B61
.text:00401B2D cmp esi, 2711h
.text:00401B33 jb short loc_401B3B
.text:00401B35 call ds:__vbaGenerateBoundsError
.text:00401B3B ; ---------------------------------------------------------------------------
.text:00401B3B
.text:00401B3B loc_401B3B: ; CODE XREF: .text:00401B33j
.text:00401B3B mov ecx, [ebp-2Ch]
.text:00401B3E mov eax, [ebp-40h]
.text:00401B41 mov edx, [ecx+esi*4+9C44h]
.text:00401B48 add edx, eax
.text:00401B4A mov eax, edi
.text:00401B4C jo loc_401CEC
.text:00401B52 add eax, esi
.text:00401B54 mov [ebp-40h], edx
.text:00401B57 jo loc_401CEC
.text:00401B5D mov esi, eax
.text:00401B5F jmp short loc_401B29
.text:00401B61 ; ---------------------------------------------------------------------------
.text:00401B61
.text:00401B61 loc_401B61: ; CODE XREF: .text:00401B2Bj
.text:00401B61 mov ebx, ds:__vbaStrI4
.text:00401B67 mov ecx, 80020004h
.text:00401B6C mov [ebp-0B4h], ecx
.text:00401B72 mov [ebp-0A4h], ecx
.text:00401B78 mov [ebp-94h], ecx
.text:00401B7E mov ecx, [ebp-40h]
.text:00401B81 mov eax, 0Ah
.text:00401B86 push offset aDone ; "Done : "

Is "for( int k = 5; k--;)" faster than "for( int k = 4; k > -1; --k)"

The question says it all: Is
for( int k = 5; k--;)
faster than
for( int k = 4; k > -1; --k)
and why?
EDIT:
I generated the assembly for debug and release in MSVC2012. But (it's my first time analyzing assembly code), I can't really make sense out of it. I alredy added the "std::cout" to prevent the compiler from removing both loops during release optimization.
Can someone help me what the assembly means?
Debug:
; 10 : for( int k = 5; k--;){ std::cout << k; }
mov DWORD PTR _k$2[ebp], 5
$LN5#wmain:
mov eax, DWORD PTR _k$2[ebp]
mov DWORD PTR tv65[ebp], eax
mov ecx, DWORD PTR _k$2[ebp]
sub ecx, 1
mov DWORD PTR _k$2[ebp], ecx
cmp DWORD PTR tv65[ebp], 0
je SHORT $LN4#wmain
mov esi, esp
mov eax, DWORD PTR _k$2[ebp]
push eax
mov ecx, DWORD PTR __imp_?cout#std##3V?$basic_ostream#DU?$char_traits#D#std###1#A
call DWORD PTR __imp_??6?$basic_ostream#DU?$char_traits#D#std###std##QAEAAV01#H#Z
cmp esi, esp
call __RTC_CheckEsp
jmp SHORT $LN5#wmain
$LN4#wmain:
; 11 :
; 12 : for( int k = 4; k > -1; --k){ std::cout << k; }
mov DWORD PTR _k$1[ebp], 4
jmp SHORT $LN3#wmain
$LN2#wmain:
mov eax, DWORD PTR _k$1[ebp]
sub eax, 1
mov DWORD PTR _k$1[ebp], eax
$LN3#wmain:
cmp DWORD PTR _k$1[ebp], -1
jle SHORT $LN6#wmain
mov esi, esp
mov eax, DWORD PTR _k$1[ebp]
push eax
mov ecx, DWORD PTR __imp_?cout#std##3V?$basic_ostream#DU?$char_traits#D#std###1#A
call DWORD PTR __imp_??6?$basic_ostream#DU?$char_traits#D#std###std##QAEAAV01#H#Z
cmp esi, esp
call __RTC_CheckEsp
jmp SHORT $LN2#wmain
$LN6#wmain:
Release:
; 10 : for( int k = 5; k--;){ std::cout << k; }
mov esi, 5
$LL5#wmain:
mov ecx, DWORD PTR __imp_?cout#std##3V?$basic_ostream#DU?$char_traits#D#std###1#A
dec esi
push esi
call DWORD PTR __imp_??6?$basic_ostream#DU?$char_traits#D#std###std##QAEAAV01#H#Z
test esi, esi
jne SHORT $LL5#wmain
; 11 :
; 12 : for( int k = 4; k > -1; --k){ std::cout << k; }
mov esi, 4
npad 3
$LL3#wmain:
mov ecx, DWORD PTR __imp_?cout#std##3V?$basic_ostream#DU?$char_traits#D#std###1#A
push esi
call DWORD PTR __imp_??6?$basic_ostream#DU?$char_traits#D#std###std##QAEAAV01#H#Z
dec esi
cmp esi, -1
jg SHORT $LL3#wmain

[ UPDATE question has been updated so this is no longer different ] They do different things... the first one executes the loop for k values 4 down to 0, while the second one loops from 5 down to 1... if say the loop body does work related to the magnitude of the number, then they might differ in performance.
Ignoring that, on most CPUs k-- incidentally sets the "flags" register commonly called the "zero" flag, so no further explicit comparison is needed before deciding whether to exit. Still, an optimiser should realise that and avoid any unnecessary second comparison even with the second loop.
Generic quip: compilers are allowed to do lots of things, and the Standard certainly doesn't say anything about the relative performance of these two implementations, so ultimately the only way to know - if you have reason to care - is to use the same compiler and command line options you want for production then inspect the generated assembly or machine code and/or measure very carefully. The findings could differ when the executable's deployed on different hardware, compiler with a later version of the compiler, with different flags, a different compiler etc..

Be careful, the two loops are not equivalent:
for( int k = 5; k--;) cout << k << endl;
prints 4 3 2 1 0. While
for( int k = 5; k > 0; k--) cout << k << endl;
prints 5 4 3 2 1.
In performance point of view, you can have enough confidence in your compiler. Modern compilers know how to optimize this better than we do, in most cases.

It depends on your compiler. Probably not, but as always, one must profile to be certain. Or you could look at the generated assembly code (e.g. gcc -S) and see if it's any different. Make sure to enable optimization before you test, too!

Why does adding extra check in loop make big difference on some machines, and small difference on others?

I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays.
Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware.
Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz
Ratio 2 checks : 1 check = 1.0310
Ratio 3 checks : 1 check = 1.2769
Processor Intel(R) Core(TM) i7-3610QM CPU # 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s)
Ratio 2 checks : 1 check = 1.0090
Ratio 3 checks : 1 check = 1.0393
Processor Intel(R) Core(TM) i5-2500 CPU # 3.30GHz, 4 Cores(s)
Ratio 2 checks : 1 check = 1.0035
Ratio 3 checks : 1 check = 1.0639
Processor Intel(R) Core(TM)2 Duo CPU T9300 # 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s)
Ratio 2 checks : 1 check = 1.1195
Ratio 3 checks : 1 check = 1.3597
Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz
Ratio 2 checks : 1 check = 1.0776
Ratio 3 checks : 1 check = 1.1451
In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here.
Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking.
// dotprod.cpp
#include "dotprod.h"
double SumProduct(const double* v1, const double* v2, int n)
{
double sum=0;
for(int i=0;
i<n;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2)
{
double sum=0;
for(int i=0;
i<n1 && i <n2;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3)
{
double sum=0;
for(int i=0;
i<n1 && i <n2 && i <n3;
++i)
sum += v1[i]*v2[i];
return sum;
}
This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it?
Rest of the code below, for information. This has some C++ specific stuff in it.
Header file
// dotprod.h
double SumProduct(const double*, const double*, int n);
double SumProduct(const double*, const double*, int n1, int n2);
double SumProduct(const double*, const double*, int n1, int n2, int n3);
Test harness
// main.cpp
#include <stdio.h>
#include <math.h>
#include <numeric>
#include <vector>
#include <windows.h>
#include "../dotprod/dotprod.h" // separate lib
typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
LARGE_INTEGER li;
if (!QueryPerformanceCounter(&li)) {
exit(1);
}
return li.QuadPart;
}
int main()
{
typedef std::vector<double> dvec;
const int N = 100 * 1000;
// Initialize
dvec v1(N);
dvec v2(N);
dvec dp1(N);
dvec dp2(N);
dvec dp3(N);
for(int i=0; i<N; ++i) {
v1[i] = i;
v2[i] = log(static_cast<double>(i+1));
}
const timecount_t t0 = GetTimeCount();
// Check cost with one bound
for(int n=0; n<N; ++n) {
dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n);
}
const timecount_t t1 = GetTimeCount();
// Check cost with two bounds
for(int n=0; n<N; ++n) {
dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n);
}
const timecount_t t2 = GetTimeCount();
// Check cost with three bounds
for(int n=0; n<N; ++n) {
dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n);
}
const timecount_t t3 = GetTimeCount();
// Check results
const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0);
const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0);
const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0);
printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3);
// Output timings
const timecount_t elapsed1 = t1-t0;
const timecount_t elapsed2 = t2-t1;
const timecount_t elapsed3 = t3-t2;
printf("Elapsed: %.0f, %.0f, %.0f\n",
static_cast<double>(elapsed1),
static_cast<double>(elapsed2),
static_cast<double>(elapsed3));
const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1);
const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1);
printf("Ratio 2:1=%.2f\n", ratio2to1);
printf("Ratio 3:1=%.2f\n", ratio3to1);
return 0;
}
In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file.
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
PUBLIC __real#0000000000000000
PUBLIC ?SumProduct##YANPBN0HHH#Z ; SumProduct
EXTRN __fltused:DWORD
; COMDAT __real#0000000000000000
; File c:\dev\testspeed\dotprod\dotprod.cpp
CONST SEGMENT
__real#0000000000000000 DQ 00000000000000000r ; 0
; Function compile flags: /Ogtp
CONST ENDS
; COMDAT ?SumProduct##YANPBN0HHH#Z
_TEXT SEGMENT
tv491 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
_n3$ = 24 ; size = 4
?SumProduct##YANPBN0HHH#Z PROC ; SumProduct, COMDAT
; 25 : {
push ebp
mov ebp, esp
push ecx
; 26 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 27 : for(int i=0;
xor ecx, ecx
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, 4
jl $LC8#SumProduct
; 26 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea esi, DWORD PTR [edi+24]
; 30 : sum += v1[i]*v2[i];
sub edi, ebx
lea edx, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv491[ebp], edi
$LN15#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
mov ebx, DWORD PTR _n2$[ebp]
cmp ecx, ebx
jge $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax-8]
lea edi, DWORD PTR [edx-1]
fmul QWORD PTR [esi-24]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
mov edi, DWORD PTR tv491[ebp]
fld QWORD PTR [edi+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp edx, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+8]
lea edi, DWORD PTR [edx+1]
fmul QWORD PTR [esi-8]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [esi]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add esi, 32 ; 00000020H
faddp ST(1), ST(0)
add edx, 4
cmp ecx, ebx
jl SHORT $LN15#SumProduct
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct
$LN9#SumProduct:
; 31 : return sum;
; 32 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HHH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0HH#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0HH#Z
_TEXT SEGMENT
tv448 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
?SumProduct##YANPBN0HH#Z PROC ; SumProduct, COMDAT
; 15 : {
push ebp
mov ebp, esp
push ecx
; 16 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 17 : for(int i=0;
xor ecx, ecx
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp edi, 4
jl SHORT $LC8#SumProduct#2
; 16 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea edx, DWORD PTR [edi+24]
; 20 : sum += v1[i]*v2[i];
sub edi, ebx
lea esi, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv448[ebp], edi
$LN19#SumProduct#2:
mov edi, DWORD PTR _n2$[ebp]
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax-8]
lea ebx, DWORD PTR [esi-1]
fmul QWORD PTR [edx-24]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
mov ebx, DWORD PTR tv448[ebp]
fld QWORD PTR [ebx+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp esi, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+8]
lea ebx, DWORD PTR [esi+1]
fmul QWORD PTR [edx-8]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [edx]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add edx, 32 ; 00000020H
faddp ST(1), ST(0)
add esi, 4
cmp ecx, ebx
jl SHORT $LN19#SumProduct#2
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct#2:
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct#2:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct#2
; 20 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct#2
$LN9#SumProduct#2:
; 21 : return sum;
; 22 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0H#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0H#Z
_TEXT SEGMENT
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
?SumProduct##YANPBN0H#Z PROC ; SumProduct, COMDAT
; _n$ = eax
; 5 : {
push ebp
mov ebp, esp
mov edx, DWORD PTR _v2$[ebp]
; 6 : double sum=0;
fldz
push ebx
push esi
mov esi, eax
; 7 : for(int i=0;
xor ebx, ebx
push edi
mov edi, DWORD PTR _v1$[ebp]
; 8 : i<n;
; 9 : ++i)
cmp esi, 4
jl SHORT $LC9#SumProduct#3
; 6 : double sum=0;
lea eax, DWORD PTR [edx+8]
lea ecx, DWORD PTR [edi+24]
; 10 : sum += v1[i]*v2[i];
sub edi, edx
lea edx, DWORD PTR [esi-4]
shr edx, 2
inc edx
lea ebx, DWORD PTR [edx*4]
$LN10#SumProduct#3:
fld QWORD PTR [eax-8]
add eax, 32 ; 00000020H
fmul QWORD PTR [ecx-24]
add ecx, 32 ; 00000020H
dec edx
faddp ST(1), ST(0)
fld QWORD PTR [edi+eax-32]
fmul QWORD PTR [eax-32]
faddp ST(1), ST(0)
fld QWORD PTR [eax-24]
fmul QWORD PTR [ecx-40]
faddp ST(1), ST(0)
fld QWORD PTR [eax-16]
fmul QWORD PTR [ecx-32]
faddp ST(1), ST(0)
jne SHORT $LN10#SumProduct#3
; 6 : double sum=0;
mov edx, DWORD PTR _v2$[ebp]
mov edi, DWORD PTR _v1$[ebp]
$LC9#SumProduct#3:
; 8 : i<n;
; 9 : ++i)
cmp ebx, esi
jge SHORT $LN8#SumProduct#3
sub edi, edx
lea eax, DWORD PTR [edx+ebx*8]
sub esi, ebx
$LC3#SumProduct#3:
; 10 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edi]
add eax, 8
dec esi
fmul QWORD PTR [eax-8]
faddp ST(1), ST(0)
jne SHORT $LC3#SumProduct#3
$LN8#SumProduct#3:
; 11 : return sum;
; 12 : }
pop edi
pop esi
pop ebx
pop ebp
ret 0
?SumProduct##YANPBN0H#Z ENDP ; SumProduct
_TEXT ENDS
END

One big difference between CPUs is the pipeline optimization
The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch.
So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How can adding code to a loop make it faster? - c++

Related

C++ Does this code modify any data pointer to this or does it just return it?

Need assistance with loop to iteratively add to sum variable in Assembly

C++ running slower than VB6

Is "for( int k = 5; k--;)" faster than "for( int k = 4; k > -1; --k)"

Why does adding extra check in loop make big difference on some machines, and small difference on others?

Categories

Resources