Need assistance with loop to iteratively add to sum variable in Assembly - c++

I'm trying to convert the following C++ code into Assembly (MASM, Irvine32):
const int SIZE = 10;
int numbers[SIZE] = {10,60,20,33,72,89,45,65,72,18};
int limit = 50;
int index = 0;
int sum = 0;
while( index < SIZE )
{
if( numbers[index] <= limit )
{
sum = sum + numbers[index]; // sum += array[index];
}
index++;
}
If anyone could clear up where I'm going wrong -- I'm getting errors at L1: it's just spouting out "+10". I believe it's because I cannot translate sum=sum+numbers[index] into Assembly. If anyone could help me do that it would be fantastic. My attempt at translating it (lines starting from "total: mov esi, offset numbers" to "inc index") is obviously incorrect.
.data
SYZ = 10
numbers DWORD 10, 60, 20, 33, 72, 89, 45, 65, 72, 18
limit DWORD 50
index DWORD 0
sum DWORD 0
.code
main PROC
mov eax, index
mov ebx, SYZ
top: cmp eax, ebx
jae next
jb total
total: mov esi, OFFSET numbers
mov ecx, limit
cmp [esi], ecx
jbe L1
L1: add eax, ebx
inc index
jmp top
next: mov edx, sum
call WriteInt
exit
main ENDP
END main

Your conditional branch that implements the if is wrong. It should look like:
top:
...
cmp [esi], ecx
ja L1 ; conditional jump *over* an ADD instruction
add eax, [esi] ; [esi] is array[index] if you increment ESI properly...
L1: inc index
jmp top
In your C++, you can see that if numbers[index] <= limit then you want to update the sum, otherwise just increment the index and go back to the "top"; aka recheck the stopping condition.
Your original asm code was doing a condition check and then continuing regardless of the result.
cmp [esi], ecx
jbe L1 ; jump or fall-through to L1, condition irrelevant
L1: add eax, ebx
The C++ equivalent of your original asm is:
if( numbers[index] <= limit )
{
}
sum += ebx;
index++;
I'm not sure if this will solve all of your problems, but it will definitely solve one of them.

.data
SYZ = 10
numbers DWORD 10, 60, 20, 33, 72, 89, 45, 65, 72, 18
limit DWORD 50
index DWORD 0
sum DWORD 0
.code
main PROC
mov eax, index
mov ebx, SYZ
mov esi, OFFSET numbers
mov ecx, limit
mov edx, 0
top: cmp eax, ebx
jae next
cmp [esi], ecx
ja L1
add edx, [esi]
L1: inc index
mov eax, index
add esi, 4
jmp top
next: mov sum, edx
mov eax, sum
call WriteInt

Related

How do I convert C++ function to assembly(x86_64)?

This is my .CPP file
#include <iostream>
using namespace std;
extern "C" void KeysAsm(int arr[], int n, int thetha, int rho);
// Keep this and call it from assembler
extern "C"
void crim(int *xp, int *yp) {
int temp = *xp;
*xp = *yp;
*yp = temp+2;
}
// Translate this into Intel assembler
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
// Function to print an array
void printArray(int arr[], int size){
int i;
for (i = 0; i < size; i++)
cout << arr[i] << "\n";
cout << endl;
}
int main() {
int gamma1[]{
9,
270,
88,
-12,
456,
80,
45,
123,
427,
999
};
int gamma2[]{
900,
312,
542,
234,
234,
1,
566,
123,
427,
111
};
printf("Array:\n");
printArray(gamma1, 10);
KeysAsm(gamma1, 10, 5, 6);
printf("Array Result Asm:\n");
printArray(gamma1, 10);
KeysCpp(gamma2, 10, 5, 6);
printf("Array Result Cpp:\n");
printArray(gamma2, 10);
}
What I want to do is, convert the KeysCpp function into assembly language and call it from this very .CPP file. I want to keep the crim function as it is in .CPP, while only converting the KeysCpp.
Here is my .ASM file
PUBLIC KeysAsm
includelib kernel32.lib
_DATA SEGMENT
EXTERN crim:PROC
_DATA ENDS
_TEXT SEGMENT
KeysAsm PROC
push rbp
mov rbp, rsp
sub rsp, 40
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-32], edx
mov DWORD PTR [rbp-36], ecx
mov DWORD PTR [rbp-4], 0
jmp L3
L3:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
cmp DWORD PTR [rbp-4], eax
jl L7
L4:
mov eax, DWORD PTR [rbp-28]
sub eax, DWORD PTR [rbp-4]
sub eax, 1
cmp DWORD PTR [rbp-8], eax
jl L6
L5:
add DWORD PTR [rbp-8], 1
L6:
mov eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
cmp edx, eax
jle L5
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov rsi, rdx
mov rdi, rax
call crim
L7:
mov DWORD PTR [rbp-8], 0
jmp L4
KeysAsm ENDP
_TEXT ENDS
END
I am using Visual Studio 2017 to run this project.
I am getting next error when I run this code.
Unhandled exception at 0x00007FF74B0E429C in MatrixMultiplication.exe: Stack cookie instrumentation code detected a stack-based buffer overrun. occurred
Your asm looks like it's expecting the x86-64 System V calling convention, with args in RDI, ESI, EDX, ECX. But you said you're compiling with Visual Studio, so the compiler-generated code will use the Windows x64 calling convention: RCX, EDX, R8D, R9D.
And when you call crim, it can use shadow space (32 bytes above its return address, which you didn't reserve space for).
It looks like you got this asm from un-optimized compiler output, probably from https://godbolt.org/z/ea4MPh81r using GCC for Linux, without using -mabi=ms to override the default -mabi=sysv when compiling for non-Windows targets. And then you modified it to make the loop infinite, with a jmp at the bottom instead of a ret? Maybe a different GCC version than 12.2 since the label numbers and code don't match exactly.
(The signs of being un-optimized compiler output are all the reloads from [rbp-whatever], and redoing sign-extension before using an int to index an array with cdqe. A human would know the int must be non-negative. And being GCC specifically, the numbered label like .L1: etc. where you just removed the ., and of heavily using RAX for as much as possible in a debug build. And choices like lea rdx, [0+rax*4] to copy-and-shift, and the exact syntax it used to print that instruction in Intel syntax match GCC.)
To compile a single function for Windows x64, isolate it and give the compiler only prototypes for anything it calls
extern "C" void crim(int *xp, int *yp); // prototype only
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
Then on Godbolt, use gcc -O3 -mabi=ms, or use MSVC which always targets Windows. https://godbolt.org/z/Mj5Gb54b5 shows both GCC and MSVC with optimization enabled.
KeysCpp(int*, int, int, int): ; demangled name
cmp edx, 1
jle .L11 ; "shrink wrap" optimization: early-out on n<=1 before saving regs
push r15 ; save some call-preserved regs
push r14
lea r14, [rcx+4] ; arr + 1
push r13
mov r13, rcx
Unfortunately GCC fails to hoist the thetha / rho * 2 - 4 loop-invariant, instead redoing idiv every time through the loop. Seems like an obvious optimization since those are local vars whose address hasn't been taken at all, and it keeps thetha (typo for theta?) and rho in registers. So MSVC is much more efficient here. Clang also misses this optimization.

Replacing even array elements with zeros in assembler

There is the following problem:
I have an array, it is necessary to give its size and then in this array for each element that has an even value, assign the value zeros, and return the modified array.
There is my C++ code:
#include <iostream>
int main()
{
const int size = 10;
int arr[size] = { 1,2,3,4,5,6,7,8,9,10 };
for (int i = 0; i < size; i++)
{
if (arr[i] % 2 == 0)
arr[i] = 0;
}
for (int i = 0; i < size; i++)
{
std::cout << arr[i] << ' ';
}
system("pause");
return 0;
}
There is my nasm code:
%include "io64.inc"
section .text
global CMAIN
CMAIN:
mov DWORD size$[rbp], 10 ; size of array
; elements of array
mov DWORD arr$[rbp], 1
mov DWORD arr$[rbp+4], 2
mov DWORD arr$[rbp+8], 3
mov DWORD arr$[rbp+12], 4
mov DWORD arr$[rbp+16], 5
mov DWORD arr$[rbp+20], 6
mov DWORD arr$[rbp+24], 7
mov DWORD arr$[rbp+28], 8
mov DWORD arr$[rbp+32], 9
mov DWORD arr$[rbp+36], 10
mov DWORD i$4[rbp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD i$4[rbp]
inc eax
mov DWORD i$4[rbp], eax
$LN4#main:
cmp DWORD i$4[rbp], 10
jge SHORT $LN3#main
movsxd rax, DWORD i$4[rbp]
mov eax, DWORD arr$[rbp+rax*4]
cdq
and eax, 1
xor eax, edx
sub eax, edx
test eax, eax
jne SHORT $LN8#main
movsxd rax, DWORD i$4[rbp]
mov DWORD arr$[rbp+rax*4], 0
$LN8#main:
jmp SHORT $LN2#main
$LN3#main:
mov DWORD i$5[rbp], 0
jmp SHORT $LN7#main
$LN5#main:
mov eax, DWORD i$5[rbp]
inc eax
mov DWORD i$5[rbp], eax
$LN7#main:
cmp DWORD i$5[rbp], 10
jge SHORT $LN6#main
$LN6#main:
mov edi, eax
lea rcx, QWORD [rbp-32]
mov eax, edi
lea rsp, QWORD [rbp+360]
pop rdi
pop rbp
ret
My question is: will this code work and how can it be optimized?
I just get errors like this:
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:1: fatal: unable to open include file `io64.inc'
gcc.exe: error: C:\Users\79268\AppData\Local\Temp\SASM\program.o: No such file or directory
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:6: error: comma, colon, decorator or end of line expected after operand
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:9: error: comma, colon, decorator or end of line expected after operand
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:10: error: comma, colon, decorator or end of line expected after operand
So, I install NASM and IDE SASM correctly...
I try to rewrite code and compile it in SASM:
section .data
arr dd 1, 2, 3, 4, 5, 6, 7, 8, 9,10
section .text
global CMAIN
CMAIN:
call calc
push arr
calc:
lea rsi, [arr]
mov rcx, [10]
mov rdi, rsi
xor rbx, rbx
##for:
lodsq
test al, 1
cmovz rax, rbx
stosq
loop ##for
ret
And I get the same error:
c:/program files (x86)/sasm/mingw64/bin/../lib/gcc/x86_64-w64
mingw32/4.8.1/../../../../x86_64-w64
mingw32/lib/../lib/libmingw32.a(lib64_libmingw32_a-crt0_c.o):
crt0_c.c:(.text.startup+0x25): undefined reference to `WinMain'
Now I get the same error for updated code:
BITS 64
section .text
global _start
_start:
push arr
lea rsi, [arr]
mov rcx, [10]
mov rdi, rsi
xor rbx, rbx
##for:
lodsq
test al, 1
cmovz rax, rbx
stosq
loop ##for
ret
section .data
arr dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$nasm -f elf *.asm; ld -m elf_i386 -s -o demo *.o
$demo
/usr/bin/timeout: the monitored command dumped core
sh: line 1: 21184 Segmentation fault /usr/bin/timeout 10s demo
Any idea for fix and compile this programm? SASM is nit wirking on my machine, and a try to use this online NASM compilier: ASM Online compilier

nasm function called from C ends up segfaulting

I have to find min max values in array using only one conditional jump directive.
After compiling and linking the two files below I get a Segmentation Fault (core dumped), but I don't understand why that is.
Question: What is causing the segmentation fault?
main.cpp
#include <cstdio>
#include <time.h>
using namespace std;
extern "C" void minmax(int n, int * tab, int * max, int * min);
int main(){
const int rozmiar = 100000;
const int liczba_powtorzen = 10000;
int tab[rozmiar] = {1, 3, 3, -65, 3, 123, 4, 32, 342, 22, 11, 32, 44, 12, 324, 43};
tab[rozmiar-1] = -1000;
int min, max;
min = 99999;
max = -99999;
clock_t start, stop;
start = clock();
for(int i=0; i<liczba_powtorzen; i++){
minmax(rozmiar, tab, &max, &min);
}
printf("min = %d max = %d\n", min, max);
stop = clock();
printf("\n time = %f ( %d cykli)", (stop - start)*1.0/CLOCKS_PER_SEC, (stop - start));
return 0;
}
minmax.asm
global minmax ; required for linker and NASM
section .text ; start of the "CODE segment"
minmax:
push ebp
mov ebp, esp ; set up the EBP
push ecx ; save used registers
push esi
mov ecx, [ebp+8] ; array length n
mov esi, [ebp+12] ; array address
mov eax, [ebp+16] ;max
mov edi, [ebp+20] ; min
lp: add eax, [esi] ; fetch an array element
cmp eax, [esi]
jl max ; max<[esi] ->update max
cmp edi, [esi]
jg min ; min>[esi] ->update min
add esi, 4 ; move to another element
loop lp ; loop over all elements
max:
mov eax, esi
ret
min:
mov edi, esi
ret
pop esi ; restore used registers
pop ecx
pop ebp
ret ; return to caller
Long story, short:
You need to restore the stack before using ret.
Your asm implementation is faulty on many levels, but the reason for your segmentation fault is poor understanding of how ret works.
Invalid use of ret
ret does not bring you back to the last jump, it reads the value that is at the top of the stack, and returns to that address.
After you jump to either min: or max:, you call ret, where you should be jumping back to your loop.
This means that it will try to return back to the address at the top of the stack, which certainly isn't a valid address; you modified it upon entering the function.
push ebp
mov ebp, esp ; set up the EBP
push ecx ; save used registers
push esi ; note, this is where `ret` will try to go
I do not know what you're exactly trying to do, but the assembler function is written poor.
Try this:
push ebp
mov ebp, esp ; set up the EBP
push ecx ; save used registers
push esi
mov ecx, [ebp+8] ; array length n
mov esi, [ebp+12] ; array address
mov eax, 0x80000000
mov edi,[ebp+16]
mov [edi], eax
mov eax, 0x7fffffff
mov edi,[ebp+20]
mov [edi], eax
lp:
mov edi,[ebp+16]
lodsd
cmp [edi], eax
jg short _min_test
mov [edi], eax
_min_test:
mov edi,[ebp+20]
cmp [edi], eax
jl short _loop
mov [edi], eax
_loop:
loop lp
pop esi ; restore used registers
pop ecx
pop ebp
ret ; return to caller

How do I call a different subroutine using parameters passed into the current one (for use in recursion)?

I have two functions that take integers x and y read from input.
product returns x * y
power returns x ^ y, however it uses recursion and product to compute this. so x would be "base" and y is "exponent".
They called from C++:
int a, b, x, y;
a = product(x, y);
b = power(x, y);
and here is the asm. I got the product to work, however am having trouble with power because I am not sure of the syntax/method/convention to call product from it (and call itself for the recursion). EDIT: Recursion must be used.
global product
global power
section .text
product:
push ebp
mov ebp, esp
sub esp, 4
push edi
push esi
xor eax, eax
mov edi, [ebp+8]
mov esi, [ebp+12]
mov [ebp-4], edi
product_loop:
add [ebp-4], edi
mov eax, [ebp-4]
sub esi, 1
cmp esi, 1
jne product_loop
product_done:
pop esi
pop edi
mov esp, ebp
pop ebp
ret
power:
push ebp
mov ebp, esp
sub esp, 4
push edi
push esi
push ebx
xor eax, eax
mov edi, [ebp+8]
mov esi, [ebp+12]
;;;
check:
cmp esi, 1 ; if exp < 1
jl power_stop
recursion: ; else (PLEASE HELP!!!!!!!!)
; eax = call product (base, (power(base, exp-1))
power_stop:
mov eax, 1 ; return 1
power_done:
push ebx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
EDIT: My solution!
power:
; Standard prologue
push ebp ; Save the old base pointer
mov ebp, esp ; Set new value of the base pointer
sub esp, 4 ; make room for 1 local variable result
push ebx ; this is exp-1
xor eax, eax ; Place zero in EAX. We will keep a running sum
mov eax, [ebp+12] ; exp
mov ebx, [ebp+8] ; base
cmp eax, 1 ; n >= 1
jge L1 ; if not, go do a recursive call
mov eax, 1 ; otherwise return 1
jmp L2
L1:
dec eax ; exp-1
push eax ; push argument 2: exp-1
push ebx ; push argument 1: base
call power ; do the call, result goes in eax: power(base, exp-1)
add esp, 8 ; get rid of arguments
push eax ; push argument 2: power(base, exponent-1)
push ebx ; push argument 1: base
call product ; product(base, power(base, exponent-1))
L2:
; Standard epilogue
pop ebx ; restore register
mov esp, ebp ; deallocate local variables
pop ebp ; Restore the callers base pointer.
ret ; Return to the caller.
You are using CDECL calling convention, so you have to first push the arguments in the stack in backward direction, then call the function and then clean the stack after the return.
push arg_last
push arg_first
call MyFunction
add esp, 8 ; the argument_count*argument_size
But here are some notes on your code:
Your function product does not return any value. Use mov eax, [ebp-4] immediately after product_done label.
Multiplication is much easy to be made by the instruction mul or imul. Using addition is the slowest possible way.
Computing the power by recursion is not the best idea. Use the following algorithm:
Y = 1;
if N=0 exit.
if N is odd -> Y = Y*x; N=N-1
if N is even -> Y = Y*Y; N=N/2
goto 2
Use SHR instruction in order to divide N by 2. Use test instrction in order to check odd/even number.
This way, you simply don't need to call product from power function.
If you're not sure how to write the assembly, you can generally write it in C++ and assemble it for clues - something like:
int power(int n, int exp)
{
return exp == 0 ? 1 :
exp == 1 ? n :
product(n, power(n, exp - 1));
}
Then you should just be able to use gcc -S or whatever your compiler's equivalent switch for assembly output is, or disassemble the machine code if you prefer.
For example, the function above, thrown in with int product(int x, int y) { return x * y; } and int main() { return product(3, 4); }, compiled with Microsoft's compiler ala cl /Fa power.cc:
; Listing generated by Microsoft (R) Optimizing Compiler Version 15.00.30729.01
TITLE C:\home\anthony\user\dev\power.cc
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?product##YAHHH#Z ; product
; Function compile flags: /Odtp
_TEXT SEGMENT
_x$ = 8 ; size = 4
_y$ = 12 ; size = 4
?product##YAHHH#Z PROC ; product
; File c:\home\anthony\user\dev\power.cc
; Line 1
push ebp
mov ebp, esp
mov eax, DWORD PTR _x$[ebp]
imul eax, DWORD PTR _y$[ebp]
pop ebp
ret 0
?product##YAHHH#Z ENDP ; product
_TEXT ENDS
PUBLIC ?power##YAHHH#Z ; power
; Function compile flags: /Odtp
_TEXT SEGMENT
tv73 = -8 ; size = 4
tv74 = -4 ; size = 4
_n$ = 8 ; size = 4
_exp$ = 12 ; size = 4
?power##YAHHH#Z PROC ; power
; Line 4
push ebp
mov ebp, esp
sub esp, 8
; Line 7
cmp DWORD PTR _exp$[ebp], 0
jne SHORT $LN5#power
mov DWORD PTR tv74[ebp], 1
jmp SHORT $LN6#power
$LN5#power:
cmp DWORD PTR _exp$[ebp], 1
jne SHORT $LN3#power
mov eax, DWORD PTR _n$[ebp]
mov DWORD PTR tv73[ebp], eax
jmp SHORT $LN4#power
$LN3#power:
mov ecx, DWORD PTR _exp$[ebp]
sub ecx, 1
push ecx
mov edx, DWORD PTR _n$[ebp]
push edx
call ?power##YAHHH#Z ; power
add esp, 8
push eax
mov eax, DWORD PTR _n$[ebp]
push eax
call ?product##YAHHH#Z ; product
add esp, 8
mov DWORD PTR tv73[ebp], eax
$LN4#power:
mov ecx, DWORD PTR tv73[ebp]
mov DWORD PTR tv74[ebp], ecx
$LN6#power:
mov eax, DWORD PTR tv74[ebp]
; Line 8
mov esp, ebp
pop ebp
ret 0
?power##YAHHH#Z ENDP ; power
_TEXT ENDS
PUBLIC _main
; Function compile flags: /Odtp
_TEXT SEGMENT
_main PROC
; Line 11
push ebp
mov ebp, esp
; Line 12
push 4
push 3
call ?power##YAHHH#Z ; power
add esp, 8
; Line 13
pop ebp
ret 0
_main ENDP
_TEXT ENDS
END
To walk you through this:
?power##YAHHH#Z PROC ; power
; Line 4
push ebp
mov ebp, esp
sub esp, 8
The above is the entry code for the power function - just adjusting the stack pointer to jump over the function arguments, which it will access below as _exp$[ebp] (that's exp) and _n$[ebp] (i.e. n).
; Line 7
cmp DWORD PTR _exp$[ebp], 0
jne SHORT $LN5#power
mov DWORD PTR tv74[ebp], 1
jmp SHORT $LN6#power
Basically, if exp is not equal to 0 we'll continue at label $LN5#power below, but if it is 0 then load 1 into the return value location on the stack at tv74[ebp] and jump to the function return instructions at $LN6#power.
$LN5#power:
cmp DWORD PTR _exp$[ebp], 1
jne SHORT $LN3#power
mov eax, DWORD PTR _n$[ebp]
mov DWORD PTR tv73[ebp], eax
jmp SHORT $LN4#power
Similar to the above - if exp is 1 then put n into eax and therefrom into the return value stack memory, then jump to the return instructions.
Now it starts to get interesting...
$LN3#power:
mov ecx, DWORD PTR _exp$[ebp]
sub ecx, 1
push ecx
Subtract 1 from exp and push in onto the stack...
mov edx, DWORD PTR _n$[ebp]
push edx
Also push n onto the stack...
call ?power##YAHHH#Z ; power
Recursively call the power function, which will use the two values pushes above.
add esp, 8
A stack adjustment after the function above returns.
push eax
Put the result of the recursive call - which the power return instructions leave in the eax register - onto the stack...
mov eax, DWORD PTR _n$[ebp]
push eax
Also push n onto the stack...
call ?product##YAHHH#Z ; product
Call the product function to multiple the value returned by the call to power above by n.
add esp, 8
mov DWORD PTR tv73[ebp], eax
Copy the result of product into a temporary address on the stack....
$LN4#power:
mov ecx, DWORD PTR tv73[ebp]
mov DWORD PTR tv74[ebp], ecx
Pick up the value from that tv73 temporary location and copy it into tv74...
$LN6#power:
mov eax, DWORD PTR tv74[ebp]
Finally, move the the product() result from tv74 into the eax register for convenient and fast access after the product call returns.
; Line 8
mov esp, ebp
pop ebp
ret 0
Clean up the stack and return.

Why does adding extra check in loop make big difference on some machines, and small difference on others?

I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays.
Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware.
Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz
Ratio 2 checks : 1 check = 1.0310
Ratio 3 checks : 1 check = 1.2769
Processor Intel(R) Core(TM) i7-3610QM CPU # 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s)
Ratio 2 checks : 1 check = 1.0090
Ratio 3 checks : 1 check = 1.0393
Processor Intel(R) Core(TM) i5-2500 CPU # 3.30GHz, 4 Cores(s)
Ratio 2 checks : 1 check = 1.0035
Ratio 3 checks : 1 check = 1.0639
Processor Intel(R) Core(TM)2 Duo CPU T9300 # 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s)
Ratio 2 checks : 1 check = 1.1195
Ratio 3 checks : 1 check = 1.3597
Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz
Ratio 2 checks : 1 check = 1.0776
Ratio 3 checks : 1 check = 1.1451
In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here.
Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking.
// dotprod.cpp
#include "dotprod.h"
double SumProduct(const double* v1, const double* v2, int n)
{
double sum=0;
for(int i=0;
i<n;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2)
{
double sum=0;
for(int i=0;
i<n1 && i <n2;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3)
{
double sum=0;
for(int i=0;
i<n1 && i <n2 && i <n3;
++i)
sum += v1[i]*v2[i];
return sum;
}
This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it?
Rest of the code below, for information. This has some C++ specific stuff in it.
Header file
// dotprod.h
double SumProduct(const double*, const double*, int n);
double SumProduct(const double*, const double*, int n1, int n2);
double SumProduct(const double*, const double*, int n1, int n2, int n3);
Test harness
// main.cpp
#include <stdio.h>
#include <math.h>
#include <numeric>
#include <vector>
#include <windows.h>
#include "../dotprod/dotprod.h" // separate lib
typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
LARGE_INTEGER li;
if (!QueryPerformanceCounter(&li)) {
exit(1);
}
return li.QuadPart;
}
int main()
{
typedef std::vector<double> dvec;
const int N = 100 * 1000;
// Initialize
dvec v1(N);
dvec v2(N);
dvec dp1(N);
dvec dp2(N);
dvec dp3(N);
for(int i=0; i<N; ++i) {
v1[i] = i;
v2[i] = log(static_cast<double>(i+1));
}
const timecount_t t0 = GetTimeCount();
// Check cost with one bound
for(int n=0; n<N; ++n) {
dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n);
}
const timecount_t t1 = GetTimeCount();
// Check cost with two bounds
for(int n=0; n<N; ++n) {
dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n);
}
const timecount_t t2 = GetTimeCount();
// Check cost with three bounds
for(int n=0; n<N; ++n) {
dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n);
}
const timecount_t t3 = GetTimeCount();
// Check results
const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0);
const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0);
const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0);
printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3);
// Output timings
const timecount_t elapsed1 = t1-t0;
const timecount_t elapsed2 = t2-t1;
const timecount_t elapsed3 = t3-t2;
printf("Elapsed: %.0f, %.0f, %.0f\n",
static_cast<double>(elapsed1),
static_cast<double>(elapsed2),
static_cast<double>(elapsed3));
const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1);
const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1);
printf("Ratio 2:1=%.2f\n", ratio2to1);
printf("Ratio 3:1=%.2f\n", ratio3to1);
return 0;
}
In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file.
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
PUBLIC __real#0000000000000000
PUBLIC ?SumProduct##YANPBN0HHH#Z ; SumProduct
EXTRN __fltused:DWORD
; COMDAT __real#0000000000000000
; File c:\dev\testspeed\dotprod\dotprod.cpp
CONST SEGMENT
__real#0000000000000000 DQ 00000000000000000r ; 0
; Function compile flags: /Ogtp
CONST ENDS
; COMDAT ?SumProduct##YANPBN0HHH#Z
_TEXT SEGMENT
tv491 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
_n3$ = 24 ; size = 4
?SumProduct##YANPBN0HHH#Z PROC ; SumProduct, COMDAT
; 25 : {
push ebp
mov ebp, esp
push ecx
; 26 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 27 : for(int i=0;
xor ecx, ecx
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, 4
jl $LC8#SumProduct
; 26 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea esi, DWORD PTR [edi+24]
; 30 : sum += v1[i]*v2[i];
sub edi, ebx
lea edx, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv491[ebp], edi
$LN15#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
mov ebx, DWORD PTR _n2$[ebp]
cmp ecx, ebx
jge $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax-8]
lea edi, DWORD PTR [edx-1]
fmul QWORD PTR [esi-24]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
mov edi, DWORD PTR tv491[ebp]
fld QWORD PTR [edi+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp edx, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+8]
lea edi, DWORD PTR [edx+1]
fmul QWORD PTR [esi-8]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [esi]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add esi, 32 ; 00000020H
faddp ST(1), ST(0)
add edx, 4
cmp ecx, ebx
jl SHORT $LN15#SumProduct
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct
$LN9#SumProduct:
; 31 : return sum;
; 32 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HHH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0HH#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0HH#Z
_TEXT SEGMENT
tv448 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
?SumProduct##YANPBN0HH#Z PROC ; SumProduct, COMDAT
; 15 : {
push ebp
mov ebp, esp
push ecx
; 16 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 17 : for(int i=0;
xor ecx, ecx
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp edi, 4
jl SHORT $LC8#SumProduct#2
; 16 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea edx, DWORD PTR [edi+24]
; 20 : sum += v1[i]*v2[i];
sub edi, ebx
lea esi, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv448[ebp], edi
$LN19#SumProduct#2:
mov edi, DWORD PTR _n2$[ebp]
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax-8]
lea ebx, DWORD PTR [esi-1]
fmul QWORD PTR [edx-24]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
mov ebx, DWORD PTR tv448[ebp]
fld QWORD PTR [ebx+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp esi, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+8]
lea ebx, DWORD PTR [esi+1]
fmul QWORD PTR [edx-8]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [edx]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add edx, 32 ; 00000020H
faddp ST(1), ST(0)
add esi, 4
cmp ecx, ebx
jl SHORT $LN19#SumProduct#2
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct#2:
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct#2:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct#2
; 20 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct#2
$LN9#SumProduct#2:
; 21 : return sum;
; 22 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0H#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0H#Z
_TEXT SEGMENT
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
?SumProduct##YANPBN0H#Z PROC ; SumProduct, COMDAT
; _n$ = eax
; 5 : {
push ebp
mov ebp, esp
mov edx, DWORD PTR _v2$[ebp]
; 6 : double sum=0;
fldz
push ebx
push esi
mov esi, eax
; 7 : for(int i=0;
xor ebx, ebx
push edi
mov edi, DWORD PTR _v1$[ebp]
; 8 : i<n;
; 9 : ++i)
cmp esi, 4
jl SHORT $LC9#SumProduct#3
; 6 : double sum=0;
lea eax, DWORD PTR [edx+8]
lea ecx, DWORD PTR [edi+24]
; 10 : sum += v1[i]*v2[i];
sub edi, edx
lea edx, DWORD PTR [esi-4]
shr edx, 2
inc edx
lea ebx, DWORD PTR [edx*4]
$LN10#SumProduct#3:
fld QWORD PTR [eax-8]
add eax, 32 ; 00000020H
fmul QWORD PTR [ecx-24]
add ecx, 32 ; 00000020H
dec edx
faddp ST(1), ST(0)
fld QWORD PTR [edi+eax-32]
fmul QWORD PTR [eax-32]
faddp ST(1), ST(0)
fld QWORD PTR [eax-24]
fmul QWORD PTR [ecx-40]
faddp ST(1), ST(0)
fld QWORD PTR [eax-16]
fmul QWORD PTR [ecx-32]
faddp ST(1), ST(0)
jne SHORT $LN10#SumProduct#3
; 6 : double sum=0;
mov edx, DWORD PTR _v2$[ebp]
mov edi, DWORD PTR _v1$[ebp]
$LC9#SumProduct#3:
; 8 : i<n;
; 9 : ++i)
cmp ebx, esi
jge SHORT $LN8#SumProduct#3
sub edi, edx
lea eax, DWORD PTR [edx+ebx*8]
sub esi, ebx
$LC3#SumProduct#3:
; 10 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edi]
add eax, 8
dec esi
fmul QWORD PTR [eax-8]
faddp ST(1), ST(0)
jne SHORT $LC3#SumProduct#3
$LN8#SumProduct#3:
; 11 : return sum;
; 12 : }
pop edi
pop esi
pop ebx
pop ebp
ret 0
?SumProduct##YANPBN0H#Z ENDP ; SumProduct
_TEXT ENDS
END
One big difference between CPUs is the pipeline optimization
The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch.
So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.