Pass 64-bit int as output to 32-bit inline asm - c++

#include <stdarg.h>
#include <stdint.h>
uint64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
uint64_t r = test_func(10);
return 0;
}
converts to:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
and esp, -8
sub esp, 24
mov DWORD PTR [esp], 10
call test_func(int)
mov DWORD PTR [esp+16], eax
mov DWORD PTR [esp+20], edx
mov eax, 0
leave
ret
You can see it uses 2 registers to store that 64-bit integer. However, in the C/C++ code, it is only ONE variable.
I tried to replicate this in inline-assembly but I had to do:
#include <stdarg.h>
#include <stdint.h>
int64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
int32_t rlow = 0, rhigh = 0;
asm(
"push $10\n"
"\tcall %P2"
: "=a"(rlow), "=d"(rhigh)
: "i"(&test_func) : "memory");
return 0;
}
And the output is:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-8], 0
mov DWORD PTR [ebp-4], 0
push $10
call test_func(int)
mov DWORD PTR [ebp-8], eax
mov DWORD PTR [ebp-4], edx
mov eax, 0
leave
ret
Now you can see I had to manually place the lower and higher order bits into two separate integers. Then I perform shifting to make it into one 64-bit integer.
Is there a way to automatically get it to place into a single 64-bit integer without me having to supply it two 32-bit integers and then shifting the bits?

You want the "A" constraint, which binds a 64-bit value to the eax/edx register pair. Something like:
uint64_t r;
asm("push $10\n"
"\tcall %P1"
: "=A"(r) : "i"(&test_func) : "memory");
should do the trick.

Related

Why is the C++ function parameter stored 20 bytes off of the rbp in x86-64 when the method body only has one 4 byte variable?

Consider the following program, compiled using x86-64 GCC 12.2 with flags --std=c++17 -O0:
int square(int num, int num2) {
int foo = 37;
return num * num;
}
int main () {
return square(10, 5);
}
The resulting assembly using godbolt is:
square(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov DWORD PTR [rbp-4], 37
mov eax, DWORD PTR [rbp-20]
imul eax, eax
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 5
mov edi, 10
call square(int, int)
nop
pop rbp
ret
I read about shadow spaces and it appears that in x64 there must be at minimum 32 bytes allocated: "32 bytes above the return address which the called function owns" ...
With that said, how is the offset -20 determined for the parameter num? If there's 32 bytes from rbp, wouldn't that be -24?
I noticed even if you add more local variables, it'll remain -20 until it gets pushed over to -36, but I cannot understand why. Thanks!

A temporary array is assigned but not a temporary primary value

I am amazed that this C++ code is compiled:
int main()
{
(int[10]){}[0]=15;
return 0;
}
The equivalent assembly is
main:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-48], 0
mov QWORD PTR [rbp-40], 0
mov QWORD PTR [rbp-32], 0
mov QWORD PTR [rbp-24], 0
mov QWORD PTR [rbp-16], 0
mov DWORD PTR [rbp-48], 15
mov eax, 0
pop rbp
ret
According to this code, an array is defined without having any name and then assigned.
Interestingly, when there is no array, the code does not compile:
int main()
{
(int){}=15; /* <Compilation failed> */
return 0;
}
1- Why is the first expression (maybe you call it assigning to an xvalue) legal in C++ for a temporary array but not the second one for a basic primary type? Why the language is designed this way?
2- What is the application of such a temporary array?

GCC generated assembly

Why printf function causes the change of prologue?
C code_1:
#include <cstdio>
int main(){
int a = 11;
printf("%d", a);
}
GCC -m32 generated one:
.LC0:
.string "%d"
main:
lea ecx, [esp+4] // What's purpose of this three
and esp, -16 // lines?
push DWORD PTR [ecx-4] //
push ebp
mov ebp, esp
push ecx
sub esp, 20 // why sub 20?
mov DWORD PTR [ebp-12], 11
sub esp, 8
push DWORD PTR [ebp-12]
push OFFSET FLAT:.LC0
call printf
add esp, 16
mov eax, 0
mov ecx, DWORD PTR [ebp-4]
leave
lea esp, [ecx-4]
ret
C code_2:
#include <cstdio>
int main(){
int a = 11;
}
GCC -m32:
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-4], 11
mov eax, 0
leave
ret
What is the purpose of first three lines added in first code?
Please, explain first assembly code, if you can.
EDIT:
64-bit mode:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 11
mov eax, DWORD PTR [rbp-4]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
The insight is that the compiler keep the stack aligned at function calls.
The alignment is 16 byte.
lea ecx, [esp+4] ;Save original ESP to ECX (ESP+4 actually)
and esp, -16 ;Align stack on 16 bytes (Lower esp)
push DWORD PTR [ecx-4] ;Push main return address (Stack at 16B + 4)
;My guess is to aid debugging tools that expect the RA
;to be at [ebp+04h]
push ebp
mov ebp, esp ;Prolog (Stack at 16B+8)
push ecx ;Save ECX (Original stack pointer) (Stack at 16B+12)
sub esp, 20 ;Reserve 20 bytes (Stack at 16B+0, ALIGNED AGAIN)
;4 for alignment + 1x16 for a variable (variable space is
;allocated in multiple of 16)
mov DWORD PTR [ebp-12], 11 ;a = 11
sub esp, 8 ;Stack at 16B+8 for later alignment
push DWORD PTR [ebp-12] ;a
push OFFSET FLAT:.LC0 ;"%d" (Stack at 16B)
call printf
add esp, 16 ;Remove args+pad from the stack (Stack at 16B)
mov eax, 0 ;Return 0
mov ecx, DWORD PTR [ebp-4] ;Restore ECX without the need to add to esp
leave ;Restore EBP
lea esp, [ecx-4] ;Restore original ESP
ret
I don't know why the compiler saves esp+4 in ecx instead of esp (esp+4 is the address of the first parameter of main).

How do I call a different subroutine using parameters passed into the current one (for use in recursion)?

I have two functions that take integers x and y read from input.
product returns x * y
power returns x ^ y, however it uses recursion and product to compute this. so x would be "base" and y is "exponent".
They called from C++:
int a, b, x, y;
a = product(x, y);
b = power(x, y);
and here is the asm. I got the product to work, however am having trouble with power because I am not sure of the syntax/method/convention to call product from it (and call itself for the recursion). EDIT: Recursion must be used.
global product
global power
section .text
product:
push ebp
mov ebp, esp
sub esp, 4
push edi
push esi
xor eax, eax
mov edi, [ebp+8]
mov esi, [ebp+12]
mov [ebp-4], edi
product_loop:
add [ebp-4], edi
mov eax, [ebp-4]
sub esi, 1
cmp esi, 1
jne product_loop
product_done:
pop esi
pop edi
mov esp, ebp
pop ebp
ret
power:
push ebp
mov ebp, esp
sub esp, 4
push edi
push esi
push ebx
xor eax, eax
mov edi, [ebp+8]
mov esi, [ebp+12]
;;;
check:
cmp esi, 1 ; if exp < 1
jl power_stop
recursion: ; else (PLEASE HELP!!!!!!!!)
; eax = call product (base, (power(base, exp-1))
power_stop:
mov eax, 1 ; return 1
power_done:
push ebx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
EDIT: My solution!
power:
; Standard prologue
push ebp ; Save the old base pointer
mov ebp, esp ; Set new value of the base pointer
sub esp, 4 ; make room for 1 local variable result
push ebx ; this is exp-1
xor eax, eax ; Place zero in EAX. We will keep a running sum
mov eax, [ebp+12] ; exp
mov ebx, [ebp+8] ; base
cmp eax, 1 ; n >= 1
jge L1 ; if not, go do a recursive call
mov eax, 1 ; otherwise return 1
jmp L2
L1:
dec eax ; exp-1
push eax ; push argument 2: exp-1
push ebx ; push argument 1: base
call power ; do the call, result goes in eax: power(base, exp-1)
add esp, 8 ; get rid of arguments
push eax ; push argument 2: power(base, exponent-1)
push ebx ; push argument 1: base
call product ; product(base, power(base, exponent-1))
L2:
; Standard epilogue
pop ebx ; restore register
mov esp, ebp ; deallocate local variables
pop ebp ; Restore the callers base pointer.
ret ; Return to the caller.
You are using CDECL calling convention, so you have to first push the arguments in the stack in backward direction, then call the function and then clean the stack after the return.
push arg_last
push arg_first
call MyFunction
add esp, 8 ; the argument_count*argument_size
But here are some notes on your code:
Your function product does not return any value. Use mov eax, [ebp-4] immediately after product_done label.
Multiplication is much easy to be made by the instruction mul or imul. Using addition is the slowest possible way.
Computing the power by recursion is not the best idea. Use the following algorithm:
Y = 1;
if N=0 exit.
if N is odd -> Y = Y*x; N=N-1
if N is even -> Y = Y*Y; N=N/2
goto 2
Use SHR instruction in order to divide N by 2. Use test instrction in order to check odd/even number.
This way, you simply don't need to call product from power function.
If you're not sure how to write the assembly, you can generally write it in C++ and assemble it for clues - something like:
int power(int n, int exp)
{
return exp == 0 ? 1 :
exp == 1 ? n :
product(n, power(n, exp - 1));
}
Then you should just be able to use gcc -S or whatever your compiler's equivalent switch for assembly output is, or disassemble the machine code if you prefer.
For example, the function above, thrown in with int product(int x, int y) { return x * y; } and int main() { return product(3, 4); }, compiled with Microsoft's compiler ala cl /Fa power.cc:
; Listing generated by Microsoft (R) Optimizing Compiler Version 15.00.30729.01
TITLE C:\home\anthony\user\dev\power.cc
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?product##YAHHH#Z ; product
; Function compile flags: /Odtp
_TEXT SEGMENT
_x$ = 8 ; size = 4
_y$ = 12 ; size = 4
?product##YAHHH#Z PROC ; product
; File c:\home\anthony\user\dev\power.cc
; Line 1
push ebp
mov ebp, esp
mov eax, DWORD PTR _x$[ebp]
imul eax, DWORD PTR _y$[ebp]
pop ebp
ret 0
?product##YAHHH#Z ENDP ; product
_TEXT ENDS
PUBLIC ?power##YAHHH#Z ; power
; Function compile flags: /Odtp
_TEXT SEGMENT
tv73 = -8 ; size = 4
tv74 = -4 ; size = 4
_n$ = 8 ; size = 4
_exp$ = 12 ; size = 4
?power##YAHHH#Z PROC ; power
; Line 4
push ebp
mov ebp, esp
sub esp, 8
; Line 7
cmp DWORD PTR _exp$[ebp], 0
jne SHORT $LN5#power
mov DWORD PTR tv74[ebp], 1
jmp SHORT $LN6#power
$LN5#power:
cmp DWORD PTR _exp$[ebp], 1
jne SHORT $LN3#power
mov eax, DWORD PTR _n$[ebp]
mov DWORD PTR tv73[ebp], eax
jmp SHORT $LN4#power
$LN3#power:
mov ecx, DWORD PTR _exp$[ebp]
sub ecx, 1
push ecx
mov edx, DWORD PTR _n$[ebp]
push edx
call ?power##YAHHH#Z ; power
add esp, 8
push eax
mov eax, DWORD PTR _n$[ebp]
push eax
call ?product##YAHHH#Z ; product
add esp, 8
mov DWORD PTR tv73[ebp], eax
$LN4#power:
mov ecx, DWORD PTR tv73[ebp]
mov DWORD PTR tv74[ebp], ecx
$LN6#power:
mov eax, DWORD PTR tv74[ebp]
; Line 8
mov esp, ebp
pop ebp
ret 0
?power##YAHHH#Z ENDP ; power
_TEXT ENDS
PUBLIC _main
; Function compile flags: /Odtp
_TEXT SEGMENT
_main PROC
; Line 11
push ebp
mov ebp, esp
; Line 12
push 4
push 3
call ?power##YAHHH#Z ; power
add esp, 8
; Line 13
pop ebp
ret 0
_main ENDP
_TEXT ENDS
END
To walk you through this:
?power##YAHHH#Z PROC ; power
; Line 4
push ebp
mov ebp, esp
sub esp, 8
The above is the entry code for the power function - just adjusting the stack pointer to jump over the function arguments, which it will access below as _exp$[ebp] (that's exp) and _n$[ebp] (i.e. n).
; Line 7
cmp DWORD PTR _exp$[ebp], 0
jne SHORT $LN5#power
mov DWORD PTR tv74[ebp], 1
jmp SHORT $LN6#power
Basically, if exp is not equal to 0 we'll continue at label $LN5#power below, but if it is 0 then load 1 into the return value location on the stack at tv74[ebp] and jump to the function return instructions at $LN6#power.
$LN5#power:
cmp DWORD PTR _exp$[ebp], 1
jne SHORT $LN3#power
mov eax, DWORD PTR _n$[ebp]
mov DWORD PTR tv73[ebp], eax
jmp SHORT $LN4#power
Similar to the above - if exp is 1 then put n into eax and therefrom into the return value stack memory, then jump to the return instructions.
Now it starts to get interesting...
$LN3#power:
mov ecx, DWORD PTR _exp$[ebp]
sub ecx, 1
push ecx
Subtract 1 from exp and push in onto the stack...
mov edx, DWORD PTR _n$[ebp]
push edx
Also push n onto the stack...
call ?power##YAHHH#Z ; power
Recursively call the power function, which will use the two values pushes above.
add esp, 8
A stack adjustment after the function above returns.
push eax
Put the result of the recursive call - which the power return instructions leave in the eax register - onto the stack...
mov eax, DWORD PTR _n$[ebp]
push eax
Also push n onto the stack...
call ?product##YAHHH#Z ; product
Call the product function to multiple the value returned by the call to power above by n.
add esp, 8
mov DWORD PTR tv73[ebp], eax
Copy the result of product into a temporary address on the stack....
$LN4#power:
mov ecx, DWORD PTR tv73[ebp]
mov DWORD PTR tv74[ebp], ecx
Pick up the value from that tv73 temporary location and copy it into tv74...
$LN6#power:
mov eax, DWORD PTR tv74[ebp]
Finally, move the the product() result from tv74 into the eax register for convenient and fast access after the product call returns.
; Line 8
mov esp, ebp
pop ebp
ret 0
Clean up the stack and return.

Why is __fastcall assebmler code larger than __stdcall one in MS C++?

I have disassembled two different variations of Swap function (simple value-swap between two pointers).
1). __fastcall http://pastebin.com/ux5LMktz
2). __stdcall (function without explicit calling convention modifier will have a __stdcall by default, because of MS C++ compiler for Windows) http://pastebin.com/eGR6VUjX
As I know, __fastcall is implemented differently, depending on the compiler, but basically it puts the first two arguments (left to right) into ECX and EDX register. And there could be stack use, but if the arguments are too long.
But as for the link at 1-st option, you can see, that value is pushed into the ECX registry, and there is no real difference between two variations of swap function.
And __fastcall variant does use:
00AA261F pop ecx
00AA2620 mov dword ptr [ebp-14h],edx
00AA2623 mov dword ptr [ebp-8],ecx
Which are not used in __stdcall version.
So it doesn't look like more optimized (as __fasctcall must be , by its definition).
I'm a newbie in ASM language and calling convention, so I ask you for a piece of advice. Maybe __fastcall is faster exactly in my sample, but I don't see it, do I?
Thanks!
Try turning on optimization, then comparing the results. Your fastcall version has many redundant operations because it's not optimized.
Here's output of VS 2010 with /Ox.
fastcall:
; _firstValue$ = ecx
; _secondValue$ = edx
?CallMe1##YIXPAH0#Z PROC ; CallMe1
mov eax, DWORD PTR [ecx]
push esi
mov esi, DWORD PTR [edx]
cmp eax, esi
je SHORT $LN1#CallMe1
mov DWORD PTR [ecx], esi
mov DWORD PTR [edx], eax
$LN1#CallMe1:
pop esi
ret 0
?CallMe1##YIXPAH0#Z ENDP ; CallMe1
stdcall:
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe2##YGXPAH0#Z PROC ; CallMe2
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe2
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe2:
pop esi
ret 8
?CallMe2##YGXPAH0#Z ENDP ; CallMe2
cdecl (what you mistakenly call stdcall in your example):
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe3##YAXPAH0#Z PROC ; CallMe3
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe3
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe3:
pop esi
ret 0
?CallMe3##YAXPAH0#Z ENDP ; CallMe3