Calling method using inline assembler in gcc - c++

so as I said, I'm trying to call a method using inline asm using gcc. So, I searched how x86 works, and what are the calling convention, then I tried some easy call witch worked perfectly. Then I tried to embed v8, which was my original goal, but it didn't work so well...
Here's my code :
v8::Handle<v8::Value> V8Method::staticInternalMethodCaller(const v8::Arguments& args, int argsize, void* object, void* method)
{
int i = 0;
char* native_args;
// Move the ESP to the end of the array (argsize is the array size in byte)
asm("subl %1, %%esp;"
"movl %%esp, %0;"
: "=r"(native_args)
: "r"(argsize));
// This for loop only converts V8 type to native type,
// and puts them in the array:
for (; i < args.Length(); ++i)
{
if (args[i]->IsInt32())
{
*(int*)(native_args) = args[i]->Int32Value();
native_args += sizeof(int);
}
else if (args[i]->IsNumber())
{
*(float*)(native_args) = (float)(args[i]->NumberValue());
native_args += sizeof(float);
}
}
// Then call the method:
asm("call *%1;" : : "c"(object), "r"(method));
return v8::Null();
}
And here is the generated assembly :
__ZN3srl8V8Method26staticInternalMethodCallerERKN2v89ArgumentsEiPvS5_:
LFB1178:
.cfi_startproc
.cfi_personality 0,___gxx_personality_v0
.cfi_lsda 0,LLSDA1178
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %ebx
subl $68, %esp
.cfi_offset 3, -12
movl $0, -12(%ebp)
movl 12(%ebp), %eax
/APP
# 64 "method.cpp" 1
subl %eax, %esp; movl %esp, %ebx; addl $4, %esp
# 0 "" 2
/NO_APP
movl %ebx, -16(%ebp)
jmp L74
L77:
movl -12(%ebp), %eax
movl %eax, (%esp)
movl 8(%ebp), %ecx
LEHB25:
call __ZNK2v89ArgumentsixEi
LEHE25:
subl $4, %esp
movl %eax, -36(%ebp)
leal -36(%ebp), %eax
movl %eax, %ecx
call __ZNK2v86HandleINS_5ValueEEptEv
movl %eax, %ecx
LEHB26:
call __ZNK2v85Value7IsInt32Ev
LEHE26:
testb %al, %al
je L75
movl -12(%ebp), %eax
movl %eax, (%esp)
movl 8(%ebp), %ecx
LEHB27:
call __ZNK2v89ArgumentsixEi
LEHE27:
subl $4, %esp
movl %eax, -32(%ebp)
leal -32(%ebp), %eax
movl %eax, %ecx
call __ZNK2v86HandleINS_5ValueEEptEv
movl %eax, %ecx
LEHB28:
call __ZNK2v85Value10Int32ValueEv
LEHE28:
movl %eax, %edx
movl -16(%ebp), %eax
movl %edx, (%eax)
movl -16(%ebp), %eax
movl (%eax), %ebx
movl $LC4, 4(%esp)
movl $__ZSt4cout, (%esp)
LEHB29:
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl -16(%ebp), %edx
movl %edx, (%esp)
movl %eax, %ecx
call __ZNSolsEPKv
subl $4, %esp
movl $LC5, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %ebx, (%esp)
movl %eax, %ecx
call __ZNSolsEi
subl $4, %esp
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, (%esp)
movl %eax, %ecx
call __ZNSolsEPFRSoS_E
subl $4, %esp
addl $4, -16(%ebp)
jmp L76
L75:
movl -12(%ebp), %eax
movl %eax, (%esp)
movl 8(%ebp), %ecx
call __ZNK2v89ArgumentsixEi
LEHE29:
subl $4, %esp
movl %eax, -28(%ebp)
leal -28(%ebp), %eax
movl %eax, %ecx
call __ZNK2v86HandleINS_5ValueEEptEv
movl %eax, %ecx
LEHB30:
call __ZNK2v85Value8IsNumberEv
LEHE30:
testb %al, %al
je L76
movl -12(%ebp), %eax
movl %eax, (%esp)
movl 8(%ebp), %ecx
LEHB31:
call __ZNK2v89ArgumentsixEi
LEHE31:
subl $4, %esp
movl %eax, -24(%ebp)
leal -24(%ebp), %eax
movl %eax, %ecx
call __ZNK2v86HandleINS_5ValueEEptEv
movl %eax, %ecx
LEHB32:
call __ZNK2v85Value11NumberValueEv
LEHE32:
fstps -44(%ebp)
flds -44(%ebp)
movl -16(%ebp), %eax
fstps (%eax)
movl -16(%ebp), %eax
movl (%eax), %ebx
movl $LC4, 4(%esp)
movl $__ZSt4cout, (%esp)
LEHB33:
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl -16(%ebp), %edx
movl %edx, (%esp)
movl %eax, %ecx
call __ZNSolsEPKv
subl $4, %esp
movl $LC5, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %ebx, (%esp)
movl %eax, %ecx
call __ZNSolsEf
subl $4, %esp
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, (%esp)
movl %eax, %ecx
call __ZNSolsEPFRSoS_E
subl $4, %esp
addl $4, -16(%ebp)
L76:
incl -12(%ebp)
L74:
movl 8(%ebp), %ecx
call __ZNK2v89Arguments6LengthEv
cmpl -12(%ebp), %eax
setg %al
testb %al, %al
jne L77
movl 16(%ebp), %eax
movl 20(%ebp), %edx
movl %eax, %ecx
/APP
# 69 "method.cpp" 1
call *%edx;
# 0 "" 2
/NO_APP
call __ZN2v84NullEv
leal -20(%ebp), %edx
movl %eax, (%esp)
movl %edx, %ecx
call __ZN2v86HandleINS_5ValueEEC1INS_9PrimitiveEEENS0_IT_EE
subl $4, %esp
movl -20(%ebp), %eax
jmp L87
L83:
movl %eax, (%esp)
call __Unwind_Resume
L84:
movl %eax, (%esp)
call __Unwind_Resume
L85:
movl %eax, (%esp)
call __Unwind_Resume
L86:
movl %eax, (%esp)
call __Unwind_Resume
LEHE33:
L87:
movl -4(%ebp), %ebx
leave
.cfi_restore 5
.cfi_restore 3
.cfi_def_cfa 4, 4
ret
.cfi_endproc
So, this static method is a callback (I do some signature checking before) witch is supposed to call the specific method providing valid C++ native args. In order to speed up a little bit and avoid copies of args, I'm trying to load all param in an local array, and then modify the ESP to make this array an argument.
The method call works well, but I don't get correct arguments... I've done lots of research about function call, calling convention, and lots of test (which were all successful), but I don't understand what is going on... Is there something I missed ?
Basically, the callee is supposed to get its arguments at the top of the esp, in my case, the array... (I precise that the array is valid)
I use GCC.

There are many problems with what you are attempting.
You cannot modify %esp using inline assembly, because the compiler
is probably using %esp to reference its local variables and arguments. This may work if the compiler uses %ebp instead, but there is no guarantee.
You never undo the %esp modification before returning.
In your inline assembly, you need to declare that %esp is side-effected.
You probably need to pass object as a silent first argument. method is an instance method, not a static method?
all of this depends on what calling convention you're using: cdecl, stdcall, etc.

I'd recommend not trying to do this yourself, there are a lot of annoying little details that have to be gotten exactly right. I'd suggest instead using the FFCALL library, specifically the avcall set of methods, to do this.
I imagine that something like this would do what you want:
v8::Handle<v8::Value> V8Method::staticInternalMethodCaller(const v8::Arguments& args, int argsize, void* object, void* method)
{
// Set up the argument list with the function pointer, return type, and
// pointer to value storing the return value (assuming int, change if
// necessary)
int return_value;
av_alist alist;
av_start_int(alist, method, &return_value);
for(int i = args.Length() - 1; i >= 0; i--)
{
// Push the arguments onto the argument list
if (args[i]->IsInt32())
{
av_int(alist, args[i]->Int32Value());
}
else if (args[i]->IsNumber())
{
av_double(alist, (float)(args[i]->NumberValue());
}
}
av_call(alist); // Call the function
return v8::Null();
}

Related

C++ GCC Optimization Speed slows down when local variable is copied to global variable

I have a question regarding GCC's optimization flags and how they work.
I have a very long piece of code that utilizes all local arrays and variables. At the end of the code, I copy the contents of the local array to a global array. Here is an extremely stripped down example of my code:
uint8_t globalArray[16]={0};
void func()
{
unsigned char localArray[16]={0};
for (int r=0; r<1000000; r++)
{
**manipulate localArray with a lot of calculations**
}
memcpy(&globalArray,localArray,16);
}
Here's the approximate speed of the code in three different scenarios:
Without "-O3" optimization: 3.203s
With "-O3" optimization: 1.457s
With "-O3" optimization and without the final memcpy(&globalArray,localArray,16); statement: 0.015s
Without copying the local array into the global array, the code runs almost 100 times faster. I know that the global array is stored in the memory and the local array is stored in registers. My question is:
Why does just copying 16 elements of a local array to a global array cause 100 times slower execution? I have searched this forum and online and I cannot find a definite answer to this particular scenario of mine.
Is there any way that I can extract the contents of the local variable without the speed loss?
Thank you in advance to anyone that can help me with this problem.
Without the memcpy, your compiler will likely see that localArray is never read from, so it doesn't need to do any of the calculations in the loop body.
Take this code as an example:
uint8_t globalArray[16]={0};
void func()
{
unsigned char localArray[16]={0};
for (int r=0; r<1000000; r++)
{
localArray[r%16] = r;
}
memcpy(&globalArray,localArray,16);
}
Clang 3.7.1 with -O3 outputs this assembly:
func(): # #func()
# BB#0:
xorps %xmm0, %xmm0
movaps %xmm0, -24(%rsp)
#DEBUG_VALUE: r <- 0
xorl %eax, %eax
.LBB0_1: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: r <- 0
movl %eax, %ecx
sarl $31, %ecx
shrl $28, %ecx
leal (%rcx,%rax), %ecx
andl $-16, %ecx
movl %eax, %edx
subl %ecx, %edx
movslq %edx, %rcx
movb %al, -24(%rsp,%rcx)
leal 1(%rax), %ecx
#DEBUG_VALUE: r <- ECX
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 1(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 1(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 2(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 2(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 2(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 3(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 3(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 3(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 4(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 4(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 4(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
addl $5, %eax
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
# BB#2:
movaps -24(%rsp), %xmm0
movaps %xmm0, globalArray(%rip)
retq
For the same code without the memcpy, it outputs this:
func(): # #func()
# BB#0:
#DEBUG_VALUE: r <- 0
retq
Even if you know nothing about assembly, it's clear to see that the latter just does nothing.

How to print the memory address and the value of the arguments of a C/C++ function that is being debugged with LLDB?

I'm getting ready for my OS test. One of the tools we use is the debugger(LLDB) and my goal is to inspect the arguments of a C function or a C++ method.
For instance: How may I see the memory address and the value of the arguments passed to _SMenuItemCommandID?? - I have tried different things, but died in the attempt.
HITestBox`_SMenuItemCommandID(MenuData*, unsigned short, unsigned long):
0x9a7bfc35: pushl %ebp
0x9a7bfc36: movl %esp, %ebp
0x9a7bfc38: pushl %esi
0x9a7bfc39: subl $52, %esp
0x9a7bfc3c: movl 8(%ebp), %esi
0x9a7bfc3f: movl 88(%esi), %eax
0x9a7bfc42: movl %eax, -16(%ebp)
0x9a7bfc45: movzwl 12(%ebp), %ecx
0x9a7bfc49: movw %cx, -12(%ebp)
0x9a7bfc4d: movl $0, -8(%ebp)
0x9a7bfc54: leal -8(%ebp), %edx
0x9a7bfc57: movl %edx, 28(%esp)
0x9a7bfc5b: movl %ecx, 4(%esp)
0x9a7bfc5f: movl %eax, (%esp)
0x9a7bfc62: movl $0, 24(%esp)
0x9a7bfc6a: movl $4, 20(%esp)
0x9a7bfc72: movl $0, 16(%esp)
0x9a7bfc7a: movl $1835232612, 12(%esp)
0x9a7bfc82: movl $12, 8(%esp)
0x9a7bfc8a: calll 0x9a5f7c9b ; elementGetDataAtIndex
0x9a7bfc8f: movl 16(%ebp), %eax
0x9a7bfc92: cmpl %eax, -8(%ebp)
0x9a7bfc95: je 0x9a7bfcae ; _SMenuItemCommandID(MenuData*, unsigned short, unsigned long) + 121
0x9a7bfc97: movl %eax, 4(%esp)
0x9a7bfc9b: leal -16(%ebp), %eax
0x9a7bfc9e: movl %eax, (%esp)
0x9a7bfca1: calll 0x9a7e2914 ; mID::SetCommandID(unsigned long)
0x9a7bfca6: movl %esi, (%esp)
0x9a7bfca9: calll 0x9a5f7c65 ; invalidate(MenuData*)
0x9a7bfcae: xorl %eax, %eax
0x9a7bfcb0: addl $52, %esp
0x9a7bfcb3: popl %esi
0x9a7bfcb4: popl %ebp
0x9a7bfcb5: ret
Edit:
Say I'm debugging an app which I don't have the source code, but I have the symbols exported.
Say, at some moment this code gets executed:
MenuData *myData = (MenuData *)0x28ff44;;
SMenuItemCommandID(myData, 3, 4);
What do I need to do (with LLDB) to get:
arg0 = 0x28ff44
arg1 =3
arg2 =4
The disassembly that you posted is x86. The arguments are on the stack. If you break before the function prolog, the arguments are relative to the stack pointer, %esp (which is accessed as $esp in lldb):
# The return address:
x/w $esp
# The first argument:
x/w $esp+4
# The second argument:
x/w $esp+8
If you break after the prolog (0x9a7bfc3c in your example), which is where symbolic breakpoints are usually placed, the arguments are found relative to the frame pointer (%ebp a.k.a. $ebp):
# The saved frame pointer of the previous frame:
x/w $ebp
# The return address:
x/w $ebp+4
# The first argument:
x/w $ebp+8
# The second argument:
x/w $ebp+12
For other architectures, the arguments will be stored differently, often in the registers. Also, the above assumes the "cdecl" calling convention. There are others. Have you been told which architecture(s) and calling convention(s) you're expected to be familiar with?

C++ Tail recursion using 64-bit variables

I have written a simple Fibonacci function as an exercise in C++ (using Visual Studio) to test Tail Recursion and to see how it works.
this is the code:
int fib_tail(int n, int res, int next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10,0,1); //Tail Recursion works
}
when I compiled using Release mode I saw the optimized assembly using the JMP instruction in spite of a call. So my conclusion was: tail recursion works. See image below:
I wanted to do some performance tests by increasing the input variable n in my Fibonacci function. I then opted to change the variable type, used in the function, from int to unsigned long long. Then I passed a big number like: 10e+08
This is now the new function:
typedef unsigned long long ULONG64;
ULONG64 fib_tail(ULONG64 n, ULONG64 res, ULONG64 next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10e+9,0,1); //Tail recursion does not work
}
When I ran the code above I got a stack overflow exception, which made me think that tail recursion was not working. I looked at the assembly and in fact I found this:
As you see now there is a call instruction whereas I was expecting only a simple JMP. I don't understand the reason why using a 8 bytes variable disables tail recursion. Why the compiler doesn't perform an optimization in such case?
This is one of those questions that you'd have to ask the guys that do compiler optimisation for MS - there is really no technical reason why ANY return type should prevent tail-recursion from being a jump as such - there may be OTHER reasons such as "the code is too complex to understand" or some such.
clang 3.7 as of a couple of weeks back clearly figures it out:
_Z8fib_tailyyy: # #_Z8fib_tailyyy
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
pushl %eax
movl 36(%esp), %ecx
movl 32(%esp), %esi
movl 28(%esp), %edi
movl 24(%esp), %ebx
movl %ebx, %eax
orl %edi, %eax
je .LBB0_1
movl 44(%esp), %ebp
movl 40(%esp), %eax
movl %eax, (%esp) # 4-byte Spill
.LBB0_3: # %if.end
movl %ebp, %edx
movl (%esp), %eax # 4-byte Reload
addl $-1, %ebx
adcl $-1, %edi
addl %eax, %esi
adcl %edx, %ecx
movl %ebx, %ebp
orl %edi, %ebp
movl %esi, (%esp) # 4-byte Spill
movl %ecx, %ebp
movl %eax, %esi
movl %edx, %ecx
jne .LBB0_3
jmp .LBB0_4
.LBB0_1:
movl %esi, %eax
movl %ecx, %edx
.LBB0_4: # %return
addl $4, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
retl
main: # #main
subl $28, %esp
movl $0, 20(%esp)
movl $1, 16(%esp)
movl $0, 12(%esp)
movl $0, 8(%esp)
movl $2, 4(%esp)
movl $1410065408, (%esp) # imm = 0x540BE400
calll _Z8fib_tailyyy
movl %edx, f+4
movl %eax, f
xorl %eax, %eax
addl $28, %esp
retl
Same applies to gcc 4.9.2 if you give it -O2 (but not in -O1 which was all clang needed)
(And of course also in 64-bit mode)

assembly x86 'decompiling' [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question appears to be off-topic because it lacks sufficient information to diagnose the problem. Describe your problem in more detail or include a minimal example in the question itself.
Closed 8 years ago.
Improve this question
I'm having trouble understanding this assembly x86 code (AT&T notation). I need to be able to understand it (write C++ function that is compiled to that code) and solve similar exercises on the exam.
Can you explain to me which part does what and what is the convention?
f:
pushl %ebp ; 1
movl %esp, %ebp; 2
pushl %ebx ; 3
subl $36, %esp; 4
movl 8(%ebp), %edx ; 5
movl 12(%ebp), %eax ; 6
movl (%eax), %eax ; 7
movl %edx, 8(%esp) ; 8
leal 16(%ebp), %edx ; 9
movl %edx, 4(%esp) ; 10
movl %eax, (%esp) ; 11
call f; 12
movl %eax, -12(%ebp) ; 13
movl 16(%ebp), %edx ; 14
movl 12(%ebp), %eax ; 15
movl %edx, (%eax) ; 16
movl 12(%ebp), %eax ; 17
movl (%eax), %edx ; 18
movl -12(%ebp), %eax ; 19
movl %edx, 8(%esp) ; 20
leal 8(%ebp), %edx ; 21
movl %edx, 4(%esp) ; 22
movl %eax, (%esp) ; 23
call f; 24
movl %eax, %ebx; 25
movl 16(%ebp), %edx ; 26
movl -12(%ebp), %eax ; 27
movl %edx, 8(%esp) ; 28
movl 12(%ebp), %edx ; 29
movl %edx, 4(%esp) ; 30
movl %eax, (%esp) ; 31
call f; 32
movl %eax, %edx; 33
movl 16(%ebp), %eax ; 34
movl %edx, 8(%esp) ; 35
leal 8(%ebp), %edx ; 36
movl %edx, 4(%esp) ; 37
movl %eax, (%esp) ; 38
call f; 39
movl %ebx, 8(%esp) ; 40
leal -12(%ebp), %edx ; 41
movl %edx, 4(%esp) ; 42
movl %eax, (%esp) ; 43
call f; 44
addl $36, %esp; 45
popl %ebx ; 46
popl %ebp ; 47
ret; 48
There are no jumps, but a few of 'call f', does it mean that there is an infinite loop?
Below is a little bit to help you get going.
Step 1. Divide the code up into logical chunks. Key things to look for to identify logical chunks are the stack prologue and epilogue code, function calls, branch statements and addresses identified by the branch statements.
Step 2. Make notes about what each chunk is doing.
For example ...
f:
pushl %ebp
movl %esp, %ebp ; Create the stack frame
pushl %ebx ; and save non-volatile register EBX
subl $36, %esp ; Carve space for 9 32-bit words on the stack
; Notes: 8(%ebp) is the address for the 1st parameter
; 12(%ebp) is the address for the 2nd parameter
; 16(%ebp) is the address for the 3rd parameter
;
; Anything addresses as -#(%ebp) will be a stack variable
; local to this function.
;
; Anything addressed as #(%esp) will be used to pass parameters
; to the sub-function. The advantage of doing it this way is that
; parameters passed to the sub-function do not have to be popped
; after every call to a sub-function.
movl 8(%ebp), %edx ; EDX = 1st parameter
movl 12(%ebp), %eax ; EAX = 2nd parameter
movl (%eax), %eax ; The 2nd parameter is a pointer!
movl %edx, 8(%esp) ; Pass EDX as 3rd parameter to sub-function
leal 16(%ebp), %edx ; EDX = address of 3rd parameter to this function
movl %edx, 4(%esp) ; Passing it as 2nd parameter to sub-function
movl %eax, (%esp) ; Pass EAX as 3rd parameter to sub-function
call f ; Call sub-function
movl %eax, -12(%ebp) ; Save return value to local stack variable
; More Notes:
; I am guessing that this bit of decompiled code was an object file.
; Experience has shown me that when the address sub-functions used by
; CALL are all the same (and match the address of the calling function)
; this is often due to decompiling an object file as opposed to an
; executable. If however, the sub-function address truly is '0xf', then
; this will be a recursive routine that will blow the stack as there is
; no exit condition.
movl 16(%ebp), %edx ; EDX: 3rd parameter passed to function
; likely modified by previous CALL
movl 12(%ebp), %eax ; EAX: 2nd parameter passed to function
movl %edx, (%eax) ; Save EDX to the location pointed to by the 2nd parameter
movl 12(%ebp), %eax ; EAX: 2nd parameter passed to function (recall it's a ptr)
movl (%eax), %edx ; ... and so on ...
movl -12(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %edx)
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %eax, %ebx
movl 16(%ebp), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %eax, %edx
movl 16(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %ebx, 8(%esp)
leal -12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
addl $36, %esp ; Reclaim that carved stack space
popl %ebx ; Restore the non-volatile register EBX
popl %ebp ; Restore to the caller's stack frame
ret ; Return
I am leaving the rest for you. I hope this helps you along.
This function f is a recursive function without termination of the recursion. Something like
void f(int a, int b, int c)
{
f(a,b,c);
//....
}
Stop evaluating the disassembly, since it isn't worth to get such bad code in any high level language.
I came to the solution:
int f (int i, int* j, int k) {
int n = f(*j, &k, i);
*j = k;
f( f(n, &i, *j), &n, f(k, &i, f(n, j, k)) );
return 0;
}
when compiling my code
g++ -m32 -S a.cpp
I get the following assembly code:
_Z1fiPii:
.LFB971:
.cfi_startproc
.cfi_personality 0,__gxx_personality_v0
.cfi_lsda 0,.LLSDA971
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %ebx
subl $36, %esp
.cfi_offset 3, -12
movl 8(%ebp), %edx
movl 12(%ebp), %eax
movl (%eax), %eax
movl %edx, 8(%esp)
leal 16(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
.LEHB0:
call _Z1fiPii
movl %eax, -12(%ebp)
movl 16(%ebp), %edx
movl 12(%ebp), %eax
movl %edx, (%eax)
movl 16(%ebp), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
movl 16(%ebp), %edx
movl %eax, 8(%esp)
leal 8(%ebp), %eax
movl %eax, 4(%esp)
movl %edx, (%esp)
call _Z1fiPii
movl %eax, %ebx
movl 12(%ebp), %eax
movl (%eax), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %ecx
movl %ecx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
movl %ebx, 8(%esp)
leal -12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
.LEHE0:
movl $0, %eax
jmp .L5
.L4:
movl %eax, (%esp)
.LEHB1:
call _Unwind_Resume
.LEHE1:
.L5:
addl $36, %esp
popl %ebx
.cfi_restore 3
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
Is this one equivalent to the one pasted before?

Int to Float to Int conversion precision loss

Recently, I wrote a small program and compiled it using mingw32(on Windows8) of 2 different versions. Surprisingly, I got two different reusults. I tried disassmbling it but found nothing special. Could anyone help me? Thank you.
the exe files:
https://www.dropbox.com/s/69sq1ttjgwv1qm3/asm.7z
results: 720720(gcc version 4.5.2), 720719(gcc version 4.7.0)
compiler flags: -lstdc++ -static
Code snipped as following:
#include <iostream>
#include <cmath>
using namespace std;
int main()
{
int a = 55440, b = 13;
a *= pow(b, 1);
cout << a << endl;
return 0;
}
Assembly output(4.5.2):
http://pastebin.com/EJAkVAaH
Assembly output(4.7.0):
http://pastebin.com/kzbbFGs6
I've been able to reproduce the problem with a single version of the compiler.
Mine is MinGW g++ 4.6.2.
When I compile the program as g++ -g -O2 bugflt.cpp -o bugflt.exe, I get 720720.
This is the disassembly of main():
_main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
call ___main
movl $720720, 4(%esp)
movl $__ZSt4cout, (%esp)
call __ZNSolsEi
movl %eax, (%esp)
call __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
xorl %eax, %eax
leave
ret
As you can see, the value is calculated at compile time.
When I compile it as g++ -g -O2 -fno-inline bugflt.cpp -o bugflt.exe, I get 720719.
This is the disassembly of main():
_main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
call ___main
movl $1, 4(%esp)
movl $13, (%esp)
call __ZSt3powIiiEN9__gnu_cxx11__promote_2INS0_11__enable_ifIXaasrSt15__is_arithmeticIT_E7__valuesrS3_IT0_E7__valueES4_E6__typeES6_E6__typeES4_S6_
fmuls LC1
fnstcw 30(%esp)
movw 30(%esp), %ax
movb $12, %ah
movw %ax, 28(%esp)
fldcw 28(%esp)
fistpl 4(%esp)
fldcw 30(%esp)
movl $__ZSt4cout, (%esp)
call __ZNSolsEi
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, 4(%esp)
movl %eax, (%esp)
call __ZNSolsEPFRSoS_E
xorl %eax, %eax
leave
ret
...
LC1:
.long 1196986368 // 55440.0 exactly
If I replace the call to exp() with loading 13.0 like this:
_main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
call ___main
movl $1, 4(%esp)
movl $13, (%esp)
// call __ZSt3powIiiEN9__gnu_cxx11__promote_2INS0_11__enable_ifIXaasrSt15__is_arithmeticIT_E7__valuesrS3_IT0_E7__valueES4_E6__typeES6_E6__typeES4_S6_
fildl (%esp)
fmuls LC1
fnstcw 30(%esp)
movw 30(%esp), %ax
movb $12, %ah
movw %ax, 28(%esp)
fldcw 28(%esp)
fistpl 4(%esp)
fldcw 30(%esp)
movl $__ZSt4cout, (%esp)
call __ZNSolsEi
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, 4(%esp)
movl %eax, (%esp)
call __ZNSolsEPFRSoS_E
xorl %eax, %eax
leave
ret
I get 720720.
If I set the same rounding and precision control fields of the x87 FPU control word for the duration of exp() as for the fistpl 4(%esp) instruction like this:
_main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
call ___main
movl $1, 4(%esp)
movl $13, (%esp)
fnstcw 30(%esp)
movw 30(%esp), %ax
movb $12, %ah
movw %ax, 28(%esp)
fldcw 28(%esp)
call __ZSt3powIiiEN9__gnu_cxx11__promote_2INS0_11__enable_ifIXaasrSt15__is_arithmeticIT_E7__valuesrS3_IT0_E7__valueES4_E6__typeES6_E6__typeES4_S6_
fldcw 30(%esp)
fmuls LC1
fnstcw 30(%esp)
movw 30(%esp), %ax
movb $12, %ah
movw %ax, 28(%esp)
fldcw 28(%esp)
fistpl 4(%esp)
fldcw 30(%esp)
movl $__ZSt4cout, (%esp)
call __ZNSolsEi
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, 4(%esp)
movl %eax, (%esp)
call __ZNSolsEPFRSoS_E
xorl %eax, %eax
leave
ret
I get 720720 as well.
From this I can only conclude that exp() isn't calculating 131 precisely as 13.0.
It may be worth looking at the source code of that __gnu_cxx::__promote_2<__gnu_cxx::__enable_if<(std::__is_arithmetic<int>::__value)&&(std::__is_arithmetic<int>::__value), int>::__type, int>::__type std::pow<int, int>(int, int) to see exactly how it manages to screw up exponentiation with integers (see, unlike C's exp() it takes two ints instead of two doubles).
But I wouldn't blame exp() for that. C++11 defines float pow(float, float) and long double pow(long double, long double) in addition to C's double pow(double, double). But there's no double pow(int, int) in the standard.
The fact that the compiler provides a version for integer arguments does not make any additional guarantee about the precision of the result. If exp() calculates ab as
ab = 2b * log2(a)
or as
ab = eb * ln(a)
for floating-point values, there definitely can be rounding errors in the process.
If the "integer" version of exp() does something similar and incurs a similar loss of precision due to rounding errors, it still does its job right. And it does it even if the loss of precision is due to some silly bug and not because of the normal rounding errors.
However surprising this behavior may seem, it's correct. Or so I believe until proven wrong.