Able to change value of const in C, but not in C++ - c++

Consider the following code
#include <stdio.h>
#include <string.h>
main()
{
const int a = 2;
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
This code can change the value to a in C but not in C++. I understand that C++ is applying optimization and replacing instances of a with 2. So was this a bug fix in C++ or was the bug fixed by chance due to optimization?

It's undefined behavior to modify a const value no matter directly or indirectly. This may compile in C and may even run without problem on your machine, but it's still undefined behavior.
The difference between C and C++ on this is: with const int a = 2, C++ treats a as a constant expression, for instance, you can use a as array dimension:
int n[a]; //fine in C++
But in C, a is not a constant expression, with the same code:
int n[a]; //VLA in C99
Here n is not a fixed-sized array, but a variable length array.

This is not a C vs C++ issue. By modifying a const value (as well as by double-casting a pointer via a long), you enter the realm of undefined behaviour in both languages. Therefore the difference is simply a matter of how the undefined behaviour chooses to manifest itself.

You are casting away the constness out of &a and modifying the pointed value, which is undefined behavior both in C and in C++ (the trip through long just adds some more gratuitous UB). In C++ your compiler happens to optimize more aggressively the constant, but the point of the situation is unchanged.

Your code generates undefined behavior on C++ since you're accessing memory you shouldn't
include <stdio.h>
#include <string.h>
void main()
{
const int a = 2;
printf("%x != %x !!", sizeof(long), sizeof(void*)); // on a x64 system 4 != 8
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
and even if it works on a 32 bit system modifying const memory by casting away the constness is undefined behavior in both languages.

Following is the assembly code generated by g++. The compiler statically use "$2" instead of "a", but in case of gcc it doesn't perform any static optimization. I guess there shouldn't be any undefined behaviour.
.Ltext0:
.section .rodata
.LC0:
0000 256400 .string "%d"
.text
.globl main
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA0
0000 55 pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0001 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0004 4883EC20 subq $32, %rsp
.LBB2:
0008 C745EC02 movl $2, -20(%rbp)
000000
000f 488D45EC leaq -20(%rbp), %rax
0013 488945F0 movq %rax, -16(%rbp)
0017 488B45F0 movq -16(%rbp), %rax
001b 488945F8 movq %rax, -8(%rbp)
001f 488B45F8 movq -8(%rbp), %rax
0023 C7000300 movl $3, (%rax)
0000
0029 488B45F8 movq -8(%rbp), %rax
002d 8B00 movl (%rax), %eax
002f 89C6 movl %eax, %esi
0031 BF000000 movl $.LC0, %edi
00
0036 B8000000 movl $0, %eax
00
.LEHB0:
003b E8000000 call printf
00
0040 BE020000 movl $2, %esi
00
0045 BF000000 movl $.LC0, %edi
00
004a B8000000 movl $0, %eax
00
004f E8000000 call printf
00
.LEHE0:
0054 B8000000 movl $0, %eax
00
0059 EB08 jmp .L5
.L4:
005b 4889C7 movq %rax, %rdi
.LEHB1:
005e E8000000 call _Unwind_Resume
00
.LEHE1:
.L5:
.LBE2:
0063 C9 leave
.cfi_def_cfa 7, 8
0064 C3 ret
.cfi_endproc
.LFE0:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA0:
0000 FF .byte 0xff
0001 FF .byte 0xff
0002 01 .byte 0x1
0003 08 .uleb128 .LLSDACSE0-.LLSDACSB0
.LLSDACSB0:
0004 3B .uleb128 .LEHB0-.LFB0
0005 19 .uleb128 .LEHE0-.LEHB0
0006 5B .uleb128 .L4-.LFB0
0007 00 .uleb128 0
0008 5E .uleb128 .LEHB1-.LFB0
0009 05 .uleb128 .LEHE1-.LEHB1
000a 00 .uleb128 0
000b 00 .uleb128 0
.LLSDACSE0:
.text
.Letext0:

Related

Problems reverse engineering cpp / asm code (just for the sake of learning)

I am trying some cpp binary disassembling. I wrote this utterly simple code:
#include <iostream>
int main() {
int i=0; int i2=0;
for(int i=0; i<1000000; i++) {i2++; std::cout << "\n" << i2;}
return 0;
}
I then compiled it with g++ using something like:
g++ .cpp -o .cpp.bin
I then ran a:
objdump -d .cpp.bin
Here's what I extracted:
;1lim.cpp.bin: file format elf64-x86-64
;Disassembly of section .init:
_init:
endbr64
sub $0x8,%rsp
mov 0x2fd9(%rip),%rax
test %rax,%rax
je 1016 <_init+0x16>
call *%rax
add $0x8,%rsp
ret
;Disassembly of section .plt:
.plt:
push 0x2f7a(%rip)
bnd jmp *0x2f7b(%rip)
nopl (%rax)
endbr64
push $0x0
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x1
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x2
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x3
bnd jmp 1020 <_init+0x20>
nop
;Disassembly of section .plt.got:
__cxa_finalize#plt:
endbr64
bnd jmp *0x2f55(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .plt.sec:
__cxa_atexit#plt:
endbr64
bnd jmp *0x2f25(%rip)
nopl 0x0(%rax,%rax,1)
_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt:
endbr64
bnd jmp *0x2f1d(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSt8ios_base4InitC1Ev#plt:
endbr64
bnd jmp *0x2f15(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSolsEi#plt:
endbr64
bnd jmp *0x2f0d(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .text:
_start:
endbr64
xor %ebp,%ebp
mov %rdx,%r9
pop %rsi
mov %rsp,%rdx
and $0xfffffffffffffff0,%rsp
push %rax
push %rsp
xor %r8d,%r8d
xor %ecx,%ecx
lea 0xca(%rip),%rdi
call *0x2ef3(%rip)
hlt
cs nopw 0x0(%rax,%rax,1)
deregister_tm_clones:
lea 0x2f19(%rip),%rdi
lea 0x2f12(%rip),%rax
cmp %rdi,%rax
je 1118 <deregister_tm_clones+0x28>
mov 0x2ed6(%rip),%rax
test %rax,%rax
je 1118 <deregister_tm_clones+0x28>
jmp *%rax
nopl 0x0(%rax)
ret
nopl 0x0(%rax)
register_tm_clones:
lea 0x2ee9(%rip),%rdi
lea 0x2ee2(%rip),%rsi
sub %rdi,%rsi
mov %rsi,%rax
shr $0x3f,%rsi
sar $0x3,%rax
add %rax,%rsi
sar %rsi
je 1158 <register_tm_clones+0x38>
mov 0x2ea5(%rip),%rax
test %rax,%rax
je 1158 <register_tm_clones+0x38>
jmp *%rax
nopw 0x0(%rax,%rax,1)
ret
nopl 0x0(%rax)
__do_global_dtors_aux:
endbr64
cmpb $0x0,0x2fe5(%rip)
jne 1198 <__do_global_dtors_aux+0x38>
push %rbp
cmpq $0x0,0x2e5a(%rip)
mov %rsp,%rbp
je 1187 <__do_global_dtors_aux+0x27>
mov 0x2e86(%rip),%rdi
call 1070 <__cxa_finalize#plt>
call 10f0 <deregister_tm_clones>
movb $0x1,0x2fbd(%rip)
pop %rbp
ret
nopl (%rax)
ret
nopl 0x0(%rax)
frame_dummy:
endbr64
jmp 1120 <register_tm_clones>
main:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
movl $0x0,-0x4(%rbp)
movl $0x0,-0xc(%rbp)
movl $0x0,-0x8(%rbp)
jmp 11fd <main+0x54>
addl $0x1,-0xc(%rbp)
lea 0xe2d(%rip),%rax
mov %rax,%rsi
lea 0x2e5f(%rip),%rax
mov %rax,%rdi
call 1090 <_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt>
mov %rax,%rdx
mov -0xc(%rbp),%eax
mov %eax,%esi
mov %rdx,%rdi
call 10b0 <_ZNSolsEi#plt>
addl $0x1,-0x8(%rbp)
cmpl $0xf423f,-0x8(%rbp)
jle 11cc <main+0x23>
mov $0x0,%eax
leave
ret
_Z41__static_initialization_and_destruction_0ii:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
mov %edi,-0x4(%rbp)
mov %esi,-0x8(%rbp)
cmpl $0x1,-0x4(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
cmpl $0xffff,-0x8(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
lea 0x2f1c(%rip),%rax
mov %rax,%rdi
call 10a0 <_ZNSt8ios_base4InitC1Ev#plt>
lea 0x2dc4(%rip),%rax
mov %rax,%rdx
lea 0x2f03(%rip),%rax
mov %rax,%rsi
mov 0x2da0(%rip),%rax
mov %rax,%rdi
call 1080 <__cxa_atexit#plt>
nop
leave
ret
_GLOBAL__sub_I_main:
endbr64
push %rbp
mov %rsp,%rbp
mov $0xffff,%esi
mov $0x1,%edi
call 120d <_Z41__static_initialization_and_destruction_0ii>
pop %rbp
ret
;Disassembly of section .fini:
_fini:
endbr64
sub $0x8,%rsp
add $0x8,%rsp
ret
I am now trying to interpret it using the following:
nasm -f elf64 .asm
How can I possibly fix the assembly code, in order to try to compile it with NASM (it's already a slickly modified version from what I got from objdump)
the answer that i found to this question was the following: instead of running the predeceasing quoted commands, using the following:
gcc -S .cpp
this will produce the following code:
.file "1lim.cpp"
.text
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata
.LC0:
.string "\n"
.text
.globl main
.type main, #function
main:
.LFB1731:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
movl $0, -12(%rbp)
movl $0, -8(%rbp)
jmp .L2
.L3:
addl $1, -12(%rbp)
leaq .LC0(%rip), %rax
movq %rax, %rsi
leaq _ZSt4cout(%rip), %rax
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
movq %rax, %rdx
movl -12(%rbp), %eax
movl %eax, %esi
movq %rdx, %rdi
call _ZNSolsEi#PLT
addl $1, -8(%rbp)
.L2:
cmpl $999999, -8(%rbp)
jle .L3
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1731:
.size main, .-main
.type _Z41__static_initialization_and_destruction_0ii, #function
_Z41__static_initialization_and_destruction_0ii:
.LFB2229:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L7
cmpl $65535, -8(%rbp)
jne .L7
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rdi
call _ZNSt8ios_base4InitC1Ev#PLT
leaq __dso_handle(%rip), %rax
movq %rax, %rdx
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rsi
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rax
movq %rax, %rdi
call __cxa_atexit#PLT
.L7:
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2229:
.size _Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2230:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $65535, %esi
movl $1, %edi
call _Z41__static_initialization_and_destruction_0ii
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2230:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.hidden __dso_handle
.ident "GCC: (Ubuntu 11.2.0-19ubuntu1) 11.2.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:
to then compile it it's just about doing the following:
g++ .s -o .s.bin
to then run it it's just a matter of
./.s.bin
now my question is:
how can i do the same with executables or binaries?
apparently, the answer resides in using a binary called objconv, which stands for converting c++ binaries into assembly code in a correct way. apparently, it can be installed on any operative system using anaconda packge and environment managers
cheers

How do I read the Symboltable generated by objdump --disassemble, i.e. what does each column mean?

This is an excerpt of what I get but I don't understand it. I recognise the instructions but for example what is in the second column?
Disassembly of section .text._ZNSt10_Head_baseILm0EOeLb0EE7_M_headERS1_:
_ZNSt10_Head_baseILm0EOeLb0EE7_M_headERS1_:
0: 55 pushq %rbp
1: 48 89 e5 movq %rsp, %rbp
4: 48 89 7d f8 movq %rdi, -8(%rbp)
8: 48 8b 45 f8 movq -8(%rbp), %rax
c: 48 8b 00 movq (%rax), %rax
f: 5d popq %rbp
10: c3 retq

GCC generates undesired assembly code

I'm working on vectorizing loops, and GCC is giving me a hard time.
When I look at the assembly code it generates, I see a lot of strange lines that I would like to get rid of.
For example, with vectorization, I've learnt that you can avoid a lot of extra assembly lines by giving additionnal information to GCC about array alignment.
http://locklessinc.com/articles/vectorize/
Here is my experiment.
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
comp[i]=a[i]|b[i];
}
Generates simple assembly:
.globl _ZN8Test_LUT7performEv
23 _ZN8Test_LUT7performEv:
24 .LFB664:
25 .cfi_startproc
26 0020 488B4710 movq 16(%rdi), %rax
27 0024 488B4F08 movq 8(%rdi), %rcx
28 0028 488B5720 movq 32(%rdi), %rdx
29 002c 0FB700 movzwl (%rax), %eax
30 002f 660B01 orw (%rcx), %ax
31 0032 668902 movw %ax, (%rdx)
32 0035 C3 ret
33 .cfi_endproc
But, even if I was expecting a few extra lines, I am very surprised by what I got after adding a loop :
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
for(i=0;i<SIZE;i++)
comp[i]=a[i]|b[i];
}
Generates this assembly with a lot more lines:
233 _Z10itwillworkPKtS0_Pt:
234 .LFB663:
235 .cfi_startproc
236 0250 488D4210 leaq 16(%rdx), %rax
237 0254 488D4E10 leaq 16(%rsi), %rcx
238 0258 4839F0 cmpq %rsi, %rax
239 025b 410F96C0 setbe %r8b
240 025f 4839CA cmpq %rcx, %rdx
241 0262 0F93C1 setnb %cl
242 0265 4108C8 orb %cl, %r8b
243 0268 743E je .L55
244 026a 4839F8 cmpq %rdi, %rax
245 026d 488D4710 leaq 16(%rdi), %rax
246 0271 0F96C1 setbe %cl
247 0274 4839C2 cmpq %rax, %rdx
248 0277 0F93C0 setnb %al
249 027a 08C1 orb %al, %cl
250 027c 742A je .L55
251 027e 31C0 xorl %eax, %eax
252 .p2align 4,,10
253 .p2align 3
254 .L57:
255 0280 F30F6F0C movdqu (%rsi,%rax), %xmm1
255 06
256 0285 F30F6F04 movdqu (%rdi,%rax), %xmm0
256 07
257 028a 660FEBC1 por %xmm1, %xmm0
258 028e F30F7F04 movdqu %xmm0, (%rdx,%rax)
258 02
259 0293 4883C010 addq $16, %rax
260 0297 483D0008 cmpq $2048, %rax
260 0000
261 029d 75E1 jne .L57
262 029f F3C3 rep ret
263 .p2align 4,,10
264 02a1 0F1F8000 .p2align 3
264 000000
265 .L55:
266 02a8 31C0 xorl %eax, %eax
267 02aa 660F1F44 .p2align 4,,10
267 0000
268 .p2align 3
269 .L58:
270 02b0 0FB70C06 movzwl (%rsi,%rax), %ecx
271 02b4 660B0C07 orw (%rdi,%rax), %cx
272 02b8 66890C02 movw %cx, (%rdx,%rax)
273 02bc 4883C002 addq $2, %rax
274 02c0 483D0008 cmpq $2048, %rax
274 0000
275 02c6 75E8 jne .L58
276 02c8 F3C3 rep ret
277 .cfi_endproc
Both were compiled with gcc 4.8.4 in release mode, -O2 -ftree-vectorize -msse2.
Can somebody help me get rid of those lines? Or, if it's impossible, can you tell me why they are there ?
Update :
I've tried the tricks there http://locklessinc.com/articles/vectorize/, but I get another issue:
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
for(i=0;i<SIZE;i++)
comp[i]=a[i]|b[i];
}
A few assembly lines are generated for this function, I get it.
But when I call this function from somewhere else :
itwillwork(a,b,c);
There is no call instruction : the long list of instructions of "itwillwork" (the same as above) are used directly.
Am I missing something ? (the "extra lines" are the problem, not the inline call)
You are getting "weird" code because GCC cannot make assumptions about the alignment of your pointers so you can see that it is first performing an alignment test to determine whether it can take the fast path and do 128 bits at a time, or the slow path and do 16 bits at a time.
Additionally, the reason you are finding the code repeated is because the compiler is applying an inlining optimisation. You could disable this with the __attribute((noinline)) spec but if performance is your goal, let the compiler inline it.
If you specify the __restrict keyword then the compiler will only generate the fast-path code: https://goo.gl/g3jUfQ
However, this does not mean the compiler is going to magically take care of alignment for you so take care of what you pass to the function.

Do references occupy space on stack

I read that in the following case:
int i = 12;
int &a = i;
a will not occupy a space on the stack as it is an alias of i;
My question is suppose its a parameter as such
void funct(foo& a , int b)
{
}
when the function is created will a occupy a space on the stack ?
A reference is more or less like a pointer at this level and the following
#include <stdio.h>
#include <stdlib.h>
struct foo{
int val;
};
int funct(foo& a, int b)
{
return a.val;
}
int main(void) {
foo obj;
obj.val = 92;
funct(obj, 22); // 22 is passed by value, obj is passed by reference
return EXIT_SUCCESS;
}
gets translated to:
.Ltext0:
.globl _Z5functR3fooi // funct()
_Z5functR3fooi:
.LFB2:
.cfi_startproc
0000 55 pushq %rbp // some stack bookkeeping
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0001 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0004 48897DF8 movq %rdi, -8(%rbp) <-- move the address on the stack frame
0008 8975F4 movl %esi, -12(%rbp) <-- move the value on the stack frame
000b 488B45F8 movq -8(%rbp), %rax <-- get the address from the stack frame
000f 8B00 movl (%rax), %eax <-- use it
0011 5D popq %rbp
.cfi_def_cfa 7, 8
0012 C3 ret
.cfi_endproc
.LFE2:
.globl main
main:
.LFB3:
.cfi_startproc // Main
0013 55 pushq %rbp // Stack bookkeeping
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0014 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0017 4883EC10 subq $16, %rsp
.LBB2:
001b C745F05C movl $92, -16(%rbp) <-- save 92 (the entire POD struct) on the stack frame
000000
0022 488D45F0 leaq -16(%rbp), %rax <-- get the pointer to the stack frame where the obj is
0026 BE160000 movl $22, %esi <-- save the value in a register
00
002b 4889C7 movq %rax, %rdi <-- address of the stack frame to the object
002e E8000000 call _Z5functR3fooi // funct() call
00
0033 B8000000 movl $0, %eax
00
.LBE2:
0038 C9 leave
.cfi_def_cfa 7, 8
0039 C3 ret
.cfi_endproc
.LFE3:
.Letext0:
Of course keep in mind that this is just an implementation (gcc's one to be precise) without any optimization. It depends on the compiler how this really works.
(asm generated by http://assembly.ynh.io/)
According to the standard, whether or not a reference requires storage is unspecified. So it depends on the implementation.
A reference is often implemented as an address in assembly. However, a reference is neither a pointer to an object nor a copy of the object. A reference is the object.

Efficient data structure(s) for kind of scripting application?

In C++ I want to write an application that works similar to a scripting language:
Out of some input during "setup time" it will define on a big global array where each variable will be located and on a different array the sequence of the functions ("LogicElement") to call (including their parameters like the variables to use).
One implementation might look like:
class LogicElement_Generic
{
public:
virtual void calc() const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int &to;
const int &from1;
const int &from2;
public:
LogicElement_Mul( int &_to, const int &_from1, const int &_from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc() const
{
to = from1 * from2;
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
// just a demo, this would be setup from e.g. an input file:
int *to = (int*)globalVariableBuffer;
int *from1 = (int*)(globalVariableBuffer + sizeof(int));
int *from2 = (int*)(globalVariableBuffer + 2*sizeof(int));
*from1 = 2;
*from2 = 3;
le[0] = new LogicElement_Mul( *to, *from1, *from2 );
// doing all calculations:
// finally it would be a loop iterating over all calculation functions,
// over and over again - the area in the code where all the resources
// would be burned...
le[0]->calc();
return *to;
}
Although that works as intended, looking at the created assembly:
78 .section .text._ZNK16LogicElement_Mul4calcEv,"axG",#progbits,_ZNK16LogicElement_Mul4calcEv,comdat
79 .align 2
80 .weak _ZNK16LogicElement_Mul4calcEv
82 _ZNK16LogicElement_Mul4calcEv:
83 .LFB6:
17:.../src/test.cpp **** void calc() const
84 .loc 1 17 0
85 .cfi_startproc
86 0000 55 pushq %rbp
87 .LCFI6:
88 .cfi_def_cfa_offset 16
89 .cfi_offset 6, -16
90 0001 4889E5 movq %rsp, %rbp
91 .LCFI7:
92 .cfi_def_cfa_register 6
93 0004 48897DF8 movq %rdi, -8(%rbp)
18:.../src/test.cpp **** {
19:.../src/test.cpp **** to = from1 * from2;
94 .loc 1 19 0
95 0008 488B45F8 movq -8(%rbp), %rax
96 000c 488B4008 movq 8(%rax), %rax
97 0010 488B55F8 movq -8(%rbp), %rdx
98 0014 488B5210 movq 16(%rdx), %rdx
99 0018 8B0A movl (%rdx), %ecx
100 001a 488B55F8 movq -8(%rbp), %rdx
101 001e 488B5218 movq 24(%rdx), %rdx
102 0022 8B12 movl (%rdx), %edx
103 0024 0FAFD1 imull %ecx, %edx
104 0027 8910 movl %edx, (%rax)
20:.../src/test.cpp **** }
105 .loc 1 20 0
106 0029 5D popq %rbp
107 .LCFI8:
108 .cfi_def_cfa 7, 8
109 002a C3 ret
110 .cfi_endproc
Looking at the assembly lines 95 .. 104 you can see that for each variable three indirections are used.
As this part of the code (the calc() methods) would finally be called very rapidly I want to use the least CPU cycles and memory bandwidth as possible (by general C/C++).
I also want to achieve (not shown in the code above) to have two variable buffers that have exactly the same layout to be able to do double buffering at an multithreaded approach to limit the necessary locks (exact implementation details would be too much detail for this question).
So the big questions are:
How can I change the architecture to reduce the amount of memory indirections in the calc()?
(I'd expect only two: one to get the offset address in the variable array and an additional to get the variable itself - but my experiments changing the code above to use offsets made things far worse!)
Is there a better way to set up the classes and thus the array of the LogicElements so that calling the calculation methods will use the least amount of resources?
Thanks to the hint by #Ed S. I changed away from references (where I hoped the compiler could optimize better).
But an even more important step I did was to compare the assembly that was generated after activating optimizations (just a simple -O2 did do).
(I didn't do that at the beginning as I wanted to have a clearer picture on the generated "pure" machine code and not one where an intelligent compiler fixes a stupid programmer - but it seems the compiler is too "stupid" then...)
So the current result is quite good now for the variable array:
class LogicElement_Generic
{
public:
virtual void calc(void * const base) const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int const to;
int const from1;
int const from2;
public:
LogicElement_Mul( int const _to, int const _from1, int const _from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc(void * const base) const
{
*((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
int to = 0;
int from1 = sizeof(int);
int from2 = 2*sizeof(int);
*((int*)(globalVariableBuffer+from1)) = 2;
*((int*)(globalVariableBuffer+from2)) = 3;
le[0] = new LogicElement_Mul( to, from1, from2 );
le[0]->calc(globalVariableBuffer);
return *((int*)(globalVariableBuffer+to));
}
with the relevant part of the assembly:
17:.../src/test.cpp **** void calc(void * const base) const
12 .loc 1 17 0
13 .cfi_startproc
14 .LVL0:
18:.../src/test.cpp **** {
19:.../src/test.cpp **** *((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
15 .loc 1 19 0
16 0000 4863470C movslq 12(%rdi), %rax
17 0004 48634F10 movslq 16(%rdi), %rcx
18 0008 48635708 movslq 8(%rdi), %rdx
19 000c 8B0406 movl (%rsi,%rax), %eax
20 000f 0FAF040E imull (%rsi,%rcx), %eax
21 0013 890416 movl %eax, (%rsi,%rdx)
20:.../src/test.cpp **** }
22 .loc 1 20 0
23 0016 C3 ret
24 .cfi_endproc
So I recon the first questions as answered! :)
The second is still open.
(Even more now as the pointer arithmetic might be valid C++ - but very ugly...)