I worte the following code in c++ .
I want to do casting to a const variable and change it,
this is the code:
#include <iostream>
using namespace std;
int main()
{
int const a = 5;
int* ptr = (int*)&a;
*ptr = 10;
cout<<"a is : "<< a << endl;
system("pause");
}
This code passed the compiler, I expect the program to print an the screen 10 ,
but the result in the screen is 5.
When I run the debugger the memory in &a has been changed to 10 like I expected.
Any idea why?
First of all this is undefined behavior. Don't do it. Second, the compiler optimized away actually looking at the memory at &a when you print out a because you told the compiler a would never change (you said it was const). So it actually turned into...
cout << "a is : "<<5 << endl;
You are invoking undefined behavior with the code in question, trying to change a variable declared as const by casting away the constness is not allowed (unless the const variable is really a reference to a variable which isn't const).
One plausible, and highly likely, explanation to your result is that the compiler knows that the value of a shouldn't change, therefor it can pretty much replace all occurences of a with 5. ie. the "look up" is optimized out.
Why look at the adress of a to read it's value when it's declared as always being 5?
Let's take a look at what instructions a compiler might turn the snippet into
foo.cpp
void func (int)
{
/* ... */
}
int
main (int argc, char *argv[])
{
const int a = 10;
int * p = &const_cast<int&> (a);
*p = 20;
func (a);
}
assembly instructions of main as given by g++ -S foo.cpp
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movl %edi, -20(%rbp)
movq %rsi, -32(%rbp)
movl $10, -12(%rbp)
leaq -12(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax # store the adress of `a` in %rax
movl $20, (%rax) # store 20 at the location pointed to by %rax (ie. &a)
movl $10, %edi # put 10 in register %edi (placeholder for first argument to function)
# # notice how the value is not read from `a`
# # but is a constant
call _Z4funci # call `func`
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
As seen above the value of 20 is indeed put at the address stored in %rax where (%rax) contains the address of a (movl $20, (%rax)), but the argument to our call to void func (int) is the constant number 10 (movl $10, %edi).
As said earlier; the compiler assumes that the value of a doesn't change, and instead of reading the memory location every time a is used it will instead replace it with the constant value 10.
Related
I'm trying to reproduce the example code in order to understand the as-if rule of C++ better. According to cppreference.
int& preinc(int& n) { return ++n; }
int add(int n, int m) { return n+m; }
// volatile input to prevent constant folding
volatile int input = 7;
// volatile output to make the result a visible side-effect
volatile int result;
int main()
{
int n = input;
// using built-in operators would invoke undefined behavior
// int m = ++n + ++n;
// but using functions makes sure the code executes as-if
// the functions were not overlapped
int m = add(preinc(n), preinc(n));
result = m;
}
I use g++ -s main.cpp to get the assembler output from the source, the main() function of output file main.s is showed as below:
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
.cfi_offset 3, -24
movq %fs:40, %rax
movq %rax, -24(%rbp)
xorl %eax, %eax
movl input(%rip), %eax
movl %eax, -32(%rbp)
leaq -32(%rbp), %rax
movq %rax, %rdi
call _Z6preincRi
movl (%rax), %ebx
leaq -32(%rbp), %rax
movq %rax, %rdi
call _Z6preincRi
movl (%rax), %eax
movl %ebx, %esi
movl %eax, %edi
call _Z3addii
movl %eax, -28(%rbp)
movl -28(%rbp), %eax
movl %eax, result(%rip)
movl $0, %eax
movq -24(%rbp), %rdx
xorq %fs:40, %rdx
je .L7
call __stack_chk_fail
According to the output file I think the g++ compiler only compile the source code sentence by sentence without optimization, even if I add the -O3 compile option.
Since the output suppose to be like this:
# full code of the main() function as produced by the GCC compiler
# x86 (Intel) platform:
movl input(%rip), %eax # eax = input
leal 3(%rax,%rax), %eax # eax = 3 + eax + eax
movl %eax, result(%rip) # result = eax
xorl %eax, %eax # eax = 0 (the return value of main())
ret
I want to know how to get the assembler output code showed as below.
Something went wrong when I test the example code. Here is the answer I made concluding some of my thoughts and the comments from others above.
Compilers will optimize the code unless the "-O3" or "-O2" compilation option was added. Just like #Balázs Kovacsics and #molbdnilo said in comments. Using the command g++ -S main.cpp will get the assembler output sentence by sentence like what is showed in the question.
Once the "-O3" or "-O2" compilation option was added, it means programmer allows the compiler do any code transformations that do not change the observable behavior of the program. So the main() function of output file main.s is showed as below, using g++ -S -O3 main.cpp
main:
.LFB2:
.cfi_startproc
movl input(%rip), %eax
leal 3(%rax,%rax), %eax
movl %eax, result(%rip)
xorl %eax, %eax
ret
.cfi_endproc
Should be careful that the compiler option should written in upper case.
Here is a compiler explorer website #JulianH gave which is really convenient to see assembler output among different platforms and different compilers.
I think get the assembler output helps me understand as-if rule better. I hope what I wrote would help someone who is also confusing about the abstract description of cppreference.
I would like to know what my compiler does with the following code
void design_grid::design_valid()
{
auto valid_idx = [this]() {
if ((row_num < 0) || (col_num < 0))
{
return false;
}
if ((row_num >= this->num_rows) || (col_num >= this->num_rows))
{
return false;
}
return true;
}
/* some code that calls lambda function valid_idx() */
}
If I repeatedly call the class member function above (design_grid::design_valid), then what exactly happens when my program encounters the creation of valid_idx every time? Does the compiler inline the code where it is called later, at compile time, so that it does not actually do anything where the creation of valid_idx is encountered?
UPDATE
A segment of the assembly code is below. If this is a little too much too read, I will post another batch of code later which is coloured, to illustrate which parts are which. (don't have a nice way to colour code segments with me at the present moment). Also note that I have updated the definition of my member function and the lambda function above to reflect what it is really named in my code (and thus, in the assembly language).
In any case, it appears that the lambda is defined separately from the main function. The lambda function is represented by the _ZZN11design_grid12design_validEvENKUliiE_clEii function directly below. Directly below this function, in turn, the outer function (design_grid::design_valid), represented by _ZN11design_grid12design_validEv starts. Later in _ZN11design_grid12design_validEv, a call is made to _ZZN11design_grid12design_validEvENKUliiE_clEii. This line where the call is made looks like
call _ZZN11design_grid12design_validEvENKUliiE_clEii #
Correct me if I'm wrong, but this means that the compiler defined the lambda as a normal function outside the design_valid function, then calls it as a normal function when it should? That is, it does not create a new object every time it encounters the statement which declares the lambda function? The only trace I could see of the lambda function in that particular location is in the line which is commented # tmp85, valid_idx.__this in the second function, right after the base and stack pointers readjusted at the start of the function, but this is just a simple movq operation.
.type _ZZN11design_grid12design_validEvENKUliiE_clEii, #function
_ZZN11design_grid12design_validEvENKUliiE_clEii:
.LFB4029:
.cfi_startproc
pushq %rbp #
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp #,
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp) # __closure, __closure
movl %esi, -12(%rbp) # row_num, row_num
movl %edx, -16(%rbp) # col_num, col_num
cmpl $0, -12(%rbp) #, row_num
js .L107 #,
cmpl $0, -16(%rbp) #, col_num
jns .L108 #,
.L107:
movl $0, %eax #, D.81546
jmp .L109 #
.L108:
movq -8(%rbp), %rax # __closure, tmp65
movq (%rax), %rax # __closure_4(D)->__this, D.81547
movl 68(%rax), %eax # _5->D.69795.num_rows, D.81548
cmpl -12(%rbp), %eax # row_num, D.81548
jle .L110 #,
movq -8(%rbp), %rax # __closure, tmp66
movq (%rax), %rax # __closure_4(D)->__this, D.81547
movl 68(%rax), %eax # _7->D.69795.num_rows, D.81548
cmpl -16(%rbp), %eax # col_num, D.81548
jg .L111 #,
.L110:
movl $0, %eax #, D.81546
jmp .L109 #
.L111:
movl $1, %eax #, D.81546
.L109:
popq %rbp #
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4029:
.size _ZZN11design_grid12design_validEvENKUliiE_clEii,.-_ZZN11design_grid12design_validEvENKUliiE_clEii
.align 2
.globl _ZN11design_grid12design_validEv
.type _ZN11design_grid12design_validEv, #function
_ZN11design_grid12design_validEv:
.LFB4028:
.cfi_startproc
pushq %rbp #
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp #,
.cfi_def_cfa_register 6
pushq %rbx #
subq $72, %rsp #,
.cfi_offset 3, -24
movq %rdi, -72(%rbp) # this, this
movq -72(%rbp), %rax # this, tmp85
movq %rax, -32(%rbp) # tmp85, valid_idx.__this
movl $0, -52(%rbp) #, active_count
movl $0, -48(%rbp) #, row_num
jmp .L113 #
.L128:
movl $0, -44(%rbp) #, col_num
jmp .L114 #
.L127:
movl -44(%rbp), %eax # col_num, tmp86
movslq %eax, %rbx # tmp86, D.81551
Closures (unnamed function objects) are to lambdas as objects are to classes. This means that a closure lambda_func is created repeatedly from the lambda:
[this]() {
/* some code here */
}
Just as an object could be created repeatedly from a class. Of course the compiler may optimize some steps away.
As for this part of the question:
Does the compiler inline the code where it is called later, at compile
time, so that it does not actually do anything where the creation of
lambda_func is encountered?
See:
are lambda functions in c++ inline? and
Why can lambdas be better optimized by the compiler than plain functions?.
Here is a sample program to test what might happen:
#include <iostream>
#include <random>
#include <algorithm>
class ClassA {
public:
void repeatedly_called();
private:
std::random_device rd{};
std::mt19937 mt{rd()};
std::uniform_int_distribution<> ud{0,10};
};
void ClassA::repeatedly_called()
{
auto lambda_func = [this]() {
/* some code here */
return ud(mt);
};
/* some code that calls lambda_func() */
std::cout << lambda_func()*lambda_func() << '\n';
};
int main()
{
ClassA class_a{};
for(size_t i{0}; i < 100; ++i) {
class_a.repeatedly_called();
}
return 0;
}
It was tested here.
We can see that in this particular case the function repeatedly_called does not make a call to the lambda (which is generating the random numbers) as it has been inlined:
In the question's update it appears that the lambda instructions were not inlined. Theoretically the closure is created and normally that would mean some memory allocation, but, the compiler may optimize and remove some steps.
With only the capture of this the lambda is similar to a member function.
Basically what happens is that the compiler creates an unnamed class with a function call operator, storing the captured variables in the class as member variables. Then the compiler uses this unnamed class to create an object, which is your lambda_func variable.
During my little performance issues investigation, I noticed an interesting stack allocation feature, here it is template for measure time:
#include <chrono>
#include <iostream>
using namespace std;
using namespace std::chrono;
int x; //for simple optimization suppression
void foo();
int main()
{
const size_t n = 10000000; //ten millions
auto start = high_resolution_clock::now();
for (size_t i = 0; i < n; i++)
{
foo();
}
auto finish = high_resolution_clock::now();
cout << duration_cast<milliseconds>(finish - start).count() << endl;
}
Now it's all about foo() implementation, in each implementation will be allocated in total 500000 ints:
Allocated in one chunk:
void foo()
{
const int size = 500000;
int a1[size];
x = a1[size - 1];
}
Result: 7.3 seconds;
Allocated in two chunks:
void foo()
{
const int size = 250000;
int a1[size];
int a2[size];
x = a1[size - 1] + a2[size - 1];
}
Result: 3.5 seconds;
Allocated in four chunks:
void foo()
{
const int size = 125000;
int a1[size];
int a2[size];
int a3[size];
int a4[size];
x = a1[size - 1] + a2[size - 1] +
a3[size - 1] + a4[size - 1];
}
Result: 1.8 seconds.
and etc... I split it in 16 chunks and get result time 0.38 seconds.
Explain it to me, please, why and how this happens?
I used MSVC 2013 (v120), Release build.
UPD:
My machine is x64 platform. And I compiled it with Win32 platform.
When I compile it with x64 Platform then it yields in all cases about 40ms.
Why platform choice so much affect?
Looking at disassembly from VS2015 Update 3, in the 2 and 4 array versions of foo, the compiler optimizes out the unused arrays so that it only reserves stack space for 1 array in each function. Since the later functions have smaller arrays this takes less time. The assignment to x reads the same memory location for both/all 4 arrays. (Since the arrays are uninitialized, reading from them is undefined behavior.) Without optimizing the code there are 2 or 4 distinct arrays that are read from.
The long time taken for these functions is due to stack probes performed by __chkstk as part of stack overflow detection (necessary when the compiler needs more than 1 page of space to hold all the local variables).
You should look at the resulting assembler code, to see what your compiler really does with the code. For gcc/clang/icc you can use Matt Godbolt's Compiler Explorer.
clang optimizes everything out because of UB and the result is (foo - first version, foo2 - second version:
foo: # #foo
retq
foo2: # #foo2
retq
icc treats both versions very similar:
foo:
pushq %rbp #4.1
movq %rsp, %rbp #4.1
subq $2000000, %rsp #4.1
movl -4(%rbp), %eax #8.9
movl %eax, x(%rip) #8.5
leave #10.1
ret #10.1
foo2:
pushq %rbp #13.1
movq %rsp, %rbp #13.1
subq $2000000, %rsp #13.1
movl -1000004(%rbp), %eax #18.9
addl -4(%rbp), %eax #18.24
movl %eax, x(%rip) #18.5
leave #19.1
ret
and gcc creates different assembler code for different version. Version 6.1 produces code which would show similar behavior as your experiments:
foo:
pushq %rbp
movq %rsp, %rbp
subq $2000016, %rsp
movl 1999996(%rsp), %eax
movl %eax, x(%rip)
leave
ret
foo2:
pushq %rbp
movl $1000016, %edx #only the first array is allocated
movq %rsp, %rbp
subq %rdx, %rsp
leaq 3(%rsp), %rax
subq %rdx, %rsp
shrq $2, %rax
movl 999996(,%rax,4), %eax
addl 999996(%rsp), %eax
movl %eax, x(%rip)
leave
ret
Thus the only way to understand the difference is to look at the assembler code produced by your compiler, everything else is just guessing.
Very much like this question, except that instead vector<int> I have vector<struct myType>.
If I want to reset (or for that matter, set to some value) myType.myVar for every element in the vector, what's the most efficient method?
Right now I'm iterating through:
for(int i=0; i<myVec.size(); i++) myVec.at(i).myVar = 0;
But since vectors are guaranteed to be stored contiguously, there's surely a better way?
Resetting will need to traverse every element of the vector, so it will need at least O(n) complexity. Your current algorithm takes O(n).
In this particular case you can use operator[] instead of at (that might throw an exception). But I doubt that's the bottleneck of your application.
On this note you should probably use std::fill:
std::fill(myVec.begin(), myVec.end(), 0);
But unless you want to go byte level and set a chunk of memory to 0, which will not only cause you headaches but will also make you lose portability in most cases, there's nothing to improve here.
Instead of the below code
for(int i=0; i<myVec.size(); i++) myVec.at(i).myVar = 0;
do it as follows:
size_t sz = myVec.size();
for(int i=0; i<sz; ++i) myVec[i].myVar = 0;
As the "at" method internally checks whether index is out of range or not. But as your loop index is taking care(myVec.size()), you can avoid the extra check. Otherwise this is the fastest way to do it.
EDIT
In addition to that, we can store the size() of vector prior to executing the for loop.This would ensure that there is no further call of method size() inside the for loop.
One of the fastest ways would be to perform loop unwinding and break the speed limit posed by conventional for loops that cause great cash spills. In your case, as it's a run time thing, there's no way to apply template metaprogramming so a variation on good old Duff's device would do the trick
#include <iostream>
#include <vector>
using namespace std;
struct myStruct {
int a;
double b;
};
int main()
{
std::vector<myStruct> mV(20);
double val(20); // the new value you want to reset to
int n = (mV.size()+7) / 8; // 8 is the size of the block unroll
auto to = mV.begin(); //
switch(mV.size()%8)
{
case 0: do { (*to++).b = val;
case 7: (*to++).b = val;
case 6: (*to++).b = val;
case 5: (*to++).b = val;
case 4: (*to++).b = val;
case 3: (*to++).b = val;
case 2: (*to++).b = val;
case 1: (*to++).b = val;
} while (--n>0);
}
// just printing to verify that the value is set
for (auto i : mV) std::cout << i.b << std::endl;
return 0;
}
Here I choose to perform an 8-block unwind, to reset the value (let's say) b of a myStruct structure. The block size can be tweaked and loops are effectively unrolled. Remember this is the underlying technique in memcpy and one of the optimizations (loop unrolling in general) a compiler will attempt (actually they're quite good at this so we might as well let them do their job).
In addition to what's been said before, you should consider that if you turn on optimizations, the compiler will likely perform loop-unrolling which will make the loop itself faster.
Also pre increment ++i takes few instructions less than post increment i++.
Explanation here
Beware of spending a lot of time thinking about optimization details which the compiler will just take care of for you.
Here are four implementations of what I understand the OP to be, along with the code generated using gcc 4.8 with --std=c++11 -O3 -S
Declarations:
#include <algorithm>
#include <vector>
struct T {
int irrelevant;
int relevant;
double trailing;
};
Explicit loop implementations, roughly from answers and comments provided to OP. Both produced identical machine code, aside from labels.
.cfi_startproc
movq (%rdi), %rsi
void clear_relevant(std::vector<T>* vecp) { movq 8(%rdi), %rcx
for(unsigned i=0; i<vecp->size(); i++) { xorl %edx, %edx
vecp->at(i).relevant = 0; xorl %eax, %eax
} subq %rsi, %rcx
} sarq $4, %rcx
testq %rcx, %rcx
je .L1
.p2align 4,,10
.p2align 3
.L5:
void clear_relevant2(std::vector<T>* vecp) { salq $4, %rdx
std::vector<T>& vec = *vecp; addl $1, %eax
auto s = vec.size(); movl $0, 4(%rsi,%rdx)
for (unsigned i = 0; i < s; ++i) { movl %eax, %edx
vec[i].relevant = 0; cmpq %rcx, %rdx
} jb .L5
} .L1:
rep ret
.cfi_endproc
Two other versions, one using std::for_each and the other one using the range for syntax. Here there is a subtle difference in the code for the two versions (other than the labels):
.cfi_startproc
movq 8(%rdi), %rdx
movq (%rdi), %rax
cmpq %rax, %rdx
je .L17
void clear_relevant3(std::vector<T>* vecp) { .p2align 4,,10
for (auto& p : *vecp) p.relevant = 0; .p2align 3
} .L21:
movl $0, 4(%rax)
addq $16, %rax
cmpq %rax, %rdx
jne .L21
.L17:
rep ret
.cfi_endproc
.cfi_startproc
movq 8(%rdi), %rdx
movq (%rdi), %rax
cmpq %rdx, %rax
void clear_relevant4(std::vector<T>* vecp) { je .L12
std::for_each(vecp->begin(), vecp->end(), .p2align 4,,10
[](T& o){o.relevant=0;}); .p2align 3
} .L16:
movl $0, 4(%rax)
addq $16, %rax
cmpq %rax, %rdx
jne .L16
.L12:
rep ret
.cfi_endproc
I have a virtual function in a hotspot code that needs to return a structure as a result. I have these two options:
virtual Vec4 generateVec() const = 0; // return value
virtual void generateVec(Vec4& output) const = 0; // output parameter
My question is, is there generally any difference in the performance of these functions? I'd assume the second one is faster, because it does not involve copying data on the stack. However, the first one is often much more convenient to use. If the first one is still slightly slower, would this be measurable at all? Am I too obsessed :)
Let me stress that that this function will be called millions of times every second, but also that the size of the structure Vec4 is small - 16 bytes.
As has been said, try them out - but you will quite possibly find that Vec4 generateVec() is actually faster. Return value optimization will elide the copy operation, whereas void generateVec(Vec4& output) may cause an unnecessary initialisation of the output parameter.
Is there any way you can avoid making the function virtual? If you're calling it millions of times a sec that extra level of indirection is worth looking at.
Code called millions of times per second implies you really do need to optimize for speed.
Depending on how complex the body of the derived generateVec's is, the difference between the two may be unnoticeable or could be massive.
Best bet is to try them both and profile to see if you need to worry about optimizing this particular aspect of the code.
Feeling a bit bored, so I came up with this:
#include <iostream>
#include <ctime>
#include <cstdlib>
using namespace std;
struct A {
int n[4];
A() {
n[0] = n[1] = n[2] = n[3] = rand();
}
};
A f1() {
return A();
}
A f2( A & a ) {
a = A();
}
const unsigned long BIG = 100000000;
int main() {
unsigned int sum = 0;
A a;
clock_t t = clock();
for ( unsigned int i = 0; i < BIG; i++ ) {
a = f1();
sum += a.n[0];
}
cout << clock() - t << endl;
t = clock();
for ( unsigned int i = 0; i < BIG; i++ ) {
f2( a );
sum += a.n[0];
}
cout << clock() - t << endl;
return sum & 1;
}
Results with -O2 optimisation are that there is no significant difference.
There are chances that the first solution is faster.
A very nice article :
http://cpp-next.com/archive/2009/08/want-speed-pass-by-value/
Just out of curiosity, I wrote 2 similar functions (uses 8-byte data types) to check their assembly code.
long long int ret_val()
{
long long int tmp(1);
return tmp;
}
// ret_val() assembly
.globl _Z7ret_valv
.type _Z7ret_valv, #function
_Z7ret_valv:
.LFB0:
.cfi_startproc
.cfi_personality 0x0,__gxx_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
subl $16, %esp
movl $1, -8(%ebp)
movl $0, -4(%ebp)
movl -8(%ebp), %eax
movl -4(%ebp), %edx
leave
ret
.cfi_endproc
Surprisingly, the pass-by-value method below required a few more instructions:
void output_val(long long int& value)
{
long long int tmp(2);
value = tmp;
}
// output_val() assembly
.globl _Z10output_valRx
.type _Z10output_valRx, #function
_Z10output_valRx:
.LFB1:
.cfi_startproc
.cfi_personality 0x0,__gxx_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
subl $16, %esp
movl $2, -8(%ebp)
movl $0, -4(%ebp)
movl 8(%ebp), %ecx
movl -8(%ebp), %eax
movl -4(%ebp), %edx
movl %eax, (%ecx)
movl %edx, 4(%ecx)
leave
ret
.cfi_endproc
These functions were called in a test code as:
long long val = ret_val();
long long val2;
output_val(val2);
Compiled by gcc.