Efficient data structure(s) for kind of scripting application? - c++

In C++ I want to write an application that works similar to a scripting language:
Out of some input during "setup time" it will define on a big global array where each variable will be located and on a different array the sequence of the functions ("LogicElement") to call (including their parameters like the variables to use).
One implementation might look like:
class LogicElement_Generic
{
public:
virtual void calc() const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int &to;
const int &from1;
const int &from2;
public:
LogicElement_Mul( int &_to, const int &_from1, const int &_from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc() const
{
to = from1 * from2;
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
// just a demo, this would be setup from e.g. an input file:
int *to = (int*)globalVariableBuffer;
int *from1 = (int*)(globalVariableBuffer + sizeof(int));
int *from2 = (int*)(globalVariableBuffer + 2*sizeof(int));
*from1 = 2;
*from2 = 3;
le[0] = new LogicElement_Mul( *to, *from1, *from2 );
// doing all calculations:
// finally it would be a loop iterating over all calculation functions,
// over and over again - the area in the code where all the resources
// would be burned...
le[0]->calc();
return *to;
}
Although that works as intended, looking at the created assembly:
78 .section .text._ZNK16LogicElement_Mul4calcEv,"axG",#progbits,_ZNK16LogicElement_Mul4calcEv,comdat
79 .align 2
80 .weak _ZNK16LogicElement_Mul4calcEv
82 _ZNK16LogicElement_Mul4calcEv:
83 .LFB6:
17:.../src/test.cpp **** void calc() const
84 .loc 1 17 0
85 .cfi_startproc
86 0000 55 pushq %rbp
87 .LCFI6:
88 .cfi_def_cfa_offset 16
89 .cfi_offset 6, -16
90 0001 4889E5 movq %rsp, %rbp
91 .LCFI7:
92 .cfi_def_cfa_register 6
93 0004 48897DF8 movq %rdi, -8(%rbp)
18:.../src/test.cpp **** {
19:.../src/test.cpp **** to = from1 * from2;
94 .loc 1 19 0
95 0008 488B45F8 movq -8(%rbp), %rax
96 000c 488B4008 movq 8(%rax), %rax
97 0010 488B55F8 movq -8(%rbp), %rdx
98 0014 488B5210 movq 16(%rdx), %rdx
99 0018 8B0A movl (%rdx), %ecx
100 001a 488B55F8 movq -8(%rbp), %rdx
101 001e 488B5218 movq 24(%rdx), %rdx
102 0022 8B12 movl (%rdx), %edx
103 0024 0FAFD1 imull %ecx, %edx
104 0027 8910 movl %edx, (%rax)
20:.../src/test.cpp **** }
105 .loc 1 20 0
106 0029 5D popq %rbp
107 .LCFI8:
108 .cfi_def_cfa 7, 8
109 002a C3 ret
110 .cfi_endproc
Looking at the assembly lines 95 .. 104 you can see that for each variable three indirections are used.
As this part of the code (the calc() methods) would finally be called very rapidly I want to use the least CPU cycles and memory bandwidth as possible (by general C/C++).
I also want to achieve (not shown in the code above) to have two variable buffers that have exactly the same layout to be able to do double buffering at an multithreaded approach to limit the necessary locks (exact implementation details would be too much detail for this question).
So the big questions are:
How can I change the architecture to reduce the amount of memory indirections in the calc()?
(I'd expect only two: one to get the offset address in the variable array and an additional to get the variable itself - but my experiments changing the code above to use offsets made things far worse!)
Is there a better way to set up the classes and thus the array of the LogicElements so that calling the calculation methods will use the least amount of resources?

Thanks to the hint by #Ed S. I changed away from references (where I hoped the compiler could optimize better).
But an even more important step I did was to compare the assembly that was generated after activating optimizations (just a simple -O2 did do).
(I didn't do that at the beginning as I wanted to have a clearer picture on the generated "pure" machine code and not one where an intelligent compiler fixes a stupid programmer - but it seems the compiler is too "stupid" then...)
So the current result is quite good now for the variable array:
class LogicElement_Generic
{
public:
virtual void calc(void * const base) const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int const to;
int const from1;
int const from2;
public:
LogicElement_Mul( int const _to, int const _from1, int const _from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc(void * const base) const
{
*((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
int to = 0;
int from1 = sizeof(int);
int from2 = 2*sizeof(int);
*((int*)(globalVariableBuffer+from1)) = 2;
*((int*)(globalVariableBuffer+from2)) = 3;
le[0] = new LogicElement_Mul( to, from1, from2 );
le[0]->calc(globalVariableBuffer);
return *((int*)(globalVariableBuffer+to));
}
with the relevant part of the assembly:
17:.../src/test.cpp **** void calc(void * const base) const
12 .loc 1 17 0
13 .cfi_startproc
14 .LVL0:
18:.../src/test.cpp **** {
19:.../src/test.cpp **** *((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
15 .loc 1 19 0
16 0000 4863470C movslq 12(%rdi), %rax
17 0004 48634F10 movslq 16(%rdi), %rcx
18 0008 48635708 movslq 8(%rdi), %rdx
19 000c 8B0406 movl (%rsi,%rax), %eax
20 000f 0FAF040E imull (%rsi,%rcx), %eax
21 0013 890416 movl %eax, (%rsi,%rdx)
20:.../src/test.cpp **** }
22 .loc 1 20 0
23 0016 C3 ret
24 .cfi_endproc
So I recon the first questions as answered! :)
The second is still open.
(Even more now as the pointer arithmetic might be valid C++ - but very ugly...)

Related

Am I correctly reasoning about cache performance?

I'm trying to solidify my understanding of data contention and have come up with the following minimal test program. It runs a thread that does some data crunching, and spinlocks on an atomic bool until the thread is done.
#include <iostream>
#include <atomic>
#include <thread>
#include <array>
class SideProcessor
{
public:
SideProcessor()
: dataArr{}
{
};
void start() {
processingRequested = true;
};
bool isStarted() {
return processingRequested;
};
bool isDone() {
return !processingRequested;
};
void finish() {
processingRequested = false;
}
void run(std::atomic<bool>* exitRequested) {
while(!(*exitRequested)) {
// Spin on the flag.
while(!(isStarted()) && !(*exitRequested)) {
}
if (*exitRequested) {
// If we were asked to spin down, break out of the loop.
break;
}
// Process
processData();
// Flag that we're done.
finish();
}
};
private:
std::atomic<bool> processingRequested;
#ifdef PAD_ALIGNMENT
std::array<bool, 64> padding;
#endif
std::array<int, 100> dataArr;
void processData() {
// Add 1 to every element a bunch of times.
std::cout << "Processing data." << std::endl;
for (unsigned int i = 0; i < 10000000; ++i) {
for (unsigned int j = 0; j < 100; ++j) {
dataArr[j] += 1;
}
}
std::cout << "Done processing." << std::endl;
};
};
int main() {
std::atomic<bool> exitRequested;
exitRequested = false;
SideProcessor sideProcessor;
std::thread processThreadObj = std::thread(&SideProcessor::run,
&sideProcessor, &exitRequested);
// Spinlock while the thread is doing work.
std::cout << "Starting processing." << std::endl;
sideProcessor.start();
while (!(sideProcessor.isDone())) {
}
// Spin the thread down.
std::cout << "Waiting for thread to spin down." << std::endl;
exitRequested = true;
processThreadObj.join();
std::cout << "Done." << std::endl;
return 0;
}
When I build with -O3 and don't define PAD_ALIGNMENT, I get the following results from Linux perf:
142,511,066 cache-references (29.15%)
142,364 cache-misses # 0.100 % of all cache refs (39.33%)
33,580,965 L1-dcache-load-misses # 3.40% of all L1-dcache hits (39.90%)
988,605,337 L1-dcache-loads (40.46%)
279,446,259 L1-dcache-store (40.71%)
227,713 L1-icache-load-misses (40.71%)
10,040,733 LLC-loads (40.71%)
5,834 LLC-load-misses # 0.06% of all LL-cache hits (40.32%)
40,070,067 LLC-stores (19.39%)
94 LLC-store-misses (19.22%)
0.708704757 seconds time elapsed
When I build with PAD_ALIGNMENT, I get the following results:
450,713 cache-references (27.83%)
124,281 cache-misses # 27.574 % of all cache refs (39.29%)
104,857 L1-dcache-load-misses # 0.01% of all L1-dcache hits (42.16%)
714,361,767 L1-dcache-loads (45.02%)
281,140,925 L1-dcache-store (45.83%)
90,839 L1-icache-load-misses (43.52%)
11,225 LLC-loads (40.66%)
3,685 LLC-load-misses # 32.83% of all LL-cache hits (37.80%)
1,798 LLC-stores (17.18%)
76 LLC-store-misses (17.18%)
0.140602005 seconds time elapsed
I have 2 questions:
Would I be correct in saying that the huge amount of increased cache references come from missing L1 and having to go to LLC to get the cache line that the other core invalidated? (please correct my terminology if it isn't accurate)
I think I understand the increased LLC-loads, but why are there so many more LLC-stores when the data is on the same cache line?
(Edit) Additional information as requested:
g++ version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
CPU model: Intel(R) Core(TM) i7-6700HQ CPU # 2.60GHz
Kernel version: 4.15.0-106-generic
processData()'s loop when compiling with g++ -std=c++14 -pthread -g -O3 -c -fverbose-asm -Wa,-adhln:
Without PAD_ALIGNMENT:
57:main.cpp **** void processData() {
58:main.cpp **** // Add 1 to every element a bunch of times.
59:main.cpp **** std::cout << "Processing data." << std::endl;
60:main.cpp **** for (unsigned int i = 0; i < 10000000; ++i) {
61:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
62:main.cpp **** dataArr[j] += 1;
409 .loc 4 62 0
410 0109 83450401 addl $1, 4(%rbp) #, MEM[(value_type &)this_8(D) + 4]
411 .LVL32:
412 010d 4183FD01 cmpl $1, %r13d #, prolog_loop_niters.140
413 0111 0F841901 je .L29 #,
413 0000
414 0117 83450801 addl $1, 8(%rbp) #, MEM[(value_type &)this_8(D) + 4]
415 .LVL33:
416 011b 4183FD02 cmpl $2, %r13d #, prolog_loop_niters.140
417 011f 0F842801 je .L30 #,
417 0000
418 0125 83450C01 addl $1, 12(%rbp) #, MEM[(value_type &)this_8(D) + 4]
419 .LVL34:
420 0129 BF610000 movl $97, %edi #, ivtmp_12
420 00
421 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
422 .loc 4 61 0
423 012e 41BA0300 movl $3, %r10d #, j
423 0000
424 .LVL35:
425 .L18:
426 0134 31C0 xorl %eax, %eax # ivtmp.156
427 0136 31D2 xorl %edx, %edx # ivtmp.153
428 0138 0F1F8400 .p2align 4,,10
428 00000000
429 .p2align 3
430 .L20:
431 0140 83C201 addl $1, %edx #, ivtmp.153
432 # main.cpp:62: dataArr[j] += 1;
433 .loc 4 62 0
434 0143 66410F6F movdqa (%r14,%rax), %xmm0 # MEM[base: vectp_this.148_118, index: ivtmp.156_48, offset: 0], vect__2
434 0406
435 0149 660FFEC1 paddd %xmm1, %xmm0 # tmp231, vect__25.150
436 014d 410F2904 movaps %xmm0, (%r14,%rax) # vect__25.150, MEM[base: vectp_this.148_118, index: ivtmp.156_48, offse
436 06
437 0152 4883C010 addq $16, %rax #, ivtmp.156
438 0156 4439FA cmpl %r15d, %edx # bnd.143, ivtmp.153
439 0159 72E5 jb .L20 #,
440 015b 4429E7 subl %r12d, %edi # niters_vector_mult_vf.144, ivtmp_12
441 015e 4439E3 cmpl %r12d, %ebx # niters_vector_mult_vf.144, niters.142
442 0161 438D0422 leal (%r10,%r12), %eax #, tmp.145
443 0165 89FA movl %edi, %edx # ivtmp_12, tmp.146
444 0167 7421 je .L21 #,
445 .LVL36:
446 0169 89C7 movl %eax, %edi # tmp.145, tmp.145
447 016b 8344BD04 addl $1, 4(%rbp,%rdi,4) #, MEM[(value_type &)_129 + 4]
447 01
448 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
449 .loc 4 61 0
450 0170 83FA01 cmpl $1, %edx #, tmp.146
451 0173 8D7801 leal 1(%rax), %edi #,
452 .LVL37:
453 0176 7412 je .L21 #,
454 # main.cpp:62: dataArr[j] += 1;
455 .loc 4 62 0
456 0178 8344BD04 addl $1, 4(%rbp,%rdi,4) #, MEM[(value_type &)_122 + 4]
456 01
457 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
458 .loc 4 61 0
459 017d 83C002 addl $2, %eax #,
460 .LVL38:
461 0180 83FA02 cmpl $2, %edx #, tmp.146
462 0183 7405 je .L21 #,
463 # main.cpp:62: dataArr[j] += 1;
464 .loc 4 62 0
465 0185 83448504 addl $1, 4(%rbp,%rax,4) #, MEM[(value_type &)_160 + 4]
465 01
466 .LVL39:
467 .L21:
468 .LBE930:
469 # main.cpp:60: for (unsigned int i = 0; i < 10000000; ++i) {
60:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
470 .loc 4 60 0
471 018a 83EE01 subl $1, %esi #, ivtmp_3
472 018d 0F856DFF jne .L22 #,
472 FFFF
With PAD_ALIGNMENT:
57:main.cpp **** void processData() {
58:main.cpp **** // Add 1 to every element a bunch of times.
59:main.cpp **** std::cout << "Processing data." << std::endl;
60:main.cpp **** for (unsigned int i = 0; i < 10000000; ++i) {
61:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
62:main.cpp **** dataArr[j] += 1;
410 .loc 4 62 0
411 0109 83454401 addl $1, 68(%rbp) #, MEM[(value_type &)this_8(D) + 68]
412 .LVL32:
413 010d 4183FD01 cmpl $1, %r13d #, prolog_loop_niters.140
414 0111 0F841901 je .L29 #,
414 0000
415 0117 83454801 addl $1, 72(%rbp) #, MEM[(value_type &)this_8(D) + 68]
416 .LVL33:
417 011b 4183FD02 cmpl $2, %r13d #, prolog_loop_niters.140
418 011f 0F842801 je .L30 #,
418 0000
419 0125 83454C01 addl $1, 76(%rbp) #, MEM[(value_type &)this_8(D) + 68]
420 .LVL34:
421 0129 BF610000 movl $97, %edi #, ivtmp_12
421 00
422 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
423 .loc 4 61 0
424 012e 41BA0300 movl $3, %r10d #, j
424 0000
425 .LVL35:
426 .L18:
427 0134 31C0 xorl %eax, %eax # ivtmp.156
428 0136 31D2 xorl %edx, %edx # ivtmp.153
429 0138 0F1F8400 .p2align 4,,10
429 00000000
430 .p2align 3
431 .L20:
432 0140 83C201 addl $1, %edx #, ivtmp.153
433 # main.cpp:62: dataArr[j] += 1;
434 .loc 4 62 0
435 0143 66410F6F movdqa (%r14,%rax), %xmm0 # MEM[base: vectp_this.148_118, index: ivtmp.156_48, offset: 0], vect__2
435 0406
436 0149 660FFEC1 paddd %xmm1, %xmm0 # tmp231, vect__25.150
437 014d 410F2904 movaps %xmm0, (%r14,%rax) # vect__25.150, MEM[base: vectp_this.148_118, index: ivtmp.156_48, offse
437 06
438 0152 4883C010 addq $16, %rax #, ivtmp.156
439 0156 4439FA cmpl %r15d, %edx # bnd.143, ivtmp.153
440 0159 72E5 jb .L20 #,
441 015b 4429E7 subl %r12d, %edi # niters_vector_mult_vf.144, ivtmp_12
442 015e 4439E3 cmpl %r12d, %ebx # niters_vector_mult_vf.144, niters.142
443 0161 438D0422 leal (%r10,%r12), %eax #, tmp.145
444 0165 89FA movl %edi, %edx # ivtmp_12, tmp.146
445 0167 7421 je .L21 #,
446 .LVL36:
447 0169 89C7 movl %eax, %edi # tmp.145, tmp.145
448 016b 8344BD44 addl $1, 68(%rbp,%rdi,4) #, MEM[(value_type &)_129 + 68]
448 01
449 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
450 .loc 4 61 0
451 0170 83FA01 cmpl $1, %edx #, tmp.146
452 0173 8D7801 leal 1(%rax), %edi #,
453 .LVL37:
454 0176 7412 je .L21 #,
455 # main.cpp:62: dataArr[j] += 1;
456 .loc 4 62 0
457 0178 8344BD44 addl $1, 68(%rbp,%rdi,4) #, MEM[(value_type &)_122 + 68]
457 01
458 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
459 .loc 4 61 0
460 017d 83C002 addl $2, %eax #,
461 .LVL38:
462 0180 83FA02 cmpl $2, %edx #, tmp.146
463 0183 7405 je .L21 #,
464 # main.cpp:62: dataArr[j] += 1;
465 .loc 4 62 0
466 0185 83448544 addl $1, 68(%rbp,%rax,4) #, MEM[(value_type &)_160 + 68]
466 01
467 .LVL39:
468 .L21:
469 .LBE930:
470 # main.cpp:60: for (unsigned int i = 0; i < 10000000; ++i) {
60:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
471 .loc 4 60 0
472 018a 83EE01 subl $1, %esi #, ivtmp_3
473 018d 0F856DFF jne .L22 #,
473 FFFF
An instance of type SideProcessor has the following fields:
std::atomic<bool> processingRequested;
#ifdef PAD_ALIGNMENT
std::array<bool, 64> padding;
#endif
std::array<int, 100> dataArr;
The size of processingRequested is probably one byte. Without PAD_ALIGNMENT, the compiler will probably arrange the fields such that the first few elements of dataArr are in same 64-byte cache line as processingRequested. However, with PAD_ALIGNMENT, there will be a 64-byte gap between the two fields, so they the first element of the array and processingRequested will be in different cache lines.
Considering the loop in processData in isolation, one would expect that all of the 100 elements of the dataArr to easily fit in the L1D cache and so the vast majority of accesses should hit in the L1D. However, the main thread reads processingRequested in while (!(sideProcessor.isDone())) { } concurrently with the processing thread executing the loop in processData. Without PAD_ALIGNMENT, the main thread wants to read from the same same cache line that the processing thread wants to both read and write. This results in a false sharing situation where the shared cache line repeatedly bounces between the private caches of the two cores on the which the threads are running.
With false sharing between two cores in the same LLC sharing domain, there will be a negligible number of misses by the LLC (it can backstop the requests so they don't go to DRAM), but there will be a lot of read and RFO requests from the two cores. That's why the LLC miss event counts are small.
It appears to me that the compiler has unrolled the loop in processData four times and vectorized it using 16-byte SSE instructions. This would explain why the number of stores is close to a quarter of a billion. Without, the number of loads is about a billion, about a quarter of which is from the processing thread and most of the rest are from the main thread. The number of loads executed in while (!(sideProcessor.isDone())) { } depends on the time it takes to complete the execution of processData. So it makes sense that the number of loads is much smaller in the case of no false sharing (with PAD_ALIGNMENT).
In the case without PAD_ALIGNMENT, most of the L1-dcache-load-misses and LLC-loads events are from requests generated by the main thread while most of the LLC-stores events are from requests generated by the processing thread. All of these requests are to the line containing processingRequested. It makes sense that LLC-stores is much larger than LLC-loads because the main thread accesses the line more rapidly than the processing thread, so it's more likely that the RFOs miss in the private caches of the core on the processing thread is running. I think also most of the L1-dcache-load-misses events represent loads from the main thread to the shared cache line. It looks like only a third of these loads miss in the private L2 cache, which suggests that the line is being prefetched into the L2. This can be verified by disabling the L2 prefetchers and checking whether L1-dcache-load-misses is about equal to LLC-loads.

If Else construct

if (a == 1)
//do something
else if (a == 2)
//do something
else if (a == 3)
//do something
else if (a == 4)
//do something
else if (a == 5)
//do something
else if (a == 6)
//do something
else if (a == 7)
//do something
else if (a == 8)
//do something
Now imagine, we know that a will mostly be 7 and we execute this block of code several times in a program. Will moving the (a == 7 ) check to top improve any time performance? That is:
if (a == 7)
//do something
else if (a == 1)
//do something
else if (a == 2)
//do something
else if (a == 3)
//do something
and so on. Does it improve anything or it's just wishful thinking?
You can use switch case for improving the performance of program
switch (a)
{
case 1:
break;
case 2:
break;
case 3:
break;
case 4:
break;
case 5:
break;
case 6:
break;
case 7:
break;
}
Since the if conditions are checked in the order specified, yes. Whether it is measurable (and, hence, should you care) will depend on how many times that portion of the code is called.
Imagine you go to a hotel and are given a room with number 7.
You have to go across the hall checking every room until you find the room with number 7.
Will the time taken depend on how many rooms you checked before you got the one you have been alloted?
Yes..
But know this, in your scenario the time difference will be very very minute to be noticed.
For scenarios where there are too many numbers to be checked, putting the one in the beginning which occur many number of times does improve performance. In fact, this methodology is used by some network protocols for comparing protocol numbers
There is some penalty to be paid in case of compiler couldn't make the constructs to a jump table, I would think the switch/case implementation will be compiled as a jump table in assembly and if else not as a jump table then switch/case has an edge over if else. Again I guess this depends on architecture and compilers.
In case of switch/case compiler will be able to generate the asm jump table only based on the constants (eg. consecutive values) that we provide.
The test i ran on my machine gave the assembly for if/else as this (not jump table) ,
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $7, -4(%rbp)
cmpl $1, -4(%rbp)
jne .L2
movl $97, %edi
call putchar
jmp .L3
.L2:
**cmpl $2, -4(%rbp)
jne .L4**
movl $97, %edi
call putchar
jmp .L3
.L4:
**cmpl $3, -4(%rbp)
jne .L5**
movl $97, %edi
call putchar
jmp .L3
.L5:
**cmpl $4, -4(%rbp)
jne .L6**
movl $97, %edi
call putchar
jmp .L3
.L6:
**cmpl $5, -4(%rbp)
jne .L7**
movl $97, %edi
call putchar
jmp .L3
.L7:
**cmpl $6, -4(%rbp)
jne .L8**
movl $97, %edi
call putchar
jmp .L3
.L8:
cmpl $7, -4(%rbp)
jne .L9
movl $97, %edi
call putchar
jmp .L3
.L9:
cmpl $8, -4(%rbp)
jne .L3
movl $97, %edi
call putchar
.L3:
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
But for switch/case (jump table),
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $7, -4(%rbp)
cmpl $7, -4(%rbp)
ja .L2
movl -4(%rbp), %eax
movq .L4(,%rax,8), %rax
jmp *%rax
.section .rodata
.align 8
.align 4
.L4:
.quad .L2
.quad .L3
.quad .L5
.quad .L6
.quad .L7
.quad .L8
.quad .L9
.quad .L10
.text
.L3:
movl $97, %edi
call putchar
jmp .L2
.L5:
movl $97, %edi
call putchar
jmp .L2
.L6:
movl $97, %edi
call putchar
jmp .L2
.L7:
movl $97, %edi
call putchar
jmp .L2
.L8:
movl $97, %edi
call putchar
jmp .L2
.L9:
movl $97, %edi
call putchar
jmp .L2
.L10:
movl $97, %edi
call putchar
nop
.L2:
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
From the tests i feel that switch/case is better as it doesn't have to go through the earlier entries to find a match.
I would suggest to try gcc -S option to generate the assembly to check the asm to see.
TL;DR version
For so few values, any differences in speed will be immeasurably small, and you'd be better off sticking with the more straightforward, easier-to-understand version. It isn't until you need to start searching through tables containing thousands to millions of entries that you'll want something smarter than a linear ordered search.
James Michener Version
Another possibility not yet mentioned is to do a partitioned search, like so:
if ( a > 4 )
{
if ( a > 6 )
{
if ( a == 7 ) // do stuff
else // a == 8, do stuff
}
else
{
if ( a == 5 ) // do stuff
else // a == 6, do stuff
}
}
else
{
if ( a > 2 )
{
if ( a == 3 ) // do stuff
else // a == 4, do stuff
}
else
{
if ( a == 1 ) // do stuff
else // a == 2, do stuff
}
}
No more than three tests are performed for any value of a. Of course, no less than three tests are performed for any value of a, either. On average, it should give better performance than the naive 1-8 search when the majority of inputs are 7, but...
As with all things performance-related, the rule is measure, don't guess. Code up different versions, profile them, analyze the results. For testing against so few values, it's going to be hard to get reliable numbers; you'll need to execute each method thousands of times for a given value just to get a useful non-zero time measurement (it also means that any difference between the methods will be ridiculously small).
Stuff like this can also be affected by compiler optimization settings. You'll want to build at different optimization levels and re-run your tests.
Just for giggles, I coded up my own version measuring several different approaches:
naive - the straightforward test from 1 to 8 in order;
sevenfirst - check for 7 first, then 1 - 6 and 8;
eightfirst - check from 8 to 1 in reverse order;
partitioned - use the partitioned search above;
switcher - use a switch statement instead of if-else;
I used the following test harness:
int main( void )
{
size_t counter[9] = {0};
struct timeval start, end;
unsigned long total_nsec;
void (*funcs[])(int, size_t *) = { naive, sevenfirst, eightfirst, partitioned, switcher };
srand(time(NULL));
printf("%15s %15s %15s %15s %15s %15s\n", "test #", "naive", "sevenfirst", "eightfirst", "partitioned", "switcher" );
printf("%15s %15s %15s %15s %15s %15s\n", "------", "-----", "----------", "----------", "-----------", "--------" );
unsigned long times[5] = {0};
for ( size_t t = 0; t < 20; t++ )
{
printf( "%15zu ", t );
for ( size_t f = 0; f < 5; f ++ )
{
total_nsec = 0;
for ( size_t i = 0; i < 1000; i++ )
{
int a = generate();
gettimeofday( &start, NULL );
for ( size_t j = 0; j < 10000; j++ )
(*funcs[f])( a, counter );
gettimeofday( &end, NULL );
}
total_nsec += end.tv_usec - start.tv_usec;
printf( "%15lu ", total_nsec );
times[f] += total_nsec;
memset( counter, 0, sizeof counter );
}
putchar('\n');
}
putchar ('\n');
printf( "%15s ", "average:" );
for ( size_t i = 0; i < 5; i++ )
printf( "%15f ", (double) times[i] / 20 );
putchar ('\n' );
return 0;
}
The generate function produces random numbers from 1 through 8, weighted so that 7 appears half the time. I run each method 10000 times per generated value to get measurable times, for 1000 generated values.
I didn't want the performance difference between the various control structures to get swamped by the // do stuff code, so each case just increments a counter, such as
if ( a == 1 )
counter[1]++;
This also gave me a way to verify that my number generator was working properly.
I run through the whole sequence 20 times and average the results. Even so, the numbers can vary a bit from run to run, so don't trust them too deeply. If nothing else, they show that changes at this level don't result in huge improvements. For example:
test # naive sevenfirst eightfirst partitioned switcher
------ ----- ---------- ---------- ----------- --------
0 121 100 118 119 111
1 110 100 131 120 115
2 110 100 125 121 111
3 115 125 117 105 110
4 120 116 125 110 115
5 129 100 110 106 116
6 115 176 105 106 115
7 111 100 111 106 110
8 139 100 106 111 116
9 125 100 136 106 111
10 106 100 105 106 111
11 126 112 135 105 115
12 116 120 135 110 115
13 120 105 106 111 115
14 120 105 105 106 110
15 100 131 106 118 115
16 106 113 116 111 110
17 106 105 105 118 111
18 121 113 103 106 115
19 130 101 105 105 116
average: 117.300000 111.100000 115.250000 110.300000 113.150000
Numbers are in microseconds. The code was built using gcc 4.1.2 with no optimization running on a SLES 10 system1.
So, running each method 10000 times for 1000 values, averaged over 20 runs, gives a total variation of about 7 μsec. That's really not worth getting exercised over. For something that's only searching among 8 distinct values and isn't going to run more than "several times", you're not going to see any measurable improvement in performance regardless of the method used. Stick with the method that's the easiest to read and understand.
Now, for searching a table containing several hundreds to thousands to millions of entries, you definitely want to use something smarter than a linear search.
1. It should go without saying, but the results above are only valid for this code built with this specific compiler and running on this specific system.
There should be a slight difference, but this would depend on the platform and the nature of the comparison - different compilers may optimise something like this differently, different architectures will have different effects as well, and it also depends on what the comparison actually is (if it is a more complex comparison than a simple primitive type comparison, for example)
It is probably good practice to test the specific case you are actually going to use, IF this is something that is likely to actually be a performance bottleneck.
Alternatively, a switch statement, if usable, should have the same performance for any value independent of order because it is implemented using an offset in memory as opposed to successive comparisons.
Probably not, you still have the same number of conditions and still either of them might evaluate to true, even if you check for a == 7 first, all other conditions are potentially true therefore will be evaluated.
The block of code that eis executed if a == 7 might be executed quicker when the program runs - but essentially you're code is still the same, with the same number of statements.

GCC generates undesired assembly code

I'm working on vectorizing loops, and GCC is giving me a hard time.
When I look at the assembly code it generates, I see a lot of strange lines that I would like to get rid of.
For example, with vectorization, I've learnt that you can avoid a lot of extra assembly lines by giving additionnal information to GCC about array alignment.
http://locklessinc.com/articles/vectorize/
Here is my experiment.
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
comp[i]=a[i]|b[i];
}
Generates simple assembly:
.globl _ZN8Test_LUT7performEv
23 _ZN8Test_LUT7performEv:
24 .LFB664:
25 .cfi_startproc
26 0020 488B4710 movq 16(%rdi), %rax
27 0024 488B4F08 movq 8(%rdi), %rcx
28 0028 488B5720 movq 32(%rdi), %rdx
29 002c 0FB700 movzwl (%rax), %eax
30 002f 660B01 orw (%rcx), %ax
31 0032 668902 movw %ax, (%rdx)
32 0035 C3 ret
33 .cfi_endproc
But, even if I was expecting a few extra lines, I am very surprised by what I got after adding a loop :
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
for(i=0;i<SIZE;i++)
comp[i]=a[i]|b[i];
}
Generates this assembly with a lot more lines:
233 _Z10itwillworkPKtS0_Pt:
234 .LFB663:
235 .cfi_startproc
236 0250 488D4210 leaq 16(%rdx), %rax
237 0254 488D4E10 leaq 16(%rsi), %rcx
238 0258 4839F0 cmpq %rsi, %rax
239 025b 410F96C0 setbe %r8b
240 025f 4839CA cmpq %rcx, %rdx
241 0262 0F93C1 setnb %cl
242 0265 4108C8 orb %cl, %r8b
243 0268 743E je .L55
244 026a 4839F8 cmpq %rdi, %rax
245 026d 488D4710 leaq 16(%rdi), %rax
246 0271 0F96C1 setbe %cl
247 0274 4839C2 cmpq %rax, %rdx
248 0277 0F93C0 setnb %al
249 027a 08C1 orb %al, %cl
250 027c 742A je .L55
251 027e 31C0 xorl %eax, %eax
252 .p2align 4,,10
253 .p2align 3
254 .L57:
255 0280 F30F6F0C movdqu (%rsi,%rax), %xmm1
255 06
256 0285 F30F6F04 movdqu (%rdi,%rax), %xmm0
256 07
257 028a 660FEBC1 por %xmm1, %xmm0
258 028e F30F7F04 movdqu %xmm0, (%rdx,%rax)
258 02
259 0293 4883C010 addq $16, %rax
260 0297 483D0008 cmpq $2048, %rax
260 0000
261 029d 75E1 jne .L57
262 029f F3C3 rep ret
263 .p2align 4,,10
264 02a1 0F1F8000 .p2align 3
264 000000
265 .L55:
266 02a8 31C0 xorl %eax, %eax
267 02aa 660F1F44 .p2align 4,,10
267 0000
268 .p2align 3
269 .L58:
270 02b0 0FB70C06 movzwl (%rsi,%rax), %ecx
271 02b4 660B0C07 orw (%rdi,%rax), %cx
272 02b8 66890C02 movw %cx, (%rdx,%rax)
273 02bc 4883C002 addq $2, %rax
274 02c0 483D0008 cmpq $2048, %rax
274 0000
275 02c6 75E8 jne .L58
276 02c8 F3C3 rep ret
277 .cfi_endproc
Both were compiled with gcc 4.8.4 in release mode, -O2 -ftree-vectorize -msse2.
Can somebody help me get rid of those lines? Or, if it's impossible, can you tell me why they are there ?
Update :
I've tried the tricks there http://locklessinc.com/articles/vectorize/, but I get another issue:
#define SIZE 1024
void itwillwork (const uint16_t * a, const uint16_t * b, uint16_t * comp) {
int i = 0;
for(i=0;i<SIZE;i++)
comp[i]=a[i]|b[i];
}
A few assembly lines are generated for this function, I get it.
But when I call this function from somewhere else :
itwillwork(a,b,c);
There is no call instruction : the long list of instructions of "itwillwork" (the same as above) are used directly.
Am I missing something ? (the "extra lines" are the problem, not the inline call)
You are getting "weird" code because GCC cannot make assumptions about the alignment of your pointers so you can see that it is first performing an alignment test to determine whether it can take the fast path and do 128 bits at a time, or the slow path and do 16 bits at a time.
Additionally, the reason you are finding the code repeated is because the compiler is applying an inlining optimisation. You could disable this with the __attribute((noinline)) spec but if performance is your goal, let the compiler inline it.
If you specify the __restrict keyword then the compiler will only generate the fast-path code: https://goo.gl/g3jUfQ
However, this does not mean the compiler is going to magically take care of alignment for you so take care of what you pass to the function.

Do references occupy space on stack

I read that in the following case:
int i = 12;
int &a = i;
a will not occupy a space on the stack as it is an alias of i;
My question is suppose its a parameter as such
void funct(foo& a , int b)
{
}
when the function is created will a occupy a space on the stack ?
A reference is more or less like a pointer at this level and the following
#include <stdio.h>
#include <stdlib.h>
struct foo{
int val;
};
int funct(foo& a, int b)
{
return a.val;
}
int main(void) {
foo obj;
obj.val = 92;
funct(obj, 22); // 22 is passed by value, obj is passed by reference
return EXIT_SUCCESS;
}
gets translated to:
.Ltext0:
.globl _Z5functR3fooi // funct()
_Z5functR3fooi:
.LFB2:
.cfi_startproc
0000 55 pushq %rbp // some stack bookkeeping
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0001 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0004 48897DF8 movq %rdi, -8(%rbp) <-- move the address on the stack frame
0008 8975F4 movl %esi, -12(%rbp) <-- move the value on the stack frame
000b 488B45F8 movq -8(%rbp), %rax <-- get the address from the stack frame
000f 8B00 movl (%rax), %eax <-- use it
0011 5D popq %rbp
.cfi_def_cfa 7, 8
0012 C3 ret
.cfi_endproc
.LFE2:
.globl main
main:
.LFB3:
.cfi_startproc // Main
0013 55 pushq %rbp // Stack bookkeeping
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0014 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0017 4883EC10 subq $16, %rsp
.LBB2:
001b C745F05C movl $92, -16(%rbp) <-- save 92 (the entire POD struct) on the stack frame
000000
0022 488D45F0 leaq -16(%rbp), %rax <-- get the pointer to the stack frame where the obj is
0026 BE160000 movl $22, %esi <-- save the value in a register
00
002b 4889C7 movq %rax, %rdi <-- address of the stack frame to the object
002e E8000000 call _Z5functR3fooi // funct() call
00
0033 B8000000 movl $0, %eax
00
.LBE2:
0038 C9 leave
.cfi_def_cfa 7, 8
0039 C3 ret
.cfi_endproc
.LFE3:
.Letext0:
Of course keep in mind that this is just an implementation (gcc's one to be precise) without any optimization. It depends on the compiler how this really works.
(asm generated by http://assembly.ynh.io/)
According to the standard, whether or not a reference requires storage is unspecified. So it depends on the implementation.
A reference is often implemented as an address in assembly. However, a reference is neither a pointer to an object nor a copy of the object. A reference is the object.

Able to change value of const in C, but not in C++

Consider the following code
#include <stdio.h>
#include <string.h>
main()
{
const int a = 2;
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
This code can change the value to a in C but not in C++. I understand that C++ is applying optimization and replacing instances of a with 2. So was this a bug fix in C++ or was the bug fixed by chance due to optimization?
It's undefined behavior to modify a const value no matter directly or indirectly. This may compile in C and may even run without problem on your machine, but it's still undefined behavior.
The difference between C and C++ on this is: with const int a = 2, C++ treats a as a constant expression, for instance, you can use a as array dimension:
int n[a]; //fine in C++
But in C, a is not a constant expression, with the same code:
int n[a]; //VLA in C99
Here n is not a fixed-sized array, but a variable length array.
This is not a C vs C++ issue. By modifying a const value (as well as by double-casting a pointer via a long), you enter the realm of undefined behaviour in both languages. Therefore the difference is simply a matter of how the undefined behaviour chooses to manifest itself.
You are casting away the constness out of &a and modifying the pointed value, which is undefined behavior both in C and in C++ (the trip through long just adds some more gratuitous UB). In C++ your compiler happens to optimize more aggressively the constant, but the point of the situation is unchanged.
Your code generates undefined behavior on C++ since you're accessing memory you shouldn't
include <stdio.h>
#include <string.h>
void main()
{
const int a = 2;
printf("%x != %x !!", sizeof(long), sizeof(void*)); // on a x64 system 4 != 8
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
and even if it works on a 32 bit system modifying const memory by casting away the constness is undefined behavior in both languages.
Following is the assembly code generated by g++. The compiler statically use "$2" instead of "a", but in case of gcc it doesn't perform any static optimization. I guess there shouldn't be any undefined behaviour.
.Ltext0:
.section .rodata
.LC0:
0000 256400 .string "%d"
.text
.globl main
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA0
0000 55 pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0001 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0004 4883EC20 subq $32, %rsp
.LBB2:
0008 C745EC02 movl $2, -20(%rbp)
000000
000f 488D45EC leaq -20(%rbp), %rax
0013 488945F0 movq %rax, -16(%rbp)
0017 488B45F0 movq -16(%rbp), %rax
001b 488945F8 movq %rax, -8(%rbp)
001f 488B45F8 movq -8(%rbp), %rax
0023 C7000300 movl $3, (%rax)
0000
0029 488B45F8 movq -8(%rbp), %rax
002d 8B00 movl (%rax), %eax
002f 89C6 movl %eax, %esi
0031 BF000000 movl $.LC0, %edi
00
0036 B8000000 movl $0, %eax
00
.LEHB0:
003b E8000000 call printf
00
0040 BE020000 movl $2, %esi
00
0045 BF000000 movl $.LC0, %edi
00
004a B8000000 movl $0, %eax
00
004f E8000000 call printf
00
.LEHE0:
0054 B8000000 movl $0, %eax
00
0059 EB08 jmp .L5
.L4:
005b 4889C7 movq %rax, %rdi
.LEHB1:
005e E8000000 call _Unwind_Resume
00
.LEHE1:
.L5:
.LBE2:
0063 C9 leave
.cfi_def_cfa 7, 8
0064 C3 ret
.cfi_endproc
.LFE0:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA0:
0000 FF .byte 0xff
0001 FF .byte 0xff
0002 01 .byte 0x1
0003 08 .uleb128 .LLSDACSE0-.LLSDACSB0
.LLSDACSB0:
0004 3B .uleb128 .LEHB0-.LFB0
0005 19 .uleb128 .LEHE0-.LEHB0
0006 5B .uleb128 .L4-.LFB0
0007 00 .uleb128 0
0008 5E .uleb128 .LEHB1-.LFB0
0009 05 .uleb128 .LEHE1-.LEHB1
000a 00 .uleb128 0
000b 00 .uleb128 0
.LLSDACSE0:
.text
.Letext0: