if (a == 1)
//do something
else if (a == 2)
//do something
else if (a == 3)
//do something
else if (a == 4)
//do something
else if (a == 5)
//do something
else if (a == 6)
//do something
else if (a == 7)
//do something
else if (a == 8)
//do something
Now imagine, we know that a will mostly be 7 and we execute this block of code several times in a program. Will moving the (a == 7 ) check to top improve any time performance? That is:
if (a == 7)
//do something
else if (a == 1)
//do something
else if (a == 2)
//do something
else if (a == 3)
//do something
and so on. Does it improve anything or it's just wishful thinking?
You can use switch case for improving the performance of program
switch (a)
{
case 1:
break;
case 2:
break;
case 3:
break;
case 4:
break;
case 5:
break;
case 6:
break;
case 7:
break;
}
Since the if conditions are checked in the order specified, yes. Whether it is measurable (and, hence, should you care) will depend on how many times that portion of the code is called.
Imagine you go to a hotel and are given a room with number 7.
You have to go across the hall checking every room until you find the room with number 7.
Will the time taken depend on how many rooms you checked before you got the one you have been alloted?
Yes..
But know this, in your scenario the time difference will be very very minute to be noticed.
For scenarios where there are too many numbers to be checked, putting the one in the beginning which occur many number of times does improve performance. In fact, this methodology is used by some network protocols for comparing protocol numbers
There is some penalty to be paid in case of compiler couldn't make the constructs to a jump table, I would think the switch/case implementation will be compiled as a jump table in assembly and if else not as a jump table then switch/case has an edge over if else. Again I guess this depends on architecture and compilers.
In case of switch/case compiler will be able to generate the asm jump table only based on the constants (eg. consecutive values) that we provide.
The test i ran on my machine gave the assembly for if/else as this (not jump table) ,
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $7, -4(%rbp)
cmpl $1, -4(%rbp)
jne .L2
movl $97, %edi
call putchar
jmp .L3
.L2:
**cmpl $2, -4(%rbp)
jne .L4**
movl $97, %edi
call putchar
jmp .L3
.L4:
**cmpl $3, -4(%rbp)
jne .L5**
movl $97, %edi
call putchar
jmp .L3
.L5:
**cmpl $4, -4(%rbp)
jne .L6**
movl $97, %edi
call putchar
jmp .L3
.L6:
**cmpl $5, -4(%rbp)
jne .L7**
movl $97, %edi
call putchar
jmp .L3
.L7:
**cmpl $6, -4(%rbp)
jne .L8**
movl $97, %edi
call putchar
jmp .L3
.L8:
cmpl $7, -4(%rbp)
jne .L9
movl $97, %edi
call putchar
jmp .L3
.L9:
cmpl $8, -4(%rbp)
jne .L3
movl $97, %edi
call putchar
.L3:
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
But for switch/case (jump table),
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $7, -4(%rbp)
cmpl $7, -4(%rbp)
ja .L2
movl -4(%rbp), %eax
movq .L4(,%rax,8), %rax
jmp *%rax
.section .rodata
.align 8
.align 4
.L4:
.quad .L2
.quad .L3
.quad .L5
.quad .L6
.quad .L7
.quad .L8
.quad .L9
.quad .L10
.text
.L3:
movl $97, %edi
call putchar
jmp .L2
.L5:
movl $97, %edi
call putchar
jmp .L2
.L6:
movl $97, %edi
call putchar
jmp .L2
.L7:
movl $97, %edi
call putchar
jmp .L2
.L8:
movl $97, %edi
call putchar
jmp .L2
.L9:
movl $97, %edi
call putchar
jmp .L2
.L10:
movl $97, %edi
call putchar
nop
.L2:
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
From the tests i feel that switch/case is better as it doesn't have to go through the earlier entries to find a match.
I would suggest to try gcc -S option to generate the assembly to check the asm to see.
TL;DR version
For so few values, any differences in speed will be immeasurably small, and you'd be better off sticking with the more straightforward, easier-to-understand version. It isn't until you need to start searching through tables containing thousands to millions of entries that you'll want something smarter than a linear ordered search.
James Michener Version
Another possibility not yet mentioned is to do a partitioned search, like so:
if ( a > 4 )
{
if ( a > 6 )
{
if ( a == 7 ) // do stuff
else // a == 8, do stuff
}
else
{
if ( a == 5 ) // do stuff
else // a == 6, do stuff
}
}
else
{
if ( a > 2 )
{
if ( a == 3 ) // do stuff
else // a == 4, do stuff
}
else
{
if ( a == 1 ) // do stuff
else // a == 2, do stuff
}
}
No more than three tests are performed for any value of a. Of course, no less than three tests are performed for any value of a, either. On average, it should give better performance than the naive 1-8 search when the majority of inputs are 7, but...
As with all things performance-related, the rule is measure, don't guess. Code up different versions, profile them, analyze the results. For testing against so few values, it's going to be hard to get reliable numbers; you'll need to execute each method thousands of times for a given value just to get a useful non-zero time measurement (it also means that any difference between the methods will be ridiculously small).
Stuff like this can also be affected by compiler optimization settings. You'll want to build at different optimization levels and re-run your tests.
Just for giggles, I coded up my own version measuring several different approaches:
naive - the straightforward test from 1 to 8 in order;
sevenfirst - check for 7 first, then 1 - 6 and 8;
eightfirst - check from 8 to 1 in reverse order;
partitioned - use the partitioned search above;
switcher - use a switch statement instead of if-else;
I used the following test harness:
int main( void )
{
size_t counter[9] = {0};
struct timeval start, end;
unsigned long total_nsec;
void (*funcs[])(int, size_t *) = { naive, sevenfirst, eightfirst, partitioned, switcher };
srand(time(NULL));
printf("%15s %15s %15s %15s %15s %15s\n", "test #", "naive", "sevenfirst", "eightfirst", "partitioned", "switcher" );
printf("%15s %15s %15s %15s %15s %15s\n", "------", "-----", "----------", "----------", "-----------", "--------" );
unsigned long times[5] = {0};
for ( size_t t = 0; t < 20; t++ )
{
printf( "%15zu ", t );
for ( size_t f = 0; f < 5; f ++ )
{
total_nsec = 0;
for ( size_t i = 0; i < 1000; i++ )
{
int a = generate();
gettimeofday( &start, NULL );
for ( size_t j = 0; j < 10000; j++ )
(*funcs[f])( a, counter );
gettimeofday( &end, NULL );
}
total_nsec += end.tv_usec - start.tv_usec;
printf( "%15lu ", total_nsec );
times[f] += total_nsec;
memset( counter, 0, sizeof counter );
}
putchar('\n');
}
putchar ('\n');
printf( "%15s ", "average:" );
for ( size_t i = 0; i < 5; i++ )
printf( "%15f ", (double) times[i] / 20 );
putchar ('\n' );
return 0;
}
The generate function produces random numbers from 1 through 8, weighted so that 7 appears half the time. I run each method 10000 times per generated value to get measurable times, for 1000 generated values.
I didn't want the performance difference between the various control structures to get swamped by the // do stuff code, so each case just increments a counter, such as
if ( a == 1 )
counter[1]++;
This also gave me a way to verify that my number generator was working properly.
I run through the whole sequence 20 times and average the results. Even so, the numbers can vary a bit from run to run, so don't trust them too deeply. If nothing else, they show that changes at this level don't result in huge improvements. For example:
test # naive sevenfirst eightfirst partitioned switcher
------ ----- ---------- ---------- ----------- --------
0 121 100 118 119 111
1 110 100 131 120 115
2 110 100 125 121 111
3 115 125 117 105 110
4 120 116 125 110 115
5 129 100 110 106 116
6 115 176 105 106 115
7 111 100 111 106 110
8 139 100 106 111 116
9 125 100 136 106 111
10 106 100 105 106 111
11 126 112 135 105 115
12 116 120 135 110 115
13 120 105 106 111 115
14 120 105 105 106 110
15 100 131 106 118 115
16 106 113 116 111 110
17 106 105 105 118 111
18 121 113 103 106 115
19 130 101 105 105 116
average: 117.300000 111.100000 115.250000 110.300000 113.150000
Numbers are in microseconds. The code was built using gcc 4.1.2 with no optimization running on a SLES 10 system1.
So, running each method 10000 times for 1000 values, averaged over 20 runs, gives a total variation of about 7 μsec. That's really not worth getting exercised over. For something that's only searching among 8 distinct values and isn't going to run more than "several times", you're not going to see any measurable improvement in performance regardless of the method used. Stick with the method that's the easiest to read and understand.
Now, for searching a table containing several hundreds to thousands to millions of entries, you definitely want to use something smarter than a linear search.
1. It should go without saying, but the results above are only valid for this code built with this specific compiler and running on this specific system.
There should be a slight difference, but this would depend on the platform and the nature of the comparison - different compilers may optimise something like this differently, different architectures will have different effects as well, and it also depends on what the comparison actually is (if it is a more complex comparison than a simple primitive type comparison, for example)
It is probably good practice to test the specific case you are actually going to use, IF this is something that is likely to actually be a performance bottleneck.
Alternatively, a switch statement, if usable, should have the same performance for any value independent of order because it is implemented using an offset in memory as opposed to successive comparisons.
Probably not, you still have the same number of conditions and still either of them might evaluate to true, even if you check for a == 7 first, all other conditions are potentially true therefore will be evaluated.
The block of code that eis executed if a == 7 might be executed quicker when the program runs - but essentially you're code is still the same, with the same number of statements.
Related
I add debug info to the first and last instruction of the basic blocks by llvm pass, and then I successfully find the info I add in the assembly codes. But the numbers of info of first instruction and last instruction are different.
I want to know if the result is correct, and if it is correct, why the numbers are different and how to correctly get the boundary of block in assembly code?
The code of my pass
bool runOnFunction(Function &F) override {
unsigned start = 2333;
unsigned end = 23333;
MDNode *N = F.getMetadata("dbg");
for (BasicBlock &B : F) {
errs() << "Hello: ";
Instruction *I = B.getFirstNonPHI();
DebugLoc loc = DebugLoc::get(start, 0, N);
if (loc && I!=NULL) {
I->setDebugLoc(loc);
} else {
errs() << "start error";
}
I = B.getTerminator();
loc = DebugLoc::get(end, 1, N);
if (loc && I!= NULL) {
I->setDebugLoc(loc);
} else {
errs() << "end error";
}
errs() << "\n";
}
return true;
}
};
}
It runs without any errors. Some of the result:
# %bb.2: # %for.body
# in Loop: Header=BB0_1 Depth=1
.loc 1 2333 0 # myls.c:2333:0
movl -4(%ebp), %eax
.Ltmp4:
.loc 1 71 14 # myls.c:71:14
movl %eax, -8(%ebp)
.Ltmp5:
.LBB0_3: # %for.cond1
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
.loc 1 2333 0 # myls.c:2333:0
movl -8(%ebp), %eax
.Ltmp6:
.loc 1 71 18 # myls.c:71:18
cmpl n, %eax
.Ltmp7:
.loc 1 23333 1 # myls.c:23333:1
jge .LBB0_13
......
LBB2_10: # %sw.epilog
.loc 1 23333 1 # myls.c:23333:1
jmp .LBB2_11
.LBB2_11: # %while.cond
# =>This Inner Loop Header: Depth=1
.loc 1 162 13 # myls.c:162:13
cmpl $0, -164(%ebp)
.loc 1 23333 1 # myls.c:23333:1
jl .LBB2_20
# %bb.12: # %while.body
# in Loop: Header=BB2_11 Depth=1
.Ltmp75:
.loc 1 164 14 # myls.c:164:14
movl -136(%ebp), %eax
.loc 1 164 28 is_stmt 0 # myls.c:164:28
movl -164(%ebp), %ecx
# kill: def $cl killed $ecx
.loc 1 164 25 # myls.c:164:25
movl $1, %edx
shll %cl, %edx
.loc 1 164 22 # myls.c:164:22
andl %edx, %eax
cmpl $0, %eax
.Ltmp76:
.loc 1 23333 1 is_stmt 1 # myls.c:23333:1
je .LBB2_18
I find the 2333 and 23333 don't match, and the numbers of 2333 and 23333 are different in assembly code for different architectures. I use opt to run my pass and llc to get the assembly code.
I appreciate every help.
I think the problem is with optimization. After your pass, the IR goes through other rounds of optimization that changed the basic blocks and eliminated some instructions.
You probably have a register function like the following from LLVM page
static llvm::RegisterStandardPasses Y(
llvm::PassManagerBuilder::EP_EarlyAsPossible,
[](const llvm::PassManagerBuilder &Builder,
llvm::legacy::PassManagerBase &PM) { PM.add(new Hello()); });
Try use EP_OptimizerLast rather than EP_EarlyAsPossible. That will run your pass as the last optimizer.
The other option is to use EP_EnabledOnOptLevel0 and use opt with -O0.
You can look up the flags in this page
It might be helpful to use -emit-llvm to generate LLVM IR in *.ll. It is more visible what your code has done to the IR
Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 6 years ago.
Improve this question
I have to display all odd integers from 1 to 20:
My teacher says that this loop is wrong because it has no logic:
int i;
for (i=1;i<=20;i=i+2)
{
cout<<endl<<i;
}
According to him, this is right:
int i;
for (i=1;i<=20;i++)
{
if (i%2!=0)
{
cout<<endl<<i;
}
}
Why does he say that the first loop is wrong even though they both give the same result?
TL;DR Your version is clearly better in every possible way :). The version of your teacher loops 20 times, while your version only loops 10 times. It is 100% more efficient (even if it gets optimized away by the compiler, then there is no difference).
Note: you don't even need the = in the comparison, because it will never be 20, only less.
If you want to you can see the difference in the assembly generated (non-optimized):
Your version
main:
; Initialization omitted
.LCFI1:
subq $16, %rsp
movl $1, -4(%rbp) ; Initialize 'i' to 1
jmp .L2 ; Jump to compare statements
.L3:
movl -4(%rbp), %eax ; Copy 'i' to register1 (for function call)
; Omitted call to std::cout to output 'i' and '\n'
addl $2, -4(%rbp) ; Add 2 to 'i'
.L2:
cmpl $20, -4(%rbp) ; Compare 'i' to 20
jle .L3 ; Jump only if 'i' < 20
movl $0, %eax ; Reset (cleanup)
leave ; Leave
His version
main:
.LCFI1:
subq $16, %rsp
movl $1, -4(%rbp) ; Initialize 'i' to 1
jmp .L2 ; Jump to compare statements
.L4:
movl -4(%rbp), %eax ; Copy 'i' to register1
andl $1, %eax ; XOR 'i' with 1 (same as 'i' % 2)
testl %eax, %eax ; Compares 'i' to 'i'
je .L3 ; Continue loop by jumping if 'i' == 0
; Omitted call to std::cout to output 'i' and '\n'
.L3:
addl $1, -4(%rbp) ; Add 1 to 'i'
.L2:
cmpl $20, -4(%rbp) ; Compare 'i' to 20
jle .L4 ; Jump only if 'i' < 20
movl $0, %eax ; Reset (cleanup)
leave ; Leave
As you can see your version has far less instructions than his version, so yours runs faster, in the non-optimized code.
Here are the benchmarks for the non-optimized version
Version | Time
You | 0s
His | 0.015625s
And here for the optimized (compiled using -O3)
Version | Time
You | 0s
His | 0s
In the end, there is no difference, because the compiler is quite good at optimizing such loops.
Disclaimer
The above assembly was generated by g++ 4.8.4 on Ubuntu 14, using the following command: g++ -S -fno-asynchronous-unwind-tables -fno-dwarf2-cfi-asm foo.cpp
The first code example does not determine if a number is odd. It uses pre-existing knowledge to print all off numbers in the 1-20 range. The pre-existing knowledge is that you already know that the numbers 1, 3, 5, ... are odd numbers, you just print that information. The second example goes over all numbers 1-20 and calculates/determines if any number in that range is an odd number. Meaning, if it's not divisible by 2 (hence the % 2 operation) then it's an odd number. That's what your teacher is saying.
I need to optimize a program as good as somehow possible. Now I came across this issue: I have a one-dimensional array which represents a texture in form of pixel data. I now need to manipulate that data. The array is accessed via the following function:
(y * width) + x
to have x,y coordinates. Now the question is, what way is the most optimized for this function, I have considered the following two possibilities:
Inline:
inline int Coords(x,y) { return (y * width) + x); }
Macro:
#define COORDS(X,Y) ((Y)*width)+(X)
which one is the best practice to use here, or is there a way to get a even more optimized variant of this which I dont know?
I wrote a little test program to see what the difference would be between the two approaches.
Here it is:
#include <cstdint>
#include <algorithm>
#include <iterator>
#include <iostream>
using namespace std;
static constexpr int width = 100;
inline int Coords(int x, int y) { return (y * width) + x; }
#define COORDS(X,Y) ((Y)*width)+(X)
void fill1(uint8_t* bytes, int height)
{
for (int x = 0 ; x < width ; ++x) {
for (int y = 0 ; y < height ; ++y) {
bytes[Coords(x,y)] = 0;
}
}
}
void fill2(uint8_t* bytes, int height)
{
for (int x = 0 ; x < width ; ++x) {
for (int y = 0 ; y < height ; ++y) {
bytes[COORDS(x,y)] = 0;
}
}
}
auto main() -> int
{
uint8_t buf1[100 * 100];
uint8_t buf2[100 * 100];
fill1(buf1, 100);
fill2(buf2, 100);
// these are here to prevent the compiler from optimising away all the above code.
copy(begin(buf1), end(buf1), ostream_iterator<char>(cout));
copy(begin(buf2), end(buf2), ostream_iterator<char>(cout));
return 0;
}
I compiled it like this:
c++ -S -o intent.s -std=c++1y -O3 intent.cpp
and then looked at the source code to see what the compiler would do.
As expected, the compiler completely ignores all attempts by the programmer to optimise, and instead looks solely at the expressed intent, side effects and possibilities of aliases. Then it emits exactly the same code for both functions (which are of course inlined).
relevant parts of the assembly:
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp16:
.cfi_def_cfa_offset 16
Ltmp17:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp18:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $20024, %rsp ## imm = 0x4E38
Ltmp19:
.cfi_offset %rbx, -56
Ltmp20:
.cfi_offset %r12, -48
Ltmp21:
.cfi_offset %r13, -40
Ltmp22:
.cfi_offset %r14, -32
Ltmp23:
.cfi_offset %r15, -24
movq ___stack_chk_guard#GOTPCREL(%rip), %r15
movq (%r15), %r15
movq %r15, -48(%rbp)
xorl %eax, %eax
xorl %ecx, %ecx
.align 4, 0x90
LBB2_1: ## %.lr.ph.us.i
## =>This Loop Header: Depth=1
## Child Loop BB2_2 Depth 2
leaq -10048(%rbp,%rcx), %rdx
movl $400, %esi ## imm = 0x190
.align 4, 0x90
LBB2_2: ## Parent Loop BB2_1 Depth=1
## => This Inner Loop Header: Depth=2
movb $0, -400(%rdx,%rsi)
movb $0, -300(%rdx,%rsi)
movb $0, -200(%rdx,%rsi)
movb $0, -100(%rdx,%rsi)
movb $0, (%rdx,%rsi)
addq $500, %rsi ## imm = 0x1F4
cmpq $10400, %rsi ## imm = 0x28A0
jne LBB2_2
## BB#3: ## in Loop: Header=BB2_1 Depth=1
incq %rcx
cmpq $100, %rcx
jne LBB2_1
## BB#4:
xorl %r13d, %r13d
.align 4, 0x90
LBB2_5: ## %.lr.ph.us.i10
## =>This Loop Header: Depth=1
## Child Loop BB2_6 Depth 2
leaq -20048(%rbp,%rax), %rcx
movl $400, %edx ## imm = 0x190
.align 4, 0x90
LBB2_6: ## Parent Loop BB2_5 Depth=1
## => This Inner Loop Header: Depth=2
movb $0, -400(%rcx,%rdx)
movb $0, -300(%rcx,%rdx)
movb $0, -200(%rcx,%rdx)
movb $0, -100(%rcx,%rdx)
movb $0, (%rcx,%rdx)
addq $500, %rdx ## imm = 0x1F4
cmpq $10400, %rdx ## imm = 0x28A0
jne LBB2_6
## BB#7: ## in Loop: Header=BB2_5 Depth=1
incq %rax
cmpq $100, %rax
jne LBB2_5
## BB#8:
movq __ZNSt3__14coutE#GOTPCREL(%rip), %r14
leaq -20049(%rbp), %r12
xorl %ebx, %ebx
.align 4, 0x90
LBB2_9: ## %_ZNSt3__116ostream_iteratorIccNS_11char_traitsIcEEEaSERKc.exit.us.i.i13
## =>This Inner Loop Header: Depth=1
movb -10048(%rbp,%r13), %al
movb %al, -20049(%rbp)
movl $1, %edx
movq %r14, %rdi
movq %r12, %rsi
callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
incq %r13
cmpq $10000, %r13 ## imm = 0x2710
jne LBB2_9
## BB#10:
movq __ZNSt3__14coutE#GOTPCREL(%rip), %r14
leaq -20049(%rbp), %r12
.align 4, 0x90
LBB2_11: ## %_ZNSt3__116ostream_iteratorIccNS_11char_traitsIcEEEaSERKc.exit.us.i.i
## =>This Inner Loop Header: Depth=1
movb -20048(%rbp,%rbx), %al
movb %al, -20049(%rbp)
movl $1, %edx
movq %r14, %rdi
movq %r12, %rsi
callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
incq %rbx
cmpq $10000, %rbx ## imm = 0x2710
jne LBB2_11
## BB#12: ## %_ZNSt3__14copyIPhNS_16ostream_iteratorIccNS_11char_traitsIcEEEEEET0_T_S7_S6_.exit
cmpq -48(%rbp), %r15
jne LBB2_14
## BB#13: ## %_ZNSt3__14copyIPhNS_16ostream_iteratorIccNS_11char_traitsIcEEEEEET0_T_S7_S6_.exit
xorl %eax, %eax
addq $20024, %rsp ## imm = 0x4E38
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
Note that without the two calls to copy(..., ostream_iterator...) the compiler surmised that the total effect of the program was nothing and refused to emit any code at all, other than to return 0 from main()
Moral of the story: stop trying to do the compiler's job. Get on with yours.
Your job is to express intent as elegantly as you can. That's all.
Inline function, for two reasons:
it's less prone to bugs,
it lets the compiler decide whether to inline or not, so you don't have to waste time worrying about such trivial things.
First job: fix the bugs in the macro.
If you're that concerned, implement both ways using a compiler directive and profile the results.
Change inline int Coords(x,y) to inline int Coords(const x, const y) so, if the macro version does turn out quicker, then the inline build version will error if the macro is ever refactored to modify the arguments.
My hunch is that the function will be no slower than the macro in a good optimised build. And a code base without macros is easier to maintain.
If you do end up settling for the macro, then I'd be inclined to pass width as a macro argument too for the sake of program stability.
I am surprised that no one mentioned one major difference between a function and a macro: any compiler can inline the function, but not many (if at all) can create a function out of a macro even if that will benefit the performance.
I would offer a diverging answer in that this question seems to be looking at the wrong solutions. It's comparing two things that even the most basic optimizer from the 90s (maybe even 80s) should be able to optimize to the same degree (a trivial one-liner function versus a macro).
If you want to improve performance here, you have to compare between solutions that aren't so trivial for the compiler to optimize.
For example, let's say you access the texture in a sequential way. Then you don't need to access a pixel through (y*w) + x, you can simply iterate over it sequentially:
for (int j=0; j < num_pixels; ++j)
// do something with pixels[j]
In practice I've seen performance benefits with these kinds of loops over the y/x double loop even against the most modern compilers.
Let's say you aren't accessing things perfectly sequentially but can still access adjacent horizontal pixels within a scanline. You might get a performance boost in that case by doing:
// Given a particular y value:
Pixel* scanline = pixels + y*w;
for (int x=0; x < w; ++x)
// do something with scanline[x]
If you aren't doing either of these things and need completely random access to an image, maybe you can figure out a way to make your memory access pattern more uniform (accessing more horizontal pixels that would likely be in the same L1 cache line prior to eviction).
Sometimes it can even be worth the cost to transpose the image if that results in the bulk of your subsequent memory access being horizontal within a scanline and not across scanlines (due to spatial locality). It might seem crazy that the cost of transposing an image (basically rotating it 90 degrees and swapping rows with columns) will more than make up for the reduced cost of accessing it afterwards, but accessing memory in an efficient, cache-friendly pattern is a huge deal, and especially in image processing (like the difference between hundreds of millions of pixels per second vs. just millions of pixels per second).
If you can't do any of this and still need random access and you're facing profiler hotspots here, then it might help to split your texture image into smaller tiles (that would mean rendering more textured quads/triangles and possibly even doing extra work to ensure seamless results at the boundaries of each texture tile, but it can balance out and the extra geometry overhead can outweigh the cost if your overhead is in processing the texture). That would be increasing locality of reference and the probability that you'll use more memory cached to faster but smaller memory prior to eviction by actually reducing the size of the texture input you are processing in a totally random-access kind of way.
Any of these techniques can provide a boost in performance -- trying to optimize a one-liner function by using a macro instead is very unlikely to help anything except just make the code harder to maintain. In the best case scenario a macro might improve performance in a completely unoptimized debug build, but that kind of defeats the whole purpose of a debug build which is intended to be easy to debug, and macros are notoriously difficult to debug.
I am trying to improve my parser's speed. And switch-case, sometimes it's useful, but I see it's still slow. Not sure - if C++ supports this feature (address checkpoints (with additional parameter)), it's great!
Simple example :
enum Transport //MOTORBIKE = 1, CAR = 2, ... SHIP = 10
Transport foo = //unknown
switch(foo)
{
case MOTORBIKE : /*do something*/ break;
case CAR : /*do something*/ break;
//////////////////////////
case SHIP : /*do something*/ break;
}
If the variable foo is SHIP, at least the program has to re-check the value up to ten times! -> It's still slow.
If C++ supports checkpoints :
Transport foo = //unknown
__checkpoint smart_switch;
goto (smart_switch + foo); //instant call!!!
smart_switch + MOTORBIKE : /*do something*/ goto __end;
smart_switch + CAR : /*do something*/ goto __end;
smart_switch + [...] : /*do something*/ goto __end;
////////////////////////////////////////////////////////////
smart_switch + SHIP : /*do something*/ goto __end;
__end : return 0;
It doesn't generate any jump tables, and then check per value. Maybe it doesn't work well with default case. The only thing is smart_switch + CAR -> smart_switch + SHIP may have different addresses so if C++ evaluate them as real addresses, the process will fail. So when compiling the compiler just has to convert them to real addresses.
Does C++ support this feature? And does it greatly improve speed & performance?
What you are talking about is called a jump table. The jump table is usually an array of relative addresses where the program execution control can be transferred. Here is an example of how you can implement one:
#include <ctime>
#include <cstdlib>
#include <cstdio>
int main()
{
static constexpr void* jump_table[] =
{
&&print_0, &&print_1, &&print_2,
&&print_3, &&print_4, &&print_5
};
std::srand(std::time(nullptr));
int v = std::rand();
if (v < 0 || v > 5)
goto out;
goto *jump_table[v];
print_0:
std::printf("zero\n");
goto out;
print_1:
std::printf("one\n");
goto out;
print_2:
std::printf("two\n");
goto out;
print_3:
std::printf("three\n");
goto out;
print_4:
std::printf("four\n");
goto out;
print_5:
std::printf("five\n");
goto out;
out:
return EXIT_SUCCESS;
}
However, I seriously doubt two things. The first doubt is that using jump table will make your program faster. An indirect jump is relatively expensive and is badly predicted by the hardware. Chances are that if you only have three values then you are better off simply comparing each of them using "if-then-else" statement. For a lot of sparse values (i.e. 1, 100, 250, 500 etc.), you are better off doing a binary search rather than blowing up the size of your table. In either case, this is just a head of a huge iceberg when it comes to switch statements. So unless you know all of the details and know where compiler did the wrong thing for your particular case, don't even bother trying to change switch to something else — you will never outsmart the compiler and will only make your program slower.
The second doubt is actually that switching is the bottleneck of your parser. Most likely it is not. So in order to save yourself a lot of valuable time, try profiling your code first to know for sure what is the slowest part of your program. Usually it goes in steps like this:
Profile and find the bottleneck.
Figure out why this is a bottleneck and come up with a reasonable idea of how to improve the code for speed.
Try to improve the code.
Go to step #1.
And there is no exit from this loop. Optimization is something you can spend your entire life doing. At some point, you will have to assume the program is fast enough and there are no bottlenecks :)
Also, I have written a more comprehensive analysis with in-depth (more or less) details on how switch statements are implemented by compilers and when and when not try to engage in trying to outsmart them. Please find the article here.
Yes C/C++ does support this feature, and it is in... standard switch. I have no idea where you get the idea that switch will check each value, but you are wrong. Yes I heard that some compilers can generate better code for pretty big cases (many variants like probably several hundreds), but I do not think that it is yours.
For example following code compiled by gcc without any optimization:
enum E { One, Two, Three, Four, Five };
void func( E e )
{
int res;
switch( e ) {
case One : res = 10; break;
case Two : res = 20; break;
case Three : res = 30; break;
case Four : res = 40; break;
case Five : res = 50; break;
}
}
generates following:
_Z4func1E:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -20(%rbp)
movl -20(%rbp), %eax
cmpl $4, %eax
ja .L1
movl %eax, %eax
movq .L8(,%rax,8), %rax
jmp *%rax
.section .rodata
.align 8
.align 4
.L8:
.quad .L3
.quad .L4
.quad .L5
.quad .L6
.quad .L7
.text
.L3:
movl $10, -4(%rbp)
jmp .L1
.L4:
movl $20, -4(%rbp)
jmp .L1
.L5:
movl $30, -4(%rbp)
jmp .L1
.L6:
movl $40, -4(%rbp)
jmp .L1
.L7:
movl $50, -4(%rbp)
nop
.L1:
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
As you can see it simply jumps into particular position without checking each value.
You can just build an array with function pointers and index at it by the enum
In C++ I want to write an application that works similar to a scripting language:
Out of some input during "setup time" it will define on a big global array where each variable will be located and on a different array the sequence of the functions ("LogicElement") to call (including their parameters like the variables to use).
One implementation might look like:
class LogicElement_Generic
{
public:
virtual void calc() const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int &to;
const int &from1;
const int &from2;
public:
LogicElement_Mul( int &_to, const int &_from1, const int &_from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc() const
{
to = from1 * from2;
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
// just a demo, this would be setup from e.g. an input file:
int *to = (int*)globalVariableBuffer;
int *from1 = (int*)(globalVariableBuffer + sizeof(int));
int *from2 = (int*)(globalVariableBuffer + 2*sizeof(int));
*from1 = 2;
*from2 = 3;
le[0] = new LogicElement_Mul( *to, *from1, *from2 );
// doing all calculations:
// finally it would be a loop iterating over all calculation functions,
// over and over again - the area in the code where all the resources
// would be burned...
le[0]->calc();
return *to;
}
Although that works as intended, looking at the created assembly:
78 .section .text._ZNK16LogicElement_Mul4calcEv,"axG",#progbits,_ZNK16LogicElement_Mul4calcEv,comdat
79 .align 2
80 .weak _ZNK16LogicElement_Mul4calcEv
82 _ZNK16LogicElement_Mul4calcEv:
83 .LFB6:
17:.../src/test.cpp **** void calc() const
84 .loc 1 17 0
85 .cfi_startproc
86 0000 55 pushq %rbp
87 .LCFI6:
88 .cfi_def_cfa_offset 16
89 .cfi_offset 6, -16
90 0001 4889E5 movq %rsp, %rbp
91 .LCFI7:
92 .cfi_def_cfa_register 6
93 0004 48897DF8 movq %rdi, -8(%rbp)
18:.../src/test.cpp **** {
19:.../src/test.cpp **** to = from1 * from2;
94 .loc 1 19 0
95 0008 488B45F8 movq -8(%rbp), %rax
96 000c 488B4008 movq 8(%rax), %rax
97 0010 488B55F8 movq -8(%rbp), %rdx
98 0014 488B5210 movq 16(%rdx), %rdx
99 0018 8B0A movl (%rdx), %ecx
100 001a 488B55F8 movq -8(%rbp), %rdx
101 001e 488B5218 movq 24(%rdx), %rdx
102 0022 8B12 movl (%rdx), %edx
103 0024 0FAFD1 imull %ecx, %edx
104 0027 8910 movl %edx, (%rax)
20:.../src/test.cpp **** }
105 .loc 1 20 0
106 0029 5D popq %rbp
107 .LCFI8:
108 .cfi_def_cfa 7, 8
109 002a C3 ret
110 .cfi_endproc
Looking at the assembly lines 95 .. 104 you can see that for each variable three indirections are used.
As this part of the code (the calc() methods) would finally be called very rapidly I want to use the least CPU cycles and memory bandwidth as possible (by general C/C++).
I also want to achieve (not shown in the code above) to have two variable buffers that have exactly the same layout to be able to do double buffering at an multithreaded approach to limit the necessary locks (exact implementation details would be too much detail for this question).
So the big questions are:
How can I change the architecture to reduce the amount of memory indirections in the calc()?
(I'd expect only two: one to get the offset address in the variable array and an additional to get the variable itself - but my experiments changing the code above to use offsets made things far worse!)
Is there a better way to set up the classes and thus the array of the LogicElements so that calling the calculation methods will use the least amount of resources?
Thanks to the hint by #Ed S. I changed away from references (where I hoped the compiler could optimize better).
But an even more important step I did was to compare the assembly that was generated after activating optimizations (just a simple -O2 did do).
(I didn't do that at the beginning as I wanted to have a clearer picture on the generated "pure" machine code and not one where an intelligent compiler fixes a stupid programmer - but it seems the compiler is too "stupid" then...)
So the current result is quite good now for the variable array:
class LogicElement_Generic
{
public:
virtual void calc(void * const base) const = 0;
};
class LogicElement_Mul : public LogicElement_Generic
{
int const to;
int const from1;
int const from2;
public:
LogicElement_Mul( int const _to, int const _from1, int const _from2 ) : to(_to), from1(_from1), from2(_from2)
{}
void calc(void * const base) const
{
*((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
}
};
char globalVariableBuffer[1000]; // a simple binary buffer
LogicElement_Generic *le[10];
int main( void )
{
int to = 0;
int from1 = sizeof(int);
int from2 = 2*sizeof(int);
*((int*)(globalVariableBuffer+from1)) = 2;
*((int*)(globalVariableBuffer+from2)) = 3;
le[0] = new LogicElement_Mul( to, from1, from2 );
le[0]->calc(globalVariableBuffer);
return *((int*)(globalVariableBuffer+to));
}
with the relevant part of the assembly:
17:.../src/test.cpp **** void calc(void * const base) const
12 .loc 1 17 0
13 .cfi_startproc
14 .LVL0:
18:.../src/test.cpp **** {
19:.../src/test.cpp **** *((int*)(base+to)) = *((int*)(base+from1)) * *((int*)(base+from2));
15 .loc 1 19 0
16 0000 4863470C movslq 12(%rdi), %rax
17 0004 48634F10 movslq 16(%rdi), %rcx
18 0008 48635708 movslq 8(%rdi), %rdx
19 000c 8B0406 movl (%rsi,%rax), %eax
20 000f 0FAF040E imull (%rsi,%rcx), %eax
21 0013 890416 movl %eax, (%rsi,%rdx)
20:.../src/test.cpp **** }
22 .loc 1 20 0
23 0016 C3 ret
24 .cfi_endproc
So I recon the first questions as answered! :)
The second is still open.
(Even more now as the pointer arithmetic might be valid C++ - but very ugly...)