I have a simple piece of code, that addresses this (poorly stated, out of place) question :
template<typename It>
bool isAlpha(It first, It last)
{
return (first != last && *first != '\0') ?
(isalpha(static_cast<int>(*first)) && isAlpha(++first, last)) : true;
}
I'm trying to figure out how can I go about implementing it in a tail recursive fashion, and although there are great sources like this answer, I can't wrap my mind around it.
Can anyone help ?
EDIT
I'm placing the disassembly code below; The compiler is gcc 4.9.0 compiling with -std=c++11 -O2 -Wall -pedantic the assembly output is
bool isAlpha<char const*>(char const*, char const*):
cmpq %rdi, %rsi
je .L5
movzbl (%rdi), %edx
movl $1, %eax
testb %dl, %dl
je .L12
pushq %rbp
pushq %rbx
leaq 1(%rdi), %rbx
movq %rsi, %rbp
subq $8, %rsp
.L3:
movsbl %dl, %edi
call isalpha
testl %eax, %eax
jne .L14
xorl %eax, %eax
.L2:
addq $8, %rsp
popq %rbx
popq %rbp
.L12:
rep ret
.L14:
cmpq %rbp, %rbx
je .L7
addq $1, %rbx
movzbl -1(%rbx), %edx
testb %dl, %dl
jne .L3
.L7:
movl $1, %eax
jmp .L2
.L5:
movl $1, %eax
ret
To clarify cdhowie's point, the function can be rewritten as follows (unless I made a mistake):
bool isAlpha(It first, It last)
{
if (first == last)
return true;
if (*first == '\0')
return true;
if (!isalpha(static_cast<int>(*first))
return false;
return isAlpha(++first, last);
}
Which would indeed allow for trivial tail call elimination.
This is normally a job for the compiler, though.
Related
I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T s(const T x)
{
return sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = s(x[i]);
}
}
compile with gcc 6.2 and -O3 -march=native -ffast-math and get
func(float*, float*, int):
testl %edx, %edx
jle .L10
leaq 8(%rsp), %r10
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
pushq %r14
xorl %r14d, %r14d
pushq %r13
leal -8(%rdx), %r13d
pushq %r12
shrl $3, %r13d
movq %rsi, %r12
pushq %r10
addl $1, %r13d
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
.L4:
vmovaps (%rbx), %ymm0
addl $1, %r14d
addq $32, %r12
addq $32, %rbx
call _ZGVcN8v_sinf // YAY! Vectorized trig!
vmovaps %ymm0, -32(%r12)
cmpl %r13d, %r14d
jb .L4
vzeroupper
addq $8, %rsp
popq %rbx
popq %r10
popq %r12
popq %r13
popq %r14
popq %rbp
leaq -8(%r10), %rsp
.L10:
ret
But when I add a cosine to the function, there is no vectorization:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T f(const T x)
{
return cosf(x)+sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = f(x[i]);
}
}
which gives:
func(float*, float*, int):
testl %edx, %edx
jle .L10
pushq %r12
leal -1(%rdx), %eax
pushq %rbp
leaq 4(%rdi,%rax,4), %r12
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $16, %rsp
.L4:
vmovss (%rbx), %xmm0
leaq 8(%rsp), %rsi
addq $4, %rbx
addq $4, %rbp
leaq 12(%rsp), %rdi
call sincosf // No vectorization
vmovss 12(%rsp), %xmm0
vaddss 8(%rsp), %xmm0, %xmm0
vmovss %xmm0, -4(%rbp)
cmpq %rbx, %r12
jne .L4
addq $16, %rsp
popq %rbx
popq %rbp
popq %r12
.L10:
ret
I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none.
Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example?
(As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)
So this question is just out of curiosity.
I have some tiny program:
#include <some_header>
void print(){ printf("abc"); } // don't care about main, I'm not gonna run it
Then I compiled it to assembly, with once some_header=>iostream and another time some_header=>cstdio with gcc.godbolt.org (6.1 for x86_64) with -O3 -pedantic -std=c++14. Look at this:
.LC0:
.string "abc"
print(): (iostream) or (both included)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
subq $8, %rsp
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
addq $8, %rsp
jmp __cxa_atexit
print(): (cstdio)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
There's a significant difference between them, and they're identical for the first three lines, so why do iostream need such amount of code to clean up or what are those lines just doing? OR just to say that godbolt is unreliable upon this task?
Also it seems that standard doesn't guarantee that printf is accessible from iostream, should this be relied upon?
Your print function compiles to a pretty much the same assembly code in both cases.
The additional lines you see are to initialise and de-initialise iostream library. You may see that clearly if you remove the optimisation flag -O3.
Here is a complete listing with iostream included and optimisation switched off.
std::piecewise_construct:
.zero 1
.LC0:
.string "abc"
print():
pushq %rbp
movq %rsp, %rbp
movl $.LC0, %edi
movl $0, %eax
call printf
nop
popq %rbp
ret
__static_initialization_and_destruction_0(int, int):
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L4
cmpl $65535, -8(%rbp)
jne .L4
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
call __cxa_atexit
.L4:
nop
leave
ret
pushq %rbp
movq %rsp, %rbp
movl $65535, %esi
movl $1, %edi
call __static_initialization_and_destruction_0(int, int)
popq %rbp
ret
Which of the below code will be more optimized for efficiency First function or Second function in C/C++ gcc compiler ?
// First Function
if ( A && B && C ) {
UpdateData();
} else if ( A && B ){
ResetData();
}
//Second Function
if ( A && B) {
if (C) {
UpdateData();
} else {
ResetData();
}
}
Do we get any performance improvement in Second Function ?
If First Function is used, can the compiler optimize it to Second Method on its own ?
A large portion of this question will depend on what A, B and C really are (and the compiler will optimise it, as shown below). Simple types, definitely not worth worrying about. If they are some kind of "big number math" objects, or some complicated data type that needs 1000 instructions for each "is this true or not", then there will be a big difference if the compiler decides to make different code.
As always when it comes to performance: Measure in your own code, use profiling to detect where the code spends MOST of the time, and then measure with changes to that code. Repeat until it runs fast enough [whatever that is] and/or your manager tells you to stop fiddling with the code. Typically, however, unless it's REALLY a high traffic area of the code, it will make little difference to re-arrange the conditions in an if-statement, it is the overall algorithm that makes most impact in the general case.
If we assume A, B and C are simple types, such as int, we can write some code to investigate:
extern int A, B, C;
extern void UpdateData();
extern void ResetData();
void func1()
{
if ( A && B && C ) {
UpdateData();
} else if ( A && B ){
ResetData();
}
}
void func2()
{
if ( A && B) {
if (C) {
UpdateData();
} else {
ResetData();
}
}
}
gcc 4.8.2 given this, with -O1 produces this code:
_Z5func1v:
cmpl $0, A(%rip)
je .L6
cmpl $0, B(%rip)
je .L6
subq $8, %rsp
cmpl $0, C(%rip)
je .L3
call _Z10UpdateDatav
jmp .L1
.L3:
call _Z9ResetDatav
.L1:
addq $8, %rsp
.L6:
rep ret
_Z5func2v:
.LFB1:
cmpl $0, A(%rip)
je .L12
cmpl $0, B(%rip)
je .L12
subq $8, %rsp
cmpl $0, C(%rip)
je .L9
call _Z10UpdateDatav
jmp .L7
.L9:
call _Z9ResetDatav
.L7:
addq $8, %rsp
.L12:
rep ret
In other words: No difference at all
Using clang++ 3.7 (as of about 3 weeks ago) with -O1 gives this:
_Z5func1v: # #_Z5func1v
cmpl $0, A(%rip)
setne %cl
cmpl $0, B(%rip)
setne %al
andb %cl, %al
movzbl %al, %ecx
cmpl $1, %ecx
jne .LBB0_2
movl C(%rip), %ecx
testl %ecx, %ecx
je .LBB0_2
jmp _Z10UpdateDatav # TAILCALL
.LBB0_2: # %if.else
testb %al, %al
je .LBB0_3
jmp _Z9ResetDatav # TAILCALL
.LBB0_3: # %if.end8
retq
_Z5func2v: # #_Z5func2v
cmpl $0, A(%rip)
je .LBB1_4
movl B(%rip), %eax
testl %eax, %eax
je .LBB1_4
cmpl $0, C(%rip)
je .LBB1_3
jmp _Z10UpdateDatav # TAILCALL
.LBB1_4: # %if.end4
retq
.LBB1_3: # %if.else
jmp _Z9ResetDatav # TAILCALL
.Ltmp1:
The chaining of and in the func1 of clang MAY be of benefit, but it's probably such a small difference that you should concentrate on what makes more sense from a logical perspective of the code.
In summary: Not worth it
Higher optimisation in g++ makes it do the same tailcall optimisation that clang does, otherwise no difference.
However, if we make A, B and C into external functions, which the compiler can't "understand", then we get a difference:
_Z5func1v: # #_Z5func1v
pushq %rax
.Ltmp0:
.cfi_def_cfa_offset 16
callq _Z1Av
testl %eax, %eax
je .LBB0_3
callq _Z1Bv
testl %eax, %eax
je .LBB0_3
callq _Z1Cv
testl %eax, %eax
je .LBB0_3
popq %rax
jmp _Z10UpdateDatav # TAILCALL
.LBB0_3: # %if.else
callq _Z1Av
testl %eax, %eax
je .LBB0_5
callq _Z1Bv
testl %eax, %eax
je .LBB0_5
popq %rax
jmp _Z9ResetDatav # TAILCALL
.LBB0_5: # %if.end12
popq %rax
retq
_Z5func2v: # #_Z5func2v
pushq %rax
.Ltmp2:
.cfi_def_cfa_offset 16
callq _Z1Av
testl %eax, %eax
je .LBB1_4
callq _Z1Bv
testl %eax, %eax
je .LBB1_4
callq _Z1Cv
testl %eax, %eax
je .LBB1_3
popq %rax
jmp _Z10UpdateDatav # TAILCALL
.LBB1_4: # %if.end6
popq %rax
retq
.LBB1_3: # %if.else
popq %rax
jmp _Z9ResetDatav # TAILCALL
Here we DO see the difference between func1 and func2, where func1 will call A and B twice - since the compiler can't assume that calling those functions ONCE will do the same thing as calling twice. [Consider that the functions A and B may be reading data from a file, calling rand, or whatever, the result of NOT calling that function may be that the program behaves differently.
(In this case I only posted clang code, but g++ produces code that has the same outcome, but slightly different ordering of the different lumps of code)
So the question is which of these implementation has better performance and readability.
Imagine you have to write a code that each step is dependent of the success of the previous one, something like:
bool function()
{
bool isOk = false;
if( A.Func1() )
{
B.Func1();
if( C.Func2() )
{
if( D.Func3() )
{
...
isOk = true;
}
}
}
return isOk;
}
Let's say there are up to 6 nested IFs, since I don't want the padding to grow too much to the right, and I don't want to nest the function calls because there are several parameters involved, the first approach would be using the inverse logic:
bool function()
{
if( ! A.Func1() ) return false:
B.Func1();
if( ! C.Func2() ) return false;
if( ! D.Func3() ) return false;
...
return true;
}
But what about avoiding so many returns, like this:
bool function()
{
bool isOk = false;
do
{
if( ! A.Func1() ) break:
B.Func1();
if( ! C.Func2() ) break;
if( ! D.Func3() ) break;
...
isOk = true;
break;
}while(false);
return isOk;
}
Compilers will break down your code to simple instructions, using branch instructions to form loops, if/else etc, and it's unlikely your code will be any different at all once the compiler has gone over it.
Write the code that you think makes most sense for the solution you require.
If I were to "vote" for one of the three variants, I'd say my code is mostly variant 2. However, I don't follow it religiously. If it makes more sense (from a "how you think about it" perspective) to write in variant 1, then I will do that.
I don't think I've ever written, or even seen code written like variant 3 - I'm sure it happens, but if your goal is to have a single return, then I'd say variant 1 is the clearer and more obvious choice. Variant 3 is really just a "goto by another name" (see my most rewarded answer [and that's after I had something like 80 down-votes for suggesting goto as a solution]). I personally don't see variant 3 as any better than the other two, and unless the function is short enough to see do and the while on the same page, you also don't actually know that it won't loop without scrolling around - which is really not a good thing.
If you then, after profiling the code, discover a particular function is taking more time than you think is "right", study the assembly code.
Just to illustrate this, I will take your code and compile all three examples with g++ and clang++, and show the resulting code. It will probably take a few minutes because I have to actually make it compileable first.
Your source, after some massaging to make it compile as a singe source file:
class X
{
public:
bool Func1();
bool Func2();
bool Func3();
};
X A, B, C, D;
bool function()
{
bool isOk = false;
if( A.Func1() )
{
B.Func1();
if( C.Func2() )
{
if( D.Func3() )
{
isOk = true;
}
}
}
return isOk;
}
bool function2()
{
if( ! A.Func1() ) return false;
B.Func1();
if( ! C.Func2() ) return false;
if( ! D.Func3() ) return false;
return true;
}
bool function3()
{
bool isOk = false;
do
{
if( ! A.Func1() ) break;
B.Func1();
if( ! C.Func2() ) break;
if( ! D.Func3() ) break;
isOk = true;
}while(false);
return isOk;
}
Code generated by clang 3.5 (compiled from sources a few days ago):
_Z8functionv: # #_Z8functionv
pushq %rax
movl $A, %edi
callq _ZN1X5Func1Ev
testb %al, %al
je .LBB0_2
movl $B, %edi
callq _ZN1X5Func1Ev
movl $C, %edi
callq _ZN1X5Func2Ev
testb %al, %al
je .LBB0_2
movl $D, %edi
popq %rax
jmp _ZN1X5Func3Ev # TAILCALL
xorl %eax, %eax
popq %rdx
retq
_Z9function2v: # #_Z9function2v
pushq %rax
movl $A, %edi
callq _ZN1X5Func1Ev
testb %al, %al
je .LBB1_1
movl $B, %edi
callq _ZN1X5Func1Ev
movl $C, %edi
callq _ZN1X5Func2Ev
testb %al, %al
je .LBB1_3
movl $D, %edi
callq _ZN1X5Func3Ev
# kill: AL<def> AL<kill> EAX<def>
jmp .LBB1_5
.LBB1_1:
xorl %eax, %eax
jmp .LBB1_5
.LBB1_3:
xorl %eax, %eax
.LBB1_5:
# kill: AL<def> AL<kill> EAX<kill>
popq %rdx
retq
_Z9function3v: # #_Z9function3v
pushq %rax
.Ltmp4:
.cfi_def_cfa_offset 16
movl $A, %edi
callq _ZN1X5Func1Ev
testb %al, %al
je .LBB2_2
movl $B, %edi
callq _ZN1X5Func1Ev
movl $C, %edi
callq _ZN1X5Func2Ev
testb %al, %al
je .LBB2_2
movl $D, %edi
popq %rax
jmp _ZN1X5Func3Ev # TAILCALL
.LBB2_2:
xorl %eax, %eax
popq %rdx
retq
In the clang++ code, the second function is very marginally worse due to an extra jump that one would have hoped the compiler could sort out being the same as one of the others. But I doubt any realistic code where func1 and func2 and func3 actually does anything meaningful will show any measurable difference.
And g++ 4.8.2:
_Z8functionv:
subq $8, %rsp
movl $A, %edi
call _ZN1X5Func1Ev
testb %al, %al
jne .L10
.L3:
xorl %eax, %eax
addq $8, %rsp
ret
.L10:
movl $B, %edi
call _ZN1X5Func1Ev
movl $C, %edi
call _ZN1X5Func2Ev
testb %al, %al
je .L3
movl $D, %edi
addq $8, %rsp
jmp _ZN1X5Func3Ev
_Z9function2v:
subq $8, %rsp
movl $A, %edi
call _ZN1X5Func1Ev
testb %al, %al
jne .L19
.L13:
xorl %eax, %eax
addq $8, %rsp
ret
.L19:
movl $B, %edi
call _ZN1X5Func1Ev
movl $C, %edi
call _ZN1X5Func2Ev
testb %al, %al
je .L13
movl $D, %edi
addq $8, %rsp
jmp _ZN1X5Func3Ev
_Z9function3v:
.LFB2:
subq $8, %rsp
movl $A, %edi
call _ZN1X5Func1Ev
testb %al, %al
jne .L28
.L22:
xorl %eax, %eax
addq $8, %rsp
ret
.L28:
movl $B, %edi
call _ZN1X5Func1Ev
movl $C, %edi
call _ZN1X5Func2Ev
testb %al, %al
je .L22
movl $D, %edi
addq $8, %rsp
jmp _ZN1X5Func3Ev
I challenge you to spot the difference aside from the label names between the different functions.
I think performance (and most likely even binary code) will be the same with any modern compiler.
Readability is somewhat a matter of conventions and habits.
I personally would prefer the first form, and probably you would need a new function to group some of the conditions (I think some of them can be grouped together in some meaningful way). The third form looks most cryptic to me.
As C++ has RAII and automatic cleanups, I tend to prefer the bail-out-with-return-as-soon-as-possible solution (your second one), because the code gets much cleaner IMHO. Obviously, it's a matter of opinion, taste and YMMV...
Inside a large loop, I currently have a statement similar to
if (ptr == NULL || ptr->calculate() > 5)
{do something}
where ptr is an object pointer set before the loop and never changed.
I would like to avoid comparing ptr to NULL in every iteration of the loop. (The current final program does that, right?) A simple solution would be to write the loop code once for (ptr == NULL) and once for (ptr != NULL). But this would increase the amount of code making it more difficult to maintain, plus it looks silly if the same large loop appears twice with only one or two lines changed.
What can I do? Use dynamically-valued constants maybe and hope the compiler is smart? How?
Many thanks!
EDIT by Luther Blissett. The OP wants to know if there is a better way to remove the pointer check here:
loop {
A;
if (ptr==0 || ptr->calculate()>5) B;
C;
}
than duplicating the loop as shown here:
if (ptr==0)
loop {
A;
B;
C;
}
else loop {
A;
if (ptr->calculate()>5) B;
C;
}
I just wanted to inform you, that apparently GCC can do this requested hoisting in the optimizer. Here's a model loop (in C):
struct C
{
int (*calculate)();
};
void sideeffect1();
void sideeffect2();
void sideeffect3();
void foo(struct C *ptr)
{
int i;
for (i=0;i<1000;i++)
{
sideeffect1();
if (ptr == 0 || ptr->calculate()>5) sideeffect2();
sideeffect3();
}
}
Compiling this with gcc 4.5 and -O3 gives:
.globl foo
.type foo, #function
foo:
.LFB0:
pushq %rbp
.LCFI0:
movq %rdi, %rbp
pushq %rbx
.LCFI1:
subq $8, %rsp
.LCFI2:
testq %rdi, %rdi # ptr==0? -> .L2, see below
je .L2
movl $1000, %ebx
.p2align 4,,10
.p2align 3
.L4:
xorl %eax, %eax
call sideeffect1 # sideeffect1
xorl %eax, %eax
call *0(%rbp) # call p->calculate, no check for ptr==0
cmpl $5, %eax
jle .L3
xorl %eax, %eax
call sideeffect2 # ok, call sideeffect2
.L3:
xorl %eax, %eax
call sideeffect3
subl $1, %ebx
jne .L4
addq $8, %rsp
.LCFI3:
xorl %eax, %eax
popq %rbx
.LCFI4:
popq %rbp
.LCFI5:
ret
.L2: # here's the loop with ptr==0
.LCFI6:
movl $1000, %ebx
.p2align 4,,10
.p2align 3
.L6:
xorl %eax, %eax
call sideeffect1 # does not try to call ptr->calculate() anymore
xorl %eax, %eax
call sideeffect2
xorl %eax, %eax
call sideeffect3
subl $1, %ebx
jne .L6
addq $8, %rsp
.LCFI7:
xorl %eax, %eax
popq %rbx
.LCFI8:
popq %rbp
.LCFI9:
ret
And so does clang 2.7 (-O3):
foo:
.Leh_func_begin1:
pushq %rbp
.Llabel1:
movq %rsp, %rbp
.Llabel2:
pushq %r14
pushq %rbx
.Llabel3:
testq %rdi, %rdi # ptr==NULL -> .LBB1_5
je .LBB1_5
movq %rdi, %rbx
movl $1000, %r14d
.align 16, 0x90
.LBB1_2:
xorb %al, %al # here's the loop with the ptr->calculate check()
callq sideeffect1
xorb %al, %al
callq *(%rbx)
cmpl $6, %eax
jl .LBB1_4
xorb %al, %al
callq sideeffect2
.LBB1_4:
xorb %al, %al
callq sideeffect3
decl %r14d
jne .LBB1_2
jmp .LBB1_7
.LBB1_5:
movl $1000, %r14d
.align 16, 0x90
.LBB1_6:
xorb %al, %al # and here's the loop for the ptr==NULL case
callq sideeffect1
xorb %al, %al
callq sideeffect2
xorb %al, %al
callq sideeffect3
decl %r14d
jne .LBB1_6
.LBB1_7:
popq %rbx
popq %r14
popq %rbp
ret
In C++, although completely overkill you can put the loop in a function and use a template. This will generate twice the body of the function, but eliminate the extra check which will be optimized out. While I certainly don't recommend it, here is the code:
template<bool ptr_is_null>
void loop() {
for(int i = x; i != y; ++i) {
/**/
if(ptr_is_null || ptr->calculate() > 5) {
/**/
}
/**/
}
}
You call it with:
if (ptr==NULL) loop<true>(); else loop<false>();
You are better off without this "optimization", the compiler will probably do the RightThing(TM) for you.
Why do you want to avoid comparing to NULL?
Creating a variant for each of the NULL and non-NULL cases just gives you almost twice as much code to write, test and more importantly maintain.
A 'large loop' smells like an opportunity to refactor the loop into separate functions, in order to make the code easier to maintain. Then you can easily have two variants of the loop, one for ptr == null and one for ptr != null, calling different functions, with just a rough similarity in the overall structure of the loop.
Since
ptr is an object pointer set before the loop and never changed
can't you just check if it is null before the loop and not check again... since you don't change it.
If it is not valid for your pointer to be NULL, you could use a reference instead.
If it is valid for your pointer to be NULL, but if so then you skip all processing, then you could either wrap your code with one check at the beginning, or return early from your function:
if (ptr != NULL)
{
// your function
}
or
if (ptr == NULL) { return; }
If it is valid for your pointer to be NULL, but only some processing is skipped, then keep it like it is.
if (ptr == NULL || ptr->calculate() > 5)
{do something}
I would simply think in terms of what is done if the condition is true.
If "do something" is really the exact same stuff for (ptr == NULL) or (ptr->calculate() > 5), then I hardly see a reason to split up anything.
If "do something" contains particular cases for either condition, then I would consider to refactor into separate loops to get rid of extra special case checking. Depends on the special cases involved.
Eliminating code duplication is good up to a point. You should not care too much about optimizing until your program does what it should do and until performance becomes a problem.
[...] Premature optimization is the root of all evil
http://en.wikipedia.org/wiki/Program_optimization