I wrote a very simple implementation of what could be a similarity to Assembly/machine code.
It is even capable of recursion as in this example:
9 6
IFEQ R0,0
RET 1
ENDIF
MOV R1,R0
SUB R1,1
CALL R1
MOV R2,R9
MUL R2,R0
RET R2
Output: 720 (factorial of 6)
Description:
9 = Program Lines
6 = Program Input. Will be set to registry R0 value at class construction
CALL = calls the program again with the passed value (recursion)
RET = returns the program with the specified value. Sets registry R9 value to output value.
R0 to R9 -> general purpose registry.
R0 - program input value
R9 - program output value
-edit: Program commands:
MOV, ADD, SUB, MUL, DIV, MOD, IFEQ, IFNEQ, IFG, IFGE, IFL, IFLE, ENDIF, CALL, RET
However the program can enter into infinite loop/recursion. e.g:
2 0
CALL 10
RET 1 //will never be reached
How do I verify whether MY program will enter into an infinite loop/recursion?
Here's my implementation, don't know whether it's necessary, but just in case you need. (It's the whole code... hope you don't mind).
#include <iostream>
#include <map>
#include <string> //std::getline
#include <sstream>
#include <vector>
namespace util
{
template<typename I>I readcin(I& input) {
std::cin >> input;
std::cin.clear(); std::cin.ignore();
return input;
}
template<typename I, typename...O> I readcin(I& input, O&... others) {
readcin(input);
return readcin(others...);
}
}
//operations
enum OP
{
MOV, ADD, SUB, MUL, DIV, MOD,
IFG, IFL,
IFEQ, IFGE, IFLE,
IFNEQ,
CALL,
RET,
ENDIF,
};
std::map<std::string, OP> OPTABLE
{
{"MOV", MOV}, {"ADD", ADD}, {"SUB", SUB}, {"MUL", MUL}, {"DIV", DIV}, {"MOD", MOD},
{"RET", RET},
{"IFG", IFG}, {"IFL", IFL},
{"IFNEQ", IFNEQ}, {"IFEQ", IFEQ}, {"IFGE", IFGE}, {"IFLE", IFLE},
{"CALL", CALL},
{"ENDIF", ENDIF}
};
//registry index
enum RI
{
R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, RI_MAX
};
std::map<std::string, RI> RITABLE =
{
{"R0", R0}, {"R1", R1}, {"R2", R2}, {"R3", R3}, {"R4", R4}, {"R5", R5},
{"R6", R6}, {"R7", R7}, {"R8", R8}, {"R9", R9}
};
struct Instruction
{
OP op;
RI r1;
int r2value;
Instruction() = delete;
Instruction(OP operation, RI firstRegister, int _2ndRegValue = -1)
{
op = operation;
r1 = firstRegister;
r2value = _2ndRegValue;
}
};
class Assembly
{
private:
int REG[RI::RI_MAX] {0};
int GetRegistryValue(RI ri) const { return REG[ri]; }
void SetRegistryValue(RI ri, int val) { REG[ri] = val; }
enum CMP_FLAG{ CMP_FAIL, CMP_OK };
CMP_FLAG flag { CMP_OK };
CMP_FLAG GetFlag() const { return flag; }
void SetFlag(bool setFlag) { flag = static_cast<CMP_FLAG>(setFlag); }
std::vector<std::string> programLines;
OP ExtractOP(const std::string& line);
RI ExtractRI(const std::string& line, OP op);
int Extract2ndRIval(const std::string& line, OP op);
public:
void addCommand(const std::string& line) { programLines.push_back(line); }
void Execute();
Assembly() = delete;
Assembly(int inputValue) { REG[R0] = inputValue; }
int ReturnValue() const { return REG[R9]; }
private:
//recursion only
Assembly(int inputValue, const std::vector<std::string>& progLines) {
REG[R0] = inputValue;
programLines = progLines;
this->Execute();
}
};
OP Assembly::ExtractOP(const std::string& line)
{
std::istringstream issline{ line };
std::string operation;
issline >> operation;
return OPTABLE[operation];
}
RI Assembly::ExtractRI(const std::string& line, OP op)
{
auto space = line.find(' ');
if(op <= IFNEQ){
auto comma = line.find(',');
return RITABLE[std::string(line.begin() + space + 1, line.begin() + comma)];
}
return RI_MAX;
}
int Assembly::Extract2ndRIval(const std::string& line, OP op)
{
if(op == ENDIF) {
return -1;
}
std::size_t spaceOrComma;
if(op == CALL || op == RET) {
spaceOrComma = line.find(' ');
} else {
spaceOrComma = line.find(',');
}
std::string opval = std::string(line.begin() + spaceOrComma + 1, line.end());
auto it = RITABLE.find(opval);
if(it != RITABLE.end()){
return this->REG[it->second];
}
auto num = std::atoi(opval.c_str());
return num;
}
void Assembly::Execute()
{
for(const std::string& line : programLines)
{
OP op = ExtractOP(line);
RI r1 = ExtractRI(line, op);
int r2value = Extract2ndRIval(line, op);
Instruction command ( op, r1, r2value );
if(GetFlag() == CMP_FAIL)
{
if(command.op == ENDIF){
SetFlag(CMP_OK);
}
continue;
}
switch(command.op)
{
case MOV: { SetRegistryValue(command.r1, command.r2value); } break;
case ADD: { SetRegistryValue(command.r1, REG[command.r1] + command.r2value); } break;
case SUB: { SetRegistryValue(command.r1, REG[command.r1] - command.r2value); } break;
case MUL: { SetRegistryValue(command.r1, REG[command.r1] * command.r2value); } break;
case DIV: { SetRegistryValue(command.r1, REG[command.r1] / command.r2value); } break;
case MOD: { SetRegistryValue(command.r1, REG[command.r1] % command.r2value); } break;
case IFEQ: { SetFlag(GetRegistryValue(command.r1) == command.r2value); } break;
case IFNEQ: { SetFlag(GetRegistryValue(command.r1) != command.r2value); } break;
case IFG: { SetFlag(GetRegistryValue(command.r1) > command.r2value); } break;
case IFL: { SetFlag(GetRegistryValue(command.r1) < command.r2value); } break;
case IFGE: { SetFlag(GetRegistryValue(command.r1) >= command.r2value); } break;
case IFLE: { SetFlag(GetRegistryValue(command.r1) <= command.r2value); } break;
case RET:
{
SetRegistryValue(R9, command.r2value);
return;
}break;
//oh boy!
case CALL:
{
// std::cout << "value to call:" << command.r2value << std::endl;
Assembly recursion(command.r2value, this->programLines);
SetRegistryValue(R9, recursion.ReturnValue());
}break;
}
}
}
int main()
{
while(true)
{
int pl, input;
util::readcin(pl, input);
if(pl == 0){
break;
}
Assembly Asm(input);
for(auto i=0; i<pl; ++i)
{
std::string line;
std::getline(std::cin, line);
Asm.addCommand(line);
}
Asm.Execute();
std::cout << Asm.ReturnValue() << std::endl;
}
return 0;
}
The only way to check to see if a program is stuck in an infinite loop in the general case is to check to see the program has entered the same state as previous state. If it has entered exactly the same state previously, then it must continue on executing in a loop returning to the same state over and over following the same sequence of steps. In real programs this essentially impossible because of the huge number of possible states the program can be in, but your assembly language only allows much more limited number of possible states.
Since your CALL instruction works just like invoking the program at the start and this is the only form of looping, this means that checking if the code enters the same state twice is simple. A CALL instruction with a certain argument has the exact same effect as invoking the program with that argument as an input. If the CALL instruction has the same argument as any previously executed CALL instruction or the program's input value, then it must continue executing in a loop endlessly returning to same state in the same sequence of steps.
In other words, the only state that needs to be checked is the R0 value at the start of the program. Since this value is stored in a int, it can only have 2^32 possible values on any common C++ implementation, so it's reasonable and easy to brute force check see if a given program with a given input gets stuck in infinite loop.
In fact, it's possible to use memoization of the return values to brute force check all possible inputs in O(N) time and O(N) space, where N is number of possible inputs. There are various ways you could do this, but the way I would do it is to create three vectors, all with a number of elements equal to the number of possible inputs. The first vector is a bool (bit) vector that records whether or not a given input has been memoized yet, the second vector is also bool vector and it records whether a given input has been used already on the call stack, the second vector is an int vector that records the result and is used a linked list of input values to create a call stack to save space. (In the code below these vectors are called, is_memoized, input_pending and memoized_value respectively.)
I'd take your interpreter loop and rewrite it to be non-recursive, something like this pseudo-code:
input = reg[R0]
if is_memoized[input]:
reg[R9] = memoized_value[input]
return
input_pending[input] = true
memoized_value[input] = input // mark the top of the stack
while true:
for command in program:
...
if command.op == CALL:
argument = command.r2value
if input_pending[argument]:
// Since this input value is ready been used as input value
// somewhere on the call stack this the program is about to enter
// an identical state as a previous state and so is stuck in
// a infinite loop.
return false // program doesn't halt
if is_memoized[argument]:
REG[R9] = memoized_value[argument]
else:
memoized_value[argument] = input // stack the input value
input = argument
REG = [input, 0, 0, 0, 0, 0, 0, 0, 0, 0]
input_pending[input] = true
break // reevaluate the program from the beginning.
if command.op == RET:
argument = command.r2value
stacked_input = memoized_value[input]
input_pending[input] = false
if stacked_input == input: // at the top of the stack
REG[R9] = argument
return true // program halts for this input
is_memoized[input] = true
memoized_value[input] = argument
input = stacked_input
REG = [input, 0, 0, 0, 0, 0, 0, 0, 0, 0]
break // reevaluate the program from the beginning
You'd then call this interpreter loop for each possible input, something like this:
for i in all_possible_inputs:
if not program.execute(input = i): // the function written above
print "program doesn't halt for all inputs"
return
print "program halts for all inputs"
A recursive version should be faster since it doesn't have to reevaluate the program on each unmemoized CALL instruction in the program, but it would require hundreds of gigabytes of stack space in the worst case. This non-recursive version only requires 17 GB of memory. Either way it's still O(N) space and time, you're just making one constant factor smaller and another bigger.
To get this execute in reasonable amount of time you'd also probably want to parse the code once, and execute some byte code representation instead.
I take it you're looking for outside-the-box thinking.
Think of the halting problem this way. Turing proved programs are free from control. But why? Because the language has instructions to control execution. This means feasibly regulating and predicting execution in programs requires removing all control semantics from the language.
Even my collaborative process architecture doesn't accomplish that. It just forbids them because of the mess they make. It is still composed from a language which contains them. For example, you can use IF to break, return or continue but not for other operations. Function calls are illegal. I created such restrictions to achieve controllability. However not even a collaborative organization removes control structures from the language to prevent their misuse.
My architecture is online via my profile with a factorial example in my W article.
If the program steps into the same configuration twice then it will loop forever.
This is also true for Turing Machines, it is just that the (infinite) input is part of the machine's configuration.
This is also the intuition behind the pumping lemmas.
What constitutes a configuration in your model?
Since you have no memory and no IO, a configuration is given by the content of the registers and the line number of the current instruction (i.e. the instruction pointer).
When do a configuration change?
After every instruction, sure. But in the case of a non-branching instruction, the configurations before and after it are surely different because even if the instruction is a NOP then line number changed.
In the case of a branch, the line number might be one that we've seen before (for a backwards branch), so that's where the machine could step into the same configuration.
The only jumping instruction of interest, to me, seems to be call. The IF like ones will always produce different configurations because they are not expressive enough to produce iteration (jump back and repeat).
How does call change a configuration? It sets the line number to 1 and all the registers (except r0) to zero.
So the condition for a call to produce the same configuration reduces to having the same input.
If you check, in the call implementation, if the operand value has already been used before (in the simulation) then you can tell that the program will loop forever.
If a register has size n, then the possible states are O(2n), which is generally a lot.
You must be prepared to give up after a (possible customizable) threshold. Or in your case where your registers are int, most C++ implementations have 32-bit int, and modern machines can handle a 512MiB bitmap of 2^32 bits. (std::vector<bool> implements a packed bitmap for example; index it with unsigned to avoid negative ints). A hash table is another alternative (std::unordered_set<int>). But if you used a wider type for your registers, the state would be too big to practically record every possible one and you would need some limit. A limit is kind of built-in to your implementation as you will overflow the C++ callstack (C++ recursion depth) before seeing anywhere near 2^32 repeats of the machine being simulated.
If the registers are unbounded in their value, this reduces to the Halting Problem and thus undecideable in the general case. (But as #Brendan says, you can still look for early repeats of the state; many programs will terminate or infinitely repeat in a simple way.)
If you change the call implementation to not zero out the other registers, you must fallback to check the whole configuration at the call site (and not just the operand).
To check the termination of the program on every input you must proceed non-deterministically and symbolically.
There are two problems: the branches and the input value.
It is a famous theorem that an NDTM can be simulated by a TM in an exponential number of steps w.r.t. the steps of the NDTM.
The only problematic instructions are the IF ones because they create non-determinism.
You can take several approaches:
Split the simulation in two branches. One that executes the IF one that does not.
Rewrite the code to be simulated to produce an exponential (in the number of branches) number of branch-free variants of the code. You can generate them lazily.
Keep a tree of configurations, each branch in the program generating two children in the current node in the tree.
They are all equivalent.
The input value is not known, so it's hard to tell if a call ends up in the same configuration.
A possible approach is to record all the changes to the input register, for example you could end up with a description like "sub(add(add(R0, 1), 4), 5);".
This description should be easy to manipulate, as it's easy to see that in the example above R0 didn't change because you get "sub(add(R0, 5), 5);" and then "R0;".
This works by relying on the laws of arithmetics, you must take care of inverse operations, identity elements (1 * a = a) and overflow.
Basically, you need to simplify the expression.
You can then check if the input has changed at a given point in the simulated time.
How do I verify whether a program will enter into an infinite loop/recursion?
In practice; the halting problem is trivial to solve. It's only impossible in theory.
The reason people think that halting problem is impossible to solve is that the question is constructed as a false dilemma ( https://en.wikipedia.org/wiki/False_dilemma ). Specifically; the question asks to determine if a program will always halt or will never halt; but there's a third possibility - sometimes halting (and sometimes not halting). For an example if this, imagine a program that asks the user if they want to halt forever or exit immediately (and correctly does what the user requested). Note that all sane applications are like this - they're not supposed to exit until/unless the user tells them to.
More correctly; in practice, there are 4 possibilities:
runs until something external causes it to halt (power turned off, hardware failure, kill -9, ...)
sometimes halts itself
always halts itself
indeterminate (unable to determine which of the 3 other cases it is)
Of course with these 4 possibilities, you can say you've created a "halting problem solver" just by classifying every program as "indeterminate", but it won't be a good solution. This gives us a kind of rating system - extremely good "halting problem solvers" rarely classify programs as "indeterminate", and extremely bad "halting problem solvers" frequently classify programs as "indeterminate".
So; how do you create a good "halting problem solver"? This involves 2 parts - generating control flow graphs ( https://en.wikipedia.org/wiki/Control-flow_graph ) for each function and a call graph ( https://en.wikipedia.org/wiki/Call_graph ) for the whole program; and value tracking.
Control Flow Graphs and Call Graph
It's not that hard to generate a control flow graph for each function/procedure/routine, just by examining the control flow instructions (call, return, jump, condition branch); and not that hard to generate a call graph while you're doing this (just by checking if a node is already in the call graph when you see a call instruction and adding it if it's not there yet).
While doing this, you want to mark control flow changes (in the control flow graph for each function) as "conditional" or "not conditional", and you want to mark functions (in the call graph for the whole program) as "conditional" or "not conditional".
By analyzing the resulting graphs you can classify trivial programs as "runs until something external causes it to halt" or "always halts itself" (e.g. this is enough to classify OP's original code as "runs until something external causes it to halt"); but the majority of programs will still be "indeterminate".
Value Tracking
Value tracking is (trying) to keep track of the possible values that could be in any register/variable/memory location at any point in time. For example, if a program reads 2 unsigned bytes from disk into 2 separate variable you know both variables will have a value from 0 to 255. If those variables are multiplied you know the result will be a value from 0*0 to 255*255; if those variables are added you know the result will be a value from 0+0 to 255+255; etc. Of course the variable's type gives absolute maximum possible ranges, even for assembly (where there's no types) - e.g. (without knowing if it's signed or unsigned) you know that a 32-bit read from memory will return a value from -2147483648 to 4294967296.
The point of value tracking is to annotate conditional branches in the control flow graph for each function; so that you can use those annotations to help classify a program as something other than "indeterminate".
This is where things get tricky - improving the quality of a "practical halting problem solver" requires increasing the sophistication/complexity of the value tracking. I don't know if it's possible to write a perfect "halting problem solver" (that never returns "indeterminate") but I do know that it's possible to write a "halting problem solver" that is sufficient for almost all practical purposes (that returns "indeterminate" so rarely that nobody cares).
Related
So, I am new to online competitive programming and i came across a code where i am using the if else statement inside a for loop. I want to increase the speed of the loop and after doing some research i came across break and continue statements.
So my question is that does using continue really increases the speed of the loop or not.
CODE :
int even_sum = 0;
for(int i=0;i<200;i++){
if(i%4 == 0){
even_sum +=i;
continue;
}else{
//do other stuff when sum of multiple of 4 is not calculated
}
}
In the specific code in the question, the code has the identical meaning with and without the continue: In either case, after execution leaves even_sum +=i;, it flows to the closing } of the for statement. Any compiler of even modest quality should treat the two options identically.
The intended purpose of continue is not to speed up code by requesting a jump the compiler is going to make anyway but to skip code that is undesired in the current loop iteration—it acts as if the remaining code had been enclosed in an else clause but may be more visually appealing and less disruptive to human perception of the code.
It is conceivable a very rudimentary compiler, or even a decent compiler but with optimization disabled, might generate a jump instruction for the continue and also a jump instruction for the “then” clause of the if statement to jump over the else clause. The latter would never be executed and would have no direct effect on program execution time, but it would increase the size of the program and thus could have indirect effects. This possibility is of negligible concern in typical modern environments, where you are unlikely to encounter such a rudimentary compiler.
No, there's no speed advantage when using continue here. Both of your codes are identical and even without optimizations they produce the same machine code.
However, sometimes continue can make your code a lot more efficient, if you have structured your loop in a specific way, e.g.
This:
int even_sum = 0;
for (int i = 0; i < 200; i++) {
if (i % 4 == 0) {
even_sum += i;
continue;
}
if (huge_computation_but_always_false_when_multiple_of_4(i)) {
// do stuff
}
}
is a lot more efficient, than:
int even_sum = 0;
for (int i = 0; i < 200; i++) {
if (i % 4 == 0) {
even_sum += i;
}
if (huge_computation_but_always_false_when_multiple_of_4(i)) {
// do stuff
}
}
because the former doesn't have to execute the huge_computation_but_always_false_when_multiple_of_4() function every time.
So even though both of these codes would always produce the same result (given that huge_computation_but_always_false_when_multiple_of_4() has no side effects), the first one, which uses continue, would be a lot faster.
I have an if statement that is currently never executed, however if I print something to the screen it takes over ten times longer for the program to run than if a variable is declared. Doing a bit of research online this seems to be some kind of branch prediction issue. Is there anything I can do to improve the program speed?
Basically both myTest and myTest_new return the same thing except one is a macro and one is a function. I am just monitoring the time it takes for bitTest to execute. and it executes in 3 seconds with just declaration in if statement but takes over a minute when Serial.print is in if statement even though neither are executed.
void bitTest()
{
int count = 0;
Serial1.println("New Test");
int lastint = 0;
Serial1.println("int");
for (int index = -2147483647; index <= 2147483647; index+=1000) {
if (index <= 0 && lastint > 0) {
break;
}
lastint = index;
for (int num = 0; num <= 31; num++) {
++1000;
int vcr1 = myTest(index, num);
int vcr2 = myTest_new(index, num);
if (vcr1 != vcr2) {
Serial1.println("Test"); // leave this println() and it takes 300 seconds for the test to run
//int x = 0;
}
} // if (index)
} // for (index)
Serial1.print("count = ");
Serial1.println(count);
return;
}
It is much less likely to be caused by a branch prediction (that branch prediction shouldn't be influenced by what you do inside your code) but by the fact that
{
int x = 0;
}
simply does nothing, because the scope of x ends at }, so that the compiler simply ditches the whole if clause, including the check. Note that this is only possible because the expression that if checks has no side effects, and neither does the block that would get executed.
By the way, the code you showed would usually directly be "compiled away", because the compiler, at compile time, can determine whether the if clause could ever be executed, unless you explicitly tell the compiler to omit such safe optimizations. Hence, I kind of doubt your "10 times as slow" measurement. Either the code you're showing isn't the actual example on which you demonstrate this, or you should turn on compiler optimization prior to doing performance comparisons.
The reason why your program takes forever is that it's buggy:
for (int index = -2147483647; index <= 2147483647; index+=1000) {
simply: at a very large index close to the maximum integer value, a wrap-around will occur. There's no "correct" way for your program to terminate. Hence you invented your strange lastint > 0 checking.
Now, fix up the loop (I mean, you're really just using every 1000th element, so why not simply loop index from 0 to 2*2147483?)
++1000;
should be illegal in C, because you can't increase a constant numeral. This is very much WTF.
All in all, your program is a mess. Re-write it, and debug a clean, well-defined version of it.
Editor's clarification: When this was originally posted, there were two issues:
Test performance drops by a factor of three if seemingly inconsequential statement added
Time taken to complete the test appears to vary randomly
The second issue has been solved: the randomness only occurs when running under the debugger.
The remainder of this question should be understood as being about the first bullet point above, and in the context of running in VC++ 2010 Express's Release Mode with optimizations "Maximize Speed" and "favor fast code".
There are still some Comments in the comment section talking about the second point but they can now be disregarded.
I have a simulation where if I add a simple if statement into the while loop that runs the actual simulation, the performance drops about a factor of three (and I run a lot of calculations in the while loop, n-body gravity for the solar system besides other things) even though the if statement is almost never executed:
if (time - cb_last_orbital_update > 5000000)
{
cb_last_orbital_update = time;
}
with time and cb_last_orbital_update being both of type double and defined in the beginning of the main function, where this if statement is too. Usually there are computations I want to run there too, but it makes no difference if I delete them. The if statement as it is above has the same effect on the performance.
The variable time is the simulation time, it increases in 0.001 steps in the beginning so it takes a really long time until the if statement is executed for the first time (I also included printing a message to see if it is being executed, but it is not, or at least only when it's supposed to). Regardless, the performance drops by a factor of 3 even in the first minutes of the simulation when it hasn't been executed once yet. If I comment out the line
cb_last_orbital_update = time;
then it runs faster again, so it's not the check for
time - cb_last_orbital_update > 5000000
either, it's definitely the simple act of writing current simulation time into this variable.
Also, if I write the current time into another variable instead of cb_last_orbital_update, the performance does not drop. So this might be an issue with assigning a new value to a variable that is used to check if the "if" should be executed? These are all shots in the dark though.
Disclaimer: I am pretty new to programming, and sorry for all that text.
I am using Visual C++ 2010 Express, deactivating the stdafx.h precompiled header function didn't make a difference either.
EDIT: Basic structure of the program. Note that nowhere besides at the end of the while loop (time += time_interval;) is time changed. Also, cb_last_orbital_update has only 3 occurrences: Declaration / initialization, plus the two times in the if statement that is causing the problem.
int main(void)
{
...
double time = 0;
double time_interval = 0.001;
double cb_last_orbital_update = 0;
F_Rocket_Preset(time, time_interval, ...);
while(conditions)
{
Rocket[active].Stage[Rocket[active].r_stage].F_Update_Stage_Performance(time, time_interval, ...);
Rocket[active].F_Calculate_Aerodynamic_Variables(time);
Rocket[active].F_Calculate_Gravitational_Forces(cb_mu, cb_pos_d, time);
Rocket[active].F_Update_Rotation(time, time_interval, ...);
Rocket[active].F_Update_Position_Velocity(time_interval, time, ...);
Rocket[active].F_Calculate_Orbital_Elements(cb_mu);
F_Update_Celestial_Bodies(time, time_interval, ...);
if (time - cb_last_orbital_update > 5000000.0)
{
cb_last_orbital_update = time;
}
Rocket[active].F_Check_Apoapsis(time, time_interval);
Rocket[active].F_Status_Check(time, ...);
Rocket[active].F_Update_Mass (time_interval, time);
Rocket[active].F_Staging_Check (time, time_interval);
time += time_interval;
if (time > 3.1536E8)
{
std::cout << "\n\nBreak main loop! Sim Time: " << time << std::endl;
break;
}
}
...
}
EDIT 2:
Here is the difference in the assembly code. On the left is the fast code with the line
cb_last_orbital_update = time;
outcommented, on the right the slow code with the line.
EDIT 4:
So, i found a workaround that seems to work just fine so far:
int cb_orbit_update_counter = 1; // before while loop
if(time - cb_orbit_update_counter * 5E6 > 0)
{
cb_orbit_update_counter++;
}
EDIT 5:
While that workaround does work, it only works in combination with using __declspec(noinline). I just removed those from the function declarations again to see if that changes anything, and it does.
EDIT 6: Sorry this is getting confusing. I tracked down the culprit for the lower performance when removing __declspec(noinline) to this function, that is being executed inside the if:
__declspec(noinline) std::string F_Get_Body_Name(int r_body)
{
switch (r_body)
{
case 0:
{
return ("the Sun");
}
case 1:
{
return ("Mercury");
}
case 2:
{
return ("Venus");
}
case 3:
{
return ("Earth");
}
case 4:
{
return ("Mars");
}
case 5:
{
return ("Jupiter");
}
case 6:
{
return ("Saturn");
}
case 7:
{
return ("Uranus");
}
case 8:
{
return ("Neptune");
}
case 9:
{
return ("Pluto");
}
case 10:
{
return ("Ceres");
}
case 11:
{
return ("the Moon");
}
default:
{
return ("unnamed body");
}
}
}
The if also now does more than just increase the counter:
if(time - cb_orbit_update_counter * 1E7 > 0)
{
F_Update_Orbital_Elements_Of_Celestial_Bodies(args);
std::cout << F_Get_Body_Name(3) << " SMA: " << cb_sma[3] << "\tPos Earth: " << cb_pos_d[3][0] << " / " << cb_pos_d[3][1] << " / " << cb_pos_d[3][2] <<
"\tAlt: " << sqrt(pow(cb_pos_d[3][0] - cb_pos_d[0][0],2) + pow(cb_pos_d[3][1] - cb_pos_d[0][1],2) + pow(cb_pos_d[3][2] - cb_pos_d[0][2],2)) << std::endl;
std::cout << "Time: " << time << "\tcb_o_h[3]: " << cb_o_h[3] << std::endl;
cb_orbit_update_counter++;
}
I remove __declspec(noinline) from the function F_Get_Body_Name alone, the code gets slower. Similarly, if i remove the execution of this function or add __declspec(noinline) again, the code runs faster. All other functions still have __declspec(noinline).
EDIT 7:
So i changed the switch function to
const std::string cb_names[] = {"the Sun","Mercury","Venus","Earth","Mars","Jupiter","Saturn","Uranus","Neptune","Pluto","Ceres","the Moon","unnamed body"}; // global definition
const int cb_number = 12; // global definition
std::string F_Get_Body_Name(int r_body)
{
if (r_body >= 0 && r_body < cb_number)
{
return (cb_names[r_body]);
}
else
{
return (cb_names[cb_number]);
}
}
and also made another part of the code slimmer. The program now runs fast without any __declspec(noinline). As ElderBug suggested, an issue with the CPU instruction cache then / the code getting too big?
I'd put my money on Intel's branch predictor. http://en.wikipedia.org/wiki/Branch_predictor
The processor assumes (time - cb_last_orbital_update > 5000000) to be false most of the time and loads up the execution pipeline accordingly.
Once the condition (time - cb_last_orbital_update > 5000000) comes true. The misprediction delay is hitting you. You may loose 10 to 20 cycles.
if (time - cb_last_orbital_update > 5000000)
{
cb_last_orbital_update = time;
}
Something is happening that you don't expect.
One candidate is some uninitialised variables hanging around somewhere, which have different values depending on the exact code that you are running. For example, you might have uninitialised memory that is sometime a denormalised floating point number, and sometime it's not.
I think it should be clear that your code doesn't do what you expect it to do. So try debugging your code, compile with all warnings enabled, make sure you use the same compiler options (optimised vs. non-optimised can easily be a factor 10). Check that you get the same results.
Especially when you say "it runs faster again (this doesn't always work though, but i can't see a pattern). Also worked with changing 5000000 to 5E6 once. It only runs fast once though, recompiling causes the performance to drop again without changing anything. One time it ran slower only after recompiling twice." it looks quite likely that you are using different compiler options.
I will try another guess. This is hypothetical, and would be mostly due to the compiler.
My guess is that you use a lot of floating point calculations, and the introduction and use of double values in your main makes the compiler run out of XMM registers (the floating point SSE registers). This force the compiler to use memory instead of registers, and induce a lot of swapping between memory and registers, thus greatly reducing the performance. This would be happening mainly because of the computations functions inlining, because function calls are preserving registers.
The solution would be to add __declspec(noinline) to ALL your computation functions declarations.
I suggest using the Microsoft Profile Guided Optimizer -- if the compiler is making the wrong assumption for this particular branch it will help, and it will in all likelihood improve speed for the rest of the code as well.
Workaround, try 2:
The code is now looking like this:
int cb_orbit_update_counter = 1; // before while loop
if(time - cb_orbit_update_counter * 5E6 > 0)
{
cb_orbit_update_counter++;
}
So far it runs fast, plus the code is being executed when it should as far as i can tell. Again only a workaround, but if this proves to work all around then i'm satisfied.
After some more testing, seems good.
My guess is that this is because the variable cb_last_orbital_update is otherwise read-only, so when you assign to it inside the if, it destroys some optimizations that the compiler has for read-only variables (e.g. perhaps it's now stored in memory instead of a register).
Something to try (although this might still not work) is to make a third variable that is initialized via cb_last_orbital_update and time depending on whether the condition is true, and using that one instead. Presumably, the compiler would now treat that variable as a constant, but I'm not sure.
I'm writing code that takes a number from a user and prints in back in letters as string. I want to know, which is better performance-wise, to have if statements, like
if (n < 100) {
// code for 2-digit numbers
} else if (n < 1000) {
// code for 3-digit numbers
} // etc..
or to put the number in a string and get its length, then work on it as a string.
The code is written in C++.
Of course if-else will be faster.
To compare two numbers you just compare them bitwise (there are different ways to do it but it's a very fast operation).
To get the length of the string you will need to make the string, put the data into it and compute the length somehow (there can be different ways of doing it too, the simplest being counting all the symbols). Of course it takes much more time.
On a simple example though you will not notice any difference. It often amazes me that people get concerned with such things (no offense). It will not make any difference for you if the code will execute in 0.003 seconds instead of 0.001 seconds really... You should make such low-level optimizations only after you know that this exact place is a bottleneck of your application, and when you are sure that you can increase the performance by a decent amount.
Until you measure and this really is a bottleneck, don't worry about performance.
That said, the following should be even faster (for readability, let's assume you use a type that ranges between 0 and 99999999):
if (n < 10000) {
// code for less or equal to 4 digits
if (n < 100)
{
//code for less or equal to 2 digits
if (n < 10)
return 1;
else
return 2;
}
else
{
//code for over 2 digits, but under or equal to 4
if (n>=1000)
return 4;
else
return 3;
}
} else {
//similar
} // etc..
Basically, it's a variation of binary search. Worst case, this will take O(log(n)) as opposed to O(n) - n being the maximum number of digits.
The string variant will be slower:
std::stringstream ss; // allocation, initialization ...
ss << 4711; // parsing, setting internal flags, ...
std::string str = ss.str(); // allocations, array copies ...
// cleaning up (compiler does it for you) ...
str.~string();
ss.~stringstream(); // destruction ...
The ... indicate there's more stuff happening.
A compact (good for cache) loop (good for branch prediction) might be what you want:
int num_digits (int value, int base=10) {
int num = 0;
while (value) {
value /= base;
++num;
}
return num;
}
int num_zeros (int value, int base=10) {
return num_decimal_digits(value, base) - 1;
}
Depending on circumstances, because it is cache and prediction friendly, this may be faster than solutions based on relational operators.
The templated variant enables the compiler to do some micro optimizations for your division:
template <int base=10>
int num_digits (int value) {
int num = 0;
while (value) {
value /= base;
++num;
}
return num;
}
The answers are good, but think a bit, about relative times.
Even by the slowest method you can think of, the program can do it in some tiny fraction of a second, like maybe 100 microseconds.
Balance that against the fastest user you can imagine, who could type in the number in maybe 500 milliseconds, and who could read the output in another 500 milliseconds, before doing whatever comes next.
OK, the machine does essentially nothing for 1000 milliseconds, and in the middle it has to crunch like crazy for 100 microseconds because, after all, we don't want the user to think the program is slow ;-)
I believe (from some research reading) that counting down in for-loops is actually more efficient and faster in runtime. My full software code is C++
I currently have this:
for (i=0; i<domain; ++i) {
my 'i' is unsigned resgister int,
also 'domain' is unsigned int
in the for-loop i is used for going through an array, e.g.
array[i] = do stuff
converting this to count down messes up the expected/correct output of my routine.
I can imagine the answer being quite trivial, but I can't get my head round it.
UPDATE: 'do stuff' does not depend on previous or later iteration. The calculations within the for-loop are independant for that iteration of i. (I hope that makes sense).
UPDATE: To achieve a runtime speedup with my for-loop, do I count down and if so remove the unsigned part when delcaring my int, or what other method?
Please help.
There is only one correct method of looping backwards using an unsigned counter:
for( i = n; i-- > 0; )
{
// Use i as normal here
}
There's a trick here, for the last loop iteration you will have i = 1 at the top of the loop, i-- > 0 passes because 1 > 0, then i = 0 in the loop body. On the next iteration i-- > 0 fails because i == 0, so it doesn't matter that the postfix decrement rolled over the counter.
Very non obvious I know.
I'm guessing your backward for loop looks like this:
for (i = domain - 1; i >= 0; --i) {
In that case, because i is unsigned, it will always be greater than or equal to zero. When you decrement an unsigned variable that is equal to zero, it will wrap around to a very large number. The solution is either to make i signed, or change the condition in the for loop like this:
for (i = domain - 1; i >= 0 && i < domain; --i) {
Or count from domain to 1 rather than from domain - 1 to 0:
for (i = domain; i >= 1; --i) {
array[i - 1] = ...; // notice you have to subtract 1 from i inside the loop now
}
This is not an answer to your problem, because you don't seem to have a problem.
This kind of optimization is completely irrelevant and should be left to the compiler (if done at all).
Have you profiled your program to check that your for-loop is a bottleneck? If not, then you do not need to spend time worrying about this. Even more so, having "i" as a "register" int, as you write, makes no real sense from a performance standpoint.
Even without knowing your problem domain, I can guarantee you that both the reverse-looping technique and the "register" int counter will have negligible impact on your program's performance. Remember, "Premature optimization is the root of all evil".
That said, better spent optimization time would be on thinking about the overall program structure, data structures and algorithms used, resource utilization, etc.
Checking to see if a number is zero can be quicker or more efficient than a comparison. But this is the sort of micro-optimization you really shouldn't worry about - a few clock cycles will be greatly dwarfed by just about any other perf issue.
On x86:
dec eax
jnz Foo
Instead of:
inc eax
cmp eax, 15
jl Foo
It has nothing to do with counting up or down. What can be faster is counting toward zero. Michael's answer shows why — x86 gives you a comparison with zero as an implicit side effect of many instructions, so after you adjust your counter, you just branch based on the result instead of doing an explicit comparison. (Maybe other architectures do that, too; I don't know.)
Borland's Pascal compilers are notorious for performing that optimization. The compiler transforms this code:
for i := x to y do
foo(i);
into an internal representation more akin to this:
tmp := Succ(y - x);
i := x;
while tmp > 0 do begin
foo(i);
Inc(i);
Dec(tmp);
end;
(I say notorious not because the optimization affects the outcome of the loop, but because the debugger displays the counter variable incorrectly. When the programmer inspects i, the debugger may display the value of tmp instead, causing no end of confusion and panic for programmers who think their loops are running backward.)
The idea is that even with the extra Inc or Dec instruction, it's still a net win, in terms of running time, over doing an explicit comparison. Whether you can actually notice that difference is up for debate.
But note that the conversion is something the compiler would do automatically, based on whether it deemed the transformation worthwhile. The compiler is usually better at optimizing code than you are, so don't spend too much effort competing with it.
Anyway, you asked about C++, not Pascal. C++ "for" loops aren't quite as easy to apply that optimization to as Pascal "for" loops are because the bounds of Pascal's loops are always fully calculated before the loop runs, whereas C++ loops sometimes depend on the stopping condition and the loop contents. C++ compilers need to do some amount of static analysis to determine whether any given loop could fit the requirements for the kind of transformation Pascal loops qualify for unconditionally. If the C++ compiler does the analysis, then it could do a similar transformation.
There's nothing stopping you from writing your loops that way on your own:
for (unsigned i = 0, tmp = domain; tmp > 0; ++i, --tmp)
array[i] = do stuff
Doing that might make your code run faster. Like I said before, though, you probably won't notice. The bigger cost you pay by manually arranging your loops like that is that your code no longer follows established idioms. Your loop is a perfectly ordinary "for" loop, but it no longer looks like one — it has two variables, they're counting in opposite directions, and one of them isn't even used in the loop body — so anyone reading your code (including you, a week, a month, or a year from now when you've forgotten the "optimization" you were hoping to achieve) will need to spend extra effort proving to himself or herself that the loop is indeed an ordinary loop in disguise.
(Did you notice that my code above used unsigned variables with no danger of wrapping around at zero? Using two separate variables allows that.)
Three things to take away from all this:
Let the optimizer do its job; on the whole it's better at it than you are.
Make ordinary code look ordinary so that the special code doesn't have to compete to get attention from people reviewing, debugging, or maintaining it.
Don't do anything fancy in the name of performance until testing and profiling show it to be necessary.
If you have a decent compiler, it will optimize "counting up" just as effectively as "counting down". Just try a few benchmarks and you'll see.
So you "read" that couting down is more efficient? I find this very difficult to believe unless you show me some profiler results and the code. I can buy it under some circumstances, but in the general case, no. Seems to me like this is a classic case of premature optimization.
Your comment about "register int i" is also very telling. Nowadays, the compiler always knows better than you how to allocate registers. Don't bother using using the register keyword unless you have profiled your code.
When you're looping through data structures of any sort, cache misses have a far bigger impact than the direction you're going. Concern yourself with the bigger picture of memory layout and algorithm structure instead of trivial micro-optimisations.
You may try the following, which compiler will optimize very efficiently:
#define for_range(_type, _param, _A1, _B1) \
for (_type _param = _A1, _finish = _B1,\
_step = static_cast<_type>(2*(((int)_finish)>(int)_param)-1),\
_stop = static_cast<_type>(((int)_finish)+(int)_step); _param != _stop; \
_param = static_cast<_type>(((int)_param)+(int)_step))
Now you can use it:
for_range (unsigned, i, 10,0)
{
cout << "backwards i: " << i << endl;
}
for_range (char, c, 'z','a')
{
cout << c << endl;
}
enum Count { zero, one, two, three };
for_range (Count, c, three, zero)
{
cout << "backwards: " << c << endl;
}
You may iterate in any direction:
for_range (Count, c, zero, three)
{
cout << "forward: " << c << endl;
}
The loop
for_range (unsigned,i,b,a)
{
// body of the loop
}
will produce the following code:
mov esi,b
L1:
; body of the loop
dec esi
cmp esi,a-1
jne L1
Hard to say with information given but... reverse your array, and count down?
Jeremy Ruten rightly pointed out that using an unsigned loop counter is dangerous. It's also unnecessary, as far as I can tell.
Others have also pointed out the dangers of premature optimization. They're absolutely right.
With that said, here is a style I used when programming embedded systems many years ago, when every byte and every cycle did count for something. These forms were useful for me on the particular CPUs and compilers that I was using, but your mileage may vary.
// Start out pointing to the last elem in array
pointer_to_array_elem_type p = array + (domain - 1);
for (int i = domain - 1; --i >= 0 ; ) {
*p-- = (... whatever ...)
}
This form takes advantage of the condition flag that is set on some processors after arithmetical operations -- on some architectures, the decrement and testing for the branch condition can be combined into a single instruction. Note that using predecrement (--i) is the key here -- using postdecrement (i--) would not have worked as well.
Alternatively,
// Start out pointing *beyond* the last elem in array
pointer_to_array_elem_type p = array + domain;
for (pointer_to_array_type p = array + domain; p - domain > 0 ; ) {
*(--p) = (... whatever ...)
}
This second form takes advantage of pointer (address) arithmetic. I rarely see the form (pointer - int) these days (for good reason), but the language guarantees that when you subtract an int from a pointer, the pointer is decremented by (int * sizeof (*pointer)).
I'll emphasize again that whether these forms are a win for you depends on the CPU and compiler that you're using. They served me well on Motorola 6809 and 68000 architectures.
In some later arm cores, decrement and compare takes only a single instruction. This makes decrementing loops more efficient than incrementing ones.
I don't know why there isn't an increment-compare instruction also.
I'm surprised that this post was voted -1 when it's a true issue.
Everyone here is focusing on performance. There is actually a logical reason to iterate towards zero that can result in cleaner code.
Iterating over the last element first is convenient when you delete invalid elements by swapping with the end of the array. For bad elements not adjacent to the end we can swap into the end position, decrease the end bound of the array, and keep iterating. If you were to iterate toward the end then swapping with the end could result in swapping bad for bad. By iterating end to 0 we know that the element at the end of the array has already been proven valid for this iteration.
For further explanation...
If:
You delete bad elements by swapping with one end of the array and changing the array bounds to exclude the bad elements.
Then obviously:
You would swap with a good element i.e. one that has already been tested in this iteration.
So this implies:
If we iterate away from the variable bound then elements between the variable bound and the current iteration pointer have been proven good. Whether the iteration pointer gets ++ or -- doesn't matter. What matters is that we're iterating away from the variable bound so we know that the elements adjacent to it are good.
So finally:
Iterating towards 0 allows us to use only one variable to represent the array bounds. Whether this matters is a personal decision between you and your compiler.
What matters much more than whether you're increasing or decreasing your counter is whether or not you're going up memory or down memory. Most caches are optimized for going up memory, not down memory. Since memory access time is the bottleneck that most programs today face, this means that changing your program so that you go up memory can result in a performance boost even if this requires comparing your counter to a non-zero value. In some of my programs, I saw a significant improvement in performance by changing my code to go up memory instead of down it.
Skeptical? Here's the output that I got:
sum up = 705046256
sum down = 705046256
Ave. Up Memory = 4839 mus
Ave. Down Memory = 5552 mus
sum up = inf
sum down = inf
Ave. Up Memory = 18638 mus
Ave. Down Memory = 19053 mus
from running this program:
#include <chrono>
#include <iostream>
#include <random>
#include <vector>
template<class Iterator, typename T>
void FillWithRandomNumbers(Iterator start, Iterator one_past_end, T a, T b) {
std::random_device rnd_device;
std::mt19937 generator(rnd_device());
std::uniform_int_distribution<T> dist(a, b);
for (auto it = start; it != one_past_end; it++)
*it = dist(generator);
return ;
}
template<class Iterator>
void FillWithRandomNumbers(Iterator start, Iterator one_past_end, double a, double b) {
std::random_device rnd_device;
std::mt19937_64 generator(rnd_device());
std::uniform_real_distribution<double> dist(a, b);
for (auto it = start; it != one_past_end; it++)
*it = dist(generator);
return ;
}
template<class RAI, class T>
inline void sum_abs_up(RAI first, RAI one_past_last, T &total) {
T sum = 0;
auto it = first;
do {
sum += *it;
it++;
} while (it != one_past_last);
total += sum;
}
template<class RAI, class T>
inline void sum_abs_down(RAI first, RAI one_past_last, T &total) {
T sum = 0;
auto it = one_past_last;
do {
it--;
sum += *it;
} while (it != first);
total += sum;
}
template<class T> std::chrono::nanoseconds TimeDown(
std::vector<T> &vec, const std::vector<T> &vec_original,
std::size_t num_repititions, T &running_sum) {
std::chrono::nanoseconds total{0};
for (std::size_t i = 0; i < num_repititions; i++) {
auto start_time = std::chrono::high_resolution_clock::now();
sum_abs_down(vec.begin(), vec.end(), running_sum);
total += std::chrono::high_resolution_clock::now() - start_time;
vec = vec_original;
}
return total;
}
template<class T> std::chrono::nanoseconds TimeUp(
std::vector<T> &vec, const std::vector<T> &vec_original,
std::size_t num_repititions, T &running_sum) {
std::chrono::nanoseconds total{0};
for (std::size_t i = 0; i < num_repititions; i++) {
auto start_time = std::chrono::high_resolution_clock::now();
sum_abs_up(vec.begin(), vec.end(), running_sum);
total += std::chrono::high_resolution_clock::now() - start_time;
vec = vec_original;
}
return total;
}
int main() {
std::size_t num_repititions = 1 << 10;
{
typedef int ValueType;
auto lower = std::numeric_limits<ValueType>::min();
auto upper = std::numeric_limits<ValueType>::max();
std::vector<ValueType> vec(1 << 24);
FillWithRandomNumbers(vec.begin(), vec.end(), lower, upper);
const auto vec_original = vec;
ValueType sum_up = 0, sum_down = 0;
auto time_up = TimeUp(vec, vec_original, num_repititions, sum_up).count();
auto time_down = TimeDown(vec, vec_original, num_repititions, sum_down).count();
std::cout << "sum up = " << sum_up << '\n';
std::cout << "sum down = " << sum_down << '\n';
std::cout << "Ave. Up Memory = " << time_up/(num_repititions * 1000) << " mus\n";
std::cout << "Ave. Down Memory = "<< time_down/(num_repititions * 1000) << " mus"
<< std::endl;
}
{
typedef double ValueType;
auto lower = std::numeric_limits<ValueType>::min();
auto upper = std::numeric_limits<ValueType>::max();
std::vector<ValueType> vec(1 << 24);
FillWithRandomNumbers(vec.begin(), vec.end(), lower, upper);
const auto vec_original = vec;
ValueType sum_up = 0, sum_down = 0;
auto time_up = TimeUp(vec, vec_original, num_repititions, sum_up).count();
auto time_down = TimeDown(vec, vec_original, num_repititions, sum_down).count();
std::cout << "sum up = " << sum_up << '\n';
std::cout << "sum down = " << sum_down << '\n';
std::cout << "Ave. Up Memory = " << time_up/(num_repititions * 1000) << " mus\n";
std::cout << "Ave. Down Memory = "<< time_down/(num_repititions * 1000) << " mus"
<< std::endl;
}
return 0;
}
Both sum_abs_up and sum_abs_down do the same thing and are timed they same way with the only difference being that sum_abs_up goes up memory while sum_abs_down goes down memory. I even pass vec by reference so that both functions access the same memory locations. Nevertheless, sum_abs_up is consistently faster than sum_abs_down. Give it a run yourself (I compiled it with g++ -O3).
FYI vec_original is there for experimentation, to make it easy for me to change sum_abs_up and sum_abs_down in a way that makes them alter vec while not allowing these changes to affect future timings.
It's important to note how tight the loop that I'm timing is. If a loop's body is large then it likely won't matter whether its iterator goes up or down memory since the time it takes to execute the loop's body will likely completely dominate. Also, it's important to mention that with some rare loops, going down memory is sometimes faster than going up it. But even with such loops it's rarely ever the case that going up was always slower than going down (unlike loops that go up memory, which are very often always faster than the equivalent down-memory loops; a small handful of times they were even 40+% faster).
The point is, as a rule of thumb, if you have the option, if the loop's body is small, and if there's little difference between having your loop go up memory instead of down it, then you should go up memory.