Related
I'm trying to write miller-rabin test. I found few codes such as:
https://www.sanfoundry.com/cpp-program-implement-miller-rabin-primality-test/
https://www.geeksforgeeks.org/primality-test-set-3-miller-rabin/
Of course all this codes works for 252097800623 ( which is prime number ), but this is becaouse they are parsing it to int. When I changed all ints to long long in this codes they are now returning NO. I also wrote my own code based on another article and it worked when I was testing it with small numbers like 11, 101, 17 and even 1000000007, but chrashed on greater numbers like 252097800623. I want to write program that works for all integers from 1 to 10^18
EDIT
here is modified code form 1st link:
/*
* C++ Program to Implement Milong longer Rabin Primality Test
*/
#include <iostream>
#include <cstring>
#include <cstdlib>
using namespace std;
/*
* calculates (a * b) % c taking long longo account that a * b might overflow
*/
long long mulmod(long long a, long long b, long long mod)
{
long long x = 0,y = a % mod;
while (b > 0)
{
if (b % 2 == 1)
{
x = (x + y) % mod;
}
y = (y * 2) % mod;
b /= 2;
}
return x % mod;
}
/*
* modular exponentiation
*/
long long modulo(long long base, long long exponent, long long mod)
{
long long x = 1;
long long y = base;
while (exponent > 0)
{
if (exponent % 2 == 1)
x = (x * y) % mod;
y = (y * y) % mod;
exponent = exponent / 2;
}
return x % mod;
}
/*
* Milong longer-Rabin primality test, iteration signifies the accuracy
*/
bool Miller(long long p,long long iteration)
{
if (p < 2)
{
return false;
}
if (p != 2 && p % 2==0)
{
return false;
}
long long s = p - 1;
while (s % 2 == 0)
{
s /= 2;
}
for (long long i = 0; i < iteration; i++)
{
long long a = rand() % (p - 1) + 1, temp = s;
long long mod = modulo(a, temp, p);
while (temp != p - 1 && mod != 1 && mod != p - 1)
{
mod = mulmod(mod, mod, p);
temp *= 2;
}
if (mod != p - 1 && temp % 2 == 0)
{
return false;
}
}
return true;
}
//Main
int main()
{
long long iteration = 5;
long long num;
cout<<"Enter long longeger to test primality: ";
cin>>num;
if (Miller(num, iteration))
cout<<num<<" is prime"<<endl;
else
cout<<num<<" is not prime"<<endl;
return 0;
}
The code in the first link, which you replicated in your question, replacing the (bad) macro ll with long long (although this produces exactly the same preprocessed code) and all int with long long, is already broken for large values, see compiler explorer here. I forced the compiler to evaluate the Miller function for 252097800623 at compile time, replacing the call to rand() with one random number 123456.
As you can see the compiler is telling me that it cannot do so, because there are integer overflows in the program. In particular:
<source>:133:17: error: static_assert expression is not an integral constant expression
static_assert(Miller(num, iteration));
^~~~~~~~~~~~~~~~~~~~~~
<source>:62:12: note: value 232307310937188460801 is outside the range of representable values of type 'long long'
y = (y * y) % mod;
^
<source>:104:14: note: in call to 'modulo(123457, 63024450155, 252097800623)'
ll mod = modulo(a, temp, p);
^
<source>:133:17: note: in call to 'Miller(252097800623, 5)'
static_assert(Miller(num, iteration));
As you can see long long is simply too small to handle inputs that large to this algorithm.
While taking input output in C++ I have only used scanf/printf and cin/cout. Now I recently came across this code taking I/O in a strange fashion.
Also note that this I/O method is causing the code to run extremely fast, as this code uses almost the same algorithm as most of the other codes but it executes in a much smaller time. Why is this I/O so fast and how does this work in general?
edit: code
#include <bits/stdtr1c++.h>
#define MAXN 200010
#define MAXQ 200010
#define MAXV 1000010
#define clr(ar) memset(ar, 0, sizeof(ar))
#define read() freopen("lol.txt", "r", stdin)
using namespace std;
const int block_size = 633;
long long res, out[MAXQ]; int n, q, ar[MAXN], val[MAXN], freq[MAXV];
namespace fastio{
int ptr, ye;
char temp[25], str[8333667], out[8333669];
void init(){
ptr = 0, ye = 0;
fread(str, 1, 8333667, stdin);
}
inline int number(){
int i, j, val = 0;
while (str[ptr] < 45 || str[ptr] > 57) ptr++;
while (str[ptr] > 47 && str[ptr] < 58) val = (val * 10) + (str[ptr++] - 48);
return val;
}
inline void convert(long long x){
int i, d = 0;
for (; ;){
temp[++d] = (x % 10) + 48;
x /= 10;
if (!x) break;
}
for (i = d; i; i--) out[ye++] = temp[i];
out[ye++] = 10;
}
inline void print(){
fwrite(out, 1, ye, stdout);
} }
struct query{
int l, r, d, i;
inline query() {}
inline query(int a, int b, int c){
i = c;
l = a, r = b, d = l / block_size;
}
inline bool operator < (const query& other) const{
if (d != other.d) return (d < other.d);
return ((d & 1) ? (r < other.r) : (r > other.r));
} } Q[MAXQ];
void compress(int n, int* in, int* out){
unordered_map <int, int> mp;
for (int i = 0; i < n; i++) out[i] = mp.emplace(in[i], mp.size()).first->second; }
inline void insert(int i){
res += (long long)val[i] * (1 + 2 * freq[ar[i]]++); }
inline void erase(int i){
res -= (long long)val[i] * (1 + 2 * --freq[ar[i]]); }
inline void run(){
sort(Q, Q + q);
int i, l, r, a = 0, b = 0;
for (res = 0, i = 0; i < q; i++){
l = Q[i].l, r = Q[i].r;
while (a > l) insert(--a);
while (b <= r) insert(b++);
while (a < l) erase(a++);
while (b > (r + 1)) erase(--b);
out[Q[i].i] = res;
}
for (i = 0; i < q; i++) fastio::convert(out[i]); }
int main(){
fastio::init();
int n, i, j, k, a, b;
n = fastio::number();
q = fastio::number();
for (i = 0; i < n; i++) val[i] = fastio::number();
compress(n, val, ar);
for (i = 0; i < q; i++){
a = fastio::number();
b = fastio::number();
Q[i] = query(a - 1, b - 1, i);
}
run();
fastio::print();
return 0; }
This solution, http://codeforces.com/contest/86/submission/22526466 (624 ms, 32 MB RAM uses) uses single fread and manual parsing of numbers from memory (so it uses more memory); many other solutions are slower and uses scanf (http://codeforces.com/contest/86/submission/27561563 1620 ms 9MB) or C++ iostream cin (http://codeforces.com/contest/86/submission/27558562 3118 ms, 15 MB). Not all difference of solutions comes from input-output and parsing (solutions methods have differences too), but some is.
fread(str, 1, 8333667, stdin);
This code uses single fread libcall to read up to 8MB, which is full file. The file may have up to 2 (n,t) + 200000 (a_i) + 2*200000 (l,r) 6/7-digit numbers with or without line breaks or separated by one (?) space, so around 8 chars max for number (6 or 7 for number, as 1000000 is allowed too, and 1 space or \n); max input file size is like 0.6 M * 8 bytes =~ 5 MB.
inline int number(){
int i, j, val = 0;
while (str[ptr] < 45 || str[ptr] > 57) ptr++;
while (str[ptr] > 47 && str[ptr] < 58) val = (val * 10) + (str[ptr++] - 48);
return val;
}
Then code uses manual code of parsing decimal int numbers. According to ascii table, http://www.asciitable.com/ decimal codes of 48...57 are decimal digits (second while loop): '0'...'9', and we can just subtract 48 from the letter code to get the digit; multiply partially read val by 10 and add current digit. And chr<45 || chr > 57 in the first while loops sound like skipping non-digits from input. This is incorrect, as this code will not parse codes 45, 46, 47 = '-', '.', '/', and no any number after these chars will be read.
n = fastio::number();
q = fastio::number();
for (i = 0; i < n; i++) val[i] = fastio::number();
for (i = 0; i < q; i++){
a = fastio::number();
b = fastio::number();
Actual reading uses this fastio::number() method; and other solutions uses calling of scanf or iostream operator << in loop:
for (int i = 0; i < N; i++) {
scanf("%d", &(arr[i]));
add(arr[i]);
}
or
for (int i = 1; i <= n; ++i)
cin >> a[i];
Both methods are more universal, but they do library call, which will read some chars from internal buffer (like 4KB) or call OS syscall for buffer refill, and every function does many checks and has error reporting: For every number of input scanf will reparse the same format string of first argument, and will do all the logic described in POSIX http://pubs.opengroup.org/onlinepubs/7908799/xsh/fscanf.html and all error-checking. C++ iostream has no format string, but it is still more universal: https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/istream.tcc#L156 'operator>>(int& __n)'.
So, standard library functions have more logic inside, more calls, more branching; and they are more universal and much safer, and should be used in real-world programming. And this "sport programming" contest allow users to solve the task with standard library functions which are fast enough, if you can imagine the algorithm. Authors or task are required to write several solutions with standard i/o functions to check that timelimit of the task is correct and task can be solved. (The TopCoder system is better with i/o, you will not implement i/o, the data is already passed into your function in some language structs/collections).
Sometimes tasks in sport programming have tight limits on memory: input files several times bigger than allowed memory usage, and programmer can't read whole file into memory. For example: get 20 mln of digits of single verylong number from input file and add 1 to it, with memory limit of 2 MB; you can't read full input number from file in forward direction; it is very hard to do correct reading in chunks in backward direction; and you just need to forget standard method of addition (Columnar addition) and build FSM (Finite-state machine) with state, counting sequences of 9s.
I have made a recursive function in c++ which deals with very large integers.
long long int findfirst(int level)
{
if(level==1)
return 1;
else if(level%2==0)
return (2*findfirst(--level));
else
return (2*findfirst(--level)-1);
}
when the input variable(level) is high,it reaches the limit of long long int and gives me wrong output.
i want to print (output%mod) where mod is 10^9+7(^ is power) .
int main()
{
long long int first = findfirst(143)%1000000007;
cout << first;
}
It prints -194114669 .
Normally online judges problem don't require the use of large integers (normally meaning almost always), if your solution need large integers probably is not the best solution to solve the problem.
Some notes about modular arithmetic
if a1 = b1 mod n and a2 = b2 mod n then:
a1 + a2 = b1 + b2 mod n
a1 - a2 = b1 - b2 mod n
a1 * a2 = b1 * b2 mod n
That mean that modular arithmetic is transitive (a + b * c) mod n could be calculated as (((b mod n) * (c mod n)) mod n + (a mod n)) mod n, I know there a lot of parenthesis and sub-expression but that is to avoid integer overflow as much as we can.
As long as I understand your program you don't need recursion at all:
#include <iostream>
using namespace std;
const long long int mod_value = 1000000007;
long long int findfirst(int level) {
long long int res = 1;
for (int lev = 1; lev <= level; lev++) {
if (lev % 2 == 0)
res = (2*res) % mod_value;
else
res = (2*res - 1) % mod_value;
}
return res;
}
int main() {
for (int i = 1; i < 143; i++) {
cout << findfirst(i) << endl;
}
return 0;
}
If you need to do recursion modify you solution to:
long long int findfirst(int level) {
if (level == 1)
return 1;
else if (level % 2 == 0)
return (2 * findfirst(--level)) % mod_value;
else
return (2 * findfirst(--level) - 1) % mod_value;
}
Where mod_value is the same as before:
Please make a good study of modular arithmetic and apply in the following online challenge (the reward of discovery the solution yourself is to high to let it go). Most of the online challenge has a mathematical background.
If the problem is (as you say) it overflows long long int, then use an arbitrary precision Integer library. Examples are here.
As mentioned in the title, I'm looking for something that can give me more performance than atoi. Presently, the fastest way I know is
atoi(mystring.c_str())
Finally, I would prefer a solution that doesn't rely on Boost. Does anybody have good performance tricks for doing this?
Additional Information: int will not exceed 2 billion, it is always positive, the string has no decimal places in it.
I experimented with solutions using lookup tables, but found them fraught with issues, and actually not very fast. The fastest solution turned out to be the least imaginitive:
int fast_atoi( const char * str )
{
int val = 0;
while( *str ) {
val = val*10 + (*str++ - '0');
}
return val;
}
Running a benchmark with a million randomly generated strings:
fast_atoi : 0.0097 seconds
atoi : 0.0414 seconds
To be fair, I also tested this function by forcing the compiler not to inline it. The results were still good:
fast_atoi : 0.0104 seconds
atoi : 0.0426 seconds
Provided your data conforms to the requirements of the fast_atoi function, that is pretty reasonable performance. The requirements are:
Input string contains only numeric characters, or is empty
Input string represents a number from 0 up to INT_MAX
atoi can be improved upon significantly, given certain assumptions. This was demonstrated powerfully in a presentation by Andrei Alexandrescu at the C++ and Beyond 2012 conference. Hi s replacement used loop unrolling and ALU parallelism to achieve orders of magnitude in perf improvement. I don't have his materials, but this link uses a similar technique: http://tombarta.wordpress.com/2008/04/23/specializing-atoi/
This page compares conversion speed between different string->int functions using different compilers. The naive function, which offers no error checking, offers speeds roughly twice as fast as atoi(), according to the results presented.
// Taken from http://tinodidriksen.com/uploads/code/cpp/speed-string-to-int.cpp
int naive(const char *p) {
int x = 0;
bool neg = false;
if (*p == '-') {
neg = true;
++p;
}
while (*p >= '0' && *p <= '9') {
x = (x*10) + (*p - '0');
++p;
}
if (neg) {
x = -x;
}
return x;
}
it is always positive
Remove the negative checks in the above code for a micro optimization.
If you can guarantee the string will not have anything but numeric characters, you can micro optimize further by changing the loop
while (*p >= '0' && *p <= '9') {
to
while (*p != '\0' ) {
Which leaves you with
unsigned int naive(const char *p) {
unsigned int x = 0;
while (*p != '\0') {
x = (x*10) + (*p - '0');
++p;
}
return x;
}
Quite a few of the code examples here are quite complex and do unnecessary work, meaning the code could be slimmer and faster.
Conversion loops are often written to do three different things with each character:
bail out if it is the end-of-string character
bail out if it is not a digit
convert it from its code point to the actual digit value
First observation: there is no need to check for the end-of-string character separately, since it is not a digit. Hence the check for 'digitness' covers the EOS condition implicitly.
Second observation: double conditions for range testing as in (c >= '0' && c <= '9') can be converted to a single test condition by using an unsigned type and anchoring the range at zero; that way there can be no unwanted values below the beginning of the range, all unwanted values are mapped to the range above the upper limit: (uint8_t(c - '0') <= 9)
It just so happens that c - '0' needs to be computed here anyway...
Hence the inner conversion loop can be slimmed to
uint64_t n = digit_value(*p);
unsigned d;
while ((d = digit_value(*++p)) <= 9)
{
n = n * 10 + d;
}
The code here is called with the precondition that p be pointing at a digit, which is why the first digit is extracted without further ado (which also avoids a superfluous MUL).
That precondition is less outlandish than might appear at first, since p pointing at a digit is the reason why this code is called by the parser in the first place. In my code the whole shebang looks like this (assertions and other production-quality noise elided):
unsigned digit_value (char c)
{
return unsigned(c - '0');
}
bool is_digit (char c)
{
return digit_value(c) <= 9;
}
uint64_t extract_uint64 (char const **read_ptr)
{
char const *p = *read_ptr;
uint64_t n = digit_value(*p);
unsigned d;
while ((d = digit_value(*++p)) <= 9)
{
n = n * 10 + d;
}
*read_ptr = p;
return n;
}
The first call to digit_value() is often elided by the compiler, if the code gets inlined and the calling code has already computed that value by calling is_digit().
n * 10 happens to be faster than manual shifting (e.g. n = (n << 3) + (n << 1) + d), at least on my machine with gcc 4.8.1 and VC++ 2013. My guess is that both compilers use LEA with index scaling for adding up to three values in one go and scaling one of them by 2, 4, or 8.
In any case that's exactly how it should be: we write nice clean code in separate functions and express the desired logic (n * 10, x % CHAR_BIT, whatever) and the compiler converts it to shifting, masking, LEAing and so on, inlines everything into the big bad parser loop and takes care of all the required messiness under the hood to make things fast. We don't even have to stick inline in front of everything anymore. If anything then we have to do the opposite, by using __declspec(noinline) judiciously when compilers get over-eager.
I'm using the above code in a program that reads billions of numbers from text files and pipes; it converts 115 million uints per second if the length is 9..10 digits, and 60 million/s for length 19..20 digits (gcc 4.8.1). That's more than ten times as fast as strtoull() (and just barely enough for my purposes, but I digress...). That's the timing for converting text blobs containing 10 million numbers each (100..200 MB), meaning that memory timings make these numbers appear a bit worse than they would be in a synthetic benchmark running from cache.
Paddy's implementation of fast_atoi is faster than atoi - without the shadow of the doubt - however it works only for unsigned integers.
Below, I put evaluated version of Paddy's fast_atoi that also allows only unsigned integers but speeds conversion up even more by replacing costly operation * with +
unsigned int fast_atou(const char *str)
{
unsigned int val = 0;
while(*str) {
val = (val << 1) + (val << 3) + *(str++) - 48;
}
return val;
}
Here, I put complete version of fast_atoi() that i'm using sometimes which converts singed integers as well:
int fast_atoi(const char *buff)
{
int c = 0, sign = 0, x = 0;
const char *p = buff;
for(c = *(p++); (c < 48 || c > 57); c = *(p++)) {if (c == 45) {sign = 1; c = *(p++); break;}}; // eat whitespaces and check sign
for(; c > 47 && c < 58; c = *(p++)) x = (x << 1) + (x << 3) + c - 48;
return sign ? -x : x;
}
Here's the entirety of the atoi function in gcc:
long atoi(const char *str)
{
long num = 0;
int neg = 0;
while (isspace(*str)) str++;
if (*str == '-')
{
neg=1;
str++;
}
while (isdigit(*str))
{
num = 10*num + (*str - '0');
str++;
}
if (neg)
num = -num;
return num;
}
The whitespace and negative check are superfluous in your case, but also only use nanoseconds.
isdigit is almost certainly inlined, so that's not costing you any time.
I really don't see room for improvement here.
A faster convert function only for positive integers without error checking.
Multiplication is always slower that sum and shift, therefore change multiply with shift.
int fast_atoi( const char * str )
{
int val = 0;
while( *str ) {
val = (val << 3) + (val << 1) + (*str++ - '0');
}
return val;
}
I did a quick benchmark of the different functions given here + some extras, and I converted them to int64_t by default. Compiler = MSVC.
Here are the results (left = normal time, right = time with overhead deduction):
atoi : 153283912 ns => 1.000x : 106745800 ns => 1.000x
atoll : 174446125 ns => 0.879x : 127908013 ns => 0.835x
std::stoll : 358193237 ns => 0.428x : 311655125 ns => 0.343x
std::stoull : 354171912 ns => 0.433x : 307633800 ns => 0.347x
-----------------------------------------------------------------
fast_null : 46538112 ns => 3.294x : 0 ns => infx (overhead estimation)
fast_atou : 92299625 ns => 1.661x : 45761513 ns => 2.333x (#soerium)
FastAtoiBitShift: 93275637 ns => 1.643x : 46737525 ns => 2.284x (#hamSh)
FastAtoiMul10 : 93260987 ns => 1.644x : 46722875 ns => 2.285x (#hamSh but with *10)
FastAtoiCompare : 86691962 ns => 1.768x : 40153850 ns => 2.658x (#DarthGizka)
FastAtoiCompareu: 86960900 ns => 1.763x : 40422788 ns => 2.641x (#DarthGizka + uint)
-----------------------------------------------------------------
FastAtoi32 : 92779375 ns => 1.652x : 46241263 ns => 2.308x (handle the - sign)
FastAtoi32u : 86577312 ns => 1.770x : 40039200 ns => 2.666x (no sign)
FastAtoi32uu : 87298600 ns => 1.756x : 40760488 ns => 2.619x (no sign + uint)
FastAtoi64 : 93693575 ns => 1.636x : 47155463 ns => 2.264x
FastAtoi64u : 86846912 ns => 1.765x : 40308800 ns => 2.648x
FastAtoi64uu : 86890537 ns => 1.764x : 40352425 ns => 2.645x
FastAtoiDouble : 90126762 ns => 1.701x : 43588650 ns => 2.449x (only handle int)
FastAtoiFloat : 92062775 ns => 1.665x : 45524663 ns => 2.345x (same)
DarthGizka's code is the fastest and has the advantage of stopping when the char is non-digit.
Also, the bitshifting "optimization" is a tiny bit slower than just doing * 10.
The benchmark runs each algorithm with 10 million iterations on a pseudo-random string, to limit the branch prediction as much as possible, and then it re-runs everything 15 more times. For each algorithm, the 4 slowest and 4 fastest times are discarded, and the result given is the average of the 8 median times. This provides a lot of stability. Also, I run fast_null in order to estimate the overhead in the benchmark (loop + string changes + function call), and then this value is deducted in the second numbers.
Here is the code for the functions:
int64_t fast_null(const char* str) { return (str[0] - '0') + (str[1] - '0'); }
int64_t fast_atou(const char* str)
{
int64_t val = 0;
while (*str) val = (val << 1) + (val << 3) + *(str++) - 48;
return val;
}
int64_t FastAtoiBitShift(const char* str)
{
int64_t val = 0;
while (*str) val = (val << 3) + (val << 1) + (*str++ - '0');
return val;
}
int64_t FastAtoiMul10(const char* str)
{
int64_t val = 0;
while (*str) val = val * 10 + (*str++ - '0');
return val;
}
int64_t FastAtoiCompare(const char* str)
{
int64_t val = 0;
uint8_t x;
while ((x = uint8_t(*str++ - '0')) <= 9) val = val * 10 + x;
return val;
}
uint64_t FastAtoiCompareu(const char* str)
{
uint64_t val = 0;
uint8_t x;
while ((x = uint8_t(*str++ - '0')) <= 9) val = val * 10 + x;
return val;
}
int32_t FastAtoi32(const char* str)
{
int32_t val = 0;
int sign = 0;
if (*str == '-')
{
sign = 1;
++str;
}
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return sign ? -val : val;
}
int32_t FastAtoi32u(const char* str)
{
int32_t val = 0;
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return val;
}
uint32_t FastAtoi32uu(const char* str)
{
uint32_t val = 0;
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10u + digit;
return val;
}
int64_t FastAtoi64(const char* str)
{
int64_t val = 0;
int sign = 0;
if (*str == '-')
{
sign = 1;
++str;
}
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return sign ? -val : val;
}
int64_t FastAtoi64u(const char* str)
{
int64_t val = 0;
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return val;
}
uint64_t FastAtoi64uu(const char* str)
{
uint64_t val = 0;
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10u + digit;
return val;
}
float FastAtoiFloat(const char* str)
{
float val = 0;
uint8_t x;
while ((x = uint8_t(*str++ - '0')) <= 9) val = val * 10.0f + x;
return val;
}
double FastAtoiDouble(const char* str)
{
double val = 0;
uint8_t x;
while ((x = uint8_t(*str++ - '0')) <= 9) val = val * 10.0 + x;
return val;
}
And the benchmark code I used, just in case...
void Benchmark()
{
std::map<std::string, std::vector<int64_t>> funcTimes;
std::map<std::string, std::vector<int64_t>> funcTotals;
std::map<std::string, int64_t> funcFinals;
#define BENCH_ATOI(func) \
do \
{ \
auto start = NowNs(); \
int64_t z = 0; \
char string[] = "000001987"; \
for (int i = 1e7; i >= 0; --i) \
{ \
string[0] = '0' + (i + 0) % 10; \
string[1] = '0' + (i + 1) % 10; \
string[2] = '0' + (i + 3) % 10; \
string[3] = '0' + (i + 5) % 10; \
string[4] = '0' + (i + 9) % 10; \
z += func(string); \
} \
auto elapsed = NowNs() - start; \
funcTimes[#func].push_back(elapsed); \
funcTotals[#func].push_back(z); \
} \
while (0)
for (int i = 0; i < 16; ++i)
{
BENCH_ATOI(atoi);
BENCH_ATOI(atoll);
BENCH_ATOI(std::stoll);
BENCH_ATOI(std::stoull);
//
BENCH_ATOI(fast_null);
BENCH_ATOI(fast_atou);
BENCH_ATOI(FastAtoiBitShift);
BENCH_ATOI(FastAtoiMul10);
BENCH_ATOI(FastAtoiCompare);
BENCH_ATOI(FastAtoiCompareu);
//
BENCH_ATOI(FastAtoi32);
BENCH_ATOI(FastAtoi32u);
BENCH_ATOI(FastAtoi32uu);
BENCH_ATOI(FastAtoi64);
BENCH_ATOI(FastAtoi64u);
BENCH_ATOI(FastAtoi64uu);
BENCH_ATOI(FastAtoiFloat);
BENCH_ATOI(FastAtoiDouble);
}
for (auto& [func, times] : funcTimes)
{
std::sort(times.begin(), times.end(), [](const auto& a, const auto& b) { return a < b; });
fmt::print("{:<16}: {}\n", func, funcTotals[func][0]);
int64_t total = 0;
for (int i = 4; i <= 11; ++i) total += times[i];
total /= 8;
funcFinals[func] = total;
}
const auto base = funcFinals["atoi"];
const auto overhead = funcFinals["fast_null"];
for (const auto& [func, final] : funcFinals)
fmt::print("{:<16}: {:>9} ns => {:.3f}x : {:>9} ns => {:.3f}x\n", func, final, base * 1.0 / final, final - overhead, (base - overhead) * 1.0 / (final - overhead));
}
Why not use a stringstream? I'm not sure of its particular overhead, but you could define:
int myInt;
string myString = "1561";
stringstream ss;
ss(myString);
ss >> myInt;
Of course, you'd need to
#include <stringstream>
The only definitive answer is with checking with your compiler, your real data.
Something I'd try (even if it's using memory accesses so it may be slow depending on caching) is
int value = t1[s[n-1]];
if (n > 1) value += t10[s[n-2]]; else return value;
if (n > 2) value += t100[s[n-3]]; else return value;
if (n > 3) value += t1000[s[n-4]]; else return value;
... continuing for how many digits you need to handle ...
if t1, t10 and so on are statically allocated and constant the compiler shouldn't fear any aliasing and the machine code generated should be quite decent.
Here is mine. Atoi is the fastest I could come up with. I compiled with msvc 2010 so it might be possible to combine both templates. In msvc 2010, when I combined templates it made the case where you provide a cb argument slower.
Atoi handles nearly all the special atoi cases, and is as fast or faster than this:
int val = 0;
while( *str )
val = val*10 + (*str++ - '0');
Here is the code:
#define EQ1(a,a1) (BYTE(a) == BYTE(a1))
#define EQ1(a,a1,a2) (BYTE(a) == BYTE(a1) && EQ1(a,a2))
#define EQ1(a,a1,a2,a3) (BYTE(a) == BYTE(a1) && EQ1(a,a2,a3))
// Atoi is 4x faster than atoi. There is also an overload that takes a cb argument.
template <typename T>
T Atoi(LPCSTR sz) {
T n = 0;
bool fNeg = false; // for unsigned T, this is removed by optimizer
const BYTE* p = (const BYTE*)sz;
BYTE ch;
// test for most exceptions in the leading chars. Most of the time
// this test is skipped. Note we skip over leading zeros to avoid the
// useless math in the second loop. We expect leading 0 to be the most
// likely case, so we test it first, however the cpu might reorder that.
for ( ; (ch=*p-'1') >= 9 ; ++p) { // unsigned trick for range compare
// ignore leading 0's, spaces, and '+'
if (EQ1(ch, '0'-'1', ' '-'1', '+'-'1'))
continue;
// for unsigned T this is removed by optimizer
if (!((T)-1 > 0) && ch==BYTE('-'-'1')) {
fNeg = !fNeg;
continue;
}
// atoi ignores these. Remove this code for a small perf increase.
if (BYTE(*p-9) > 4) // \t, \n, 11, 12, \r. unsigned trick for range compare
break;
}
// deal with rest of digits, stop loop on non digit.
for ( ; (ch=*p-'0') <= 9 ; ++p) // unsigned trick for range compare
n = n*10 + ch;
// for unsigned T, (fNeg) test is removed by optimizer
return (fNeg) ? -n : n;
}
// you could go with a single template that took a cb argument, but I could not
// get the optimizer to create good code when both the cb and !cb case were combined.
// above code contains the comments.
template <typename T>
T Atoi(LPCSTR sz, BYTE cb) {
T n = 0;
bool fNeg = false;
const BYTE* p = (const BYTE*)sz;
const BYTE* p1 = p + cb;
BYTE ch;
for ( ; p<p1 && (ch=*p-'1') >= 9 ; ++p) {
if (EQ1(ch,BYTE('0'-'1'),BYTE(' '-'1'),BYTE('+'-'1')))
continue;
if (!((T)-1 > 0) && ch == BYTE('-'-'1')) {
fNeg = !fNeg;
continue;
}
if (BYTE(*p-9) > 4) // \t, \n, 11, 12, \r
break;
}
for ( ; p<p1 && (ch=*p-'0') <= 9 ; ++p)
n = n*10 + ch;
return (fNeg) ? -n : n;
}
What's the best way to write
int NumDigits(int n);
in C++ which would return the number of digits in the decimal representation of the input. For example 11->2, 999->3, -1->2 etc etc.
Straightforward and simple, and independent of sizeof(int):
int NumDigits(int n) {
int digits = 0;
if (n <= 0) {
n = -n;
++digits;
}
while (n) {
n /= 10;
++digits;
}
return digits;
}
//Works for positive integers only
int DecimalLength(int n) {
return floor(log10f(n) + 1);
}
The fastest way is probably a binary search...
//assuming n is positive
if (n < 10000)
if (n < 100)
if (n < 10)
return 1;
else
return 2;
else
if (n < 1000)
return 3;
else
return 4;
else
//etc up to 1000000000
In this case it's about 3 comparisons regardless of input, which I suspect is much faster than a division loop or using doubles.
One way is to (may not be most efficient) convert it to a string and find the length of the string. Like:
int getDigits(int n)
{
std::ostringstream stream;
stream<<n;
return stream.str().length();
}
To extend Arteluis' answer, you could use templates to generate the comparisons:
template<int BASE, int EXP>
struct Power
{
enum {RESULT = BASE * Power<BASE, EXP - 1>::RESULT};
};
template<int BASE>
struct Power<BASE, 0>
{
enum {RESULT = 1};
};
template<int LOW = 0, int HIGH = 8>
struct NumDigits
{
enum {MID = (LOW + HIGH + 1) / 2};
inline static int calculate (int i)
{
if (i < Power<10, MID>::RESULT)
return NumDigits<LOW, MID - 1>::calculate (i);
else
return NumDigits<MID, HIGH>::calculate (i);
}
};
template<int LOW>
struct NumDigits<LOW, LOW>
{
inline static int calculate (int i)
{
return LOW + 1;
}
};
int main (int argc, char* argv[])
{
// Example call.
std::cout << NumDigits<>::calculate (1234567) << std::endl;
return 0;
}
numdigits = snprintf(NULL, 0, "%d", num);
int NumDigits(int n)
{
int digits = 0;
if (n < 0) {
++digits;
do {
++digits;
n /= 10;
} while (n < 0);
}
else {
do {
++digits;
n /= 10;
} while (n > 0);
}
return digits;
}
Edit: Corrected edge case behavior for -2^31 (etc.)
Some very over-complicated solutions have been proposed, including the accepted one.
Consider:
#include <cmath>
#include <cstdlib>
int NumDigits( int num )
{
int digits = (int)log10( (double)abs(num) ) + 1 ;
return num >= 0 ? digits : digits + 1 ;
}
Note that it works for for INT_MIN + 1 ... INT_MAX, because abs(INT_MIN) == INT_MAX + 1 == INT_MIN (due to wrap-around), which in-turn is invalid input to log10(). It is possible to add code for that one case.
Here's a simpler version of Alink's answer .
int NumDigits(int32_t n)
{
if (n < 0) {
if (n == std::numeric_limits<int32_t>::min())
return 11;
return NumDigits(-n) + 1;
}
static int32_t MaxTable[9] = { 10,100,1000,10000,100000,1000000,10000000,100000000,1000000000 };
return 1 + (std::upper_bound(MaxTable, MaxTable+9, n) - MaxTable);
}
Another implementation using STL binary search on a lookup table, which seems not bad (not too long and still faster than division methods). It also seem easy and efficient to adapt for type much bigger than int: will be faster than O(digits) methods and just needs multiplication (no division or log function for this hypothetical type). There is a requirement of a MAXVALUE, though. Unless you fill the table dynamically.
[edit: move the struct into the function]
int NumDigits9(int n) {
struct power10{
vector<int> data;
power10() {
for(int i=10; i < MAX_INT/10; i *= 10) data.push_back(i);
}
};
static const power10 p10;
return 1 + upper_bound(p10.data.begin(), p10.data.end(), n) - p10.data.begin();
}
Since the goal is to be fast, this is a improvement on andrei alexandrescu's improvement. His version was already faster than the naive way (dividing by 10 at every digit). The version below is faster at least on x86-64 and ARM for most sizes.
Benchmarks for this version vs alexandrescu's version on my PR on facebook folly.
inline uint32_t digits10(uint64_t v)
{
std::uint32_t result = 0;
for (;;)
{
result += 1
+ (std::uint32_t)(v>=10)
+ (std::uint32_t)(v>=100)
+ (std::uint32_t)(v>=1000)
+ (std::uint32_t)(v>=10000)
+ (std::uint32_t)(v>=100000);
if (v < 1000000) return result;
v /= 1000000U;
}
}
My version of loop (works with 0, negative and positive values):
int numDigits(int n)
{
int digits = n<0; //count "minus"
do { digits++; } while (n/=10);
return digits;
}
If you're using a version of C++ which include C99 maths functions (C++0x and some earlier compilers)
static const double log10_2 = 3.32192809;
int count_digits ( int n )
{
if ( n == 0 ) return 1;
if ( n < 0 ) return ilogb ( -(double)n ) / log10_2 + 2;
return ilogb ( n ) / log10_2 + 1;
}
Whether ilogb is faster than a loop will depend on the architecture, but it's useful enough for this kind of problem to have been added to the standard.
An optimization of the previous division methods. (BTW they all test if n!=0, but most of the time n>=10 seems enough and spare one division which was more expensive).
I simply use multiplication and it seems to make it much faster (almost 4x here), at least on the 1..100000000 range. I am a bit surprised by such difference, so maybe this triggered some special compiler optimization or I missed something.
The initial change was simple, but unfortunately I needed to take care of a new overflow problem. It makes it less nice, but on my test case, the 10^6 trick more than compensates the cost of the added check. Obviously it depends on input distribution and you can also tweak this 10^6 value.
PS: Of course, this kind of optimization is just for fun :)
int NumDigits(int n) {
int digits = 1;
// reduce n to avoid overflow at the s*=10 step.
// n/=10 was enough but we reuse this to optimize big numbers
if (n >= 1000000) {
n /= 1000000;
digits += 6; // because 1000000 = 10^6
}
int s = 10;
while (s <= n) {
s *= 10;
++digits;
}
return digits;
}