I have already computed the truncated mean of a vector via the function truncated_mean(std::vector& v, double trimming fraction). This function takes as inputs the vector v and the fraction that we want to remove to calculate the mean (e.g. 10% so we remove the highest and lowest 10% values and then we compute the mean), I created it using the Standard Library.
For example, v = [0,1,2....,9], then truncated_mean(v, 0.10) = 4.5.
Now, I want to reuse the same function but instead of having v as input, I want to have 2 forward iterators, v.begin() and v.end(). I am provided with the template of typename forward that I should use to check if its value_type (accessed via std::iterator_traits) meets a certain criteria. My understanding of the problem is that first I need to check if the inputs belong to a vector and from there I should access the vector in itself to compute the truncated mean.
How can I adapt my function to take as input the beginning and end of the vector rather than the vector itself?
Assuming sequence passed in is sorted you could simply use std::distance to figure out the length and skip the appropriate number of elements at the start and the end:
Edit: Extended code to use std::accumulate for random access iterators; Use concepts instead of distinguishing iterator types vis additional parameter, if you're allowed to use C++20 features.
template<typename RandomAccessIterator>
double truncated_mean_impl(RandomAccessIterator begin, RandomAccessIterator end, double trimming_fraction, std::random_access_iterator_tag)
{
if (trimming_fraction < 0)
{
throw std::range_error("trimming_fraction must not be negative");
}
if(trimming_fraction >= 0.5)
{
return std::numeric_limits<double>::quiet_NaN(); // no elements left after trimming
}
auto const count = std::distance(begin, end);
auto const skippedElementCountFront = static_cast<decltype(count)>(count * trimming_fraction);
auto const summandCount = count - 2 * skippedElementCountFront;
return std::accumulate<RandomAccessIterator, double>(begin + skippedElementCountFront, end - skippedElementCountFront, 0) / summandCount;
}
template<typename ForwardIterator>
double truncated_mean_impl(ForwardIterator begin, ForwardIterator end, double trimming_fraction, std::forward_iterator_tag)
{
if (trimming_fraction < 0)
{
throw std::range_error("trimming_fraction must not be negative");
}
if(trimming_fraction >= 0.5)
{
return std::numeric_limits<double>::quiet_NaN(); // no elements left after trimming
}
auto const count = std::distance(begin, end);
auto const skippedElementCountFront = static_cast<decltype(count)>(count * trimming_fraction);
// skip elements in the front
for (auto i = skippedElementCountFront; i != 0; --i, ++begin) {}
auto const summandCount = count - 2 * skippedElementCountFront;
double sum = 0;
for (auto i = summandCount; i != 0; --i, ++begin)
{
sum += *begin;
}
return sum / summandCount;
}
template<typename ForwardIterator>
double truncated_mean(ForwardIterator begin, ForwardIterator end, double trimming_fraction)
{
return truncated_mean_impl<ForwardIterator>(begin, end, trimming_fraction, typename std::iterator_traits<ForwardIterator>::iterator_category());
}
int main()
{
std::vector<int> const values { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::cout << truncated_mean(values.cbegin(), values.cend(), 0.1) << '\n';
}
If the input sequence is not sorted and you cannot or don't want to sort the input copying the elements to a new vector and applying your original algorithm to this vector would probably be best.
I'm a programming student, and for a project I'm working on, on of the things I have to do is compute the median value of a vector of int values and must be done by passing it through functions. Also the vector is initially generated randomly using the C++ random generator mt19937 which i have already written down in my code.I'm to do this using the sort function and vector member functions such as .begin(), .end(), and .size().
I'm supposed to make sure I find the median value of the vector and then output it
And I'm Stuck, below I have included my attempt. So where am I going wrong? I would appreciate if you would be willing to give me some pointers or resources to get going in the right direction.
Code:
#include<iostream>
#include<vector>
#include<cstdlib>
#include<ctime>
#include<random>
#include<vector>
#include<cstdlib>
#include<ctime>
#include<random>
using namespace std;
double find_median(vector<double>);
double find_median(vector<double> len)
{
{
int i;
double temp;
int n=len.size();
int mid;
double median;
bool swap;
do
{
swap = false;
for (i = 0; i< len.size()-1; i++)
{
if (len[i] > len[i + 1])
{
temp = len[i];
len[i] = len[i + 1];
len[i + 1] = temp;
swap = true;
}
}
}
while (swap);
for (i=0; i<len.size(); i++)
{
if (len[i]>len[i+1])
{
temp=len[i];
len[i]=len[i+1];
len[i+1]=temp;
}
mid=len.size()/2;
if (mid%2==0)
{
median= len[i]+len[i+1];
}
else
{
median= (len[i]+0.5);
}
}
return median;
}
}
int main()
{
int n,i;
cout<<"Input the vector size: "<<endl;
cin>>n;
vector <double> foo(n);
mt19937 rand_generator;
rand_generator.seed(time(0));
uniform_real_distribution<double> rand_distribution(0,0.8);
cout<<"original vector: "<<" ";
for (i=0; i<n; i++)
{
double rand_num=rand_distribution(rand_generator);
foo[i]=rand_num;
cout<<foo[i]<<" ";
}
double median;
median=find_median(foo);
cout<<endl;
cout<<"The median of the vector is: "<<" ";
cout<<median<<endl;
}
The median is given by
const auto median_it = len.begin() + len.size() / 2;
std::nth_element(len.begin(), median_it , len.end());
auto median = *median_it;
For even numbers (size of vector) you need to be a bit more precise. E.g., you can use
assert(!len.empty());
if (len.size() % 2 == 0) {
const auto median_it1 = len.begin() + len.size() / 2 - 1;
const auto median_it2 = len.begin() + len.size() / 2;
std::nth_element(len.begin(), median_it1 , len.end());
const auto e1 = *median_it1;
std::nth_element(len.begin(), median_it2 , len.end());
const auto e2 = *median_it2;
return (e1 + e2) / 2;
} else {
const auto median_it = len.begin() + len.size() / 2;
std::nth_element(len.begin(), median_it , len.end());
return *median_it;
}
There are of course many different ways how we can get element e1. We could also use max or whatever we want. But this line is important because nth_element only places the nth element correctly, the remaining elements are ordered before or after this element, depending on whether they are larger or smaller. This range is unsorted.
This code is guaranteed to have linear complexity on average, i.e., O(N), therefore it is asymptotically better than sort, which is O(N log N).
Regarding your code:
for (i=0; i<len.size(); i++){
if (len[i]>len[i+1])
This will not work, as you access len[len.size()] in the last iteration which does not exist.
std::sort(len.begin(), len.end());
double median = len[len.size() / 2];
will do it. You might need to take the average of the middle two elements if size() is even, depending on your requirements:
0.5 * (len[len.size() / 2 - 1] + len[len.size() / 2]);
Instead of trying to do everything at once, you should start with simple test cases and work upwards:
#include<vector>
double find_median(std::vector<double> len);
// Return the number of failures - shell interprets 0 as 'success',
// which suits us perfectly.
int main()
{
return find_median({0, 1, 1, 2}) != 1;
}
This already fails with your code (even after fixing i to be an unsigned type), so you could start debugging (even 'dry' debugging, where you trace the code through on paper; that's probably enough here).
I do note that with a smaller test case, such as {0, 1, 2}, I get a crash rather than merely failing the test, so there's something that really needs to be fixed.
Let's replace the implementation with one based on overseas's answer:
#include <algorithm>
#include <limits>
#include <vector>
double find_median(std::vector<double> len)
{
if (len.size() < 1)
return std::numeric_limits<double>::signaling_NaN();
const auto alpha = len.begin();
const auto omega = len.end();
// Find the two middle positions (they will be the same if size is odd)
const auto i1 = alpha + (len.size()-1) / 2;
const auto i2 = alpha + len.size() / 2;
// Partial sort to place the correct elements at those indexes (it's okay to modify the vector,
// as we've been given a copy; otherwise, we could use std::partial_sort_copy to populate a
// temporary vector).
std::nth_element(alpha, i1, omega);
std::nth_element(i1, i2, omega);
return 0.5 * (*i1 + *i2);
}
Now, our test passes. We can write a helper method to allow us to create more tests:
#include <iostream>
bool test_median(const std::vector<double>& v, double expected)
{
auto actual = find_median(v);
if (abs(expected - actual) > 0.01) {
std::cerr << actual << " - expected " << expected << std::endl;
return true;
} else {
std::cout << actual << std::endl;
return false;
}
}
int main()
{
return test_median({0, 1, 1, 2}, 1)
+ test_median({5}, 5)
+ test_median({5, 5, 5, 0, 0, 0, 1, 2}, 1.5);
}
Once you have the simple test cases working, you can manage more complex ones. Only then is it time to create a large array of random values to see how well it scales:
#include <ctime>
#include <functional>
#include <random>
int main(int argc, char **argv)
{
std::vector<double> foo;
const int n = argc > 1 ? std::stoi(argv[1]) : 10;
foo.reserve(n);
std::mt19937 rand_generator(std::time(0));
std::uniform_real_distribution<double> rand_distribution(0,0.8);
std::generate_n(std::back_inserter(foo), n, std::bind(rand_distribution, rand_generator));
std::cout << "Vector:";
for (auto v: foo)
std::cout << ' ' << v;
std::cout << "\nMedian = " << find_median(foo) << std::endl;
}
(I've taken the number of elements as a command-line argument; that's more convenient in my build than reading it from cin). Notice that instead of allocating n doubles in the vector, we simply reserve capacity for them, but don't create any until needed.
For fun and kicks, we can now make find_median() generic. I'll leave that as an exercise; I suggest you start with:
typename<class Iterator>
auto find_median(Iterator alpha, Iterator omega)
{
using value_type = typename Iterator::value_type;
if (alpha == omega)
return std::numeric_limits<value_type>::signaling_NaN();
}
I have a std::vector<PLY> that holds a number of structs:
struct PLY {
int x;
int y;
int greyscale;
}
Some of the PLY's could be duplicates in terms of their position x and y but not necessarily in terms of their greyscale value. What is the best way to find those (position-) duplicates and replace them with a single PLY instace which has a greyscale value that represents the average greyscale of all duplicates?
E.g: PLY a{1,1,188} is a duplicate of PLY b{1,1,255}. Same (x,y) position possibly different greyscale.
Based on your description of Ply you need these operators:
auto operator==(const Ply& a, const Ply& b)
{
return a.x == b.x && a.y == b.y;
}
auto operator<(const Ply& a, const Ply& b)
{
// whenever you can be lazy!
return std::make_pair(a.x, a.y) < std::make_pair(b.x, b.y);
}
Very important: if the definition "Two Ply are identical if their x and y are identical" is not general valid, then defining comparator operators that ignore greyscale is a bad ideea. In that case you should define separate function objects or non-operator functions and pass them around to function.
There is a nice rule of thumb that a function should not have more than a loop. So instead of a nested 2 for loops, we define this helper function which computes the average of consecutive duplicates and also returns the end of the consecutive duplicates range:
// prereq: [begin, end) has at least one element
// i.e. begin != end
template <class It>
auto compute_average_duplicates(It begin, It end) -> std::pair<int, It>
// (sadly not C++17) concepts:
//requires requires(It i) { {*i} -> Ply; }
{
auto it = begin + 1;
int sum = begin->greyscale;
for (; it != end && *begin == *it; ++it) {
sum += it->greyscale;
}
// you might need rounding instead of truncation:
return std::make_pair(sum / std::distance(begin, it), it);
}
With this we can have our algorithm:
auto foo()
{
std::vector<Ply> v = {{1, 5, 10}, {2, 4, 6}, {1, 5, 2}};
std::sort(std::begin(v), std::end(v));
for (auto i = std::begin(v); i != std::end(v); ++i) {
decltype(i) j;
int average;
std::tie(average, j) = compute_average_duplicates(i, std::end(v));
// C++17 (coming soon in a compiler near you):
// auto [average, j] = compute_average_duplicates(i, std::end(v));
if (i + 1 == j)
continue;
i->greyscale = average;
v.erase(i + 1, j);
// std::vector::erase Invalidates iterators and references
// at or after the point of the erase
// which means i remains valid, and `++i` (from the for) is correct
}
}
You can apply lexicographical sorting first. During sorting you should take care of overflowing greyscale. With current approach you will have some roundoff error, but it will be small as i first sum and only then average.
In the second part you need to remove duplicates from the array. I used additional array of indices to copy every element not more than once. If you have some forbidden value for x, y or greyscale you can use it and thus get along without additional array.
struct PLY {
int x;
int y;
int greyscale;
};
int main()
{
struct comp
{
bool operator()(const PLY &a, const PLY &b) { return a.x != b.x ? a.x < b.x : a.y < b.y; }
};
vector<PLY> v{ {1,1,1}, {1,2,2}, {1,1,2}, {1,3,5}, {1,2,7} };
sort(begin(v), end(v), comp());
vector<bool> ind(v.size(), true);
int s = 0;
for (int i = 1; i < v.size(); ++i)
{
if (v[i].x == v[i - 1].x &&v[i].y == v[i - 1].y)
{
v[s].greyscale += v[i].greyscale;
ind[i] = false;
}
else
{
int d = i - s;
if (d != 1)
{
v[s].greyscale /= d;
}
s = i;
}
}
s = 0;
for (int i = 0; i < v.size(); ++i)
{
if (ind[i])
{
if (s != i)
{
v[s] = v[i];
}
++s;
}
}
v.resize(s);
}
So you need to check, is PLY a1 { 1,1,1 }; duplicates PLY a2 {2,2,1};
So simple method is to override operator == to check a1.x == a2.x and a1.y == a2.y. After you can write own function removeDuplicates(std::vector<PLU>& mPLY); which will use iterators of this vector, compare and remove. But better to use std::list if you want to remove from middle of array too frequently.
say I have a std::vector with N elements. I would like to copy every n-th element of it to a new vector, or average up to that element then copy it (downsample the original vector). So I want to do this
std::vector<double> vec(N);
long n = 4;
std::vector<double> ds(N/n);
for(long i = 0; i < ds.size(); i+=n)
{
ds[i] = vec[i*n];
}
or
for(long i = 0; i < ds.size(); i+=n)
{
double tmp = 0;
for(long j = 0; j < n; j++)
{
tmp += vec[i*n+j];
}
ds[i] = tmp/static_cast<double>(n);
}
Is there a way to do this using the standard algorithms of C++? Like using std::copy with binary functions? I have billions of elements that I want to treat this way, and I want this to be as fast as possible.
PS: I would prefer not to use external libraries such as boost.
For readability, the loop would be a good idea, as pointed out by Vlad in the comments. But if you really want to do someting like this, you could try:
int cnt=0,n=3;
vector<int> u(v.size()/3);
copy_if (v.begin(), v.end(), u.begin(),
[&cnt,&n] (int i)->bool {return ++cnt %n ==0; } );
If you want to average, it's getting worse as you'd have to similar tricks combining transform() with copy_if().
Edit:
If you're looking for performance, you'd better stick to the loop, as stressed in the comments by davidhigh: it will avoid the overhead of the call to the lambda function for each element.
If you're looking for an algorithm because you're doing this very often, you'd better write your own generic one.
You could write your own generic algorithms inspired from the design principles used in <algorithm>.
For the copy of every n elements:
template<class in_it, class out_it>
out_it copy_every_n( in_it b, in_it e, out_it r, size_t n) {
for (size_t i=distance(b,e)/n; i--; advance (b,n))
*r++ = *b;
return r;
}
Example of use:
vector<int> v {1,2,3,4,5,6,7,8,9,10};
vector<int> z(v.size()/3);
copy_every_n(v.begin(), v.end(), z.begin(), 3);
For averaging the elements n by n, you can use:
template<class in_it, class out_it>
out_it average_every_n( in_it b, in_it e, out_it r, size_t n) {
typename out_it::value_type tmp=0;
for (size_t cnt=0; b!=e; b++) {
tmp+=*b;
if (++cnt==n) {
cnt=0;
*r++=tmp/n;
tmp=0;
}
}
return r;
}
Example of use:
vector<int> w(v.size()/3);
average_every_n(v.begin(), v.end(), w.begin(), 3);
The advantage over your inital loops, is that this will work not only on vectors, but on any container providing the begin() and end() iterator. And it avoids overheads that I pointed out in my other answer.
If to use only standard library features and algorithms and if it is not allowed to use loops then the code can look the following way. Take into account that the code is based on the C++ 2014. If you need a code that will be compiled by a compiler that supports only C++ 2011 then you have to make some minor changes.
#include <iostream>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
int main()
{
const size_t N = 4;
std::vector<double> src = { 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9 };
size_t n = src.size() / N;
std::vector<double> dst( n );
std::copy_if( src.begin(), std::next( src.begin(), n * N ), dst.begin(),
[i = 0] ( auto ) mutable { return ( i = ( i + 1 ) % N ) == 0; } );
for ( double x : dst ) std::cout << x << ' ';
std::cout << std::endl;
dst.assign( n, 0.0 );
std::accumulate( src.begin(), std::next( src.begin(), n * N ), dst.begin(),
[i = 0] ( auto acc, auto x ) mutable
{
*acc += x;
if ( ( i = ( i + 1 ) % N ) == 0 ) *acc++ /= N;
return acc;
} );
for ( double x : dst ) std::cout << x << ' ';
std::cout << std::endl;
}
The program output is
4.4 8.8
2.75 7.15
This compound expression in the if condition
if ( ( i = ( i + 1 ) % N ) == 0 ) *acc++ /= N;
you can replace with more simpler one
if ( ++i % N == 0 ) *acc++ /= N;
You may have explicitly stated that you prefer not to use Boost, but any non-Boost solution would really be implementing exactly this sort of thing anyway, so I'll show you how I would do it in Boost. Ultimately, I think you're better off writing a simple loop.
Downsampling uses strided
boost::copy(
input | strided(2),
std::back_inserter(output));
Downsampling average additionally uses transformed, though this solution is non-generic and specifically relies upon vector being contiguous:
boost::copy(
input | strided(2) | transformed([](auto& x){
return std::accumulate(&x, &x + 2, 0) / 2.;
}),
std::back_inserter(output));
Of course that has issues if the input isn't an exact multiple of the stride length, so it'd probably be better to do something like:
auto downsample_avg = [](auto& input, int n){
return input | strided(n) | transformed([&,n](auto& x){
auto begin = &x;
auto end = begin + std::min<size_t>(n, &input.back() - begin + 1);
return std::accumulate(begin, end, 0.0) / (end - begin);
});
};
boost::copy(
downsample_avg(input, 2),
std::back_inserter(output));
how about this implemention?
#include <iterator>
template<typename InputIt, typename OutputIt>
OutputIt DownSample(InputIt first, InputIt last, OutputIt d_first,
typename std::iterator_traits<InputIt>::difference_type n) {
while (first < last) {
*(d_first++) = *first;
std::advance(first, n);
}
return d_first;
}
I want to sort a large array of integers (say 1 millon elements) lexicographically.
Example:
input [] = { 100, 21 , 22 , 99 , 1 , 927 }
sorted[] = { 1 , 100, 21 , 22 , 927, 99 }
I have done it using the simplest possible method:
convert all numbers to strings (very costly because it will take huge memory)
use std:sort with strcmp as comparison function
convert back the strings to integers
Is there a better method than this?
Use std::sort() with a suitable comparison function. This cuts down on the memory requirements.
The comparison function can use n % 10, n / 10 % 10, n / 100 % 10 etc. to access the individual digits (for positive integers; negative integers work a bit differently).
To provide any custom sort ordering, you can provide a comparator to std::sort. In this case, it's going to be somewhat complex, using logarithms to inspect individual digits of your number in base 10.
Here's an example — comments inline describe what's going on.
#include <iostream>
#include <algorithm>
#include <cmath>
#include <cassert>
int main() {
int input[] { 100, 21, 22, 99, 1, 927, -50, -24, -160 };
/**
* Sorts the array lexicographically.
*
* The trick is that we have to compare digits left-to-right
* (considering typical Latin decimal notation) and that each of
* two numbers to compare may have a different number of digits.
*
* This is very efficient in storage space, but inefficient in
* execution time; an approach that pre-visits each element and
* stores a translated representation will at least double your
* storage requirements (possibly a problem with large inputs)
* but require only a single translation of each element.
*/
std::sort(
std::begin(input),
std::end(input),
[](int lhs, int rhs) -> bool {
// Returns true if lhs < rhs
// Returns false otherwise
const auto BASE = 10;
const bool LHS_FIRST = true;
const bool RHS_FIRST = false;
const bool EQUAL = false;
// There's no point in doing anything at all
// if both inputs are the same; strict-weak
// ordering requires that we return `false`
// in this case.
if (lhs == rhs) {
return EQUAL;
}
// Compensate for sign
if (lhs < 0 && rhs < 0) {
// When both are negative, sign on its own yields
// no clear ordering between the two arguments.
//
// Remove the sign and continue as for positive
// numbers.
lhs *= -1;
rhs *= -1;
}
else if (lhs < 0) {
// When the LHS is negative but the RHS is not,
// consider the LHS "first" always as we wish to
// prioritise the leading '-'.
return LHS_FIRST;
}
else if (rhs < 0) {
// When the RHS is negative but the LHS is not,
// consider the RHS "first" always as we wish to
// prioritise the leading '-'.
return RHS_FIRST;
}
// Counting the number of digits in both the LHS and RHS
// arguments is *almost* trivial.
const auto lhs_digits = (
lhs == 0
? 1
: std::ceil(std::log(lhs+1)/std::log(BASE))
);
const auto rhs_digits = (
rhs == 0
? 1
: std::ceil(std::log(rhs+1)/std::log(BASE))
);
// Now we loop through the positions, left-to-right,
// calculating the digit at these positions for each
// input, and comparing them numerically. The
// lexicographic nature of the sorting comes from the
// fact that we are doing this per-digit comparison
// rather than considering the input value as a whole.
const auto max_pos = std::max(lhs_digits, rhs_digits);
for (auto pos = 0; pos < max_pos; pos++) {
if (lhs_digits - pos == 0) {
// Ran out of digits on the LHS;
// prioritise the shorter input
return LHS_FIRST;
}
else if (rhs_digits - pos == 0) {
// Ran out of digits on the RHS;
// prioritise the shorter input
return RHS_FIRST;
}
else {
const auto lhs_x = (lhs / static_cast<decltype(BASE)>(std::pow(BASE, lhs_digits - 1 - pos))) % BASE;
const auto rhs_x = (rhs / static_cast<decltype(BASE)>(std::pow(BASE, rhs_digits - 1 - pos))) % BASE;
if (lhs_x < rhs_x)
return LHS_FIRST;
else if (rhs_x < lhs_x)
return RHS_FIRST;
}
}
// If we reached the end and everything still
// matches up, then something probably went wrong
// as I'd have expected to catch this in the tests
// for equality.
assert("Unknown case encountered");
}
);
std::cout << '{';
for (auto x : input)
std::cout << x << ", ";
std::cout << '}';
// Output: {-160, -24, -50, 1, 100, 21, 22, 927, 99, }
}
Demo
There are quicker ways to calculate the number of digits in a number, but the above will get you started.
Here's another algorithm which does some of the computation before sorting. It seems to be quite fast, despite the additional copying (see comparisons).
Note:
it only supports positive integers
in only supports integers <= std::numeric_limits<int>::max()/10
N.B. you can optimize count_digits and my_pow10; for example, see Three Optimization Tips for C++ from Andrei Alexandrescu and Any way faster than pow() to compute an integer power of 10 in C++?
Helpers:
#include <random>
#include <vector>
#include <utility>
#include <cmath>
#include <iostream>
#include <algorithm>
#include <limits>
#include <iterator>
// non-optimized version
int count_digits(int p) // returns `0` for `p == 0`
{
int res = 0;
for(; p != 0; ++res)
{
p /= 10;
}
return res;
}
// non-optimized version
int my_pow10(unsigned exp)
{
int res = 1;
for(; exp != 0; --exp)
{
res *= 10;
}
return res;
}
Algorithm (note - not in-place):
// helper to provide integers with the same number of digits
template<class T, class U>
std::pair<T, T> lexicographic_pair_helper(T const p, U const maxDigits)
{
auto const digits = count_digits(p);
// append zeros so that `l` has `maxDigits` digits
auto const l = static_cast<T>( p * my_pow10(maxDigits-digits) );
return {l, p};
}
template<class RaIt>
using pair_vec
= std::vector<std::pair<typename std::iterator_traits<RaIt>::value_type,
typename std::iterator_traits<RaIt>::value_type>>;
template<class RaIt>
pair_vec<RaIt> lexicographic_sort(RaIt p_beg, RaIt p_end)
{
if(p_beg == p_end) return {};
auto max = *std::max_element(p_beg, p_end);
auto maxDigits = count_digits(max);
pair_vec<RaIt> result;
result.reserve( std::distance(p_beg, p_end) );
for(auto i = p_beg; i != p_end; ++i)
result.push_back( lexicographic_pair_helper(*i, maxDigits) );
using value_type = typename pair_vec<RaIt>::value_type;
std::sort(begin(result), end(result),
[](value_type const& l, value_type const& r)
{
if(l.first < r.first) return true;
if(l.first > r.first) return false;
return l.second < r.second; }
);
return result;
}
Usage example:
int main()
{
std::vector<int> input = { 100, 21 , 22 , 99 , 1 , 927 };
// generate some numbers
/*{
constexpr int number_of_elements = 1E6;
std::random_device rd;
std::mt19937 gen( rd() );
std::uniform_int_distribution<>
dist(0, std::numeric_limits<int>::max()/10);
for(int i = 0; i < number_of_elements; ++i)
input.push_back( dist(gen) );
}*/
std::cout << "unsorted: ";
for(auto const& e : input) std::cout << e << ", ";
std::cout << "\n\n";
auto sorted = lexicographic_sort(begin(input), end(input));
std::cout << "sorted: ";
for(auto const& e : sorted) std::cout << e.second << ", ";
std::cout << "\n\n";
}
Here's a community wiki to compare the solutions. I took nim's code and made it easily extensible. Feel free to add your solutions and outputs.
Sample runs an old slow computer (3 GB RAM, Core2Duo U9400) with g++4.9 # -O3 -march=native:
number of elements: 1e+03
size of integer type: 4
reference solution: Lightness Races in Orbit
solution "dyp":
duration: 0 ms and 301 microseconds
comparison to reference solution: exact match
solution "Nim":
duration: 2 ms and 160 microseconds
comparison to reference solution: exact match
solution "nyarlathotep":
duration: 8 ms and 126 microseconds
comparison to reference solution: exact match
solution "notbad":
duration: 1 ms and 102 microseconds
comparison to reference solution: exact match
solution "Eric Postpischil":
duration: 2 ms and 550 microseconds
comparison to reference solution: exact match
solution "Lightness Races in Orbit":
duration: 17 ms and 469 microseconds
comparison to reference solution: exact match
solution "pts":
duration: 1 ms and 92 microseconds
comparison to reference solution: exact match
==========================================================
number of elements: 1e+04
size of integer type: 4
reference solution: Lightness Races in Orbit
solution "nyarlathotep":
duration: 109 ms and 712 microseconds
comparison to reference solution: exact match
solution "Lightness Races in Orbit":
duration: 272 ms and 819 microseconds
comparison to reference solution: exact match
solution "dyp":
duration: 1 ms and 748 microseconds
comparison to reference solution: exact match
solution "notbad":
duration: 16 ms and 115 microseconds
comparison to reference solution: exact match
solution "pts":
duration: 15 ms and 10 microseconds
comparison to reference solution: exact match
solution "Eric Postpischil":
duration: 33 ms and 301 microseconds
comparison to reference solution: exact match
solution "Nim":
duration: 17 ms and 83 microseconds
comparison to reference solution: exact match
==========================================================
number of elements: 1e+05
size of integer type: 4
reference solution: Lightness Races in Orbit
solution "Nim":
duration: 217 ms and 4 microseconds
comparison to reference solution: exact match
solution "pts":
duration: 199 ms and 505 microseconds
comparison to reference solution: exact match
solution "dyp":
duration: 20 ms and 330 microseconds
comparison to reference solution: exact match
solution "Eric Postpischil":
duration: 415 ms and 477 microseconds
comparison to reference solution: exact match
solution "Lightness Races in Orbit":
duration: 3955 ms and 58 microseconds
comparison to reference solution: exact match
solution "notbad":
duration: 215 ms and 259 microseconds
comparison to reference solution: exact match
solution "nyarlathotep":
duration: 1341 ms and 46 microseconds
comparison to reference solution: mismatch found
==========================================================
number of elements: 1e+06
size of integer type: 4
reference solution: Lightness Races in Orbit
solution "Lightness Races in Orbit":
duration: 52861 ms and 314 microseconds
comparison to reference solution: exact match
solution "Eric Postpischil":
duration: 4757 ms and 608 microseconds
comparison to reference solution: exact match
solution "nyarlathotep":
duration: 15654 ms and 195 microseconds
comparison to reference solution: mismatch found
solution "dyp":
duration: 233 ms and 779 microseconds
comparison to reference solution: exact match
solution "pts":
duration: 2181 ms and 634 microseconds
comparison to reference solution: exact match
solution "Nim":
duration: 2539 ms and 9 microseconds
comparison to reference solution: exact match
solution "notbad":
duration: 2675 ms and 362 microseconds
comparison to reference solution: exact match
==========================================================
number of elements: 1e+07
size of integer type: 4
reference solution: Lightness Races in Orbit
solution "notbad":
duration: 33425 ms and 423 microseconds
comparison to reference solution: exact match
solution "pts":
duration: 26000 ms and 398 microseconds
comparison to reference solution: exact match
solution "Eric Postpischil":
duration: 56206 ms and 359 microseconds
comparison to reference solution: exact match
solution "Lightness Races in Orbit":
duration: 658540 ms and 342 microseconds
comparison to reference solution: exact match
solution "nyarlathotep":
duration: 187064 ms and 518 microseconds
comparison to reference solution: mismatch found
solution "Nim":
duration: 30519 ms and 227 microseconds
comparison to reference solution: exact match
solution "dyp":
duration: 2624 ms and 644 microseconds
comparison to reference solution: exact match
The algorithms have to be structs with function-call operator templates that support the interface:
template<class RaIt> operator()(RaIt begin, RaIt end);
A copy of the input data is provided as a parameter, the algorithm is expected to provide the result in the same range (e.g. in-place sort).
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <random>
#include <vector>
#include <utility>
#include <cmath>
#include <cassert>
#include <chrono>
#include <cstring>
#include <climits>
#include <functional>
#include <cstdlib>
#include <iomanip>
using duration_t = decltype( std::chrono::high_resolution_clock::now()
- std::chrono::high_resolution_clock::now());
template<class T>
struct result_t
{
std::vector<T> numbers;
duration_t duration;
char const* name;
};
template<class RaIt, class F>
result_t<typename std::iterator_traits<RaIt>::value_type>
apply_algorithm(RaIt p_beg, RaIt p_end, F f, char const* name)
{
using value_type = typename std::iterator_traits<RaIt>::value_type;
std::vector<value_type> inplace(p_beg, p_end);
auto start = std::chrono::high_resolution_clock::now();
f(begin(inplace), end(inplace));
auto end = std::chrono::high_resolution_clock::now();
auto duration = end - start;
return {std::move(inplace), duration, name};
}
// non-optimized version
int count_digits(int p) // returns `0` for `p == 0`
{
int res = 0;
for(; p != 0; ++res)
{
p /= 10;
}
return res;
}
// non-optimized version
int my_pow10(unsigned exp)
{
int res = 1;
for(; exp != 0; --exp)
{
res *= 10;
}
return res;
}
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// paste algorithms here
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
int main(int argc, char** argv)
{
using integer_t = int;
constexpr integer_t dist_min = 0;
constexpr integer_t dist_max = std::numeric_limits<integer_t>::max()/10;
constexpr std::size_t default_number_of_elements = 1E6;
const std::size_t number_of_elements = argc>1 ? std::atoll(argv[1]) :
default_number_of_elements;
std::cout << "number of elements: ";
std::cout << std::scientific << std::setprecision(0);
std::cout << (double)number_of_elements << "\n";
std::cout << /*std::defaultfloat <<*/ std::setprecision(6);
std::cout.unsetf(std::ios_base::floatfield);
std::cout << "size of integer type: " << sizeof(integer_t) << "\n\n";
std::vector<integer_t> input;
{
input.reserve(number_of_elements);
std::random_device rd;
std::mt19937 gen( rd() );
std::uniform_int_distribution<> dist(dist_min, dist_max);
for(std::size_t i = 0; i < number_of_elements; ++i)
input.push_back( dist(gen) );
}
auto b = begin(input);
auto e = end(input);
using res_t = result_t<integer_t>;
std::vector< std::function<res_t()> > algorithms;
#define MAKE_BINDER(B, E, ALGO, NAME) \
std::bind( &apply_algorithm<decltype(B),decltype(ALGO)>, \
B,E,ALGO,NAME )
constexpr auto lightness_name = "Lightness Races in Orbit";
algorithms.push_back( MAKE_BINDER(b, e, lightness(), lightness_name) );
algorithms.push_back( MAKE_BINDER(b, e, dyp(), "dyp") );
algorithms.push_back( MAKE_BINDER(b, e, nim(), "Nim") );
algorithms.push_back( MAKE_BINDER(b, e, pts(), "pts") );
algorithms.push_back( MAKE_BINDER(b, e, epost(), "Eric Postpischil") );
algorithms.push_back( MAKE_BINDER(b, e, nyar(), "nyarlathotep") );
algorithms.push_back( MAKE_BINDER(b, e, notbad(), "notbad") );
{
std::srand( std::random_device()() );
std::random_shuffle(begin(algorithms), end(algorithms));
}
std::vector< result_t<integer_t> > res;
for(auto& algo : algorithms)
res.push_back( algo() );
auto reference_solution
= *std::find_if(begin(res), end(res),
[](result_t<integer_t> const& p)
{ return 0 == std::strcmp(lightness_name, p.name); });
std::cout << "reference solution: "<<reference_solution.name<<"\n\n";
for(auto const& e : res)
{
std::cout << "solution \""<<e.name<<"\":\n";
auto ms =
std::chrono::duration_cast<std::chrono::microseconds>(e.duration);
std::cout << "\tduration: "<<ms.count()/1000<<" ms and "
<<ms.count()%1000<<" microseconds\n";
std::cout << "\tcomparison to reference solution: ";
if(e.numbers.size() != reference_solution.numbers.size())
{
std::cout << "ouput count mismatch\n";
break;
}
auto mismatch = std::mismatch(begin(e.numbers), end(e.numbers),
begin(reference_solution.numbers)).first;
if(end(e.numbers) == mismatch)
{
std::cout << "exact match\n";
}else
{
std::cout << "mismatch found\n";
}
}
}
Current algorithms; note I replaced the digit counters and pow-of-10 with the global function, so we all benefit if someone optimizes.
struct lightness
{
template<class RaIt> void operator()(RaIt b, RaIt e)
{
using T = typename std::iterator_traits<RaIt>::value_type;
/**
* Sorts the array lexicographically.
*
* The trick is that we have to compare digits left-to-right
* (considering typical Latin decimal notation) and that each of
* two numbers to compare may have a different number of digits.
*
* This is very efficient in storage space, but inefficient in
* execution time; an approach that pre-visits each element and
* stores a translated representation will at least double your
* storage requirements (possibly a problem with large inputs)
* but require only a single translation of each element.
*/
std::sort(
b,
e,
[](T lhs, T rhs) -> bool {
// Returns true if lhs < rhs
// Returns false otherwise
const auto BASE = 10;
const bool LHS_FIRST = true;
const bool RHS_FIRST = false;
const bool EQUAL = false;
// There's no point in doing anything at all
// if both inputs are the same; strict-weak
// ordering requires that we return `false`
// in this case.
if (lhs == rhs) {
return EQUAL;
}
// Compensate for sign
if (lhs < 0 && rhs < 0) {
// When both are negative, sign on its own yields
// no clear ordering between the two arguments.
//
// Remove the sign and continue as for positive
// numbers.
lhs *= -1;
rhs *= -1;
}
else if (lhs < 0) {
// When the LHS is negative but the RHS is not,
// consider the LHS "first" always as we wish to
// prioritise the leading '-'.
return LHS_FIRST;
}
else if (rhs < 0) {
// When the RHS is negative but the LHS is not,
// consider the RHS "first" always as we wish to
// prioritise the leading '-'.
return RHS_FIRST;
}
// Counting the number of digits in both the LHS and RHS
// arguments is *almost* trivial.
const auto lhs_digits = (
lhs == 0
? 1
: std::ceil(std::log(lhs+1)/std::log(BASE))
);
const auto rhs_digits = (
rhs == 0
? 1
: std::ceil(std::log(rhs+1)/std::log(BASE))
);
// Now we loop through the positions, left-to-right,
// calculating the digit at these positions for each
// input, and comparing them numerically. The
// lexicographic nature of the sorting comes from the
// fact that we are doing this per-digit comparison
// rather than considering the input value as a whole.
const auto max_pos = std::max(lhs_digits, rhs_digits);
for (auto pos = 0; pos < max_pos; pos++) {
if (lhs_digits - pos == 0) {
// Ran out of digits on the LHS;
// prioritise the shorter input
return LHS_FIRST;
}
else if (rhs_digits - pos == 0) {
// Ran out of digits on the RHS;
// prioritise the shorter input
return RHS_FIRST;
}
else {
const auto lhs_x = (lhs / static_cast<decltype(BASE)>(std::pow(BASE, lhs_digits - 1 - pos))) % BASE;
const auto rhs_x = (rhs / static_cast<decltype(BASE)>(std::pow(BASE, rhs_digits - 1 - pos))) % BASE;
if (lhs_x < rhs_x)
return LHS_FIRST;
else if (rhs_x < lhs_x)
return RHS_FIRST;
}
}
// If we reached the end and everything still
// matches up, then something probably went wrong
// as I'd have expected to catch this in the tests
// for equality.
assert("Unknown case encountered");
// dyp: suppress warning and throw
throw "up";
}
);
}
};
namespace ndyp
{
// helper to provide integers with the same number of digits
template<class T, class U>
std::pair<T, T> lexicographic_pair_helper(T const p, U const maxDigits)
{
auto const digits = count_digits(p);
// append zeros so that `l` has `maxDigits` digits
auto const l = static_cast<T>( p * my_pow10(maxDigits-digits) );
return {l, p};
}
template<class RaIt>
using pair_vec
= std::vector<std::pair<typename std::iterator_traits<RaIt>::value_type,
typename std::iterator_traits<RaIt>::value_type>>;
template<class RaIt>
pair_vec<RaIt> lexicographic_sort(RaIt p_beg, RaIt p_end)
{
if(p_beg == p_end) return pair_vec<RaIt>{};
auto max = *std::max_element(p_beg, p_end);
auto maxDigits = count_digits(max);
pair_vec<RaIt> result;
result.reserve( std::distance(p_beg, p_end) );
for(auto i = p_beg; i != p_end; ++i)
result.push_back( lexicographic_pair_helper(*i, maxDigits) );
using value_type = typename pair_vec<RaIt>::value_type;
std::sort(begin(result), end(result),
[](value_type const& l, value_type const& r)
{
if(l.first < r.first) return true;
if(l.first > r.first) return false;
return l.second < r.second; }
);
return result;
}
}
struct dyp
{
template<class RaIt> void operator()(RaIt b, RaIt e)
{
auto pairvec = ndyp::lexicographic_sort(b, e);
std::transform(begin(pairvec), end(pairvec), b,
[](typename decltype(pairvec)::value_type const& e) { return e.second; });
}
};
namespace nnim
{
bool comp(int l, int r)
{
int lv[10] = {}; // probably possible to get this from numeric_limits
int rv[10] = {};
int lc = 10; // ditto
int rc = 10;
while (l || r)
{
if (l)
{
auto t = l / 10;
lv[--lc] = l - (t * 10);
l = t;
}
if (r)
{
auto t = r / 10;
rv[--rc] = r - (t * 10);
r = t;
}
}
while (lc < 10 && rc < 10)
{
if (lv[lc] == rv[rc])
{
lc++;
rc++;
}
else
return lv[lc] < rv[rc];
}
return lc > rc;
}
}
struct nim
{
template<class RaIt> void operator()(RaIt b, RaIt e)
{
std::sort(b, e, nnim::comp);
}
};
struct pts
{
template<class T> static bool lex_less(T a, T b) {
unsigned la = 1, lb = 1;
for (T t = a; t > 9; t /= 10) ++la;
for (T t = b; t > 9; t /= 10) ++lb;
const bool ll = la < lb;
while (la > lb) { b *= 10; ++lb; }
while (lb > la) { a *= 10; ++la; }
return a == b ? ll : a < b;
}
template<class RaIt> void operator()(RaIt b, RaIt e)
{
std::sort(b, e, lex_less<typename std::iterator_traits<RaIt>::value_type>);
}
};
struct epost
{
static bool compare(int x, int y)
{
static const double limit = .5 * (log(INT_MAX) - log(INT_MAX-1));
double lx = log10(x);
double ly = log10(y);
double fx = lx - floor(lx); // Get the mantissa of lx.
double fy = ly - floor(ly); // Get the mantissa of ly.
return fabs(fx - fy) < limit ? lx < ly : fx < fy;
}
template<class RaIt> void operator()(RaIt b, RaIt e)
{
std::sort(b, e, compare);
}
};
struct nyar
{
static bool lexiSmaller(int i1, int i2)
{
int digits1 = count_digits(i1);
int digits2 = count_digits(i2);
double val1 = i1/pow(10.0, digits1-1);
double val2 = i2/pow(10.0, digits2-1);
while (digits1 > 0 && digits2 > 0 && (int)val1 == (int)val2)
{
digits1--;
digits2--;
val1 = (val1 - (int)val1)*10;
val2 = (val2 - (int)val2)*10;
}
if (digits1 > 0 && digits2 > 0)
{
return (int)val1 < (int)val2;
}
return (digits2 > 0);
}
template<class RaIt> void operator()(RaIt b, RaIt e)
{
std::sort(b, e, lexiSmaller);
}
};
struct notbad
{
static int up_10pow(int n) {
int ans = 1;
while (ans < n) ans *= 10;
return ans;
}
static bool compare(int v1, int v2) {
int ceil1 = up_10pow(v1), ceil2 = up_10pow(v2);
while ( ceil1 != 0 && ceil2 != 0) {
if (v1 / ceil1 < v2 / ceil2) return true;
else if (v1 / ceil1 > v2 / ceil2) return false;
ceil1 /= 10;
ceil2 /= 10;
}
if (v1 < v2) return true;
return false;
}
template<class RaIt> void operator()(RaIt b, RaIt e)
{
std::sort(b, e, compare);
}
};
I believe the following works as a sort comparison function for positive integers provided the integer type used is substantially narrower than the double type (e.g., 32-bit int and 64-bit double) and the log10 routine used returns exactly correct results for exact powers of 10 (which a good implementation does):
static const double limit = .5 * (log(INT_MAX) - log(INT_MAX-1));
double lx = log10(x);
double ly = log10(y);
double fx = lx - floor(lx); // Get the mantissa of lx.
double fy = ly - floor(ly); // Get the mantissa of ly.
return fabs(fx - fy) < limit ? lx < ly : fx < fy;
It works by comparing the mantissas of the logarithms. The mantissas are the fractional parts of the logarithm, and they indicate the value of the significant digits of a number without the magnitude (e.g., the logarithms of 31, 3.1, and 310 have exactly the same mantissa).
The purpose of fabs(fx - fy) < limit is to allow for errors in taking the logarithm, which occur both because implementations of log10 are imperfect and because the floating-point format forces some error. (The integer portions of the logarithms of 31 and 310 use different numbers of bits, so there are different numbers of bits left for the significand, so they end up being rounded to slightly different values.) As long as the integer type is substantially narrower than the double type, the calculated limit will be much larger than the error in log10. Thus, the test fabs(fx - fy) < limit essentially tells us whether two calculated mantissas would be equal if calculated exactly.
If the mantissas differ, they indicate the lexicographic order, so we return fx < fy. If they are equal, then the integer portion of the logarithm tells us the order, so we return lx < ly.
It is simple to test whether log10 returns correct results for every power of ten, since there are so few of them. If it does not, adjustments can be made easily: Insert if (1-fx < limit) fx = 0; if (1-fu < limit) fy = 0;. This allows for when log10 returns something like 4.99999… when it should have returned 5.
This method has the advantage of not using loops or division (which is time-consuming on many processors).
The task sounds like a natural fit for an MSD variant of Radix Sort with padding ( http://en.wikipedia.org/wiki/Radix_sort ).
Depends on how much code you want to throw at it. The simple code as the others show is O(log n) complexity, while a fully optimized radix sort would be O(kn).
A compact solution if all your numbers are nonnegative and they are small enough so that multiplying them by 10 doesn't cause an overflow:
template<class T> bool lex_less(T a, T b) {
unsigned la = 1, lb = 1;
for (T t = a; t > 9; t /= 10) ++la;
for (T t = b; t > 9; t /= 10) ++lb;
const bool ll = la < lb;
while (la > lb) { b *= 10; ++lb; }
while (lb > la) { a *= 10; ++la; }
return a == b ? ll : a < b;
}
Run it like this:
#include <iostream>
#include <algorithm>
int main(int, char **) {
unsigned short input[] = { 100, 21 , 22 , 99 , 1 , 927 };
unsigned input_size = sizeof(input) / sizeof(input[0]);
std::sort(input, input + input_size, lex_less<unsigned short>);
for (unsigned i = 0; i < input_size; ++i) {
std::cout << ' ' << input[i];
}
std::cout << std::endl;
return 0;
}
You could try using the % operator to give you access to each individual digit eg 121 % 100 will give you the first digit and check that way but you'll have to find a way to get around the fact they have different sizes.
So find the maximum value in array. I don't know if theres a function for this in built you could try.
int Max (int* pdata,int size)
{
int temp_max =0 ;
for (int i =0 ; i < size ; i++)
{
if (*(pdata+i) > temp_max)
{
temp_max = *(pdata+i);
}
}
return temp_max;
}
This function will return the number of digits in the number
int Digit_checker(int n)
{
int num_digits = 1;
while (true)
{
if ((n % 10) == n)
return num_digits;
num_digits++;
n = n/10;
}
return num_digits;
}
Let number of digits in max equal n.
Once you have this open a for loop in the format of
for (int i = 1; i < n ; i++)
then you can go through your and use "data[i] % (10^(n-i))" to get access to the first digit then
sort that and then on the next iteration you'll get access to the second digit. I Don't know how you'll sort them though.
It wont work for negative numbers and you'll have to get around data[i] % (10^(n-i)) returning itself for numbers with less digits than max
Overload the < operator to compare two integers lexicographically. For each integer, find the smallest 10^k, which is not less than the given integer. Than compare the digits one by one.
class CmpIntLex {
int up_10pow(int n) {
int ans = 1;
while (ans < n) ans *= 10;
return ans;
}
public:
bool operator ()(int v1, int v2) {
int ceil1 = up_10pow(v1), ceil2 = up_10pow(v2);
while ( ceil1 != 0 && ceil2 != 0) {
if (v1 / ceil1 < v2 / ceil2) return true;
else if (v1 / ceil1 > v2 / ceil2) return false;
ceil1 /= 10;
ceil2 /= 10;
}
if (v1 < v2) return true;
return false;
}
int main() {
vector<int> vi = {12,45,12134,85};
sort(vi.begin(), vi.end(), CmpIntLex());
}
While some other answers here (Lightness's, notbad's) are already showing quite good code, I believe I can add one solution which might be more performant (since it requires neither division nor power in each loop; but it requires floating point arithmetic, which again might make it slow, and possibly inaccurate for large numbers):
#include <algorithm>
#include <iostream>
#include <assert.h>
// method taken from http://stackoverflow.com/a/1489873/671366
template <class T>
int numDigits(T number)
{
int digits = 0;
if (number < 0) digits = 1; // remove this line if '-' counts as a digit
while (number) {
number /= 10;
digits++;
}
return digits;
}
bool lexiSmaller(int i1, int i2)
{
int digits1 = numDigits(i1);
int digits2 = numDigits(i2);
double val1 = i1/pow(10.0, digits1-1);
double val2 = i2/pow(10.0, digits2-1);
while (digits1 > 0 && digits2 > 0 && (int)val1 == (int)val2)
{
digits1--;
digits2--;
val1 = (val1 - (int)val1)*10;
val2 = (val2 - (int)val2)*10;
}
if (digits1 > 0 && digits2 > 0)
{
return (int)val1 < (int)val2;
}
return (digits2 > 0);
}
int main(int argc, char* argv[])
{
// just testing whether the comparison function works as expected:
assert (lexiSmaller(1, 100));
assert (!lexiSmaller(100, 1));
assert (lexiSmaller(100, 22));
assert (!lexiSmaller(22, 100));
assert (lexiSmaller(927, 99));
assert (!lexiSmaller(99, 927));
assert (lexiSmaller(1, 927));
assert (!lexiSmaller(927, 1));
assert (lexiSmaller(21, 22));
assert (!lexiSmaller(22, 21));
assert (lexiSmaller(22, 99));
assert (!lexiSmaller(99, 22));
// use the comparison function for the actual sorting:
int input[] = { 100 , 21 , 22 , 99 , 1 ,927 };
std::sort(&input[0], &input[5], lexiSmaller);
std::cout << "sorted: ";
for (int i=0; i<6; ++i)
{
std::cout << input[i];
if (i<5)
{
std::cout << ", ";
}
}
std::cout << std::endl;
return 0;
}
Though I have to admit I haven't tested the performance yet.
Here is the dumb solution that doesn't use any floating point tricks. It's pretty much the same as the string comparison, but doesn't use a string per say, doesn't also handle negative numbers, to do that add a section at the top...
bool comp(int l, int r)
{
int lv[10] = {}; // probably possible to get this from numeric_limits
int rv[10] = {};
int lc = 10; // ditto
int rc = 10;
while (l || r)
{
if (l)
{
auto t = l / 10;
lv[--lc] = l - (t * 10);
l = t;
}
if (r)
{
auto t = r / 10;
rv[--rc] = r - (t * 10);
r = t;
}
}
while (lc < 10 && rc < 10)
{
if (lv[lc] == rv[rc])
{
lc++;
rc++;
}
else
return lv[lc] < rv[rc];
}
return lc > rc;
}
It's fast, and I'm sure it's possible to make it faster still, but it works and it's dumb enough to understand...
EDIT: I ate to dump lots of code, but here is a comparison of all the solutions so far..
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <random>
#include <vector>
#include <utility>
#include <cmath>
#include <cassert>
#include <chrono>
std::pair<int, int> lexicographic_pair_helper(int p, int maxDigits)
{
int digits = std::log10(p);
int l = p*std::pow(10, maxDigits-digits);
return {l, p};
}
bool l_comp(int l, int r)
{
int lv[10] = {}; // probably possible to get this from numeric_limits
int rv[10] = {};
int lc = 10; // ditto
int rc = 10;
while (l || r)
{
if (l)
{
auto t = l / 10;
lv[--lc] = l - (t * 10);
l = t;
}
if (r)
{
auto t = r / 10;
rv[--rc] = r - (t * 10);
r = t;
}
}
while (lc < 10 && rc < 10)
{
if (lv[lc] == rv[rc])
{
lc++;
rc++;
}
else
return lv[lc] < rv[rc];
}
return lc > rc;
}
int up_10pow(int n) {
int ans = 1;
while (ans < n) ans *= 10;
return ans;
}
bool l_comp2(int v1, int v2) {
int n1 = up_10pow(v1), n2 = up_10pow(v2);
while ( v1 != 0 && v2 != 0) {
if (v1 / n1 < v2 / n2) return true;
else if (v1 / n1 > v2 / n2) return false;
v1 /= 10;
v2 /= 10;
n1 /= 10;
n2 /= 10;
}
if (v1 == 0 && v2 != 0) return true;
return false;
}
int main()
{
std::vector<int> numbers;
{
constexpr int number_of_elements = 1E6;
std::random_device rd;
std::mt19937 gen( rd() );
std::uniform_int_distribution<> dist;
for(int i = 0; i < number_of_elements; ++i) numbers.push_back( dist(gen) );
}
std::vector<int> lo(numbers);
std::vector<int> dyp(numbers);
std::vector<int> nim(numbers);
std::vector<int> nb(numbers);
std::cout << "starting..." << std::endl;
{
auto start = std::chrono::high_resolution_clock::now();
/**
* Sorts the array lexicographically.
*
* The trick is that we have to compare digits left-to-right
* (considering typical Latin decimal notation) and that each of
* two numbers to compare may have a different number of digits.
*
* This probably isn't very efficient, so I wouldn't do it on
* "millions" of numbers. But, it works...
*/
std::sort(
std::begin(lo),
std::end(lo),
[](int lhs, int rhs) -> bool {
// Returns true if lhs < rhs
// Returns false otherwise
const auto BASE = 10;
const bool LHS_FIRST = true;
const bool RHS_FIRST = false;
const bool EQUAL = false;
// There's no point in doing anything at all
// if both inputs are the same; strict-weak
// ordering requires that we return `false`
// in this case.
if (lhs == rhs) {
return EQUAL;
}
// Compensate for sign
if (lhs < 0 && rhs < 0) {
// When both are negative, sign on its own yields
// no clear ordering between the two arguments.
//
// Remove the sign and continue as for positive
// numbers.
lhs *= -1;
rhs *= -1;
}
else if (lhs < 0) {
// When the LHS is negative but the RHS is not,
// consider the LHS "first" always as we wish to
// prioritise the leading '-'.
return LHS_FIRST;
}
else if (rhs < 0) {
// When the RHS is negative but the LHS is not,
// consider the RHS "first" always as we wish to
// prioritise the leading '-'.
return RHS_FIRST;
}
// Counting the number of digits in both the LHS and RHS
// arguments is *almost* trivial.
const auto lhs_digits = (
lhs == 0
? 1
: std::ceil(std::log(lhs+1)/std::log(BASE))
);
const auto rhs_digits = (
rhs == 0
? 1
: std::ceil(std::log(rhs+1)/std::log(BASE))
);
// Now we loop through the positions, left-to-right,
// calculating the digit at these positions for each
// input, and comparing them numerically. The
// lexicographic nature of the sorting comes from the
// fact that we are doing this per-digit comparison
// rather than considering the input value as a whole.
const auto max_pos = std::max(lhs_digits, rhs_digits);
for (auto pos = 0; pos < max_pos; pos++) {
if (lhs_digits - pos == 0) {
// Ran out of digits on the LHS;
// prioritise the shorter input
return LHS_FIRST;
}
else if (rhs_digits - pos == 0) {
// Ran out of digits on the RHS;
// prioritise the shorter input
return RHS_FIRST;
}
else {
const auto lhs_x = (lhs / static_cast<decltype(BASE)>(std::pow(BASE, lhs_digits - 1 - pos))) % BASE;
const auto rhs_x = (rhs / static_cast<decltype(BASE)>(std::pow(BASE, rhs_digits - 1 - pos))) % BASE;
if (lhs_x < rhs_x)
return LHS_FIRST;
else if (rhs_x < lhs_x)
return RHS_FIRST;
}
}
// If we reached the end and everything still
// matches up, then something probably went wrong
// as I'd have expected to catch this in the tests
// for equality.
assert("Unknown case encountered");
}
);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = end - start;
std::cout << "Lightness: " << elapsed.count() << '\n';
}
{
auto start = std::chrono::high_resolution_clock::now();
auto max = *std::max_element(begin(dyp), end(dyp));
int maxDigits = std::log10(max);
std::vector<std::pair<int,int>> temp;
temp.reserve(dyp.size());
for(auto const& e : dyp) temp.push_back( lexicographic_pair_helper(e, maxDigits) );
std::sort(begin(temp), end(temp), [](std::pair<int, int> const& l, std::pair<int, int> const& r)
{ if(l.first < r.first) return true; if(l.first > r.first) return false; return l.second < r.second; });
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = end - start;
std::cout << "Dyp: " << elapsed.count() << '\n';
}
{
auto start = std::chrono::high_resolution_clock::now();
std::sort (nim.begin(), nim.end(), l_comp);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = end - start;
std::cout << "Nim: " << elapsed.count() << '\n';
}
// {
// auto start = std::chrono::high_resolution_clock::now();
// std::sort (nb.begin(), nb.end(), l_comp2);
// auto end = std::chrono::high_resolution_clock::now();
// auto elapsed = end - start;
// std::cout << "notbad: " << elapsed.count() << '\n';
// }
std::cout << (nim == lo) << std::endl;
std::cout << (nim == dyp) << std::endl;
std::cout << (lo == dyp) << std::endl;
// std::cout << (lo == nb) << std::endl;
}
Based on #Oswald's answer, below is some code that does the same.
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
bool compare(string a, string b){
// Check each digit
int i = 0, j = 0;
while(i < a.size() && j < b.size()){
// If different digits
if(a[i] - '0' != b[j] - '0')
return (a[i] - '0' < b[j] - '0');
i++, j++;
}
// Different sizes
return (a.size() < b.size());
}
int main(){
vector<string> array = {"1","2","3","4","5","6","7","8","9","10","11","12"};
sort(array.begin(), array.end(), compare);
for(auto value : array)
cout << value << " ";
return 0;
}
Input: 1 2 3 4 5 6 7 8 9 10 11 12
Output: 1 10 11 12 2 3 4 5 6 7 8 9