The typical Towers of Hanoi Problem Solver would be the following:
void hanoi(int diskNumber , int start, int temp, int finish)
{
if(diskNumber == 1)
{
cout<< " Move Disk " << diskNumber<<" from " << start <<" to "<< finish<<endl;
}
else
{
hanoi(diskNumber-1,start,temp,finish);
cout<<"Move Disk from " << start <<" to "<<finish<<endl;
hanoi(diskNumber - 1,temp,start,finish);
}
}
But what I want to do is calculating the time that the algorithm runs. thus:
int main
{
//Hanoi:
cout<<"Hanoi Tower Problem:"<<endl;
//3 Disks:
clock_t htimer3 = clock();
hanoi(3, 1,2,3);
cout<<"CPU Time for n = 3 is: "
<<clock() - htimer3/CLOCKS_PER_SEC<<endl;
//6 Disks:
clock_t htimer6 = clock();
hanoi(6, 1,2,3);
cout<<"CPU Time for n = 6 is: "
<<clock() - htimer6/CLOCKS_PER_SEC<<endl;
//9 Disks:
clock_t htimer9 = clock();
hanoi(9, 1,2,3);
cout<<"CPU Time for n = 9 is: "
<<clock() - htimer9/CLOCKS_PER_SEC<<endl;
//12 Disks:
clock_t htimer12 = clock();
hanoi(12, 1,2,3);
cout<<"CPU Time for n = 12 is: "
<<clock() - htimer12/CLOCKS_PER_SEC<<endl;
//15 Disks:
clock_t htimer15 = clock();
hanoi(15, 1,2,3);
cout<<"CPU Time for n = 15 is: "
<<clock() - htimer15/CLOCKS_PER_SEC<<endl;
//End of Hanoi Tower Problem
return 0;
}
the problem here is that for example if I set the diskNumber = 15, the code's gonna run for 32767 times which will fill out the terminal window and I'll lose generated lines that comes before it (I have to calculate some other algorithms like bubble sort quick sort etc. I'm gonna use the numbers to draw a chart later to represent their Big O, i.e: Big O of Towers of Hanoi Algorithm is 2^n) .
To solve this problem I modified the code:
void hanoi(int diskSize, int start, int finish, int temp)
{
if(diskSize == 1)
{
return;
}
else
{
hanoi(diskSize - 1, start, temp, finish);
hanoi(diskSize - 1, temp, finish, start);
}
}
My main question: does the modified code, take the same running time as if it was the original algorithm? if not, what should do? any suggestions?
Yes, the time complexity of your modified code is same as the previous one since cout takes constant time. So your running time will not be affected much (granularity would be in order of nanoseconds) considering the stream you're writing to.
I would recomment redirecting the output to a file.
For example:
./executable > FileName
Related
This question already has answers here:
Measuring execution time of a function in C++
(14 answers)
Closed 3 years ago.
int Binary_search()
{
string word;
cout << "Enter The Word You Want To Find : ";
cin >> word;
int start = 0, end = data.size() - 1;
int mid, i = 0, counter = 0;
while (start <= end)
{
mid = (end + start) / 2;
if (data[i] == word)
return i;
else if (data[i] > word)
end = mid - 1;
else
start = mid + 1;
counter++;
i++;
}
return -1;
}
if i wanna to know how much time this code would take to find a word in data which is a vector of type string and it loaded with words.
The std::chrono::high_resolution_clock is the most accurate and hence it is used to measure execution time.
//Get the timepoint before the function is called
auto start = high_resolution_clock::now();
// Binary search ()
// Get ending timepoint..ie after you function is called
auto stop = high_resolution_clock::now();
// Get duration. Substart timepoints to
// get durarion. To cast it to proper unit
// use duration cast method
auto duration = duration_cast<microseconds>(stop - start);
You can use std::chrono::high_resolution_clock for that.
// get start time
auto start = std::chrono::high_resolution_clock::now();
// do some work
Binary_search();
// get end time
auto end = std::chrono::high_resolution_clock::now();
// calculate difference as double value in seconds
std::chrono::duration<double> diff = end-start;
// print the measured value
std::cout << "Time taken : " << diff.count() << " seconds\n";
I'm trying to find the number of prime numbers below 400 million but even with just 40 million my code is taking 8 secs to run. what am i doing wrong?
what can i do to make it faster?
#include<iostream>
#include<math.h>
#include<vector>
using namespace std;
int main()
{
vector<bool> k;
vector<long long int> c;
for (int i=2;i<40000000;i++)
{
k.push_back(true);
c.push_back(i);
}
for ( int i=0;i<sqrt(40000000)+1;i++)
{
if (k[i]==true)
{
for (int j=i+c[i];j<40000000;j=j+c[i])
{
k[j]=false;
}
}
}
vector <long long int> arr;
for ( int i=0;i<40000000-2;i++)
{
if (k[i]==true)
{
arr.push_back(c[i]);
}
}
cout << arr.size() << endl ;
return 0;
}
I profiled your code as well as a simple tweak, below. The tweak is more than twice as fast:
auto start = std::chrono::high_resolution_clock::now();
//original version
vector<bool> k;
vector<long long int> c;
for (int i=2;i<40000000;i++)
{
k.push_back(true);
c.push_back(i);
}
for ( int i=0;i<sqrt(40000000)+1;i++)
{
if (k[i]==true)
{
for (int j=i+c[i];j<40000000;j=j+c[i])
{
k[j]=false;
}
}
}
vector <long long int> arr;
for ( int i=0;i<40000000-2;i++)
{
if (k[i]==true)
{
arr.push_back(c[i]);
}
}
cout << arr.size() << endl ;
auto end1 = std::chrono::high_resolution_clock::now();
std::cout << "Elapsed = " <<
std::chrono::duration_cast<std::chrono::milliseconds>(end1 - start).count() <<
std::endl;
}
{
auto begin = std::chrono::high_resolution_clock::now();
//new version
const long limit{40000000};
vector<bool> k(limit-1,true);
//k[0] is the number 0
k[0]=false; k[1]=false;
auto sq = sqrt(limit) + 1;
//start at the number 2
for ( int i=2;i<sq;i++)
{
if (k[i]==true)
{
for (int j=i+i;j<limit;j+=i)
{
k[j]=false;
}
}
}
vector <long long int> arr;
for ( int i=0;i<limit-2;i++)
{
if (k[i]==true)
{
arr.push_back(i);
}
}
cout << arr.size() << endl ;
auto stop = std::chrono::high_resolution_clock::now();
std::cout << "Elapsed = " <<
std::chrono::duration_cast<std::chrono::milliseconds>(stop - begin).count() <<
std::endl;
}
Here is the output (elapsed in milliseconds), in Debug mode:
2433654
Elapsed = 5787
2433654
Elapsed = 2432
Both have same results, second is much faster.
Here is another version using some nice C++ features (requiring less code), and it is about 11% faster than the second version above:
auto begin = std::chrono::high_resolution_clock::now();
const long limit{40000000};
vector<int> k(limit-1,0);
//fill with sequence of integers
std::iota(k.begin(),k.end(),0);
//k[0] is the number 0
//integers reset to 0 are not prime
k[0]=0; k[1]=0;
auto sq = sqrt(limit) + 1;
//start at the number 2
for (int i=2;i<sq;i++)
{
if (k[i])
{
for (int j=i+i;j<limit;j+=i)
{
k[j]=0;
}
}
}
auto results = std::remove(k.begin(),k.end(),0);
cout << results - k.begin() << endl ;
auto stop = std::chrono::high_resolution_clock::now();
std::cout << "Elapsed = " <<
std::chrono::duration_cast<std::chrono::milliseconds>(stop - begin).count() <<
std::endl;
}
Note that in your original version, you push_back in three different places, while this use of modern idioms never uses push_back at all when operating on the vectors.
In this example, the vector is of ints so that you have the actual list of prime numbers when you are finished.
Output:
2433654
Elapsed = 2160
These above are all Debug mode numbers.
In Release mode, the best is a combination of the second and third techniques above, using the numeric with a vector of bools, if you don't care what the actual prime numbers are in the end:
2433654
Elapsed = 1098
2433654
Elapsed bool remove= 410
2433654
Elapsed = 779
Note that your original code only takes about 1 second on my 5 year-old laptop in Release mode, so you are probably running in Debug mode.
I got it down from taking 10 seconds to run to just half a second on my computer by changing two things. First, I'm guessing you didn't compile it with optimization enabled. That brought it from 10 seconds down to 1 second for me. Second, the vector c is unnecessary. Everywhere you have c[i] in your code you can replace it with i+2. This will make it run twice as fast.
Remove vector c, you don't need it.
Create vector k with known size at start. Repeatedly appending elements to a vector by invoking push_back() is a really bad idea from a performance point of view, as it can cause repeated memory reallocations and copies.
http://primesieve.org/segmented_sieve.html - segmented version for inspiration.
You can skip processing multiples of 2 and 3. link from code review
It looks that you've got some issue in compiler optimization flag settings. Maybe you didn't change configuration from debug to release. What is your release speedup vs debug one?
I am having problems with my timing functions here. I have a program that is timing how long a binary search is taking to find a given number in a list of sorted elements in an array.
So i am getting strange results and I'm not sure why.
For example this last run i did, the program said that it took 0 microseconds to find the value not in the array of size 100,000 elements, but just before it the program searched an array of 95,000 elements which also found the value was not in the array yet it took 4080005 microseconds.
Here is my function code.
Thanks for any help!
int binarySearch(int array[], int numElems, int value)
{
auto start =chrono::steady_clock::now();
cout << "Searching..."<< endl;
//variables
int first = 0,
last = numElems - 1,
middle,
position = -1;
bool found = false;
//Checks values for match
while (!found && first <= last)
{
//divides elements
middle = (first + last) / 2;
if (array[middle] == value)
{
found = true;
position = middle;
}
else if (array[middle] > value)
last = middle - 1;
else
first = middle + 1;
}
auto end = chrono::steady_clock::now();
auto elasped = std::chrono::duration_cast<std::chrono::microseconds>(end-start);
cout << "Time Taken: " << elasped.count() << " microseconds." << endl;
return position;
}
Running your code with a worst case search I consistently get between 25 and 86 microseconds on my machine. Moving the cout outside the clocked section of code, I get a consistent 0 microseconds.
Maybe your stdout buffer was hung for 4 seconds. Sending text to the terminal is an extraordinarily slow process. The binary search is fast; O(log(n)), which for 100,000 is 6 comparisons, worst case. 0 microseconds makes a lot of sense. I bet it was your terminal buffers being wonky.
Now for kicks, I switched to the high_resolution_clock.
$ ./a.out
Searching...
Time Taken: 619 nanoseconds.
Position: 99999
Source:
int binarySearch(int array[], int numElems, int value)
{
cout << "Searching..."<< endl;
auto start =chrono::high_resolution_clock::now();
//variables
int first = 0,
last = numElems - 1,
middle,
position = -1;
bool found = false;
//Checks values for match
while (!found && first <= last)
{
//divides elements
middle = (first + last) / 2;
if (array[middle] == value)
{
found = true;
position = middle;
}
else if (array[middle] > value)
last = middle - 1;
else
first = middle + 1;
}
auto end = chrono::high_resolution_clock::now();
auto elasped = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start);
cout << "Time Taken: " << elasped.count() << " nanoseconds." << endl;
return position;
}
I am working on a proof of concept test program for a game where certain actions are threaded and information is output to the command window for each thread. So far I have gotten the basic threading process to work but it seems that the couting in my called function is not being written for each thread and instead each thread is overwriting the others output.
The desired or expected output is that each thread will output the information couted within the mCycle function of mLaser. Essentially this is meant to be a timer of sorts for each object counting down the time until that object has completed its task. There should be an output for each thread, so if five threads are running there should be five counters counting down independently.
The current output is such that each thread is outputting its own information with in the same space which then overwrites what another thread is attempting to output.
Here is an example of the current output of the program:
Time until cycle Time until cycle 74 is complete: 36 is complete:
92 seconds 2 seconds ress any key to continue . . .
You can see the aberrations where numbers and other text are in places they should not be if you examine how the information is couted from mCycle.
What should be displayed is more long these lines:
Time until cycle 1 is complete:
92 seconds
Time until cycle 2 is complete:
112 seconds
Time until cycle 3 is complete:
34 seconds
Cycle 4 has completed!
I am not sure if this is due to some kind of thread locking going on due to how my code is structured or just an oversight in my coding for the output. If I could get a fresh pair of eyes to look over the code and point anything out that could be the fault I would appreciate it.
Here is my code, it should be compilable in any MSVS 2013 install (no custom libraries used)
#include <iostream>
#include <Windows.h>
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <future>
using namespace std;
class mLaser
{
public:
mLaser(int clen, float mamt)
{
mlCLen = clen;
mlMAmt = mamt;
}
int getCLen()
{
return mlCLen;
}
float getMAmt()
{
return mlMAmt;
}
void mCycle(int i1, int mCLength)
{
bool bMCycle = true;
int mCTime_left = mCLength * 1000;
int mCTime_start = GetTickCount(); //Get cycle start time
int mCTime_old = ((mCTime_start + 500) / 1000);
cout << "Time until cycle " << i1 << " is complete: " << endl;
while (bMCycle)
{
cout << ((mCTime_left + 500) / 1000) << " seconds";
bool bNChange = true;
while (bNChange)
{
//cout << ".";
int mCTime_new = GetTickCount();
if (mCTime_old != ((mCTime_new + 500) / 1000))
{
//cout << mCTime_old << " " << ((mCTime_new+500)/1000) << endl;
mCTime_old = ((mCTime_new + 500) / 1000);
mCTime_left -= 1000;
bNChange = false;
}
}
cout << " \r" << flush;
if (mCTime_left == 0)
{
bMCycle = false;
}
}
cout << "Mining Cycle " << i1 << " finished" << endl;
system("Pause");
return true;
}
private:
int mlCLen;
float mlMAmt;
};
string sMCycle(mLaser ml, int i1, thread& thread);
int main()
{
vector<mLaser> mlasers;
vector<thread> mthreads;
future<string> futr;
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> laser(1, 3);
uniform_int_distribution<> cLRand(30, 90);
uniform_real_distribution<float> mARand(34.0f, 154.3f);
int lasers;
int cycle_time;
float mining_amount;
lasers = laser(gen);
for (int i = 0; i < lasers-1; i++)
{
mlasers.push_back(mLaser(cLRand(gen), mARand(gen)));
mthreads.push_back(thread());
}
for (int i = 0; i < mlasers.size(); i++)
{
futr = async(launch::async, [mlasers, i, &mthreads]{return sMCycle(mlasers.at(i), i + 1, mthreads.at(i)); });
//mthreads.at(i) = thread(bind(&mLaser::mCycle, ref(mlasers.at(i)), mlasers.at(i).getCLen(), mlasers.at(i).getMAmt()));
}
for (int i = 0; i < mthreads.size(); i++)
{
//mthreads.at(i).join();
}
//string temp = futr.get();
//float out = strtof(temp.c_str(),NULL);
//cout << out << endl;
system("Pause");
return 0;
}
string sMCycle(mLaser ml, int i1, thread& t1)
{
t1 = thread(bind(&mLaser::mCycle, ref(ml), ml.getCLen(), ml.getMAmt()));
//t1.join();
return "122.0";
}
Although writing from multiple threads concurrently to std::cout has to be data race free, there is no guarantee that concurrent writes won't be interleaved. I'm not sure if one write operation of one thread can be interleaved with one write operation from another thread but they can certainly be interleaved between write operations (I think individual outputs from different threads can be interleaved).
What the standard has to say about concurrent access to the standard stream objects (i.e. std::cout, std::cin, etc.) is in 27.4.1 [iostream.objects.overview] paragraph 4:
Concurrent access to a synchronized (27.5.3.4) standard iostream object’s formatted and unformatted input (27.7.2.1) and output (27.7.3.1) functions or a standard C stream by multiple threads shall not result in a data race (1.10). [ Note: Users must still synchronize concurrent use of these objects and streams by multiple threads if they wish to avoid interleaved characters. —end note ]
If you want to have output appear in some sort of unit, you will need to synchronize access to std::cout, e.g., by using a mutex.
While Dietmar's answer is sufficient I decided to go a different, much more simple, route. Since I am creating instances of a class and I am accessing those instances in the threads, I chose to update those class' data during the threading and then called the updated data once the thread have finished executing.
This way I do not have to deal with annoying problems like data races nor grabbing output from async in a vector of shared_future. Here is my revised code in case anyone else would like to implement something similar:
#include <iostream>
#include <Windows.h>
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <future>
using namespace std; //Tacky, but good enough fo a poc D:
class mLaser
{
public:
mLaser(int clen, float mamt, int time_left)
{
mlCLen = clen;
mlMAmt = mamt;
mCTime_left = time_left;
bIsCompleted = false;
}
int getCLen()
{
return mlCLen;
}
float getMAmt()
{
return mlMAmt;
}
void setMCOld(int old)
{
mCTime_old = old;
}
void mCycle()
{
if (!bIsCompleted)
{
int mCTime_new = GetTickCount(); //Get current tick count for comparison to mCOld_time
if (mCTime_old != ((mCTime_new + 500) / 1000)) //Do calculations to see if time has passed since mCTime_old was set
{
//If it has then update mCTime_old and remove one second from mCTime_left.
mCTime_old = ((mCTime_new + 500) / 1000);
mCTime_left -= 1000;
}
cur_time = mCTime_left;
}
else
{
mCTime_left = 0;
}
}
int getCTime()
{
return cur_time;
}
int getCTLeft()
{
return mCTime_left;
}
void mCComp()
{
bIsCompleted = true;
}
bool getCompleted()
{
return bIsCompleted;
}
private:
int mlCLen; //Time of a complete mining cycle
float mlMAmt; //Amoung of ore produced by one mining cycle (not used yet)
int cur_time; //The current time remaining in the current mining cycle; will be removing this as it is just a copy of mCTime_left that I was going to use for another possiblity to make this code work
int mCTime_left; //The current time remaining in the current mining cycle
int mCTime_old; //The last time that mCycle was called
bool bIsCompleted; //Flag to check if a mining cycle has already been accounted for as completed
};
void sMCycle(mLaser& ml, int i1, thread& _thread); //Start a mining cycle thread
//Some global defines
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> laser(1, 10); //A random range for the number of mlaser entities to use
uniform_int_distribution<> cLRand(30, 90); //A random time range in seconds of mining cycle lengths
uniform_real_distribution<float> mARand(34.0f, 154.3f); //A random float range of the amount of ore produced by one mining cycle (not used yet)
int main()
{
//Init some variables for later use
vector<mLaser> mlasers; //Vector to hold mlaser objects
vector<thread> mthreads; //Vector to hold threads
vector<shared_future<int>> futr; //Vector to hold shared_futures (not used yet, might not be used if I can get the code working like this)
int lasers; //Number of lasers to create
int cycle_time; //Mining cycle time
int active_miners = 0; //Number of active mining cycle threads (one for each laser)
float mining_amount; //Amount of ore produced by one mining cycle (not used yet)
lasers = laser(gen); //Get a random number
active_miners = lasers; //Set this to that random number for the while loop later on
//Create the mlaser objects and push them into the mlasers vector
for (int i = 0; i < lasers; i++)
{
int clength = cLRand(gen);
mlasers.push_back(mLaser(clength, mARand(gen), (clength * 1000)));
//Also push thread obects into mthreads for each laser object
mthreads.push_back(thread());
}
//Setup data for mining cycles
for (int i = 0; i < mlasers.size(); i++)
{
int mCTime_start = GetTickCount(); //Get cycle start time
mlasers.at(i).setMCOld(((mCTime_start + 500) / 1000));
}
//Print initial display for mining cycles
for (int i = 0; i < mlasers.size(); i++)
{
cout << "Mining Laser " << i + 1 << " cycle will complete in " << (mlasers.at(i).getCTLeft() + 500) / 1000 << " seconds..." << endl;
}
while (active_miners > 0)
{
for (int i = 0; i < mlasers.size(); i++)
{
//futr.push_back(async(launch::async, [mlasers, i, &mthreads]{return sMCycle(mlasers.at(i), i + 1, mthreads.at(i)); }));
async(launch::async, [&mlasers, i, &mthreads]{return sMCycle(mlasers.at(i), i + 1, mthreads.at(i)); }); //Launch a thread for the current mlaser object
//mthreads.at(i) = thread(bind(&mLaser::mCycle, ref(mlasers.at(i)), mlasers.at(i).getCLen(), mlasers.at(i).getMAmt()));
}
//Output information from loops
//cout << " \r" << flush; //Return cursor to start of line and flush the buffer for the next info
system("CLS");
for (int i = 0; i < mlasers.size(); i++)
{
if (mlasers.at(i).getCTLeft() != 0) //If mining cycle is not completed
{
cout << "Mining Laser " << i + 1 << " cycle will complete in " << (mlasers.at(i).getCTLeft() + 500) / 1000 << " seconds..." << endl;
}
else if (mlasers.at(i).getCTLeft() == 0) //If it is completed
{
if (!mlasers.at(i).getCompleted())
{
mlasers.at(i).mCComp();
active_miners -= 1;
}
cout << "Mining Laser " << i + 1 << " has completed its mining cycle!" << endl;
}
}
}
/*for (int i = 0; i < mthreads.size(); i++)
{
mthreads.at(i).join();
}*/
//string temp = futr.get();
//float out = strtof(temp.c_str(),NULL);
//cout << out << endl;
system("Pause");
return 0;
}
void sMCycle(mLaser& ml, int i1,thread& _thread)
{
//Start thread
_thread = thread(bind(&mLaser::mCycle, ref(ml)));
//Join the thread
_thread.join();
}
Using VexCL in C++ I am trying to count all values in a vector above a certain minimum and I would like to perform this count on the device. The default Reductors only provide methods for MIN, MAX and SUM and the examples do not show very clear how to perform such a operation. This code is slow as it is probably executed on the host instead of the device:
int amount = 0;
int minimum = 5;
for (vex::vector<int>::iterator i = vector.begin(); i != vector.end(); ++i)
{
if (*i >= minimum)
{
amount++;
}
}
The vector I am using will consists of a large amount of values, say millions and mostly zero's. Besides the amount of values that are above the minimum, I also would like to retrieve a list of vector-ID's which contains these values. Is this possible?
If you only needed to count elements above the minimum, this would be as simple as
vex::Reductor<int, vex::SUM> sum(ctx);
int amount = sum( vec >= minimum );
The vec >= minimum expression results in a sequence of ones and zeros, and sum then counts ones.
Now, since you also need to get the positions of the elements above the minimum, it gets a bit more complicated:
#include <iostream>
#include <vexcl/vexcl.hpp>
int main() {
vex::Context ctx(vex::Filter::Env && vex::Filter::Count(1));
// Input vector
vex::vector<int> vec(ctx, {1, 3, 5, 2, 6, 8, 0, 2, 4, 7});
int n = vec.size();
int minimum = 5;
// Put result of (vec >= minimum) into key, and element indices into pos:
vex::vector<int> key(ctx, n);
vex::vector<int> pos(ctx, n);
key = (vec >= minimum);
pos = vex::element_index();
// Get number of interesting elements in vec.
vex::Reductor<int, vex::SUM> sum(ctx);
int amount = sum(key);
// Sort pos by key in descending order.
vex::sort_by_key(key, pos, vex::greater<int>());
// First 'amount' of elements in pos now hold indices of interesting
// elements. Lets use slicer to extract them:
vex::vector<int> indices(ctx, amount);
vex::slicer<1> slice(vex::extents[n]);
indices = slice[vex::range(0, amount)](pos);
std::cout << "indices: " << indices << std::endl;
}
This gives the following output:
indices: {
0: 2 4 5 9
}
#ddemidov
Thanks for your help, it is working. However, it is much slower than my original code which copies the device vector to the host and sorts using Boost. Below is the sample code with some timings:
#include <iostream>
#include <cstdio>
#include <vexcl/vexcl.hpp>
#include <vector>
#include <boost/range/algorithm.hpp>
int main()
{
clock_t start, end;
// initialize vector with random numbers
std::vector<int> hostVector(1000000);
for (int i = 0; i < hostVector.size(); ++i)
{
hostVector[i] = rand() % 20 + 1;
}
// copy to device
vex::Context cpu(vex::Filter::Type(CL_DEVICE_TYPE_CPU) && vex::Filter::Any);
vex::Context gpu(vex::Filter::Type(CL_DEVICE_TYPE_GPU) && vex::Filter::Any);
vex::vector<int> vectorCPU(cpu, 1000000);
vex::vector<int> vectorGPU(gpu, 1000000);
copy(hostVector, vectorCPU);
copy(hostVector, vectorGPU);
// sort results on CPU
start = clock();
boost::sort(hostVector);
end = clock();
cout << "C++: " << (end - start) / (CLOCKS_PER_SEC / 1000) << " ms" << endl;
// sort results on OpenCL
start = clock();
vex::sort(vectorCPU, vex::greater<int>());
end = clock();
cout << "vexcl CPU: " << (end - start) / (CLOCKS_PER_SEC / 1000) << " ms" << endl;
start = clock();
vex::sort(vectorGPU, vex::greater<int>());
end = clock();
cout << "vexcl GPU: " << (end - start) / (CLOCKS_PER_SEC / 1000) << " ms" << endl;
return 0;
}
which results in:
C++: 17 ms
vexcl CPU: 737 ms
vexcl GPU: 1670 ms
using an i7 3770 CPU and a (slow) HD4650 graphics card. As I'v read OpenCL should be able to perform fast sortings on large vertices. Do you have any advice how to perform a fast sort using OpenCL and vexcl?