I am implementing a c++ function to get Nth prime number using some predefined indices for time optimization purpose.
my code is :
// file prime.cpp
#include <iostream>
#include <time.h>
using namespace std;
/*
#define primeAt10000 104743
#define primeAt20000 224743
#define primeAt30000 350381
#define primeAt40000 479951
#define primeAt50000 611977
*/
int prime(int n){
int pos = 1,i = 1,temp;
if(n==0)
return 2;
/*
else if(n>50000){
i = primeAt50000;
pos = 50001;
}else if(n>40000){
i = primeAt40000;
pos = 40001;
}else if(n>30000){
i = primeAt30000;
pos = 30001;
}else if(n>20000){
i = primeAt20000;
pos = 20001;
}else if(n>10000){
i = primeAt10000;
pos = 10001;
}*/
while( i+=2 ){
temp = i/2+1;
for(int j = 3 ; j <= temp ; j+=2)
if(i%j == 0)
goto con;
if(pos++ >= n)
return i;
con :;
}
}
int main(int argc, char const *argv[]){
int index;
cin >> index;
clock_t start = clock();
cout << prime(index)<<endl;
cout << (clock()-start)/CLOCKS_PER_SEC<<"sec"<< endl;
return 0;
}
compiled with:
g++ prime.cpp -o prime.exe
I ran this code three times for inputs 9999, 19999 and 29999
1st run : 1sec 6sec 14sec
2nd run : 1sec 7sec 15sec
3rd run : 1sec 7sec 16sec
After enabling commented code again I ran three times with same inputes
1st run : 1sec 5sec 8sec
2nd run : 1sec 5sec 8sec
3rd run : 1sec 5sec 8sec
My question is :
Why this difference in taken time for each execution after second compilation while the loops are running everytime for ~1,25,000 times?
and
Why for input 19999 (~104743 looping times) it is much closer then the first 3 runs after first compilation (~224743 looping times)?
Difference in time for each 9999 interval is different because when we going towards larger numbers to check either it is prime or not it takes more time then smaller ones.
In other words directly We can say that the run-time of for-loop in prime() is increased because of larger value of variable temp.
when we checking for i = 101, the value of temp become 51 and for-loop will run approx 25 times.
while when we check for i = 10001, the value of temp become 5001 and for-loop will run for approx 2500 times.
this difference in run-time of for loop will increase your overall time complexity.
After some discussion with #JonathanLeffler I have further optimized this function to achieve fastest output for larger input values like for index 9999, 19689 and so on...
Now the complexity of my prime function is (N^2)/12 unlike before [it was (N^2)/8].
My new code is :
#include <iostream>
#include <time.h>
using namespace std;
#define primeAt10000 104743-7
#define primeAt20000 224743-7
#define primeAt30000 350381-7
#define primeAt40000 479951-7
#define primeAt50000 611977-7
bool checkPrime(int x){
int temp = x/2+1;
for(int j = 3 ; j <= temp ; j+=2)
if(x%j == 0)
return false;
return true;
}
int prime(int n){
int pos = 2,i = 0;
if(n==0)
return 2;
else if(n==1)
return 3;
else if(n>50000){
i = primeAt50000;
pos = 50000;
}else if(n>40000){
i = primeAt40000;
pos = 40000;
}else if(n>30000){
i = primeAt30000;
pos = 30000;
}else if(n>20000){
i = primeAt20000;
pos = 20000;
}else if(n>10000){
i = primeAt10000;
pos = 10000;
}
while( i+=6 ){
if(checkPrime(i-1))
if(pos++>=n)
return i-1;
if(checkPrime(i+1))
if(pos++>=n)
return i+1;
}
return 0;
}
int main()
{
int index;
cin >> index;
clock_t start = clock();
cout << prime(index)<<endl;
cout << (clock()-start)/(float)CLOCKS_PER_SEC<<"sec";
return 0;
}
Compiled with(as the advice of #NathanOliver && #JonathanLeffler) :
g++ -O3 -Wall -Werror -Wextra prime.cpp -o prime.exe
Now prime.exe taking 1.34, 4.83 and 7.20sec respectivly to inputs 9999, 19999 and 29999.
Related
I know this question is a duplicate one, but I couldn't find any other topic similar to my code.
The problem statement is as followed:
There is a CSV file with 16,000 lines. A serial version of the program is extracting those rows with a price (SalePrice is a column head in the CSV) higher than a specific value (threshold) given to the program with command-line arguments and calculating their Mean and Standard Derivation which will be used for further computations.
This larger CSV file is broken into 4 CSV files for the parallel version. Each thread is assigned to one CSV file and should do the same calculations (Calculating Mean and STD of rows with price higher than a specific value named threshold in my code).
Since the data is large enough, I don't think this is because of the multithreading overhead.
I would be thankful if someone could please help me find out what part is slowing down my parallel version?
#include <iostream>
#include <fstream>
#include <vector>
#include <math.h>
#include <iomanip>
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace std;
#define COMMA ','
#define EMPTY_STR ""
#define FILENAME "dataset.csv"
#define CLASSIFIER "GrLivArea"
#define SALE_PRICE "SalePrice"
const int MAX_THREAD_NUMBERS = 20;
int NUMBER_OF_THREADS;
int threshold;
int expensive_cnt[MAX_THREAD_NUMBERS];
vector<string> lines;
string head;
double _std;
long sum[MAX_THREAD_NUMBERS];
long ps[MAX_THREAD_NUMBERS];
long sumsq[MAX_THREAD_NUMBERS];
double mean;
int total_items;
int total_expensive_cnt;
struct Item
{
int x;
bool category;
};
vector<Item> items[MAX_THREAD_NUMBERS];
int getColNum(const string& head, const string& key)
{
int cnt = 0;
string cur = EMPTY_STR;
for (int i = 0 ; i < head.size() ; i++)
{
if (head[i] == COMMA)
{
if (cur == key)
return cnt;
cnt++;
cur = EMPTY_STR;
}
else
cur += head[i];
}
if (cur == key)
return cnt;
return -1;
}
vector<int> separateByComma(string s)
{
vector<int> res;
string cur = EMPTY_STR;
for (int i = 0 ; i < s.size() ; i++)
if (s[i] == COMMA)
{
res.push_back(stoi(cur));
cur = EMPTY_STR;
}
else
cur += s[i];
res.push_back(stoi(cur));
return res;
}
void* calcSums(void* tid)
{
long thread_id = (long)tid;
string filename = "dataset_" + to_string(thread_id) + ".csv";
ifstream fin(filename);
string head;
fin >> head;
int classifierColNum = getColNum(head, CLASSIFIER);
if (classifierColNum == -1)
{
printf("NO GrLivArea FOUND IN HEAD OF CSV\n");
exit(-1);
}
int priceColNum = getColNum(head, SALE_PRICE);
if (priceColNum == -1)
{
printf("NO SalePrice FOUND IN HEAD OF CSV\n");
exit(-1);
}
string line;
while (fin >> line)
{
vector<int> cur = separateByComma(line);
bool category = (cur[priceColNum] >= threshold);
Item item{cur[classifierColNum], category};
if (category)
{
sum[thread_id] += item.x;
sumsq[thread_id] += (item.x * item.x);
expensive_cnt[thread_id]++;
}
items[thread_id].push_back(item);
}
fin.close();
pthread_exit(NULL);
}
void calcMeanSTD()
{
string line;
for (int i = 0 ; ; i++)
{
struct stat buffer;
string name = "dataset_" + to_string(i) + ".csv";
if (!(stat (name.c_str(), &buffer) == 0))
break;
NUMBER_OF_THREADS++;
}
pthread_t threads[NUMBER_OF_THREADS];
int return_code;
for (long tid = 0 ; tid < NUMBER_OF_THREADS ; tid++)
{
return_code = pthread_create(&threads[tid], NULL, calcSums, (void*)tid);
if (return_code)
{
printf("ERROR; return code from pthread_create() is %d\n", return_code);
exit(-1);
}
}
for (long tid = 0 ; tid < NUMBER_OF_THREADS ; tid++)
{
return_code = pthread_join(threads[tid], NULL);
if (return_code)
{
printf("ERROR; return code from pthread_join() is %d\n", return_code);
exit(-1);
}
}
double total_sum = 0;
double total_sum_sq = 0;
total_expensive_cnt = 0;
total_items = 0;
for (int i = 0 ; i < NUMBER_OF_THREADS ; i++)
{
total_sum += sum[i];
total_sum_sq += sumsq[i];
total_expensive_cnt += expensive_cnt[i];
total_items += items[i].size();
}
mean = total_sum / total_expensive_cnt;
_std = sqrt((total_sum_sq - ((total_sum * total_sum) / (total_expensive_cnt))) / (total_expensive_cnt));
}
int main(int argc, char *argv[])
{
threshold = atoi(argv[1]);
calcMeanSTD();
cout << mean << " " << _std << endl;
return 0;
}
Please let me know if any part is not understandable.
Here are some run-time values:
Read CSV (Serial): 0.043268s Calculations (Serial): 0.000151s
The exact time calculation isn't much easy in the multithreaded version here since the calculations and file reading are done in the same while loop which is not separable here. There also many thread switches. Anyway, their sum is about: 0.14587s
As it can be seen, the amount of time needed to read from files is almost 300 times as doing the math calculations.
Thanks to the answers in the comment, I found out what is happening:
I tried increasing the number of rows in my CSV files to see if the parallelization is working.
The run-time values for a CSV file with 1000000 rows are:
Parallel: real 0m0.558s user 0m2.173s sys 0m0.020s
Serial: real 0m1.834s user 0m1.818s sys 0m0.016s
Since I am using 4 threads, I expect 1.834 divided by 0.558 to be near to 4 which actually is 3.28 and is fair enough.
This run-time values for smaller CSV files aren't showing these results which seems to be because of the simple math computations in my code.
The bottleneck of this code is the section where I am reading from CSV files. This section seems to be serial since it is reading from a disk.
There is also a problem of False Sharing in this code
which causes cache contention due to updates of different memory locations by different threads when these locations share the same cache line mapping. There are many solutions to this problem, for example, I can introduce padding into these arrays to make sure that elements accessed by multiple threads do not share cache lines. Or, more simply, work with thread-local variables instead of arrays, and, in the end, update the array elements only once.
Hello I am trying to write a C++ multithreaded program using POSIX thread library to find the number of prime numbers between 1 and 10,000,000 (10 million) and find out how many microseconds it takes...
Creating my threads and running them works completely fine, however I feel as if there is an error found in my Prime function when determining if a number is prime or not...
I keep receiving 78496 as my output, however I desire 664579. Below is my code. Any hints or pointers would be greatly appreciated.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
#include <iostream>
#include <sys/time.h> //measure the execution time of the computations
using namespace std;
//The number of thread to be generated
#define NUMBER_OF_THREADS 4
void * Prime(void* index);
long numbers[4] = {250000, 500000, 750000, 1000000};
long start_numbers[4] = {1, 250001, 500001, 750001};
int thread_numbers[4] = {0, 1, 2, 3};
int main(){
pthread_t tid[NUMBER_OF_THREADS];
int tn;
long sum = 0;
timeval start_time, end_time;
double start_time_microseconds, end_time_microseconds;
gettimeofday(&start_time, NULL);
start_time_microseconds = start_time.tv_sec * 1000000 + start_time.tv_usec;
for(tn = 0; tn < NUMBER_OF_THREADS; tn++){
if (pthread_create(&tid[tn], NULL, Prime, (void *) &thread_numbers[tn]) == -1 ) {
perror("thread fail");
exit(-1);
}
}
long value[4];
for(int i = 0; i < NUMBER_OF_THREADS; i++){
if(pthread_join(tid[i],(void **) &value[i]) == 0){
sum = sum + value[i]; //add four sums together
}else{
perror("Thread join failed");
exit(-1);
}
}
//get the end time in microseconds
gettimeofday(&end_time, NULL);
end_time_microseconds = end_time.tv_sec * 1000000 + end_time.tv_usec;
//calculate the time passed
double time_passed = end_time_microseconds - start_time_microseconds;
cout << "Sum is: " << sum << endl;
cout << "Running time is: " << time_passed << " microseconds" << endl;
exit(0);
}
//Prime function
void* Prime(void* index){
int temp_index;
temp_index = *((int*)index);
long sum_t = 0;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
for (int j=2; j*j <= i; j++)
{
if (i % j == 0)
{
break;
}
else if (j+1 > sqrt(i)) {
sum_t++;
}
}
}
cout << "Thread " << temp_index << " terminates" << endl;
pthread_exit( (void*) sum_t);
}```
This is because, you used 10^6 instead of 10^7.
Also, added some corner cases for numbers 1, 2 and 3:
//Prime function
void* Prime(void* index){
int temp_index;
temp_index = *((int*)index);
long sum_t = 0;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
// Corner cases
if(i<=1)continue;
if (i <= 3){
sum_t++;
continue;
}
for (int j=2; j*j <= i; j++)
{
if ((i % j == 0) || (i %( j+2))==0 )
{
break;
}
else if (j+1 > sqrt(i)) {
sum_t++;
}
}
}
cout << "Thread " << temp_index << " terminates" << endl;
pthread_exit( (void*) sum_t);
}
I tested your code with correct number and got the correct number of primes as output:
Thread 0 terminates
Thread 1 terminates
Thread 2 terminates
Thread 3 terminates
Sum is: 664579
Running time is: 4.69242e+07 microseconds
Thanks to #chux - Reinstate Monica for pointing this out
Along with taking 10^7 as the numbers divided in thread instead of setting the limit as 10^6 ,a number of other small scale errors are there and a number of optimizations could be made -
First of all start numbers could be from 2 itself
long start_numbers[4] = {2, 2500001, 5000001, 7500001};
sum_t++ in your code may not work on edge cases. It is better to follow the following algorithm for calculating Prime function
bool flag = false;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
flag = false;
for (long j=2; j*j <= i; j++){
if (i % j == 0 )
{
flag = true;
break;
}
}
if(!flag)
sum_t++;
}
After these 2 operations i am getting the result as
Thread 0 terminates
Thread 1 terminates
Thread 2 terminates
Thread 3 terminates
Sum is: 664579
Running time is: 6.62618e+06 microseconds
edit:
( Note : in this case j is taken as long datatype but it could work as well with int in this 'example' since the tested compiler takes int as 32 bits long)
I wrote the following dp code for finding the prime factors of a number.
#include <bits/stdc++.h>
#define max 1000001
using namespace std;
vector <int> prime;
vector<bool> isprime(max,true);
vector<bool> visited(max,false);
vector<int> data(max,-1);
void dp(int n,int last)
{
if(n >= max || visited[n])
return;
visited[n] = true;
for(int i = last;i<prime.size();i++)
{
if(n*prime[i] >= max || data[n*prime[i]] != -1)
return;
data[n*prime[i]] = prime[i];
dp(n*prime[i],i);
}
}
int main()
{
isprime[1] = false;
data[1] = 1;
for(int i = 4;i<max;i += 2)
isprime[i] = false;
for(int i = 3; i*i< max;i += 2)
{
for(int j = i*i; j < max;j += i)
isprime[j] = false;
}
prime.push_back(2);
data[2] = 2;
for(int i =3;i<max;i += 2)
if(isprime[i])
{
prime.push_back(i);
data[i] = i;
}
for(int i = 0;i<prime.size();i++)
{
dp(prime[i],i);
}
cout<<"...1\n";
for(int i = 2;i<=8000;i++)
{
cout<<i<<" :- ";
int temp = i;
while(temp!= 1)
{
cout<<data[temp]<<" ";
temp = temp/data[temp];
}
cout<<endl;
}
return 0;
}
Here, last is the last index of prime number n.
But I am getting segmentation fault for this, when I change max to 10001, it runs perfectly. I'm not getting why is this happening since the data-structures used are 1-d vectors which can hold values up to 10^6 easily.
I checked your program out using GDB. The segfault is taking place at this line:
if(n*prime[i] >= max || data[n*prime[i]] != -1)
In your first ever call to DP in your for loop, where you call dp(2,0), the recursive calls eventually generate this call: dp(92692,2585).
92692 * 2585 = 239608820
This number is larger than a 32 bit integer can hold, so the r-value generated by the integer multiplication of those two numbers overflows and becomes negative. nprime[i] becomes negative, so your first condition of the above loop fails, and the second is checked. data[n * prime[i]] is accessed, and since n*prime[i] is negative, your program accesses invalid memory and segfaults. To fix this, simply change n to a long long in your parameter list and you should be fine.
void dp(long long n, int last)
I'm trying to understand possible optimization methods for the bubble sort algorithm. I know there are better sorting methods, but I'm just curious.
To test the efficiency I'm using std::chrono. The program sorts a 10000 number long int array 30 times and prints the average sorting time. The numbers are picked randomly(up to 10000) in every iteration. Here is the code, with no optimization:
#include <iostream>
#include <ctime>
#include <chrono>
using namespace std;
int main() {
//bubble sort
srand(time(NULL));
chrono::time_point<chrono::steady_clock> start, end;
const int n = 10000;
int i,j, last, tests = 30,arr[n];
long long total = 0;
bool out;
while (tests-->0) {
for (i = 0; i < n; i++) {
arr[i] = rand() % 1000;
}
j = n;
start = chrono::high_resolution_clock::now();
while(1){
out = 0;
for (i = 0; i < j - 1; i++) {
if (arr[i + 1] < arr[i]) {
swap(arr[i + 1], arr[i]);
out = 1;
}
}
if (!out) {
break;
}
//j--;
}
end = chrono::high_resolution_clock::now();
total += chrono::duration_cast<chrono::nanoseconds>(end - start).count();
cout << "Remaining :"<<tests << endl;
}
cout << "Average :" << total / static_cast<double>(30)/1000000000<<" seconds"; // tests(30) + nanosec -> sec
cin.sync();
cin.ignore();
return 0;
}
I get 0.17 seconds average sorting time.
If I uncomment line 47(j--;) to avoid comparing numbers already sorted I get 0.12 sorting time which is understandable.
If I remember the last position where a swap took place, I know that after that index, elements are sorted, and can thus sort up to that position in further iterations. It's better explained in the second part of this post: https://stackoverflow.com/a/16196115/1967496.
This is the code that implements the new possible optimization:
#include <iostream>
#include <ctime>
#include <chrono>
using namespace std;
int main() {
//bubble sort
srand(time(NULL));
chrono::time_point<chrono::steady_clock> start, end;
const int n = 10000;
int i,j, last, tests = 30,arr[n];
long long total = 0;
bool out;
while (tests-->0) {
for (i = 0; i < n; i++) {
arr[i] = rand() % 1000;
}
j = n;
start = chrono::high_resolution_clock::now();
while(1){
out = 0;
for (i = 0; i < j - 1; i++) {
if (arr[i + 1] < arr[i]) {
swap(arr[i + 1], arr[i]);
out = 1;
last = i;
}
}
if (!out) {
break;
}
j = last + 1;
}
end = chrono::high_resolution_clock::now();
total += chrono::duration_cast<chrono::nanoseconds>(end - start).count();
cout << "Remaining :"<<tests << endl;
}
cout << "Average :" << total / static_cast<double>(30)/1000000000<<" seconds"; // tests(30) + nanosec -> sec
cin.sync();
cin.ignore();
return 0;
}
Note lines 40 and 48. And here comes the problem: The average time is now again around 0.17 seconds.
Is there a problem in my code, or am I missing something ?
Update:
I did sorting with 10 times more numbers and get now following results:
No optimization: 19.3 seconds
First optimization(j--): 14.5 seconds
Second (supposed) optimization(j=last+1): 17.4 seconds;
From my understanding, the second method should be in any case better than the first, but the numbers tell something else.
Well... The problem is that there might not be the right or wrong answer to this question.
First of all, when you're comparing only 10000 elements, you cannot really call it an effeciency test. Try comparing much higher number of elements - maybe 500000 (although you will probably need to alocate an array dynamicaly for that).
Second of all, it might be the compiler. Compilers often try to optimize things so that the program execution will run smoother and faster.
The problem from uva OJ
my solution with recursion
#include <cstdio>
using namespace std;
#define garbaze 0
//number of ways changes can be made
int coins[] = {garbaze,50,25,10,5,1}; //order does not matter//as in the //count_ways... function we are returning
//0 if which_coin_now is <= 0 so it
//does n't matter what we have in the index 0 [garbaze] .. but we must put //something there to implement the
//code using the pseudo code or recursive relation
typedef unsigned long long ull; //simple typedef
ull dp[7490][6]; //2d table
//recursive approach
ull count_ways_of_changes(int money_now,int which_coin_now)
{
if(money_now == 0)
return 1;
if(money_now < 0 || which_coin_now <=0 )
return 0;
if(dp[money_now][which_coin_now] == -1)
dp[money_now][which_coin_now] = count_ways_of_changes(money_now,which_coin_now-1) //excluding current coin
+ count_ways_of_changes(money_now - coins[which_coin_now],which_coin_now) ; //including current coin
return dp[money_now][which_coin_now] ;
}
int main()
{
for(int loop = 0; loop< 7490 ;loop++)
for(int sec_loop = 0;sec_loop<6;sec_loop++)
dp[loop][sec_loop] = -1; //table initialization
int N = 0;
while(scanf("%d",&N)==1)
{
printf("%llu\n",count_ways_of_changes(N,5)); //llu for unsigned long long
}
return 0;
}
This one got accepted (and took 0.024 s)
And my iterative approach :
#include <cstdio>
//#include <iostream>
//using namespace std;
typedef unsigned long long ull;
ull dp[7490][6];
#define garbaze 0
int value_coins[] = {garbaze,5,1,10,25,50} ;
inline ull count_ways_change(int money,int num_of_coins)
{
for(int sum_money_now = 0; sum_money_now <= money ;sum_money_now++)
for(int recent_coin_index = 0 ; recent_coin_index <= num_of_coins ; recent_coin_index++)
//common mistakes : starting the second index at num_of_coins and decrementing till 0 ...see we are pre calculating
//we have to start bottom to up....if we start at dp[0][5] .....to dp[1][5] but to know that i need to know
//dp[1][4] and dp[..][5] before hand ..but we have not calculated dp[1][4] yet...in this case i don't go to infinite
//loop or anything as the loop is well defined but i get stupid garbaze answer
{
if(sum_money_now == 0)
dp[sum_money_now][recent_coin_index] = 1;
else if(recent_coin_index == 0)
dp[sum_money_now][recent_coin_index] = 0;
else if(sum_money_now < value_coins[recent_coin_index] && recent_coin_index != 0)
dp[sum_money_now][recent_coin_index] = dp[sum_money_now][recent_coin_index-1] ;
else
dp[sum_money_now][recent_coin_index] = dp[sum_money_now][recent_coin_index-1] + dp[sum_money_now - value_coins[recent_coin_index] ][recent_coin_index] ;
// cout<<dp[sum_money_now][recent_coin_index]<<endl;
}
return dp[money][num_of_coins] ;
}
int main()
{/*
for(int loop = 0; loop< 7490 ;loop++)
for(int sec_loop = 0;sec_loop<6;sec_loop++)
dp[loop][sec_loop] = -1; //table initialization
*/ //In the iterative version do not need to initialize the table as we are working bottom - up
int N = 0;
while(scanf("%d",&N)==1)
{
printf("%llu\n",count_ways_change(N,5)); //llu for unsigned long long
}
return 0;
}
But i got time limit exceeded for this one.It gives correct output but i don't see a reason why this one has to be so slow?
The difference is your recursive solution remember partial solutions from previous tasks (because the DP table is global and does not get removed between different inputs), while the iterative doesn't - for each new input, it recalculates the DP matrix from scratch.
It can be solved by remembering which portion of the DP table was already calculated and avoid recalculating it, rather than recalculate it from scratch for every query.