Today I came into a problem:
I have to read data from a file, the file contains a lot of test cases, it looks like
N
N lines followed..
...
...
So I used while(scanf("%d", &n) && n!=-1), but it took me more than 5s to read all data. However, when I changed it into while(scanf("%d", &n) && n>-1), it just took me 800ms to read alll data. So I suppose that there is difference between speed of comparison operators in C++, and can anyone give me the order?
PS: my compiler is GCC 5.1.0
OK, let me show more details of this problem.
The problem is here: http://acm.hdu.edu.cn/showproblem.php?pid=1171
Code with not equal is here:https://github.com/kimixuchen/codesnap/blob/master/greater
Code with gerater is here:
https://github.com/kimixuchen/codesnap/blob/master/not_equal
The question is about comparison, not reading files or badly formulated conditions. So lets test comparison only. Update: tested with /O2 optimization option.
#include <ctime>
#include <cstdlib>
#include <iostream>
int main()
{
const int testCases = 10000000;
const int iterations = 100;
srand(time(NULL));
int * A = new int[testCases];
bool *B = new bool[testCases];
freopen("output.txt", "w", stdout);
for (int i = 0; i < testCases; i++)
{
A[i] = rand() % 100;
}
clock_t begin = clock();
for (int j = 0; j < iterations; j++)
for (int i = 0; i < testCases; i++)
{
B[i] = A[i] != -1;
}
clock_t end = clock();
double elapsed_secs = end - begin;
std::cout << "Elapsed time using != - " << elapsed_secs << std::endl;
//Getting new random numbers for clean test
for (int i = 0; i < testCases; i++)
{
A[i] = rand() % 100;
}
begin = clock();
for (int j = 0; j < iterations; j++)
for (int i = 0; i < testCases; i++)
{
B[i] = A[i] > -1;
}
end = clock();
elapsed_secs = end - begin;
std::cout << "Elapsed time using > - " << elapsed_secs << std::endl;
return 0;
}
Results for 5 tests (in ticks):
'!=': 1005 994 1015 1009 1019
'>': 1006 1004 1004 1005 1035
Conclusion - there is no significant difference in optimized for speed program.
Related
I am trying to optimize a piece of code without resorting to parallelizing / SSE.
Current critical code runs in about 20ms on my PC with O2. That seems quite a bit even for ~17mil iterations.
The particular piece that is too slow is as follows:
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
Update: Changing to iterators reduced the run-time to 17ms.
for (int d = 0; d < numDims; d++)
{
std::vector<float>::iterator floodIt;
for (floodIt = floodVals[d].begin(); floodIt < floodVals[d].end(); floodIt++)
{
bins[d][(int) (*floodIt * binSteps)]++;
}
}
The full dummy code is here:
#include <vector>
#include <random>
#include <iostream>
#include <chrono>
int main()
{
// Initialize random normalized input [0, 1)
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dist(0, 0.99999);
// Initialize dimensions
const int numDims = 130;
const int numNodes = 130000;
const int binSteps = 30;
// Make dummy data
std::vector<std::vector<float>> floodVals(numDims, std::vector<float>(numNodes));
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
floodVals[d][i] = dist(gen);
}
}
// Initialize binning
std::vector<std::vector<int>> bins(numDims, std::vector<int>(binSteps, 0));
// Time critical section of code
auto start = std::chrono::high_resolution_clock::now();
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
std::cout << "Elapsed: " << elapsed.count() * 1000 << " ms" << std::endl;
return 0;
}
Try eliminating indexing on d in the inner loop, since it is constant in the inner loop anyway. This was roughly 2x faster for me.
for (int d = 0; d < numDims; d++)
{
int* const bins_d = &bins[d][0];
float* const floodVals_d = &floodVals[d][0];
for (int i = 0; i < numNodes; i++)
{
bins_d[(int) (floodVals_d[i] * binSteps)]++;
}
}
I am basically writing code to count if a pair sum is even(among all pairs from 1 to 100000). I wrote a code using pthreads and without pthreads. But the code with pthreads is taking more time than the serial one. Here is my serial code
#include<bits/stdc++.h>
using namespace std;
int main()
{
long long sum = 0, count = 0, n = 100000;
auto start = chrono::high_resolution_clock::now();
for(int i = 1; i <= n; i++)
for(int j = i-1; j >= 0; j--)
{
sum = i + j;
if(sum%2 == 0)
count++;
}
cout<<"count is "<<count<<endl;
auto end = chrono::high_resolution_clock::now();
double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
time_taken *= 1e-9;
cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
return 0;
}
and here is my parallel code
#include<bits/stdc++.h>
using namespace std;
#define MAX_THREAD 3
long long cnt[5] = {0};
long long n = 100000;
int work_per_thread;
int start[] = {1, 60001, 83001, 100001};
void *count_array(void* arg)
{
int t = *((int*)arg);
long long sum = 0;
for(int i = start[t]; i < start[t+1]; i++)
for(int j = i-1; j >=0; j--)
{
sum = i + j;
if(sum%2 == 0)
cnt[t]++;
}
cout<<"thread"<<t<<" finished work "<<cnt[t]<<endl;
return NULL;
}
int main()
{
pthread_t threads[MAX_THREAD];
int arr[] = {0,1,2};
long long total_count = 0;
work_per_thread = n/MAX_THREAD;
auto start = chrono::high_resolution_clock::now();
for(int i = 0; i < MAX_THREAD; i++)
pthread_create(&threads[i], NULL, count_array, &arr[i]);
for(int i = 0; i < MAX_THREAD; i++)
pthread_join(threads[i], NULL);
for(int i = 0; i < MAX_THREAD; i++)
total_count += cnt[i];
cout << "count is " << total_count << endl;
auto end = chrono::high_resolution_clock::now();
double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
time_taken *= 1e-9;
cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
return 0;
}
In the parallel code I am creating three threads and 1st thread will be doing its computation from 1 to 60000, 2nd thread from 60001 to 83000 and so on. I have chosen these numbers so that each thread gets to do approximately similar number of computations. The parallel execution takes 10.3 secs whereas serial one takes 7.7 secs. I have 6 cores and 2 threads per core. I also used htop command to check if the required number of threads are running or not and it seems to be working fine. I don't understand where the problem is.
The all cores in the threaded version compete for cnt[].
Use a local counter inside the loop and copy the result into cnt[t] after the loop is ready.
I have several matrices that I want to multiply in c++ with allowing vectorization. However the following code results in a large execution time ~858146125 ns. How do I modify the code so I have vectorization of the matrix multiplication and reach around 100ns of execution time.
I am using the flag O3.
const int ROWS = 1000;
const int COLS = 1000;
const int ROWS1 = 1000;
const int COLS1 = 1000;
const int l = 1000;
double random_matrix[ROWS][COLS];
double random_matrix1[ROWS1][COLS1];
double mult[l][l];
int i;
int j;
/* generate number: */
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
random_matrix[i][j] = i + j;
}
for (i = 0; i < ROWS1; i++) {
for (j = 0; j < COLS1; j++)
random_matrix1[i][j] = i + j;
}
auto start = std::chrono::steady_clock::now();
for (size_t row = 0; row < ROWS; ++row) {
for (size_t tmp = 0; tmp < COLS1; ++tmp) {
mult[row][tmp] = random_matrix[row][0]*random_matrix1[0][tmp];
for (size_t col = 1; col < COLS; ++col) {
mult[row][tmp] += random_matrix[row][col] * random_matrix1[col][tmp];
}
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "Elapsed time in nanoseconds : "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()
<< " ns" << std::endl;
std::cout<<"\n";
for (i=0;i<ROWS;i++)
{
for (j=0;j<COLS1;j++)
std::cout << mult[i][j] <<std::endl; //display table
std::cout<<"\n";
}
I'm afraid you'll never get to 100 ns total execution time with these matrix sizes, with vectorization or without. Matrix multiplication of two matrices 1000 x 1000 elements takes on the order of 1000 ^ 3 = 1,000,000,000 multiply-adds. That is one billion operations.
Secondly, if performance matters so much to you, you should NOT write your own code for these low-level mathematical primitives. There are optimized C++ libraries that will perform these operations for you, such as Eigen or BLAS (Intel MKL is a package that implements BLAS).
By using one of these packages, you not only get much better performance, but also avoid the potential pitfalls or bugs that you would likely have otherwise.
I want to measure the execution time of a program in ns in C++. For that purpose I am using the chrono library.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
return 0;
}
I measured the time and it executed in 90 ns . However when I add a printing afterwards the time changes.
int main() {
const int ROWS = 200;
const int COLS = 200;
double input[ROWS][COLS];
int i,j;
auto start = std::chrono::steady_clock::now();
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
input[i][j] = i + j;
}
auto end = std::chrono::steady_clock::now();
auto res=std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
std::cout << "Elapsed time in nanoseconds : "
<< res
<< " ns" << std::endl;
for (i = 0; i < ROWS; i++) {
for (j = 0; j < COLS; j++)
std::cout<<input[i][j];
}
return 0;
}
The time changes to 89700 ns. What could be the problem. I only want to measure the execution time of the for.
I have a short piece of code which does the following thing:
Accessing an array of struct (vec1) sequentially
Accessing to an int array (hashT) randomly according to the value got from the struct array
Modifying the values of the struct array using the value got from the int array.
I ran the program on 8 physical cpus on the same numa node. But with hashT of size 5M (i.e. there are 5M ints in it) I only got x7.2 speedup. I changed the size of hashT and saw that when it's small, I got x7.9 speedup, but as it grows larger and at some point (when there are 3M ints in the array) the speedup starts to drop down. I have a 200GB RAM so memory should not be a problem. I also measured the memory bandwidth and I'm nowhere near a half of the maximal available bandwidth. So I thought that it might be due of contention of cache. I have a 20MB L3 cache which is shared by all threads. But I measured the L2 and L3 cache misses and I found that they are almost the same no matter how many threads I use (it is true that increasing the size of hashT causes more cache misses, though). So I don't see any reasons why the speedup drops. Can anybody give me an idea?
#include<iostream>
#include<vector>
#include<chrono>
#include<omp.h>
#include<stdio.h>
using namespace std;
typedef std::chrono::milliseconds ms;
struct Element{
int a, b, c;
Element() : a(0), b(1), c(2) {}
Element(int aa, int bb, int cc) : a(aa), b(bb), c(cc) {}
};
int main(int argc, char *argv[]){
int HASHT_SIZE = atoi(argv[1]);
int RID_SIZE = 5000000;
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
vector<vector<Element> > vec1(40);
for(size_t i = 0; i < vec1.size(); i++){
vec1[i].reserve(RID_SIZE);
for(size_t j = 0; j < vec1[i].capacity(); j++){
vec1[i].push_back(Element(rand() % HASHT_SIZE,0,0));
}
}
//initialize hashT
vector<int> hashT;
hashT.reserve(HASHT_SIZE);
for(int i = 0; i < HASHT_SIZE; i++){
hashT.push_back(rand() % 2);
}
start = std::chrono::high_resolution_clock::now();
//test program
#pragma omp parallel for schedule(dynamic)
for(int j = 0; j < 40; j++)
{
for(size_t i = 0; i < RID_SIZE; i++){
int nid = hashT[vec1[j][i].a];
for(int k = 0; k < 10; k++){
vec1[j][i].b += nid / (k+1);
vec1[j][i].c += nid / (k+1);
}
}
}
end = std::chrono::high_resolution_clock::now();
for(int i = 0; i < 40; i++){
for(int j = 0; j < 10; j++){
cout << vec1[i][j].a << vec1[i][j].b << vec1[i][j].c;
}
}cout << endl;
cout << "Time used: " << std::chrono::duration_cast<ms>(end - start).count() << endl;
return 0;
}