CPU TIME OF THREAD - fortran

How I calculate the time in each thread ? the CPU_time not work in this case , because If the process is multithreaded, the CPU time is the sum for all threads.
Pseudocode example:
PROGRAM MAIN
implicit none
REAL Times_thread1_Started,Times_thread2_Started,....
REAL Times_thread1_finiched
!$OMP PARALLEL
!$OMP DO !for each thread do :
call CPU_TIME_thread1(Times_thread1_Started)
call CPU_TIME_thread2(Times_thread2_Started)
..........
..........
!$OMP END DO
......................
......................
processing multithread
............
............
!$OMP PARALLEL
!$OMP DO !for each thread do :
call CPU_TIME_thread1(Times_thread1_finiched)
write(*,*) 'Thread1 times:',Times_thread1_finiched-Times_thread1_Started
call CPU_TIMEE_thread2(Times_thread2)
write(*,*) 'Thread1 times:',Times_thread1_finiched-Times_thread1_Started
..........
..........
!$OMP END DO
!$OMP END PARALLEL
END

in c++:
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <omp.h>
//---------------------------------------------------------
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
extern void UseTiming1();
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
struct Thread_time {
double* Thread;
};
class Timing {
public:
Timing();
~Timing();
void StartTiming();
void StopTiming();
Thread_time GetUserSeconds() const {
for (int i = 0; i < nthreads; i++){
time.Thread[i]=double(m_userTime[i])/ 10000000.0;
}
//delete[] m_userTime;
return (time);
}
private:
__int64* GetUserTime() const;
__int64* m_userTime;
Thread_time time;
int nthreads;
};
Timing::Timing(){
#pragma omp parallel
{
nthreads=omp_get_num_threads();
}
printf("numbzer thread = %d\n",nthreads);
m_userTime=new __int64[nthreads];
time.Thread=new double[nthreads];
}
Timing::~Timing(){
delete[] m_userTime;
delete[] time.Thread;
}
__int64* Timing::GetUserTime() const {
FILETIME creationTime;
FILETIME exitTime;
FILETIME kernelTime;
FILETIME userTime;
__int64 *CurTime;
CurTime=new __int64[nthreads];
#pragma omp parallel for private(creationTime,exitTime,kernelTime,userTime)
for (int i = 0; i < nthreads; i++){
GetThreadTimes(GetCurrentThread(),
&creationTime, &exitTime,
&kernelTime, &userTime);
CurTime[i] = userTime.dwHighDateTime;
CurTime[i] <<= 32;
CurTime[i] += userTime.dwLowDateTime;
}
return CurTime;
}
void Timing::StartTiming() {
m_userTime = GetUserTime();
}
void Timing::StopTiming() {
//for (int i = 0; i < Number_Thread; i++)
__int64 *curUserTime;
curUserTime = GetUserTime();
for (int i = 0; i < nthreads; i++){
m_userTime[i] = curUserTime[i] - m_userTime[i];
}
}
//---------------------------------------------------------
void Calc()
{
unsigned sum = 0;
// #pragma omp parallel for reduction(+:sum) num_threads(2)
for (int i = 0; i < 1000000; i++)
{
char str[1000];
for (int j = 0; j < 999; j++)
str[j] = char(((i + j) % 254) + 1);
str[999] = 0;
for (char c = 'a'; c <= 'z'; c++)
if (strchr(str, c) != NULL)
sum += 1;
}
printf("sum = %u\n", sum);
}
void UseTiming1()
{
Timing t;
t.StartTiming();
Calc();
t.StopTiming();
for (int i = 0; i < 2; i++)
printf("Thread %d Timing: %.3G seconds.\n", i,t.GetUserSeconds().Thread[i]);
}

Related

lock of openmp seems not to work when dong summation

I am new to openMP and mutli-threading. I need to do some summation work and I know that when writing to the shared variable, it need to use lock like omp_lock_t. But when I do so, the result still goes wrong.
The code is:
#include <omp.h>
#include <cstdio>
struct simu
{
public:
simu() : data{ nullptr }
{
omp_init_lock(&lock);
}
~simu()
{
omp_destroy_lock(&lock);
}
void calcluate()
{
omp_set_lock(&lock);
(*data) += 1;
omp_unset_lock(&lock);
}
public:
omp_lock_t lock;
int *data;
};
int main()
{
printf("thread_num = %d\n", omp_get_num_procs());
const int size = 2000;
int a = 1;
int b = 2;
simu s[size];
simu *ps[size];
for (int i = 0; i < size; ++i)
{
s[i].data = (0 == i % 2) ? &a : &b;
ps[i] = &s[i];
}
for (int k = 0; k < size; ++k)
{
ps[k]->calcluate();
}
printf("a = %d, b = %d\n", a, b);
a = 1;
b = 2;
#pragma omp parallel for default(shared) num_threads(4)
for (int k = 0; k < size; ++k)
{
ps[k]->calcluate();
}
printf("a = %d, b = %d\n", a, b);
return 0;
}
And the result is
thread_num = 8
a = 1001, b = 1002
a = 676, b = 679
I run this code on Win10. Can anyone explain why the result is wrong?
A lock protects the actual data item from simultaneous writes. Your lock is in the object that points at the item, so this is pointless. You need to let you data point to an object that contains a lock.

Declaring array as a shared variable in pragma parallel directive and stabilizing the code

I have been trying to parallelize computing the sum value of series using certain number of terms to the processors using block allocation.
In this program, I am generating arithmetic series and want to pass array as a shared variable in the pragma and trying to restructure the pragma parallel directive.
I am new to OPENMP-C. Kindly help me how to insert array value as a shared variable and stabilize the code. I am attaching the code below
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int main (int argc, char *argv[])
{
int rank, comm_sz;
int number, i, first, difference, global_sum1, global_sum, nprocs, step, local_sum1, local_n;
int* a;
int BLOCK_LOW, BLOCK_HIGH;
double t0, t1;
comm_sz = atoi(argv[1]);
first = atoi(argv[2]);
difference = atoi(argv[3]);
number = atoi(argv[4]);
omp_set_num_threads (comm_sz);
rank = omp_get_thread_num();
a = (int*) malloc (n*sizeof(int));
printf("comm_sz=%d, first=%d, difference=%d, number of terms=%d\n",comm_sz, first, difference, number);
for(i=1; i <= number; i++){
a[i-1] = first + (i-1)*difference;
printf("a[%d]=%d\n",i-1,a[i]);
}
for(i=0; i < number; i++){
printf("a[%d]=%d\n",i,a[i]);}
t0 = omp_get_wtime();
#pragma omp parallel omp_set_num_threads(comm_sz, number, comm_sz, first, difference, global_sum1)
{
BLOCK_LOW = (rank * number)/comm_sz;
BLOCK_HIGH = ((rank+1) * number)/comm_sz;
#pragma omp parallel while private(i, local_sum1)
//int local_sum1 = 0;
i=BLOCK_LOW;
while( i < BLOCK_HIGH )
{
printf("%d, %d\n",BLOCK_LOW,BLOCK_HIGH);
local_sum1 = local_sum1 + a[i];
i++;
}
//global_sum1 = global_sum1 + local_sum1;
#pragma omp while reduction(+:sum1)
i=0;
for (i < comm_sz) {
global_sum1 = global_sum1 + local_sum1;
i++;
}
}
step = 2*first + (n-1)*difference;
sum = 0.5*n*step;
printf("sum is %d\n", global_sum );
t1 = omp_get_wtime();
printf("Estimate of pi: %7.5f\n", global_sum1);
printf("Time: %7.2f\n", t1-t0);
}
There are several mistakes in your code. I've tried to infer what you would like to do. So, I have rewritten your code according to my understanding.
Here is my suggestion:
int main (int argc, char *argv[])
{
int comm_sz, number, i, first, difference, global_sum, step;
int* a;
double t0, t1, sum;
comm_sz = atoi(argv[1]);
first = atoi(argv[2]);
difference = atoi(argv[3]);
number = atoi(argv[4]);
omp_set_num_threads (comm_sz);
a = (int*) malloc (number*sizeof(int));
printf("comm_sz=%d, first=%d, difference=%d, number of terms=%d\n",comm_sz, first, difference, number);
for(i=0; i < number; i++){
a[i] = first + (i)*difference;
printf("a[%d]=%d\n",i,a[i]);
}
t0 = omp_get_wtime();
global_sum = 0;
#pragma omp parallel for private(i) reduction(+:global_sum)
for (i=0; i < number; i++){
global_sum += a[i];
}
step = 2*first + (number-1)*difference;
sum = 0.5*number*step;
t1 = omp_get_wtime();
printf("sum is %d\n", global_sum);
printf("Estimate of pi: %7.5f\n", sum);
printf("Time: %7.2f\n", t1-t0);
}

Implementation of Euler Totient function in OpenMP

I am new to OpenMP and I am trying to implement Euler Totient function in OpenMP.
This is my first attempt to code in OpenMP. My goal is to implement it in parallel (fastest, if possible) and then in sequential. After that I calculate the speedup and efficiency.
Below is my code:
/*
============================================================================
Name : 55390_ass3.cpp
Author : Kamil Kamili
Version :
Copyright : Your copyright notice
Description : Euler’s Totient function φ(n) in OpenMP
============================================================================
*/
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
int eulerTotient(int number)
{
int sNumber = number;
for(int i = 2; i*i <= number; ++i)
{
if(number % i == 0)
{
while(number % i == 0)
{
number /= i;
}
}
}
if(number > 1)
{
sNumber -= sNumber / number;
}
return sNumber;
}
int main (int argc, char *argv[])
{
int nTheads = omp_get_num_threads();
int nMaxThreads = omp_get_max_threads();
printf("Parallel Execution: \n");
double startTimeForParallel = omp_get_wtime();
#pragma omp parallel for
for(int i = 2; i < 10000; i++)
{
printf("φ(%d) = %d\n", i, eulerTotient(i));
}
double parallelTime = omp_get_wtime() - startTimeForParallel;
printf("Sequential Execution: \n");
double startTimeForSequential = omp_get_wtime();
for(int i = 2; i < 10000; i++)
{
printf("φ(%d) = %d\n", i, eulerTotient(i));
}
double sequentialTime = omp_get_wtime() - startTimeForSequential;
double speedUp = sequentialTime/parallelTime;
printf("Parallel Execution Time: ",parallelTime);
printf("Sequential Execution Time: " << sequentialTime);
printf("SpeedUp: "<<speedUp);
return 0;
}
But it does not calculate 'Parallel Time'. Any idea where I am going wrong?
So I edited my code a bit here and there after doing some research. I think I am not using the function omp_get_wtime() correctly.
By Edited Code:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
double omp_get_wtime(void);
int eulerTotient(int number)
{
int sNumber = number;
for(int i = 2; i*i <= number; ++i)
{
if(number % i == 0)
{
while(number % i == 0)
{
number /= i;
}
}
}
if(number > 1)
{
sNumber -= sNumber / number;
}
return sNumber;
}
int main (int argc, char *argv[])
{
int nTheads = omp_get_num_threads();
int nMaxThreads = omp_get_max_threads();
printf("Parallel Execution: %f\n");
double parallelTime, endTimeForParallel;
double startTimeForParallel = omp_get_wtime();
#pragma omp parallel for
for(int i = 2; i < 10000; i++)
{
printf("O(%d) = %d\n", i, eulerTotient(i));
}
endTimeForParallel = omp_get_wtime();
parallelTime = startTimeForParallel = endTimeForParallel;
printf("Sequential Execution: \n");
double sequentialTime,endTimeForSequential,startTimeForSequential = omp_get_wtime();
for(int i = 2; i < 10000; i++)
{
printf("O(%d) = %d\n", i, eulerTotient(i));
}
endTimeForSequential = omp_get_wtime();
sequentialTime = startTimeForSequential - endTimeForSequential;
printf("Parallel Execution Time: %f seconds\n", parallelTime);
printf("Sequential Execution Time: %f seconds\n", sequentialTime);
double speedUp = sequentialTime/parallelTime;
printf("SpeedUp: %f\n\n",speedUp);
return 0;
}
The output is simply bizarre. Anyways, here it is:
Parallel Execution Time: 1477476938.765000 seconds
Sequential Execution Time: -0.099000 seconds
SpeedUp: -0.000000
So a very noob mistake: Fixed it
As stated by #Gilles. I fixed my code. Here it is:
/*
============================================================================
Name : 55390_ass3.c
Author : Kamil Kamili
Version :
Copyright : Your copyright notice
Description : Euler Totient's function in C
============================================================================
*/
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
double omp_get_wtime(void);
int eulerTotient(int number)
{
int sNumber = number;
for(int i = 2; i*i <= number; ++i)
{
if(number % i == 0)
{
while(number % i == 0)
{
number /= i;
}
}
}
if(number > 1)
{
sNumber -= sNumber / number;
}
return sNumber;
}
int main (int argc, char *argv[])
{
int nTheads = omp_get_num_threads();
int nMaxThreads = omp_get_max_threads();
printf("Parallel Execution: %f\n");
double parallelTime, endTimeForParallel;
double startTimeForParallel = omp_get_wtime();
#pragma omp parallel for
for(int i = 2; i < 10000; i++)
{
printf("O(%d) = %d\n", i, eulerTotient(i));
}
endTimeForParallel = omp_get_wtime();
parallelTime = endTimeForParallel - startTimeForParallel;
printf("Sequential Execution: \n");
double sequentialTime,endTimeForSequential,startTimeForSequential = omp_get_wtime();
for(int i = 2; i < 10000; i++)
{
printf("O(%d) = %d\n", i, eulerTotient(i));
}
endTimeForSequential = omp_get_wtime();
sequentialTime = endTimeForSequential - startTimeForSequential;
printf("Parallel Execution Time: %f seconds\n", parallelTime);
printf("Sequential Execution Time: %f seconds\n", sequentialTime);
double speedUp = sequentialTime/parallelTime;
printf("SpeedUp: %f\n\n",speedUp);
return 0;
}
And the output:
Parallel Execution Time: 0.014000 seconds
Sequential Execution Time: 0.095000 seconds
SpeedUp: 6.785746
I still think something is wrong, unless someone tells me otherwise.

OpenMP function calls in parallel

I'm looking for a way to call a function in parallel.
For example, if I have 4 threads, I want to each of them to call the same function with their own thread id as an argument.
Because of the argument, no thread will work on the same data.
#pragma omp parallel
{
for(int p = 0; p < numberOfThreads; p++)
{
if(p == omp_get_thread_num())
parDF(p);
}
}
Thread 0 should run parDF(0)
Thread 1 should run parDF(1)
Thread 2 should run parDF(2)
Thread 3 should run parDF(3)
All this should be done at the same time...
This (obviously) doesn't work, but what is the right way to do parallel function calls?
EDIT: The actual code (This might be too much information... But it was asked for...)
From the function that calls parDF():
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
//split nodeQueue
#pragma omp master
{
splitNodeQueue(numberOfThreads);
}
int tid = omp_get_thread_num();
//printf("Hello World from thread = %d\n", tid);
#pragma omp parallel for private(tid)
for(int i = 0; i < numberOfThreads; ++i)
{
parDF(tid, originalQueueSize, DFlevel);
}
}
The parDF function:
bool Tree::parDF(int id, int originalQueueSize, int DFlevel)
{
double possibilities[20];
double sequence[3];
double workingSequence[3];
int nodesToExpand = originalQueueSize/omp_get_num_threads();
int tenthsTicks = nodesToExpand/10;
int numPossibilities = 0;
int percentage = 0;
list<double>::iterator i;
list<TreeNode*>::iterator n;
cout << "My ID is: "<< omp_get_thread_num() << endl;
while(parNodeQueue[id].size() > 0 and parNodeQueue[id].back()->depth == DFlevel)
{
if(parNodeQueue[id].size()%tenthsTicks == 0)
{
cout << endl;
cout << percentage*10 << "% done..." << endl;
if(percentage == 10)
{
percentage = 0;
}
percentage++;
}
//countStartPoints++;
depthFirstQueue.push_back(parNodeQueue[id].back());
numPossibilities = 0;
for(i = parNodeQueue[id].back()->content.sortedPoints.begin(); i != parNodeQueue[id].back()->content.sortedPoints.end(); i++)
{
for(int j = 0; j < deltas; j++)
{
if(parNodeQueue[id].back()->content.doesPointExist((*i) + delta[j]))
{
for(int k = 0; k <= numPossibilities; k++)
{
if(fabs((*i) + delta[j] - possibilities[k]) < 0.01)
{
goto pointAlreadyAdded;
}
}
possibilities[numPossibilities] = ((*i) + delta[j]);
numPossibilities++;
pointAlreadyAdded:
continue;
}
}
}
// Out of the list of possible points. All combinations of 3 are added, building small subtrees in from the node.
// If a subtree succesfully breaks the lower bound, true is returned.
for(int i = 0; i < numPossibilities; i++)
{
for(int j = 0; j < numPossibilities; j++)
{
for(int k = 0; k < numPossibilities; k++)
{
if( k != j and j != i and i != k)
{
sequence[0] = possibilities[i];
sequence[1] = possibilities[j];
sequence[2] = possibilities[k];
//countSeq++;
if(addSequence(sequence, id))
{
//successes++;
workingSequence[0] = sequence[0];
workingSequence[1] = sequence[1];
workingSequence[2] = sequence[2];
parNodeQueue[id].back()->workingSequence[0] = sequence[0];
parNodeQueue[id].back()->workingSequence[1] = sequence[1];
parNodeQueue[id].back()->workingSequence[2] = sequence[2];
parNodeQueue[id].back()->live = false;
succesfulNodes.push_back(parNodeQueue[id].back());
goto nextNode;
}
else
{
destroySubtree(parNodeQueue[id].back());
}
}
}
}
}
nextNode:
parNodeQueue[id].pop_back();
}
Is this what you are after?
Live On Coliru
#include <omp.h>
#include <cstdio>
int main()
{
int nthreads, tid;
#pragma omp parallel private(tid)
{
tid = ::omp_get_thread_num();
printf("Hello World from thread = %d\n", tid);
/* Only master thread does this */
if (tid == 0) {
nthreads = ::omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
}
} /* All threads join master thread and terminate */
}
Output:
Hello World from thread = 0
Number of threads = 8
Hello World from thread = 4
Hello World from thread = 3
Hello World from thread = 5
Hello World from thread = 2
Hello World from thread = 1
Hello World from thread = 6
Hello World from thread = 7
You should be doing something like this :
#pragma omp parallel private(tid)
{
tid = omp_get_thread_num();
parDF(tid);
}
I think its quite straight forward.
There are two ways to achieve what you want:
Exactly the way you are describing it: each thread starts the function with it's own thread id:
#pragma omp parallel
{
int threadId = omp_get_thread_num();
parDF(threadId);
}
The parallel block starts as many threads as the system reports that it supports, and each of them executes the block. Since they differ in threadId, they will process different data. To force that starting of more threads you can add a numthreads(100) or whatever to the pragma.
The correct way to do what you want is to use a parallel for block.
#pragma omp parallel for
for (int i=0; i < numThreads; ++i) {
parDF(i);
}
This way each iteration of the loop (value of i) gets assigned to a thread, that executes it. As many iterations will be ran in parallel, as there are available threads.
Method 1. is not very general, and is inefficient because you have to have as many threads as you want function calls. Method 2. is the canonical (right) way to get your problem solved.

measured runtime from c++ "time.h" is double than real

I am running this pthread-c++ program (gauss elimination) on my laptop to measure its runtime.
The program runs about 10 seconds in real but my output shows about 20 seconds. What is wrong with this program?
I used
g++ -pthread main.c
./a.out 32 2048
to run
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cstdlib>
#include <pthread.h>
#include <iostream>
typedef float Type;
void mat_rand (Type**, int, int);
Type** mat_aloc (int, int);
void mat_free (Type**);
void mat_print (Type**, int, int);
void* eliminate(void*);
unsigned int n, max_threads, active_threads, thread_length;
Type** A;
int current_row;
struct args
{
int start;
int end;
};
typedef struct args argument;
void *print_message_function( void *ptr );
int main(int argc, char *argv[])
{
if (argc < 3)
{
printf ("Error!. Please Enter The Matrix Dimension and No. of Threads!\n");
return 0;
} else
{
n = atoi(argv[2]);
max_threads = atoi(argv[1]);
if (n > 4096)
{
printf ("The maximum allowed size is 4096!\n");
return 0;
}
if (max_threads > 32)
{
printf ("The maximum allowed Threads Count is 32!\n");
return 0;
}
}
A = mat_aloc(n , n+1);
mat_rand (A, n, n+1);
//mat_print (A, n, n+1);
std::clock_t start;
double exe_time;
start = std::clock();
pthread_attr_t attr;
pthread_attr_init(&attr);
argument* thread_args = new argument[max_threads];
pthread_t* thread = new pthread_t[max_threads];
for (int i=0; i<n-1; i++)
{
current_row = i;
if (max_threads >= n-i)
active_threads = n-i-1;
else
active_threads = max_threads;
thread_length = (n-i-1)/active_threads;
for (int j=0; j<active_threads-1; j++)
{
thread_args[j].start = i+1+j*thread_length;
thread_args[j].end = i+1+(j+1)*thread_length;
pthread_create( &thread[j], &attr, eliminate, (void*) &thread_args[j]);
}
thread_args[active_threads-1].start = i+1+(active_threads-1)*thread_length;
thread_args[active_threads-1].end = n-1;
pthread_create(&thread[active_threads-1], &attr, eliminate, (void*) &thread_args[active_threads-1]);
for (int j=0; j<active_threads; j++)
{
pthread_join(thread[j], NULL);
}
}
exe_time = (clock() - start) / (double) CLOCKS_PER_SEC;
printf("Execution time for Matrix of size %i: %f\n", n, exe_time);
//mat_print (A, n, n+1);
return 0;
}
void* eliminate(void* arg)
{
Type k, row_constant;
argument* info = (argument*) arg;
row_constant = A[current_row][current_row];
for (int i=info->start; i<=info->end; i++)
{
k = A[i][current_row] / row_constant;
A[i][current_row] = 0;
for (int j=current_row+1; j<n+1; j++)
{
A[i][j] -= k*A[current_row][j];
}
}
}
// matrix random values
void mat_rand (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
for (int j=0; j<column; j++)
{
matrix[i][j] = (float)(1) + ((float)rand()/(float)RAND_MAX)*256;
}
}
// allocates a 2d matrix
Type** mat_aloc (int row, int column)
{
Type* temp = new Type [row*column];
if (temp == NULL)
{
delete [] temp;
return 0;
}
Type** mat = new Type* [row];
if (temp == NULL)
{
delete [] mat;
return 0;
}
for (int i=0; i<row; i++)
{
mat[i] = temp + i*column;
}
return mat;
}
// free memory of matrix
void mat_free (Type** matrix)
{
delete[] (*matrix);
delete[] matrix;
}
// print matrix
void mat_print (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
{
for (int j=0; j<column; j++)
{
std::cout<< matrix[i][j] << "\t\t";
}
printf("\n");
}
printf(".................\n");
}
clock reports CPU time used. If you have 2 CPUs and run a thread on each one for 10 seconds, clock will report 20 seconds.