How to parallelize matrix sorting for loop?

How to parallelize matrix sorting for loop? - c++

I am trying to parallelize a for(){...} loop, using OpenMP, which takes a number of "lines" N of a "table" N*M and sorts each line in an ascending order.
I added #pragma omp parallel, #pragma omp for schedule directives, but don't see any changes, as if it does nothing at all.
Here is full program:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
double GetTime() {
struct timeval clock;
gettimeofday(&clock, NULL);
double rez = (double)clock.tv_sec+(double)clock.tv_usec/1000000;
return rez;
}
void genMatrix(int *A, int N, int M) {
// Generate matrix
for (int i=0; i<N; i++) {
for (int j=0; j<M; j++) A[i*M+j] = (int)((double)rand()/RAND_MAX*99) + 1;
}
}
int main() {
srand(time(NULL));
int N = 4800;
int M = 6000;
int *A = new int[N*M];
int t, n;
genMatrix(A, N, M);
double t_start = GetTime();
#pragma omp parallel
{
#pragma omp for schedule
for (int k=0; k<N; k++) {
for (int i=0; i<M-1; i++) {
for (int j=0; j<M-1; j++) {
if (A[k*M+j] > A[k*M+j+1]) {
t = A[k*M+j];
A[k*M+j] = A[k*M+j+1];
A[k*M+j+1] = t;
}}}}}
double t_load = GetTime();
// Print matrix
// for (int i=0; i<N; i++) {
// for (int j=0; j<M; j++) {
// printf("%3d", A[i*M+j]);
// }
// printf("\n");
// }
printf("Load time: %.2f\n", t_load - t_start);
system("pause");
}
What is wrong and how should I add parallelization with OpenMP in this case?
Also, don't know why, but when trying to print the matrix A with big numbers( like int N = 480;int M = 600; ), some values are not sorted.
Is it a printing problem?

There are three distinct things, sine-qua-non, to go omp parallel:
A ) - the algorithm has to be correct
B ) - the algorithm has to use resources efficiently
C ) - the algorithm has to spend less on add-on overhead costs, than it receives from going omp
Fixing A) and after some slight experimentation on B) and C):
one may soon realise, that the costs demonstrated under B ) and C ) for a rand() processing are way higher, that any benefit from whatever naive or smarter matrix-coverage mapping onto resources ( here, a singular-engine, as any type of concurrency has to re-propagate a new state of the rand()-source-of-randomness across all the concurrent uses thereof, costs way more than it could deliver in concurrently operated matrix-coverage ( plus naive cache-line un-aware crossing of the matrix does not help either ).
The best results ( without optimising the myOMP_SCHEDULE_CHUNKS ):
/*
-O3 private( ..., i, j ) omp single
MATRIX.RAND time: 3 191 [us] 3 446 [us] 3 444 [us] 3 384 [us] 3 173 [us]
MATRIX.SORT time: 96 270 [us] 98 401 [us] 98 423 [us] 95 911 [us] 101 019 [us] #( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
The global view:
/* COMPILE:: -fopenmp
*
* MAY SHELL:: $ export OMP_NUM_THREADS = 3
* $ export OMP_DISPLAY_ENV = 1
* https://stackoverflow.com/questions/47495916/how-to-parallelize-matrix-sorting-for-loop
*/
#include <omp.h>
#define myOMP_SCHEDULE_CHUNKS 5 // OMP schedule( static, chunk ) ~ better cache-line depletion
#define myOMP_THREADS 4
/*
$ ./OMP_matrix_SORT
MATRIX.RAND time: 187 744 [us] 234 729 [us] 174 535 [us] 254 273 [us] 122 983 [us]
MATRIX.SORT time: 1 911 310 [us] 1 898 494 [us] 2 026 455 [us] 1 978 631 [us] 1 911 231 [us] #( 3 ) = OMP_THREADS
MATRIX.RAND time: 6 166 [us] 6 977 [us] 6 722 [us]
MATRIX.SORT time: 2 448 608 [us] 2 264 572 [us] 2 355 366 [us] #( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 918 [us] 17 551 [us] 7 194 [us]
MATRIX.SORT time: 1 774 883 [us] 1 809 002 [us] 1 786 494 [us] #( 1 ) = OMP_THREADS
MATRIX.RAND time: 7 321 [us] 7 337 [us] 6 698 [us]
MATRIX.SORT time: 2 152 945 [us] 1 900 149 [us] 1 883 638 [us] #( 1 ) = OMP_THREADS
MATRIX.RAND time: 54 198 [us] 67 290 [us] 52 123 [us]
MATRIX.SORT time: 759 248 [us] 769 580 [us] 760 759 [us] 812 875 [us] #( 3 ) = OMP_THREADS
MATRIX.RAND time: 7 054 [us] 6 414 [us] 6 435 [us] 6 426 [us]
MATRIX.SORT time: 687 021 [us] 760 917 [us] 674 496 [us] 705 629 [us] #( 3 ) = OMP_THREADS
-O3
MATRIX.RAND time: 5 890 [us] 6 147 [us] 6 081 [us] 5 796 [us] 6 143 [us]
MATRIX.SORT time: 148 381 [us] 152 664 [us] 184 922 [us] 155 236 [us] 169 442 [us] #( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
-O3 private( ..., i, j )
MATRIX.RAND time: 6 410 [us] 6 111 [us] 6 903 [us] 5 831 [us] 6 224 [us]
MATRIX.SORT time: 129 787 [us] 129 836 [us] 195 299 [us] 136 111 [us] 161 117 [us] #( 4 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 349 [us] 6 532 [us] 6 104 [us] 6 213 [us]
MATRIX.SORT time: 151 202 [us] 152 542 [us] 160 403 [us] 180 375 [us] #( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 745 [us] 5 834 [us] 5 791 [us] 7 164 [us] 6 535 [us]
MATRIX.SORT time: 214 590 [us] 214 563 [us] 209 610 [us] 205 940 [us] 230 787 [us] #( 2 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
long GetTime() { // double GetTime()
struct timeval clock;
gettimeofday( &clock, NULL );
return (long)clock.tv_sec * 1000000 // in [us] ( go (long) instead of float )
+ (long)clock.tv_usec; //
/* double rez = (double)clock.tv_sec
* + (double)clock.tv_usec / 1000000;
* // + (double)clock.tv_usec * 0.000001; // NEVER DIV
return rez;
*/
}
void genMatrix( int *A, int N, int M ) { // Generate matrix
register int i, iM, j;
#pragma omp parallel
for ( i = 0; i < N; i++ ) {
iM = i * M;
/* for ( register int i = 0; i < N; i++ ) {
register int iM = i * M;
*/
// #pragma omp parallel // 234 729 [us]
// for ( register int j = 0; j < M; j++ )
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // 122 983 [us] #( 3 ) = OMP_THREADS ~~~ v/s 6 698 [us] #( 1 ) = OMP_THREADS
// // v/s 5 796 [us] # NON-OMP
#pragma omp single // ~~ 3 191 [us]
for ( int j = 0; j < M; j++ )
A[iM +j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
// A[i*M+j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
}
}
int main() {
srand( time( NULL ) );
int N = 480; // 4800; ~ 100x faster outputs
int M = 600; // 6000;
int Mb1 = M - 1;
int *A = new int[N*M];
omp_set_num_threads( myOMP_THREADS );
long long int t_start = GetTime();
genMatrix( A, N, M );
long long int t_load = GetTime();
printf( "MATRIX.RAND time: %lld [us]\n", t_load - t_start );
register int thisB,
this1,
next1,
t, i, j;
t_start = GetTime(); // double t_start = GetTime();
// for ( register int k = 0; k < N; k++ ) {
// #pragma omp parallel
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // schedule( type, chunk ):
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t ) // schedule( type, chunk ):
#pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t, i, j ) // schedule( type, chunk ):
for ( int k = 0; k < N; k++ ) {
thisB = k*M;
if ( omp_get_num_threads() != myOMP_THREADS ) {
printf( "INF: myOMP_THREADS ( == %d ) do not match the number of executed ones ( == %d ) ", myOMP_THREADS, omp_get_num_threads() );
}
//--------------------------------------------------// -------------SORT ROW-k
// for ( register int i = 0; i < Mb1; i++ ) { // < M-1; i++ ) {
// for ( register int j = 0; j < Mb1; j++ ) { // < M-1; j++ ) {
for ( i = 0; i < Mb1; i++ ) {
for ( j = 0; j < Mb1; j++ ) {
this1 = thisB + j,
next1 = this1 + 1;
if ( A[this1] > A[next1] ){ // A[k*M+j ] > A[k*M+j+1] ) {
t = A[this1]; // t = A[k*M+j ];
A[this1] = A[next1]; // A[k*M+j ] = A[k*M+j+1];
A[next1] = t; // A[k*M+j+1] = t;
}
}
}
//--------------------------------------------------// -------------SORT ROW-k
}
t_load = GetTime(); // double t_load = GetTime();
/* Print matrix
//
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < M; j++ ) {
printf( "%3d", A[i*M+j] );
}
printf("\n");
}
//
*/
printf( "MATRIX.SORT time: %lld [us] #( %d ) = OMP_THREADS in [ %d ] OMP_SCHEDULE_CHUNKS\n",
t_load - t_start,
myOMP_THREADS,
myOMP_SCHEDULE_CHUNKS
);
// system( "pause" );
}

Related

Can anyone please tell why this code is producing the wrong output?

Problem statement:
Given two arrays A and B. Given Q queries each having a positive integer i denoting an index of the array A. For each query, your task is to find all the elements less than or equal to Ai in the array B.
My code doesn't seem to work for all the test cases.
Input
20000 // array size
24527 11330 19029 903 1178 1687 3954 11549 15705 23325 14294 23378 28891 27002 26716 13346 12153 14766 7641 17062 4928 2979 11867 833 27474 25860 28590 27 13961 12627 10021 4560 12638 10294 9878 6249 31497 28178 15015 16523 5610 8923 20040 10478 18216 21291 26497 31761 6552 32657 24942 21036 2016 11819 1928 28519 4572 14967 30245 12873 16704 22374 25667 18035 24959 30642 14977 28558 28396 4210 7022 130 287 27116 16646 21224 13467 29354 21370 21187 22446 18640 7472 29290 24216 28076 16395 6857 25327 22415 20460 27593 12865 21979 30329 24845 12284 31582 1053 11999 3723 734 4687 27498 9154 25077 6936 22569 23676 32288 10703 24479 4994 14354 2344 6985 20399 16718 4717 30161 11602 28660 522 15748 30420 1243 30031 15110 12443 6113 30066 8260 7213 7807 13267 25515 30361 16545 23428 23448 30227 28596 7177 11791 19166 29696 20828 26799 10095 25656 27957 21733 5071 15183 1415 23649 4161 142 11342 4550 19237 13796 29832 12710 28188 125 18561 12205 18029 16277 30036 9244 19623 1423 4015 1164.................
The correct output is:
13068
6148
8639
8615
334
2586
19661
4011
5428
14464
4751
9483
15197
18490
13607
16230
3140
1360
14787
6183
7031
4198
8859
16369
8455
5355
1458
12519
6988
17495
2201
2561
15966
7950
15677
19498
18528
4413
1642
2574
9223
15598
2364
9465
3935
894
19076
272
12675
6602
1441
18835
2249
14304
8879
12463
9356
17889
5993
13893
11928
11219
19976
1812
7033
7116
8025
7354
7723
8421
2014
14545
5213
5532
And my code's output is:
6939
This is my code:
#include <iostream>
#include<algorithm>
using namespace std;
int findindex (int arr[], int start, int end, int x)
{
while(start <= end)
{
int mid = (start + end) / 2;
if (arr[mid] <= x) // its lesser so more el can exist in rt search space
start = mid + 1;
else
end = mid - 1;
}
return end;
}
int main() {
int t ; cin>>t;
while(t--)
{
int n,i,q;
cin>>n;
int arr1[n],arr2[n];
for(i = 0; i < n; i++) // arr1
cin>>arr1[i];
for(i = 0; i < n; i++) //arr2
cin>>arr2[i];
cin>>q; // no of queries
sort(arr2,arr2+n);
while(q--)
{ int x;
cin>>x;
int index=findindex(arr2,0,n-1,x) ;
cout<<index+1<<"\n";
}
}
//code
return 0;
}

When you are calling the findindex() function, pass arr1[x] instead of x.
GFG Verdict:

high performance calculations and saving of the threads identificators

I write grid-stride loop to have High Performance Calculations, where large N, for example long long N 1<<36, or even more. From total grid I need only some indexes, which have to satisfy the define condition.
__global__ void Indexes(int *array, int N) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
while( index<N)
{
if (condition)
{....//do something to save index in array}
index += blockDim.x * gridDim.x;
}
}
Of course, it is possible use the Thrust, which allows to have both host and device arrays. But in this case obviously the calculation will be extremely ineffective, because need firstly to create a lot of non-needed elements, then to delete these.
What is the most effective way to save the indexes directly in array in device to pass in CPU?

If your output is relatively dense (i.e. a lot of indices and relatively few zeros), then the stream compaction approach suggested in comments is a good solution. There are a lot of ready-to-go stream compaction implementations which you can probably adapt to your purposes.
If your output is sparse, so you need to save relatively few indices for a lot of inputs, then stream compaction isn't such a great solution because it will waste a lot of GPU memory. In that case (and you can roughly estimate an upper bound of the number of output indices) something like this:
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
is probably more appropriate. Here, the idea is to use an atomically incremented position in the output array to keep track of where a thread should store its index. The code will signal if you fill the index array, and there will be information from which you can work out a restart strategy to stop the current kernel and then start from the last known index which you were able to store.
A complete example:
$ cat append.cu
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
namespace AppendArray
{
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
}
__global__
void kernelfind(int* input, int N, AppendArray::Array<int> indices)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(; idx < N; idx += gridDim.x*blockDim.x) {
if (input[idx] % 10000 == 0) {
if (indices.append(idx) < 0) return;
}
}
}
int main()
{
const int Ninputs = 1 << 20;
thrust::device_vector<int> inputs(Ninputs);
thrust::counting_iterator<int> vals(1);
thrust::copy(vals, vals + Ninputs, inputs.begin());
int* d_input = thrust::raw_pointer_cast(inputs.data());
int Nindices = Ninputs >> 12;
thrust::device_vector<int> indices(Nindices);
int* d_indices = thrust::raw_pointer_cast(indices.data());
int* pos; cudaMallocManaged(&pos, sizeof(int)); *pos = 0;
AppendArray::Array<int> index(d_indices, Nindices-1, pos);
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, kernelfind, 0, 0);
kernelfind<<<gridsize, blocksize>>>(d_input, Ninputs, index);
cudaDeviceSynchronize();
for(int i = 0; i < *pos; ++i) {
int idx = indices[i];
std::cout << i << " " << idx << " " << inputs[idx] << std::endl;
}
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o append append.cu
$ ./append
0 9999 10000
1 19999 20000
2 29999 30000
3 39999 40000
4 49999 50000
5 69999 70000
6 79999 80000
7 59999 60000
8 89999 90000
9 109999 110000
10 99999 100000
11 119999 120000
12 139999 140000
13 129999 130000
14 149999 150000
15 159999 160000
16 169999 170000
17 189999 190000
18 179999 180000
19 199999 200000
20 209999 210000
21 219999 220000
22 239999 240000
23 249999 250000
24 229999 230000
25 279999 280000
26 269999 270000
27 259999 260000
28 319999 320000
29 329999 330000
30 289999 290000
31 299999 300000
32 339999 340000
33 349999 350000
34 309999 310000
35 359999 360000
36 379999 380000
37 399999 400000
38 409999 410000
39 369999 370000
40 429999 430000
41 419999 420000
42 389999 390000
43 439999 440000
44 459999 460000
45 489999 490000
46 479999 480000
47 449999 450000
48 509999 510000
49 539999 540000
50 469999 470000
51 499999 500000
52 569999 570000
53 549999 550000
54 519999 520000
55 589999 590000
56 529999 530000
57 559999 560000
58 619999 620000
59 579999 580000
60 629999 630000
61 669999 670000
62 599999 600000
63 609999 610000
64 699999 700000
65 639999 640000
66 649999 650000
67 719999 720000
68 659999 660000
69 679999 680000
70 749999 750000
71 709999 710000
72 689999 690000
73 729999 730000
74 779999 780000
75 799999 800000
76 809999 810000
77 739999 740000
78 849999 850000
79 759999 760000
80 829999 830000
81 789999 790000
82 769999 770000
83 859999 860000
84 889999 890000
85 879999 880000
86 819999 820000
87 929999 930000
88 869999 870000
89 839999 840000
90 909999 910000
91 939999 940000
92 969999 970000
93 899999 900000
94 979999 980000
95 959999 960000
96 949999 950000
97 1019999 1020000
98 1009999 1010000
99 989999 990000
100 1029999 1030000
101 919999 920000
102 1039999 1040000
103 999999 1000000

Periodic Latency Spikes in Writing to Shared Memory on Linux

I have the following code:
#pragma pack(4)
struct RECORD_HEADER {
uint64_t msgType;
uint64_t rdtsc;
};
struct BODY {
char content[488];
};
#pragma pack()
class SerializedRDTSC {
public:
typedef unsigned long long timeunit_t;
static timeunit_t start(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__ ( "CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
static timeunit_t end(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__( "RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
"%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
};
char* createSHM() noexcept {
const auto sharedMemHandle = shm_open("testing", O_RDWR | O_CREAT, 0666);
if (-1 == sharedMemHandle) {
std::cout << "failed to open named shared memory: " << std::endl;
return nullptr;
}
constexpr int32_t size = (1 << 26);
ftruncate(sharedMemHandle, size);
char* ptr = (char*) mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_SHARED, sharedMemHandle, 0);
if (MAP_FAILED == ptr) {
std::cout << errno << std::endl;
return nullptr;
}
const auto rc = fchmod(sharedMemHandle, 0666);
if (rc == -1) {
fprintf(stderr,
"Can't change permissions to 0666 on shared mem segment: %m\n");
fflush(stderr);
}
return ptr;
}
int main() {
BODY update;
srand(time(nullptr));
char* ptr = createSHM();
constexpr uint64_t n = 700;
constexpr uint64_t n2 = 10;
uint64_t m_data[n * n2];
memset(m_data, 0, sizeof(m_data));
uint64_t r = 0;
for (uint64_t i = 0; i < n; i++) {
for (uint64_t k = 0; k < n2; k++) {
// populate the header
const auto msgType = rand();
const auto rdtsc = rand();
// populate the struct randomly
uint32_t* tmp = reinterpret_cast<uint32_t*>(&update);
for (uint32_t j = 0; j < sizeof(BODY) / sizeof(uint32_t); j++) {
const uint32_t v = rand() % 32767;
tmp[j] = v;
}
// write the struct
const auto s = SerializedRDTSC::start();
memcpy(ptr, (char*)&msgType, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, (char*)&rdtsc, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, &update, sizeof(BODY));
ptr+= sizeof(BODY);
const auto e = SerializedRDTSC::end();
m_data[r++] = e - s;
}
usleep(249998);
}
for (uint32_t i = 0; i < r; i++) {
std::cout << i << "," << m_data[i] << std::endl;
}
}
And for some reason, there are periodic latency spike according to the output:
0 9408
1 210
2 162
3 176
4 172
5 164
6 172
7 8338
8 174
9 186
10 612
11 380
12 380
13 374
14 358
15 13610
16 190
17 186
18 164
19 168
20 246
21 196
22 170
23 5066
24 176
25 176
26 168
27 174
28 166
29 440
30 232
31 214
32 5128
33 180
34 178
35 172
36 174
37 184
38 170
39 162
40 5964
41 182
42 174
43 164
44 180
45 180
46 162
47 172
I already isolated the core and double-checked with htop to make sure no other processes were using the core.
My machine has an i7 CPU (nothing fancy).
And then I tried with an Xeon CPU. The pattern is about the same -- every 7-11 write, there was a spike.
With i7 CPU, I compiled with GCC 7.2 with c++17 and ran it on CentOS 7.3.
With Xeon CPU, I compiled with GCC 4.6 with c++0x and ran it on CentOS 6.5.
My questions are:
1. Why there were periodic latency spikes? (I checked with strace. And I don't see weird system call involved)
2. Any suggestion on how to investigate/understand the spike? More for my learning.
Thanks in advance!
P.S. Yes, some people object to use rdtsc to measure latency because temperature affects TSC. Tho, I don't see any better option as I don't have PTP, and clock_gettime() sometimes will have latency spikes too. If you have any suggestion, it is more than welcome :)

A memory page is 4K bytes. Every time you start writing on a new page, that page needs mapped into the process address space. Since the data you're writing every loop is 8 + 8 + 488 = 504 bytes, you'll get a spike every 8 or 9 time thru the loop.
Since the CPU can speculatively prefetch data from memory, the page fault for the 2nd page (which should occur on the 8th loop) occurs one loop earlier than expected, when the hardware prefetcher tries to access the page.

Why isn't deleting of objects in std::map linear time? Or: What is the fastest way to delete objects in std::map?

Edit: Never do profiling with DEBUG builds... Once I switch to RELEASE there is no noticable delay and the times progress linear. Creating and deleting 1000000 objects then happens within 370 msecs total.
--
I have a map containing pointers to objects: std::map<int, DummyObject *> myMap
When I delete the map, I noticed that it seems to take longer the larger the map is--- exponentially so. I was expecting linear behaviour.
While a dry iteration through the map (without actually doing anything) behaves more or less linear, deleting the objects (without modifying the map!!) totally whacks out. Does anyone have an explanation for this behaviour? And even more important: What can I do to speed things up? Because unfortunately I really have a case with 60000+ objects, and cleaning up my map is taking too long...
Here is the code:
class DummyObject
{
public:
DummyObject() {}
~DummyObject() {}
};
void testSpeed( int iMapSize )
{
const unsigned int uiStart = GetTickCount();
typedef std::map<int, DummyObject *> T_MyMap;
T_MyMap myMap;
// Fill the map...
for( int i = 0; i < iMapSize; i++ ) {
myMap[i] = new DummyObject();
}
const unsigned int uiFilled = GetTickCount();
// Dry iteration through the map...
for( T_MyMap::const_iterator it = myMap.begin(); it != myMap.end(); it++ ) {
int i = 0; // do something
}
const unsigned int uiIterate = GetTickCount();
// Delete the map...
for( T_MyMap::const_iterator it = myMap.begin(); it != myMap.end(); it++ ) {
delete it->second;
}
const unsigned int uiDeleted = GetTickCount();
myMap.clear();
const unsigned int uiCleared = GetTickCount();
DebugText( arg( "MapSize: %1 --- ", iMapSize )
+ arg( "filled: %1 msecs; ", uiFilled - uiStart )
+ arg( "iterate: %1 msecs; ", uiIterate - uiFilled )
+ arg( "deleted: %1 msecs; ", uiDeleted - uiIterate )
+ arg( "cleared: %1 msecs; ", uiCleared - uiDeleted )
+ arg( "total: %1 msecs; ", uiCleared - uiStart ) );
}
When I call this...
testSpeed( 100 );
testSpeed( 1000 );
testSpeed( 10000 );
testSpeed( 100000 );
testSpeed( 1000000 );
... this is the output I get:
MapSize: 100 --- filled: 0 msecs; iterate: 0 msecs; deleted: 16 msecs; cleared: 0 msecs; total: 16 msecs;
MapSize: 1000 --- filled: 31 msecs; iterate: 0 msecs; deleted: 15 msecs; cleared: 0 msecs; total: 46 msecs;
MapSize: 10000 --- filled: 203 msecs; iterate: 16 msecs; deleted: 47 msecs; cleared: 15 msecs; total: 281 msecs;
MapSize: 100000 --- filled: 2527 msecs; iterate: 250 msecs; deleted: 1435 msecs; cleared: 234 msecs; total: 4446 msecs;
MapSize: 1000000 --- filled: 34055 msecs; iterate: 2652 msecs; deleted: 172241 msecs; cleared: 1981 msecs; total: 210929 msecs;

How to extract values from a "table-like" text file with awk

all
I have two input files like this :
file1 :
#W #S #this line dosen't exit
110 170 Bias
110 200 Bias
110 215 Bias
110 320 Bias
125 170 Bias
125 200 Bias
125 215 Bias
125 320 Bias
135 170 Bias
135 200 Bias
135 215 Bias
135 320 Bias
140 170 Bias
140 200 Bias
140 215 Bias
140 320 Bias
file2 :
FUNCTION BIAS ( W, S )
Bias = 0
IF AND ( W >= 0, W < 120 ) THEN
IF ( S >= 0 ) THEN Bias = -1
IF ( S >= 180 ) THEN Bias = -2
IF ( S >= 190 ) THEN Bias = -3
IF ( S >= 200 ) THEN Bias = -4
IF ( S >= 210 ) THEN Bias = -5
IF ( S >= 220 ) THEN Bias = -6
IF ( S >= 240 ) THEN Bias = -7
ENDIF
IF AND ( W >= 120, W < 130 ) THEN
IF ( S >= 0 ) THEN Bias = -11
IF ( S >= 180 ) THEN Bias = -12
IF ( S >= 190 ) THEN Bias = -13
IF ( S >= 200 ) THEN Bias = -14
IF ( S >= 210 ) THEN Bias = -15
IF ( S >= 220 ) THEN Bias = -16
IF ( S >= 240 ) THEN Bias = -17
ENDIF
IF AND ( W >= 130, W < 140 ) THEN
IF ( S >= 0 ) THEN Bias = 1
IF ( S >= 180 ) THEN Bias = 2
IF ( S >= 190 ) THEN Bias = 3
IF ( S >= 200 ) THEN Bias = 4
IF ( S >= 210 ) THEN Bias = 5
IF ( S >= 220 ) THEN Bias = 6
IF ( S >= 240 ) THEN Bias = 7
ENDIF
IF ( W >= 140 ) THEN
IF ( S >= 0 ) THEN Bias = 11
IF ( S >= 180 ) THEN Bias = 12
IF ( S >= 190 ) THEN Bias = 13
IF ( S >= 200 ) THEN Bias = 14
IF ( S >= 210 ) THEN Bias = 15
IF ( S >= 220 ) THEN Bias = 16
IF ( S >= 240 ) THEN Bias = 17
ENDIF
RETURN (Bias)
What I wanna do is to find out the corresponding value of a math function : "BIAS(W,S)" with the input (W,S) pair from file1
for example : W/S=135/195, "W" satisfy
IF AND ( W >= 130, W < 140 )
so we will go to check "S"
IF ( S >= 0 ) THEN Bias = 1
IF ( S >= 180 ) THEN Bias = 2
IF ( S >= 190 ) THEN Bias = 3
IF ( S >= 200 ) THEN Bias = 4
IF ( S >= 210 ) THEN Bias = 5
IF ( S >= 220 ) THEN Bias = 6
IF ( S >= 240 ) THEN Bias = 7
then finally we can find out S=195 is in between 190 and 200, the value of BIAS(W,S) is 3
what I want for the output is like this :
110 170 Bias -1
110 200 Bias -4
110 215 Bias -5
110 320 Bias -7
125 170 Bias -11
125 200 Bias -14
125 215 Bias -15
125 320 Bias -17
135 170 Bias 1
135 200 Bias 4
135 215 Bias 5
135 320 Bias 7
140 170 Bias 11
140 200 Bias 14
140 215 Bias 15
140 320 Bias 17
It's very easy to check by human eyes
but as you can see, file2 is basically a text file instead of a regular 2D-array numerical file, How can I extract the corresponding value? Any hint?

I just translated your logic into awk:
script.awk:
{
w=$1;
s=$2;
if (w >= 0 && w < 120) {
if ( s >= 0) { bias= -1 }
if ( s >= 180 ) { bias= -2 }
if ( s >= 190 ) { bias= -3 }
if ( s >= 200 ) { bias= -4 }
if ( s >= 210 ) { bias= -5 }
if ( s >= 220 ) { bias= -6 }
if ( s >= 240 ) { bias= -7 }
}
if (w >= 120 && w < 130) {
if ( s >= 0) { bias= -11 }
if ( s >= 180 ) { bias= -12 }
if ( s >= 190 ) { bias= -13 }
if ( s >= 200 ) { bias= -14 }
if ( s >= 210 ) { bias= -15 }
if ( s >= 220 ) { bias= -16 }
if ( s >= 240 ) { bias= -17 }
}
if (w >= 130 && w < 140) {
if ( s >= 0) { bias= 1 }
if ( s >= 180 ) { bias= 2 }
if ( s >= 190 ) { bias= 3 }
if ( s >= 200 ) { bias= 4 }
if ( s >= 210 ) { bias= 5 }
if ( s >= 220 ) { bias= 6 }
if ( s >= 240 ) { bias= 7 }
}
if (w >= 140 ) {
if ( s >= 0) { bias= 11 }
if ( s >= 180 ) { bias= 12 }
if ( s >= 190 ) { bias= 13 }
if ( s >= 200 ) { bias= 14 }
if ( s >= 210 ) { bias= 15 }
if ( s >= 220 ) { bias= 16 }
if ( s >= 240 ) { bias= 17 }
}
print $0" "bias;
}
Execution:
awk -f script.awk file1
110 170 Bias -1
110 200 Bias -4
110 215 Bias -5
110 320 Bias -7
125 170 Bias -11
125 200 Bias -14
125 215 Bias -15
125 320 Bias -17
135 170 Bias 1
135 200 Bias 4
135 215 Bias 5
135 320 Bias 7
140 170 Bias 11
140 200 Bias 14
140 215 Bias 15
140 320 Bias 17

Run the tst.awk script below on "file2" to convert the script in whatever language that is to awk and save it's output to a new file named "getbias.awk", then run:
awk -f getbias.awk '<your script>' file1
where <your script> parses file1 and calls the generated getbias() function below to get the bias values for each line.
$ cat tst.awk
{
sub(/BIAS/,"getbias")
sub(/ENDIF/,"}")
sub(/ THEN/,"")
$0 = tolower($0)
}
/^function/ { sub(/\)/,",\tbias )"); $0 = $0 " {" }
/^return/ { $0 = $0 ORS "}" }
/^if/ { sub(/ and/,""); sub(/,/," \\&\\&"); $0 = $0 " {" }
{ print }
.
$ awk -f tst.awk file2
function getbias ( w, s , bias ) {
bias = 0
if ( w >= 0 && w < 120 ) {
if ( s >= 0 ) bias = -1
if ( s >= 180 ) bias = -2
if ( s >= 190 ) bias = -3
if ( s >= 200 ) bias = -4
if ( s >= 210 ) bias = -5
if ( s >= 220 ) bias = -6
if ( s >= 240 ) bias = -7
}
if ( w >= 120 && w < 130 ) {
if ( s >= 0 ) bias = -11
if ( s >= 180 ) bias = -12
if ( s >= 190 ) bias = -13
if ( s >= 200 ) bias = -14
if ( s >= 210 ) bias = -15
if ( s >= 220 ) bias = -16
if ( s >= 240 ) bias = -17
}
if ( w >= 130 && w < 140 ) {
if ( s >= 0 ) bias = 1
if ( s >= 180 ) bias = 2
if ( s >= 190 ) bias = 3
if ( s >= 200 ) bias = 4
if ( s >= 210 ) bias = 5
if ( s >= 220 ) bias = 6
if ( s >= 240 ) bias = 7
}
if ( w >= 140 ) {
if ( s >= 0 ) bias = 11
if ( s >= 180 ) bias = 12
if ( s >= 190 ) bias = 13
if ( s >= 200 ) bias = 14
if ( s >= 210 ) bias = 15
if ( s >= 220 ) bias = 16
if ( s >= 240 ) bias = 17
}
return (bias)
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to parallelize matrix sorting for loop? - c++

Related

Can anyone please tell why this code is producing the wrong output?

high performance calculations and saving of the threads identificators

Periodic Latency Spikes in Writing to Shared Memory on Linux

Why isn't deleting of objects in std::map linear time? Or: What is the fastest way to delete objects in std::map?

How to extract values from a "table-like" text file with awk

Categories

Resources