How to make parallel cudaMalloc fast?

How to make parallel cudaMalloc fast? - c++

When allocating a lot of memory on 4 distinct NVIDIA V100 GPUs, I observe the following behavior with regards to parallelization via OpenMP:
Using the #pragma omp parallel for directive, and therefore making the cudaMalloc calls on each GPU in parallel, results in the same performance as doing it completely serial. This is tested and the same effect validated on two HPC systems: IBM Power AC922 and an AWS EC2 p3dn.24xlarge. (The numbers are obtained on the Power machine.)
./test 4000000000
# serial
GPU 0: 0.472018550
GPU 1: 0.325776811
GPU 2: 0.334342752
GPU 3: 0.337432169
total: 1.469773541
# parallel
GPU 0: 1.199741600
GPU 2: 1.200597044
GPU 3: 1.200619017
GPU 1: 1.482700315
total: 1.493352924
How can I make the parallelization faster?
Here is my code:
#include <chrono>
#include <iomanip>
#include <iostream>
int main(int argc, char* argv[]) {
size_t num_elements = std::stoull(argv[1]);
auto t0s = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (int i = 0; i < 4; ++i)
{
auto t0is = std::chrono::high_resolution_clock::now();
cudaSetDevice(i);
int* ptr;
cudaMalloc((void**)&ptr, sizeof(int) * num_elements);
auto t1is = std::chrono::high_resolution_clock::now();
std::cout << "GPU " << i << ": " << std::fixed << std::setprecision(9)
<< std::chrono::duration<double>(t1is - t0is).count() << std::endl;
}
auto t1s = std::chrono::high_resolution_clock::now();
std::cout << "total: " << std::fixed << std::setprecision(9)
<< std::chrono::duration<double>(t1s - t0s).count() << std::endl;
return 0;
}
You can compile the microbenchmark with:
nvcc -std=c++11 -Xcompiler -fopenmp -O3 test.cu -o test
I also tried std::thread instead of OpenMP with the same results.

Related

OpenMP parallel for does not speed up array sum code [duplicate]

This question already has answers here:
C++: Timing in Linux (using clock()) is out of sync (due to OpenMP?)
(3 answers)
Closed 4 months ago.
I'm trying to test the speed up of OpenMP on an array sum program.
The elements are generated using random generator to avoid optimization.
The length of array is also set large enough to indicate the performance difference.
This program is built using g++ -fopenmp -g -O0 -o main main.cpp, -g -O0 are used to avoid optimization.
However OpenMP parallel for code is significant slower than sequential code.
Test result:
Your thread count is: 12
Filling arrays
filling time：66718888
Now running omp code
2thread omp time:11154095
result: 4294903886
Now running omp code
4thread omp time:10832414
result: 4294903886
Now running omp code
6thread omp time:11165054
result: 4294903886
Now running sequential code
sequential time: 3525371
result: 4294903886
#include <iostream>
#include <stdio.h>
#include <omp.h>
#include <ctime>
#include <random>
using namespace std;
long long llsum(char *vec, size_t size, int threadCount) {
long long result = 0;
size_t i;
#pragma omp parallel for num_threads(threadCount) reduction(+: result) schedule(guided)
for (i = 0; i < size; ++i) {
result += vec[i];
}
return result;
}
int main(int argc, char **argv) {
int threadCount = 12;
omp_set_num_threads(threadCount);
cout << "Your thread count is: " << threadCount << endl;
const size_t TEST_SIZE = 8000000000;
char *testArray = new char[TEST_SIZE];
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(0, 4);
cout << "Filling arrays\n";
auto fillingStartTime = clock();
for (int i = 0; i < TEST_SIZE; ++i) {
testArray[i] = dist6(rng);
}
auto fillingEndTime = clock();
auto fillingTime = fillingEndTime - fillingStartTime;
cout << "filling time：" << fillingTime << endl;
// test omp time
for (int i = 1; i <= 3; ++i) {
cout << "Now running omp code\n";
auto ompStartTime = clock();
auto ompResult = llsum(testArray, TEST_SIZE, i * 2);
auto ompEndTime = clock();
auto ompTime = ompEndTime - ompStartTime;
cout << i * 2 << "thread omp time:" << ompTime << endl << "result: " << ompResult << endl;
}
// test sequential addition time
cout << "Now running sequential code\n";
auto seqStartTime = clock();
long long expectedResult = 0;
for (int i = 0; i < TEST_SIZE; ++i) {
expectedResult += testArray[i];
}
auto seqEndTime = clock();
auto seqTime = seqEndTime - seqStartTime;
cout << "sequential time: " << seqTime << endl << "result: " << expectedResult << endl;
delete[]testArray;
return 0;
}

As pointed out by #High Performance Mark, I should use omp_get_wtime() instead of clock().
clock() is 'active processor time', not 'elapsed time.
See
OpenMP time and clock() give two different results
https://en.cppreference.com/w/c/chrono/clock
After using omp_get_wtime(), and fixing the int i to size_t i, the result is more meaningful:
Your thread count is: 12
Filling arrays
filling time：267.038
Now running omp code
2thread omp time:26.1421
result: 15999820788
Now running omp code
4thread omp time:7.16911
result: 15999820788
Now running omp code
6thread omp time:5.66505
result: 15999820788
Now running sequential code
sequential time: 30.4056
result: 15999820788

Parallel version of the `std::generate` performs worse than the sequential one

I'm trying to parallelize some old code using the Execution Policy from the C++ 17. My sample code is below:
#include <cstdlib>
#include <chrono>
#include <iostream>
#include <algorithm>
#include <execution>
#include <vector>
using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::duration<double>;
constexpr auto NUM = 100'000'000U;
double func()
{
return rand();
}
int main()
{
std::vector<double> v(NUM);
// ------ feature testing
std::cout << "__cpp_lib_execution : " << __cpp_lib_execution << std::endl;
std::cout << "__cpp_lib_parallel_algorithm: " << __cpp_lib_parallel_algorithm << std::endl;
// ------ fill the vector with random numbers sequentially
auto const startTime1 = Clock::now();
std::generate(std::execution::seq, v.begin(), v.end(), func);
Duration const elapsed1 = Clock::now() - startTime1;
std::cout << "std::execution::seq: " << elapsed1.count() << " sec." << std::endl;
// ------ fill the vector with random numbers in parallel
auto const startTime2 = Clock::now();
std::generate(std::execution::par, v.begin(), v.end(), func);
Duration const elapsed2 = Clock::now() - startTime2;
std::cout << "std::execution::par: " << elapsed2.count() << " sec." << std::endl;
}
The program output on my Linux desktop:
__cpp_lib_execution : 201902
__cpp_lib_parallel_algorithm: 201603
std::execution::seq: 0.971162 sec.
std::execution::par: 25.0349 sec.
Why does the parallel version performs 25 times worse than the sequential one?
Compiler: g++ (Ubuntu 10.3.0-1ubuntu1~20.04) 10.3.0

The thread-safety of rand is implementation-defined. Which means either:
Your code is wrong in the parallel case, or
It's effectively serial, with a highly contended lock, which would dramatically increase the overhead in the parallel case and get incredibly poor performance.
Based on your results, I'm guessing #2 applies, but it could be both.
Either way, the answer is: rand is a terrible test case for parallelism.

No speed up with openmp for emplace_back a vector in a loop

I'm trying to emplace_back a vector in a loop with openmp. I took my inspiration from this post : C++ OpenMP Parallel For Loop - Alternatives to std::vector. So I write a test code :
// Example program
#include <iostream>
#include <string>
#include <vector>
#include <random>
#include <chrono>
#include <omp.h>
int main()
{
std::cout << "Numbers of thread available : " << omp_get_max_threads() << std::endl;
std::random_device dev;
std::mt19937 gen(dev());
std::uniform_int_distribution<unsigned> distrib(1, 5);
{
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();
std::vector<std::pair<uint32_t, uint32_t> > result;
#pragma omp declare reduction (merge : std::vector<std::pair<uint32_t, uint32_t> > : omp_out.insert(omp_out.end(), std::make_move_iterator(omp_in.begin()), std::make_move_iterator(omp_in.end())))
#pragma omp parallel for reduction(merge: result)
for(int i=0; i<100000000; ++i)
{
if(distrib(gen) == 1)
{
result.emplace_back(std::make_pair(distrib(gen),distrib(gen)));
}
}
end = std::chrono::system_clock::now(); \
auto elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count(); \
std::cout << "With openmp " << " : " << elapsed_seconds << "ms\n";
}
{
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();
std::vector<std::pair<uint32_t, uint32_t> > result;
for(int i=0; i<100000000; ++i)
{
if(distrib(gen) == 1)
{
result.emplace_back(std::make_pair(distrib(gen),distrib(gen)));
}
}
end = std::chrono::system_clock::now(); \
auto elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count(); \
std::cout << "Without openmp " << " : " << elapsed_seconds << "ms\n";
}
}
I compile this code with
g++ -o main -std=c++17 -fopenmp main.cpp
and the output is :
Numbers of thread available : 12
With openmp : 3982ms
Without openmp : 3887ms
Obviously, I don't have any speed up with my openmp implementation. Why ?

The current code is ill-formed regarding the documentation (since the parallelized code contains mostly-implicit dependencies). As a result, an OpenMP implementation is free to generate a fast but completely "wrong" program or a slow "correct" one.
To get a correct implementation and a not-too-bad speedup using OpenMP, one solution is to replicate the generator/distribution in each worker (by moving the variable declarations in a #pragma omp parallel section) and to add critical sections (using #pragma omp critical) for the (sequential) emplace_back.
Due to possible false-sharing and lock-contention, the resulting parallel implementation may scale poorly. It is probably better to generate thread-private arrays and then merge ultimately the sub-arrays in a big shared one rather than using naive critical section (note however that this still not ideal since the computation will likely be limited by the speed of the shared memory).
Please note that the result can be different from the sequential implementation when a specific seed need to be used (here it is not a problem since the seed is extracted from random_device).

OpenMP only using one thread

I have having a bit of a frustrating problem with openmp. When I run the following code it only seems to be running on one thread.
omp_set_num_threads(8);
#pragma omp parallel for schedule(dynamic)
for(size_t i = 0; i < jobs.size(); i++) //jobs is a vector
{
std::cout << omp_get_thread_num() << "\t" << omp_get_num_threads() << "\t" << omp_in_parallel() << std::endl;
jobs[i].run();
}
This prints...
0 1 1
for every line.
I can see using top that openmp is spawning as many threads as I have the process taskset to. They are mostly idle while it runs. The program is both compiled and linked with the -fopenmp flag with gcc. I am using redhat 6. I also tried using the num_threads(8) parameter in the pragma which made no difference. The program is linked with another library which also uses openmp so maybe this is the issue. Does anyone know what might cause this behavior? In all my past openmp experience it has just worked.

Can you print your jobs.size()?
I made a quick test and it does work:
#include <stdio.h>
#include <omp.h>
#include <iostream>
int main()
{
omp_set_num_threads(2);
#pragma omp parallel for ordered schedule(dynamic)
for(size_t i = 0; i < 4; i++) //jobs is a vector
{
#pragma omp ordered
std::cout << i << "\t" << omp_get_thread_num() << "\t" << omp_get_num_threads() << "\t" << omp_in_parallel() << std::endl;
}
return 0;
}
I got:
icpc -qopenmp test.cpp && ./a.out
0 0 2 1
1 1 2 1
2 0 2 1
3 1 2 1

Parallel std::fill has different performance on different architectures; why?

I'm attempting to write a parallel vector fill, using the following code:
#include <iostream>
#include <thread>
#include <vector>
#include <chrono>
#include <algorithm>
using namespace std;
using namespace std::chrono;
void fill_part(vector<double> & v, int ii, int num_threads)
{
fill(v.begin() + ii*v.size()/num_threads, v.begin() + (ii+1)*v.size()/num_threads, 0);
}
int main()
{
vector<double> v(200*1000*1000);
high_resolution_clock::time_point t = high_resolution_clock::now();
fill(v.begin(), v.end(), 0);
duration<double> d = high_resolution_clock::now() - t;
cout << "Filling the vector took " << duration_cast<milliseconds>(d).count()
<< " ms in serial.\n";
unsigned num_threads = thread::hardware_concurrency() ? thread::hardware_concurrency() : 1;
cout << "Num threads: " << num_threads << '\n';
vector<thread> threads;
t = high_resolution_clock::now();
for(int ii = 0; ii< num_threads; ++ii)
{
threads.emplace_back(fill_part, std::ref(v), ii, num_threads);
}
for(auto & t : threads)
{
if(t.joinable()) t.join();
}
d = high_resolution_clock::now() - t;
cout << "Filling the vector took " << duration_cast<milliseconds>(d).count()
<< " ms in parallel.\n";
}
I tried this code on four different architectures (all Intel CPUs--but no matter).
The first I tried had 4 CPUs, and the parallelization gave no speedup. The second had 4, and was 4 times as fast, the third had 4, and was twice as fast, and the last had 2, and gave no speedup.
My hypothesis is that the differences arise because the RAM bus can either be saturated by a single CPU or not, but is this correct? How can I predict what architectures will benefit from this parallelization?
Bonus question: The void fill_part function is awkward, so I wanted to do it with a lambda:
threads.emplace_back([&]{fill(v.begin() + ii*v.size()/num_threads, v.begin() + (ii+1)*v.size()/num_threads, 0); });
This compiles but terminates with a bus error; what's wrong with the lambda syntax?

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to make parallel cudaMalloc fast? - c++

Related

OpenMP parallel for does not speed up array sum code [duplicate]

Parallel version of the `std::generate` performs worse than the sequential one

No speed up with openmp for emplace_back a vector in a loop

OpenMP only using one thread

Parallel std::fill has different performance on different architectures; why?

Categories

Resources