1D Finite Difference Wave Equation Cuda

1D Finite Difference Wave Equation Cuda - c++

I am new to Cuda. I am trying to solve the wave equation with the initial condition in the form of the Ricky momentum. The performance of the code is 12 GFlops, although my GPU performance is 3900. Why is the code so ineffective for me and how can I fix it?
main.cu
#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
if (argc <= 3)
{
perror("Error in argc: argc<=3 (wait h, tau, C) \n");
exit(1);
}
char *eptr;
errno = 0;
long long int size,tmax;
double tau,cour,h,C, cour2;
h = std::strtod(argv[1], &eptr);
tau = std::strtod(argv[2], &eptr);
C = std::strtod(argv[3], &eptr);
tmax = 2000;
cour = C*tau/h;
cour2 = cour* cour;
size = 18*13*1024;
double *nxt_layer=nullptr;
double *layer_1=nullptr;
double *layer_2=nullptr;
double *rev_layer=nullptr;
dim3 blockSize = dim3(1024);
dim3 gridSize = dim3(size/blockSize.x);
float time;
cudaTimer timer;
cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_1, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_2, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
for (int i = 0; i < size; ++i)
{
layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
}
for (int i = 1; i < size/2; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
for (int i = size/2; i < size-1; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
for (int i = 0; i < size-1; ++i)
{
layer_2[i] = layer_1[i];
layer_1[i] = nxt_layer[i];
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
timer.start();
for (double t = 0; t < tmax; t=t+tau)
{
step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
if (CHECK_ERROR(cudaDeviceSynchronize()))
throw(-1);
nxt_layer[size-1]=0;
nxt_layer[0]=0;
}
time = timer.stop();
for (int i = 0; i < size; ++i)
{
cout<<i*h<<" "<<nxt_layer[i]<<endl;
}
}
step.cu
inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}
__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
int node = threadIdx.x + blockDim.x * blockIdx.x;
if(node >= Nx-1 || node<=0) return;
double layer_1_tmp[3];
layer_1_tmp[0]=layer_1[node-1];
layer_1_tmp[1]=layer_1[node];
layer_1_tmp[2]=layer_1[node+1];
double layer_2_tmp=layer_2[node];
if(node<=Nx/2)
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
}
else
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
}
layer_2[node]=layer_1[node];
layer_1[node]=tmp_layer[node];
}
I calculate GFlops as
long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;
I would be grateful for any of your comments and tips.

In the kernel, each work-item is doing only several multiplications and additions. This is negligible compared to kernel launch overhead per cuda thread and the memory access latency per layer_1 element. It's equivalent of measuring a few nanoseconds within microseconds of kernel time. Try clock measurement around the compute() function calls. It would at least give some "cycles per compute" measurement and you can find the total performance during the compute call.
clock_t c1 = clock();
compute();
clock_t c2 = clock();
timings[node] = c2-c1;
Even this is not true performance measurement as it doesn't take pipelining into consideration when multiple compute calls are made one after another. You may add another compute call after first one and gain even more performance due to pipelining and latency hiding.

Many (more consumer-oriented or semi-professional) graphics cards have better single precision than double precision performance. The single precision performance of the GTX 970 is 32x as high as its double precision performance.
Change the used data types from double to float.

Related

Why is multi-threading of matrix calculation not faster than single-core?

this is my first time using multi-threading to speed up a heavy calculation.
Background: The idea is to calculate a Kernel Covariance matrix, by reading a list of 3D points x_test and calculating the corresponding matrix, which has dimensions x_test.size() x x_test.size().
I already sped up the calculations by only calculating the lower triangluar matrix. Since all the calculations are independent from each other I tried to speed up the process (x_test.size() = 27000 in my case) by splitting the calculations of the matrix entries row-wise, assigning a range of rows to each thread.
On a single core the calculations took about 280 seconds each time, on 4 cores it took 270-290 seconds.
main.cpp
int main(int argc, char *argv[]) {
double sigma0sq = 1;
double lengthScale [] = {0.7633, 0.6937, 3.3307e+07};
const std::vector<std::vector<double>> x_test = parse2DCsvFile(inputPath);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i=1; i<x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
/* Spreding calculations to multiple threads */
std::vector<std::thread> threads;
for(std::size_t i = 1; i < indices.size(); ++i){
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices.at(i-1), indices.at(i)));
}
for(auto & th: threads){
th.join();
}
return 0;
}
As you can see, each thread performs the following calculations on the data assigned to it:
void calculateKMatrixCpp(const std::vector<std::vector<double>> xtest, double lengthScale[], double sigma0sq, int threadCounter, int start, int stop){
char buffer[8192];
std::ofstream out("lower_half_matrix_" + std::to_string(threadCounter) +".csv");
out.rdbuf()->pubsetbuf(buffer, 8196);
for(int i = start; i < stop; ++i){
for(int j = 0; j < i+1; ++j){
double kij = seKernel(xtest.at(i), xtest.at(j), lengthScale, sigma0sq);
if (j!=0)
out << ',';
out << kij;
}
if(i!=xtest.size()-1 )
out << '\n';
}
out.close();
}
and
double seKernel(const std::vector<double> x1,const std::vector<double> x2, double lengthScale[], double sigma0sq) {
double sum(0);
for(std::size_t i=0; i<x1.size();i++){
sum += pow((x1.at(i)-x2.at(i))/lengthScale[i],2);
}
return sigma0sq*exp(-0.5*sum);
}
Aspects I considered
locking by simultaneous access to data vector -> I don't pass a reference to the threads, but a copy of the data. I know this is not optimal in terms of RAM usage, but as far as I know this should prevent simultaneous data access since every thread has its own copy
Output -> every thread writes its part of the lower triangular matrix to its own file. My task manager doesn't indicate a full SSD utilization in the slightest
Compiler and machine
Windows 11
GNU GCC Compiler
Code::Blocks (although I don't think that should be of importance)

There are many details that can be improved in your code, but I think the two biggest issues are:
using vectors or vectors, which leads to fragmented data;
writing each piece of data to file as soon as its value is computed.
The first point is easy to fix: use something like std::vector<std::array<double, 3>>. In the code below I use an alias to make it more readable:
using Point3D = std::array<double, 3>;
std::vector<Point3D> x_test;
The second point is slightly harder to address. I assume you wanted to write to the disk inside each thread because you couldn't manage to write to a shared buffer that you could then write to a file.
Here is a way to do exactly that:
void calculateKMatrixCpp(
std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq,
int threadCounter, int start, int stop, std::vector<double>& kMatrix
) {
// ...
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
// ...
}
// ...
threads.push_back(std::thread(
calculateKMatrixCpp, x_test, lengthScale, sigma0sq,
i, indices[i-1], indices[i], std::ref(kMatrix)
));
Here, kMatrix is the shared buffer and represents the whole matrix you are trying to compute. You need to pass it to the thread via std::ref. Each thread will write to a different location in that buffer, so there is no need for any mutex or other synchronization.
Once you make these changes and try to write kMatrix to the disk, you will realize that this is the part that takes the most time, by far.
Below is the full code I tried on my machine, and the computation time was about 2 seconds whereas the writing-to-file part took 300 seconds! No amount of multithreading can speed that up.
If you truly want to write all that data to the disk, you may have some luck with file mapping. Computing the exact size needed should be easy enough if all values have the same number of digits, and it looks like you could write the values with multithreading. I have never done anything like that, so I can't really say much more about it, but it looks to me like the fastest way to write multiple gigabytes of memory to the disk.
#include <vector>
#include <thread>
#include <iostream>
#include <string>
#include <cmath>
#include <array>
#include <random>
#include <fstream>
#include <chrono>
using Point3D = std::array<double, 3>;
auto generateSampleData() -> std::vector<Point3D> {
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i) {
data.push_back({ d(g), d(g), d(g) });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance*distance;
}
return sigma0sq * std::exp(-0.5*sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::vector<double>& kMatrix) {
std::cout << "start of thread " << threadCounter << "\n" << std::flush;
for(int i = start; i < stop; ++i) {
for(int j = 0; j < i+1; ++j) {
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
}
}
std::cout << "end of thread " << threadCounter << "\n" << std::flush;
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = {0.7633, 0.6937, 3.3307e+07};
const std::vector<Point3D> x_test = generateSampleData();
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i = 1; i < x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<double> kMatrix(x_test.size() * x_test.size(), 0.0);
std::vector<std::thread> threads;
for (std::size_t i = 1; i < indices.size(); ++i) {
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::ref(kMatrix)));
}
for (auto& t : threads) {
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "computation time: " << elapsed_seconds << "s" << std::endl;
start = std::chrono::system_clock::now();
constexpr int buffer_size = 131072;
char buffer[buffer_size];
std::ofstream out("matrix.csv");
out.rdbuf()->pubsetbuf(buffer, buffer_size);
for (int i = 0; i < x_test.size(); ++i) {
for (int j = 0; j < i + 1; ++j) {
if (j != 0) {
out << ',';
}
out << kMatrix[i * x_test.size() + j];
}
if (i != x_test.size() - 1) {
out << '\n';
}
}
end = std::chrono::system_clock::now();
elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "writing time: " << elapsed_seconds << "s" << std::endl;
}

Okey I've wrote implementation with optimized formatting.
By using #Nelfeal code it was taking on my system around 250 seconds for the run to complete with write time taking the most by far. Or rather std::ofstream formatting taking most of the time.
I've written a C++20 version via std::format_to/format. It is a multi-threaded version that takes around 25-40 seconds to complete all the computations, formatting, and writing. If run in a single thread, it takes on my system around 70 seconds. Same performance should be achievable via fmt library on C++11/14/17.
Here is the code:
import <vector>;
import <thread>;
import <iostream>;
import <string>;
import <cmath>;
import <array>;
import <random>;
import <fstream>;
import <chrono>;
import <format>;
import <filesystem>;
using Point3D = std::array<double, 3>;
auto generateSampleData(Point3D scale) -> std::vector<Point3D>
{
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i)
{
data.push_back({ d(g)* scale[0], d(g)* scale[1], d(g)* scale[2] });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance * distance;
}
return sigma0sq * std::exp(-0.5 * sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::filesystem::path localPath)
{
using namespace std::string_view_literals;
std::vector<char> buffer;
buffer.reserve(15'000);
std::ofstream out(localPath);
std::cout << std::format("starting thread {}: from {} to {}\n"sv, threadCounter, start, stop);
for (int i = start; i < stop; ++i)
{
for (int j = 0; j < i; ++j)
{
double kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}, "sv, kij);
}
double kii = seKernel(xtest[i], xtest[i], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}\n"sv, kii);
out.write(buffer.data(), buffer.size());
buffer.clear();
}
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = { 0.7633, 0.6937, 3.3307e+07 };
const std::vector<Point3D> x_test = generateSampleData(lengthScale);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size() * (x_test.size()+1) / 2;
const int numThreads = 3;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for (std::size_t i = 1; i < x_test.size() + 1; ++i) {
int prod = i * (i + 1) / 2 - j * (j + 1) / 2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if (indices.size() == numThreads - 1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<std::thread> threads;
using namespace std::string_view_literals;
for (std::size_t i = 1; i < indices.size(); ++i)
{
threads.push_back(std::thread(calculateKMatrixCpp, std::ref(x_test), lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::format("./matrix_{}.csv"sv, i-1)));
}
for (auto& t : threads)
{
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start);
std::cout << std::format("total elapsed time: {}"sv, elapsed_seconds);
return 0;
}
Note: I used 6 digits of precision here as it is the default for std::ofstream. More digits means more writing time to disk and lower performance.

what are some optimization tricks to make my code run faster

i'm moving outside my confront zone and trying to make a random number distribution program while also making sure it is still somewhat uniform.
here is my code
this is the RandomDistribution.h file
#pragma once
#include <vector>
#include <random>
#include <iostream>
static float randy(float low, float high) {
static std::random_device rd;
static std::mt19937 random(rd());
std::uniform_real_distribution<float> ran(low, high);
return ran(random);
}
typedef std::vector<float> Vfloat;
class RandomDistribution
{
public:
RandomDistribution();
RandomDistribution(float percent, float contents, int container);
~RandomDistribution();
void setvariables(float percent, float contents, int container);
Vfloat RunDistribution();
private:
float divider;
float _percent;
int jar_limit;
float _contents;
float _maxdistribution;
Vfloat Jar;
bool is0;
};
this is my RandomDistribution.cpp
#include "RandomDistribution.h"
RandomDistribution::RandomDistribution() {
}
RandomDistribution::RandomDistribution(float percent, float contents, int containers):_contents(contents),jar_limit(containers)
{
Jar.resize(containers);
if (percent < 0)
_percent = 0;
else {
_percent = percent;
}
divider = jar_limit * percent;
is0 = false;
}
RandomDistribution::~RandomDistribution()
{
}
void RandomDistribution::setvariables(float percent, float contents, int container) {
if (jar_limit != container)
Jar.resize(container);
_contents = contents;
jar_limit = container;
is0 = false;
if (percent < 0)
_percent = 0;
else {
_percent = percent;
}
divider = jar_limit * percent;
}
Vfloat RandomDistribution::RunDistribution() {
for (int i = 0; i < jar_limit; i++) {
if (!is0) {
if (i + 1 >= jar_limit || _contents < 2) {
Jar[i] = _contents;
_contents -= Jar[i];
is0 = true;
}
if (!_percent <= 0) {//making sure it does not get the hole container at once
_maxdistribution = (_contents / (divider)) * (i + 1);
}
else {
_maxdistribution = _contents;
}
Jar[i] = randy(0, _maxdistribution);
if (Jar[i] < 1) {
Jar[i] = 0;
continue;
}
_contents -= Jar[i];
}
else {
Jar[0];
}
//mixing Jar so it is randomly spaced out instead all at the top
int swapper = randy(0, i);
float hold = Jar[i];
Jar[i] = Jar[swapper];
Jar[swapper] = hold;
}
return Jar;
}
source code
int main(){
RandomDistribution distribution[100];
for (int i = 0; i < 100; i++) {
distribution[i] = {RandomDistribution(1.0f, 5000.0f, 2000) };
}
Vfloat k;
k.resize(200);
for (int i = 0; i < 10; i++) {
auto t3 = chrono::steady_clock::now();
for (int b = 0; b < 100; b++) {
k = distribution[b].RunDistribution();
distribution[b].setvariables(1.0f, 5000.0f, 2000);
}
auto t4 = chrono::steady_clock::now();
auto time_span = chrono::duration_cast<chrono::duration<double>>(t4 - t3);
cout << time_span.count() << " seconds\n";
}
}
what prints out is usually between 1 to 2 seconds for each cycle. i want to bring it down to a tenth of a second if possible cause this is gonna be only one step of the process to completion and i want to run it alot more then 100 times. what can i do to speed this up, any trick or something i'm just missing here.
here is a sample of the time stamps
4.71113 seconds
1.35444 seconds
1.45008 seconds
1.74961 seconds
2.59192 seconds
2.76171 seconds
1.90149 seconds
2.2822 seconds
2.36768 seconds
2.61969 seconds

Cheinan Marks has some benchmarks and performance tips related to random generators & friends in his cppcon 2016 talk I Just Wanted a Random Integer! He mentions some fast generators as well IIRC. I'd start there.

compute pi value using monte carlo method multithreading

I am trying to find value of PI using montecarlo method, and using parallel C code. i have write serail code and works fine. But the parallel code gives me wrong values of pi some times 0 or minus values
my code
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NUM_THREADS 4 //number of threads
#define TOT_COUNT 10000055 //total number of iterations
void *doCalcs(void *threadid)
{
long longTid;
longTid = (long)threadid;
int tid = (int)longTid; //obtain the integer value of thread id
//using malloc for the return variable in order make
//sure that it is not destroyed once the thread call is finished
float *in_count = (float *)malloc(sizeof(float));
*in_count=0;
unsigned int rand_state = rand();
//get the total number of iterations for a thread
float tot_iterations= TOT_COUNT/NUM_THREADS;
int counter=0;
//calculation
for(counter=0;counter<tot_iterations;counter++){
//float x = (double)random()/RAND_MAX;
//float y = (double)random()/RAND_MAX;
//float result = sqrt((x*x) + (y*y));
double x = rand_r(&rand_state) / ((double)RAND_MAX + 1) * 2.0 - 1.0;
double y = rand_r(&rand_state) / ((double)RAND_MAX + 1) * 2.0 - 1.0;
float result = sqrt((x*x) + (y*y));
if(result<1){
*in_count+=1; //check if the generated value is inside a unit circle
}
}
//get the remaining iterations calculated by thread 0
if(tid==0){
float remainder = TOT_COUNT%NUM_THREADS;
for(counter=0;counter<remainder;counter++){
float x = (double)random()/RAND_MAX;
float y = (double)random()/RAND_MAX;
float result = sqrt((x*x) + (y*y));
if(result<1){
*in_count+=1; //check if the generated value is inside a unit circle
}
}
}
}
int main(int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
int rc;
long t;
void *status;
float tot_in=0;
for(t=0;t<NUM_THREADS;t++){
rc = pthread_create(&threads[t], NULL, doCalcs, (void *)t);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
//join the threads
for(t=0;t<NUM_THREADS;t++){
pthread_join(threads[t], &status);
//printf("Return from thread %ld is : %f\n",t, *(float*)status);
tot_in+=*(float*)status; //keep track of the total in count
}
printf("Value for PI is %f \n",1, 4*(tot_in/TOT_COUNT));
/* Last thing that main() should do */
pthread_exit(NULL);
}

This is a solution using async and future as suggested by #vladon.
#include <iostream>
#include <vector>
#include <random>
#include <future>
using namespace std;
long random_circle_sampling(long n_samples){
std::random_device rd; //Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> dis(0.0, 1.0);
long points_inside = 0;
for(long i = 0; i < n_samples; ++i){
double x = dis(gen);
double y = dis(gen);
if(x*x + y*y <= 1.0){
++points_inside;
}
}
return points_inside;
}
double approximate_pi(long tot_samples, int n_threads){
long samples_per_thread = tot_samples / n_threads;
// Used to store the future results
vector<future<long>> futures;
for(int t = 0; t < n_threads; ++t){
// Start a new asynchronous task
futures.emplace_back(async(launch::async, random_circle_sampling, samples_per_thread));
}
long tot_points_inside = 0;
for(future<long>& f : futures){
// Wait for the result to be ready
tot_points_inside += f.get();
}
double pi = 4.0 * (double) tot_points_inside / (double) tot_samples;
return pi;
}
int main() {
cout.precision(32);
long tot_samples = 1e6;
int n_threads = 8;
double pi = 3.14159265358979323846;
double approx_pi = approximate_pi(tot_samples, n_threads);
double abs_diff = abs(pi - approx_pi);
cout << "pi\t\t" <<pi << endl;
cout << "approx_pi\t" <<approx_pi << endl;
cout << "abs_diff\t" <<abs_diff << endl;
return 0;
}
You can simply run it with:
$ g++ -std=c++11 -O3 pi.cpp -o pi && time ./pi
pi 3.1415926535897931159979634685442
approx_pi 3.1427999999999998159694314381341
abs_diff 0.0012073464102066999714679695898667
./pi 0.04s user 0.00s system 27% cpu 0.163 total

Your code is not C++, it's bad, very bad plain old C.
That is C++:
#include <cmath>
#include <iostream>
#include <numeric>
#include <random>
#include <thread>
#include <vector>
constexpr auto num_threads = 4; //number of threads
constexpr auto total_count = 10000055; //total number of iterations
void doCalcs(int total_iterations, int & in_count_result)
{
auto seed = std::random_device{}();
auto gen = std::mt19937{ seed };
auto dist = std::uniform_real_distribution<>{0, 1};
auto in_count{ 0 };
//calculation
for (auto counter = 0; counter < total_iterations; ++counter) {
auto x = dist(gen);
auto y = dist(gen);
auto result = std::sqrt(std::pow(x, 2) + std::pow(y, 2));
if (result < 1) {
++in_count; //check if the generated value is inside a unit circle
}
}
in_count_result = in_count;
}
void main()
{
std::vector<std::thread> threads(num_threads);
std::vector<int> in_count(num_threads);
in_count.resize(num_threads);
for (size_t i = 0; i < num_threads; ++i) {
int total_iterations = total_count / num_threads;
if (i == 0) {
total_iterations += total_count % num_threads; // get the remaining iterations calculated by thread 0
}
threads.emplace_back(doCalcs, total_iterations, std::ref(in_count[i]));
}
for (auto & thread : threads) {
if (thread.joinable()) {
thread.join();
}
}
double pi_value = 4.0 * static_cast<double>(std::accumulate(in_count.begin(), in_count.end(), 0)) / static_cast<double>(total_count);
std::cout << "Value of PI is: " << pi_value << std::endl;
}
P.S. And it is also not that good, read about futures, promises and std::async.

Performance of map pattern in multithreaded program lower than expected (4x speedup vs 8x)

I'm getting started in multithreaded programming so please excuse me if the following seems obvious. I am adding multithreading to an image processing program and the speedup isn't exactly the one I expected.
I'm currently getting a speedup of 4x times on a 4 physical processor cpu with hyperthreading (8), so I'd like to know if this kind of speedup is expected. The only thing I can think of is that it may make sense if both hyperthreads of a single physical CPU have to share some sort of memory bus.
Being new to multithreading it's not entirely clear to me if this would be considered an I/O bound program considering that all memory is allocated in RAM (I understand that the virtual memory manager of my OS will be the one deciding to page in/out this supposed memory amount from the heap) My machine has 16Gb of RAM in case it helps deciding if paging/swapping can be an issue.
I've written a test program showcasing the serial case and two parallel cases using QThreadPool and tbb::parallel_for
The current program as you can see has no real operations other than setting a supposed image from black to white and it's done on purpose to know what the baseline is before any real operations are applied to the image.
I'm attaching the program in hope that someone can explain me if my quest for a roughly 8x speedup is a lost cause in this kind of processing algorithm. Note that I'm not interested in other kinds of optimizations such as SIMD as my real concern is not just to make it faster, but to make it faster using purely multithreading, without getting into SSE nor processor cache level optimizations.
#include <iostream>
#include <sys/time.h>
#include <vector>
#include <QThreadPool>
#include "/usr/local/include/tbb/tbb.h"
#define LOG(x) (std::cout << x << std::endl)
struct col4
{
unsigned char r, g, b, a;
};
class QTileTask : public QRunnable
{
public:
void run()
{
for(uint32_t y = m_yStart; y < m_yEnd; y++)
{
int rowStart = y * m_width;
for(uint32_t x = m_xStart; x < m_xEnd; x++)
{
int index = rowStart + x;
m_pData[index].r = 255;
m_pData[index].g = 255;
m_pData[index].b = 255;
m_pData[index].a = 255;
}
}
}
col4* m_pData;
uint32_t m_xStart;
uint32_t m_yStart;
uint32_t m_xEnd;
uint32_t m_yEnd;
uint32_t m_width;
};
struct TBBTileTask
{
void operator()()
{
for(uint32_t y = m_yStart; y < m_yEnd; y++)
{
int rowStart = y * m_width;
for(uint32_t x = m_xStart; x < m_xEnd; x++)
{
int index = rowStart + x;
m_pData[index].r = 255;
m_pData[index].g = 255;
m_pData[index].b = 255;
m_pData[index].a = 255;
}
}
}
col4* m_pData;
uint32_t m_xStart;
uint32_t m_yStart;
uint32_t m_xEnd;
uint32_t m_yEnd;
uint32_t m_width;
};
struct TBBCaller
{
TBBCaller(std::vector<TBBTileTask>& t)
: m_tasks(t)
{}
TBBCaller(TBBCaller& e, tbb::split)
: m_tasks(e.m_tasks)
{}
void operator()(const tbb::blocked_range<size_t>& r) const
{
for (size_t i=r.begin();i!=r.end();++i)
m_tasks[i]();
}
std::vector<TBBTileTask>& m_tasks;
};
inline double getcurrenttime( void )
{
timeval t;
gettimeofday(&t, NULL);
return static_cast<double>(t.tv_sec)+(static_cast<double>(t.tv_usec) / 1000000.0);
}
char* getCmdOption(char ** begin, char ** end, const std::string & option)
{
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end)
{
return *itr;
}
return 0;
}
bool cmdOptionExists(char** begin, char** end, const std::string& option)
{
return std::find(begin, end, option) != end;
}
void baselineSerial(col4* pData, int resolution)
{
double t = getcurrenttime();
for(int y = 0; y < resolution; y++)
{
int rowStart = y * resolution;
for(int x = 0; x < resolution; x++)
{
int index = rowStart + x;
pData[index].r = 255;
pData[index].g = 255;
pData[index].b = 255;
pData[index].a = 255;
}
}
LOG((getcurrenttime() - t) * 1000 << " ms. (Serial)");
}
void baselineParallelQt(col4* pData, int resolution, uint32_t tileSize)
{
double t = getcurrenttime();
QThreadPool pool;
for(int y = 0; y < resolution; y+=tileSize)
{
for(int x = 0; x < resolution; x+=tileSize)
{
uint32_t xEnd = std::min<uint32_t>(x+tileSize, resolution);
uint32_t yEnd = std::min<uint32_t>(y+tileSize, resolution);
QTileTask* t = new QTileTask;
t->m_pData = pData;
t->m_xStart = x;
t->m_yStart = y;
t->m_xEnd = xEnd;
t->m_yEnd = yEnd;
t->m_width = resolution;
pool.start(t);
}
}
pool.waitForDone();
LOG((getcurrenttime() - t) * 1000 << " ms. (QThreadPool)");
}
void baselineParallelTBB(col4* pData, int resolution, uint32_t tileSize)
{
double t = getcurrenttime();
std::vector<TBBTileTask> tasks;
for(int y = 0; y < resolution; y+=tileSize)
{
for(int x = 0; x < resolution; x+=tileSize)
{
uint32_t xEnd = std::min<uint32_t>(x+tileSize, resolution);
uint32_t yEnd = std::min<uint32_t>(y+tileSize, resolution);
TBBTileTask t;
t.m_pData = pData;
t.m_xStart = x;
t.m_yStart = y;
t.m_xEnd = xEnd;
t.m_yEnd = yEnd;
t.m_width = resolution;
tasks.push_back(t);
}
}
TBBCaller caller(tasks);
tbb::task_scheduler_init init;
tbb::parallel_for(tbb::blocked_range<size_t>(0, tasks.size()), caller);
LOG((getcurrenttime() - t) * 1000 << " ms. (TBB)");
}
int main(int argc, char** argv)
{
int resolution = 1;
uint32_t tileSize = 64;
char * pResText = getCmdOption(argv, argv + argc, "-r");
if (pResText)
{
resolution = atoi(pResText);
}
char * pTileSizeChr = getCmdOption(argv, argv + argc, "-b");
if (pTileSizeChr)
{
tileSize = atoi(pTileSizeChr);
}
if(resolution > 16)
resolution = 16;
resolution = resolution << 10;
uint32_t tileCount = resolution/tileSize + 1;
tileCount *= tileCount;
LOG("Resolution: " << resolution << " Tile Size: "<< tileSize);
LOG("Tile Count: " << tileCount);
uint64_t pixelCount = resolution*resolution;
col4* pData = new col4[pixelCount];
memset(pData, 0, sizeof(col4)*pixelCount);
baselineSerial(pData, resolution);
memset(pData, 0, sizeof(col4)*pixelCount);
baselineParallelQt(pData, resolution, tileSize);
memset(pData, 0, sizeof(col4)*pixelCount);
baselineParallelTBB(pData, resolution, tileSize);
delete[] pData;
return 0;
}

Yes, 4x speedup is expected. Hypertreading is a kind of time sharing implemented in hardware, so you can't expect to benefit from it if one thread is using up all superscalar pipelines available on the core, as it is your case. The other thread will necessarily have to wait.
You can expect an even lower speedup if your memory bus bandwidth is saturated by the threads running in less than the total number of cores available. Usually happens if you have too many cores, like in this question:
Why doesn't this code scale linearly?

Simple test to measure cache lines size

Starting from this article - Gallery of Processor Cache Effects by Igor Ostrovsky - I wanted to play with his examples on my own machine.
This is my code for the first example, that looks at how touching different cache lines affect running time:
#include <iostream>
#include <time.h>
using namespace std;
int main(int argc, char* argv[])
{
int step = 1;
const int length = 64 * 1024 * 1024;
int* arr = new int[length];
timespec t0, t1;
clock_gettime(CLOCK_REALTIME, &t0);
for (int i = 0; i < length; i += step)
arr[i] *= 3;
clock_gettime(CLOCK_REALTIME, &t1);
long int duration = (t1.tv_nsec - t0.tv_nsec);
if (duration < 0)
duration = 1000000000 + duration;
cout<< step << ", " << duration / 1000 << endl;
return 0;
}
Using various values for step, I don't see the jump in the running time:
step, microseconds
1, 451725
2, 334981
3, 287679
4, 261813
5, 254265
6, 246077
16, 215035
32, 207410
64, 202526
128, 197089
256, 195154
I would expect to see something similar with:
But from 16 onwards, the running time is halved each time we double the step.
I test it on an Ubuntu13, Xeon X5450 and compiling it with: g++ -O0.
Is something flawed with my code, or the results are actually ok?
Any insight on what I'm missing would be highly appreciated.

As i see you want to observe effect of cache line sizes, i recommend tool cachegrind, part of valgrind tool set. Your approach is right but not close to results.
#include <iostream>
#include <time.h>
#include <stdlib.h>
using namespace std;
int main(int argc, char* argv[])
{
int step = atoi(argv[1]);
const int length = 64 * 1024 * 1024;
int* arr = new int[length];
for (int i = 0; i < length; i += step)
arr[i] *= 3;
return 0;
}
Run tool valgrind --tool=cachegrind ./a.out $cacheline-size and you should see results. After plotting this you will get desired results with accuracy. Happy Experimenting!!

public class CacheLine {
public static void main(String[] args) {
CacheLine cacheLine = new CacheLine();
cacheLine.startTesting();
}
private void startTesting() {
byte[] array = new byte[128 * 1024];
for (int testIndex = 0; testIndex < 10; testIndex++) {
testMethod(array);
System.out.println("--------- // ---------");
}
}
private void testMethod(byte[] array) {
for (int len = 8192; len <= array.length; len += 8192) {
long t0 = System.nanoTime();
for (int i = 0; i < 10000; i++) {
for (int k = 0; k < len; k += 64) {
array[k] = 1;
}
}
long dT = System.nanoTime() - t0;
System.out.println("len: " + len / 1024 + " dT: " + dT + " dT/stepCount: " + (dT) / len);
}
}
}
This code helps you with determining L1 data cache size. You can read about it more in detail here. https://medium.com/#behzodbekqodirov/threading-in-java-194b7db6c1de#.kzt4w8eul

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

1D Finite Difference Wave Equation Cuda - c++

Many (more consumer-oriented or semi-professional) graphics cards have better single precision than double precision performance. The single precision performance of the GTX 970 is 32x as high as its double precision performance. Change the used data types from double to float.

Related

Why is multi-threading of matrix calculation not faster than single-core?

what are some optimization tricks to make my code run faster

compute pi value using monte carlo method multithreading

Performance of map pattern in multithreaded program lower than expected (4x speedup vs 8x)

Simple test to measure cache lines size

Categories

Resources