CUDA - CUBLAS: issues solving many (3x3) dense linear systems - c++

I am trying to solve about 1200000 linear systems (3x3, Ax=B) with CUDA 10.1, in particular using the CUBLAS library. I took a cue from this (useful!) post and re-wrote the suggested code in a Unified Memory version. The algorithm firstly performs a LU factorization using cublasgetrfBatched() followed by two consecutive invocations of cublastrsm() which solves upper or lower triangular linear systems. The code is attached below. It works correctly up to about 10000 matrixes and, in this case, it takes ~570 ms to perform the LU factorization (on an NVIDIA GeForce 930MX) and ~311 ms to solve the systems.
My issues/questions are:
Overload issue: it crashes allocating memory for more than 10k matrices. Why? How can I improve my code in order to solve the whole batch of 1.2 million matrices?
Time issue: my goal would be to solve all of these systems in less than 1 second. Am I currently following the correct approach? Any suggestions otherwise?
Would it be possible and/or useful, and if yes how, to use 'streams' of batches of 10k matrices?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <vector>
#include <ctime>
#include <ratio>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cusolverDn.h>
//#include "Utilities.cuh"
using namespace std;
using namespace std::chrono;
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double** vec, int* pivotArray, int N, int numMatrices) {
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
double temp = vec[nm][i];
vec[nm][i] = vec[nm][pivotArray[N*i + nm] - 1];
vec[nm][pivotArray[N * i + nm] - 1] = temp;
}
}
}
/************************************/
/* MAIN */
/************************************/
int main() {
const int N = 3;
const int numMatrices = 10000; // I want 1200000
// random generator to fill matrices and coefficients
random_device device;
mt19937 generator(device());
uniform_real_distribution<double> distribution(1., 5.);
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
}
cout << " memory allocated" << endl;
// FILL MATRICES
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
h_A[nm][j * N + i] = distribution(generator);
}
}
}
cout << " Matrix filled " << endl;
// FILL COEFFICIENTS
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
h_b[nm][i] = distribution(generator);
}
}
cout << " Coeff. vector filled " << endl;
cout << endl;
// --- CUDA solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
int* PivotArray;
cudaMallocManaged(&PivotArray, N * numMatrices * sizeof(int));
int* infoArray;
cudaMallocManaged(&infoArray, numMatrices * sizeof(int));
//CUBLAS LU SOLVER
high_resolution_clock::time_point t1 = high_resolution_clock::now();
cublasDgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numMatrices);
cudaDeviceSynchronize();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
cout << "It took " << time_span.count() * 1000. << " milliseconds." << endl;
for (int i = 0; i < numMatrices; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}
// rearrange coefficient
// (temporarily on CPU, this step will be on a GPU Kernel as well)
high_resolution_clock::time_point tA = high_resolution_clock::now();
rearrange(h_b, PivotArray, N, numMatrices);
high_resolution_clock::time_point tB = high_resolution_clock::now();
duration<double> time_spanA = duration_cast<duration<double>>(tB - tA);
cout << "rearrangement took " << time_spanA.count() * 1000. << " milliseconds." << endl;
//INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple right-hand sides
// --- Function overrides b as a result
const double alpha = 1.f;
high_resolution_clock::time_point t3 = high_resolution_clock::now();
cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
cudaDeviceSynchronize();
high_resolution_clock::time_point t4 = high_resolution_clock::now();
duration<double> time_span2 = duration_cast<duration<double>>(t4 - t3);
cout << "second step took " << time_span2.count() * 1000. << " milliseconds." << endl;
// --- Free resources
if (h_A) cudaFree(h_A);
if (h_b) cudaFree(h_b);
cudaDeviceReset();
return 0;
}

Overload issue: it crashes allocating memory for more than 10k matrices. Why? How can I improve my code in order to solve the whole batch of 1.2 million matrices?
In my opinion, the biggest problem in your code is that you are making horribly inefficient use of managed memory in these key allocation loops:
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
}
The problem is that each call to cudaMallocManaged has a minimum granularity. That means that if you request to allocate 1 byte, it will actually use up something like 4kbyte of memory (I believe that is the linux allocation granularity. It looks like you are on windows, and I believe the windows allocation granularity may be larger). In addition, this creates a huge inefficient data transfer load on the managed memory subsystem, when you launch a kernel (kernels will be launched in your cublas calls).
A much better way to do this is to do a single large allocation, rather than the allocation-in-a-loop, and then just subdivide that allocation using pointer arithmetic. The code could look like this:
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
cudaMallocManaged(&(h_A[0]), sizeof(double)*numMatrices*N*N);
for (int nm = 1; nm < numMatrices; nm++) {
h_A[nm] = h_A[nm-1]+ N * N;
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
cudaMallocManaged(&(h_b[0]), sizeof(double) * numMatrices * N);
for (int nm = 1; nm < numMatrices; nm++) {
h_b[nm] = h_b[nm-1] + N;
}
Another benefit of this is that the allocation process runs quite a bit faster.
Time issue: my goal would be to solve all of these systems in less than 1 second. Am I currently following the correct approach? Any suggestions otherwise?
With that change to your code, I am able to run successfully on a 1GB GPU (GeForce GT640), with:
const int numMatrices = 1200000;
with output like this:
$ ./t81
memory allocated
Matrix filled
Coeff. vector filled
It took 70.3032 milliseconds.
rearrangement took 60.02 milliseconds.
second step took 156.067 milliseconds.
Your GPU may be somewhat slower, but I think the overall timing should easily come in at less than 1 second.
Would it be possible and/or useful, and if yes how, to use 'streams' of batches of 10k matrices?
With the above change, I don't think you need to worry about this. Streams won't help here with overlap of compute operations. They could help with copy/compute overlap (although maybe not much on your GPU) but this would be hard to architect on windows with managed memory. For windows usage, I would probably suggest switching to ordinary CUDA separation of host and device memory, if you want to explore copy/compute overlap.
As an aside, you may be able to get a set of cublas calls that will do the work even more quickly by using direct inversion. CUBLAS has a batch direct inversion method. I normally wouldn't suggest this for solution of linear equations, but it may be something to consider for a set of 3x3 or 4x4 inversions, where you could easily check for singularity with the determinant method. Here is an example.

Related

Need Help Understanding OpenMP Matrix Multiplication C++ code

Here is my Matrix Multiplication C++ OpenMP code that I have written. I am trying to use OpenMP to optimize the program. The sequential code speed was 7 seconds but when I added openMP statements but it only got faster by 3 seconds. I thought it was going to get much faster and don't understand if I'm doing it right.
The OpenMP statements are in the fill_random function and in the matrix multiplication triple for loop section in main.
I would appreciate any help or advice you can give to understand this!
#include <iostream>
#include <cassert>
#include <omp.h>
#include <chrono>
using namespace std::chrono;
double** fill_random(int rows, int cols )
{
double** mat = new double* [rows]; //Allocate rows.
#pragma omp parallell collapse(2)
for (int i = 0; i < rows; ++i)
{
mat[i] = new double[cols]; // added
for( int j = 0; j < cols; ++j)
{
mat[i][j] = rand() % 10;
}
}
return mat;
}
double** create_matrix(int rows, int cols)
{
double** mat = new double* [rows]; //Allocate rows.
for (int i = 0; i < rows; ++i)
{
mat[i] = new double[cols](); //Allocate each row and zero initialize..
}
return mat;
}
void destroy_matrix(double** &mat, int rows)
{
if (mat)
{
for (int i = 0; i < rows; ++i)
{
delete[] mat[i]; //delete each row..
}
delete[] mat; //delete the rows..
mat = nullptr;
}
}
int main()
{
int rowsA = 1000; // number of rows
int colsA= 1000; // number of columns
double** matA = fill_random(rowsA, colsA);
int rowsB = 1000; // number of rows
int colsB = 1000; // number of columns
double** matB = fill_random(rowsB, colsB);
//Checking matrix multiplication qualification
assert(colsA == rowsB);
double** matC = create_matrix(rowsA, colsB);
//measure the multiply only
const auto start = high_resolution_clock::now();
//Multiplication
#pragma omp parallel for
for(int i = 0; i < rowsA; ++i)
{
for(int j = 0; j < colsB; ++j)
{
for(int k = 0; k < colsA; ++k) //ColsA..
{
matC[i][j] += matA[i][k] * matB[k][j];
}
}
}
const auto stop = high_resolution_clock::now();
const auto duration = duration_cast<seconds>(stop - start);
std::cout << "Time taken by function: " << duration.count() << " seconds" << std::endl;
//Clean up..
destroy_matrix(matA, rowsA);
destroy_matrix(matB, rowsB);
destroy_matrix(matC, rowsA);
return 0;
}
Your problem is rather small.
The collapse in the matrix creation does nothing because the loops are not perfectly nested. On the other hand, in the multiplication routine you should add a collapse(2) directive.
Creating a matrix with an array of pointers means that the expression matB[k][j] dances all over memory. Allocate your matrices as a single array and then use i*N+j as an indexing expression. (Of course I would put that in a macro or so.)
Matrix size of 1000x1000 with double(64 bit) element type requires 8MB data. When you multiply two matrices, you read 16MB data. When you write to a third matrix, you also access 24MB data total.
If L3 cache is smaller than 24MB then RAM is bottleneck. Maybe single thread did not fully use its bandwidth but when OpenMP is used, RAM bandwidth is fully used. In your case it had only 50% headroom for bandwidth.
Naive version is not using cache well. You need to swap order of two loops to gain more caching:
loop
loop k
loop
C[..] += B[..] * A[..]
although incrementing C does not re-use a register in this optimized version, it re-uses cache that is more important in this case. If you do it, it should get ~100-200 milliseconds computation time even in single-thread.
Also if you need performance, don't do this:
//Allocate each row and zero initialize..
allocate whole matrix at once so that your matrix is not scattered in memory.
To add more threads efficiently, you can do sub-matrix multiplications to compute full matrix multiplication. Scan-line multiplication is not good for load-balancing between threads. When sub-matrices are multiplied, they give better load distribution due to caching and higher floating-point operations per element fetched from memory.
Edit:
Swapping order of loops also makes compiler able to vectorize the innermost loop because one of the input matrices becomes a constant during the innermost loop.

Erroneous single thread memory bandwidth benchmark

In an attempt to measure the bandwidth of the main memory, I have come up with the following approach.
Code (for the Intel compiler)
#include <omp.h>
#include <iostream> // std::cout
#include <limits> // std::numeric_limits
#include <cstdlib> // std::free
#include <unistd.h> // sysconf
#include <stdlib.h> // posix_memalign
#include <random> // std::mt19937
int main()
{
// test-parameters
const auto size = std::size_t{150 * 1024 * 1024} / sizeof(double);
const auto experiment_count = std::size_t{500};
//+/////////////////
// access a data-point 'on a whim'
//+/////////////////
// warm-up
for (auto counter = std::size_t{}; counter < experiment_count / 2; ++counter)
{
// garbage data allocation and memory page loading
double* data = nullptr;
posix_memalign(reinterpret_cast<void**>(&data), sysconf(_SC_PAGESIZE), size * sizeof(double));
if (data == nullptr)
{
std::cerr << "Fatal error! Unable to allocate memory." << std::endl;
std::abort();
}
//#pragma omp parallel for simd safelen(8) schedule(static)
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] = -1.0;
}
//#pragma omp parallel for simd safelen(8) schedule(static)
#pragma omp simd safelen(8)
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] = 10.0;
}
// deallocate resources
free(data);
}
// timed run
auto min_duration = std::numeric_limits<double>::max();
for (auto counter = std::size_t{}; counter < experiment_count; ++counter)
{
// garbage data allocation and memory page loading
double* data = nullptr;
posix_memalign(reinterpret_cast<void**>(&data), sysconf(_SC_PAGESIZE), size * sizeof(double));
if (data == nullptr)
{
std::cerr << "Fatal error! Unable to allocate memory." << std::endl;
std::abort();
}
//#pragma omp parallel for simd safelen(8) schedule(static)
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] = -1.0;
}
const auto dur1 = omp_get_wtime() * 1E+6;
//#pragma omp parallel for simd safelen(8) schedule(static)
#pragma omp simd safelen(8)
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] = 10.0;
}
const auto dur2 = omp_get_wtime() * 1E+6;
const auto run_duration = dur2 - dur1;
if (run_duration < min_duration)
{
min_duration = run_duration;
}
// deallocate resources
free(data);
}
// REPORT
const auto traffic = size * sizeof(double) * 2; // 1x load, 1x write
std::cout << "Using " << omp_get_max_threads() << " threads. Minimum duration: " << min_duration << " us;\n"
<< "Maximum bandwidth: " << traffic / min_duration * 1E-3 << " GB/s;" << std::endl;
return 0;
}
Notes on code
Assumed to be a 'naive' approach, also linux-only. Should still serve as a rough indicator of model performance
using ICC with compiler flags -O3 -ffast-math -march=coffeelake
size (150 MiB) is much bigger than lowest level cache of system (9 MiB on i5-8400 Coffee Lake), with 2x 16 GiB DIMM DDR4 3200 MT/s
new allocations on each iteration should invalidate all cache-lines from the previous one (to eliminate cache hits)
minimum latency is recorded to counter-act the effects of interrupts and OS-scheduling: threads being taken off cores for a short while etc.
a warm-up run is done to counter-act the effects of dynamic frequency scaling (kernel feature, can alternatively be turned off by using the userspace governor).
Results of code
On my machine, I am getting 90 GB/s. Intel Advisor, which runs its own benchmarks, has calculated or measured this bandwidth to actually be 25 GB/s. (See my previous question: Intel Advisor's bandwidth information where a previous version of this code was getting page-faults inside the timed region.)
Assembly: here's a link to the assembly generated for the above code: https://godbolt.org/z/Ma7PY49bE
I am not able to understand how I'm getting such an unreasonably high result with my bandwidth. Any tips to help facilitate my understanding would be greatly appreciated.
Actually, the question seems to be, "why is the obtained bandwidth so high?", to which I have gotten quite a lot of input from #PeterCordes and #Sebastian. This information needs to be digested in its own time.
I can still offer an auxiliary 'answer' to the topic of interest. By substituting the write operation (which, as I now understand, cannot be properly modeled in a benchmark without delving into the assembly) by a cheap e.g. a bitwise operation, we can prevent the compiler from doing its job a little too well.
Updated code
#include <omp.h>
#include <iostream> // std::cout
#include <limits> // std::numeric_limits
#include <cstdlib> // std::free
#include <unistd.h> // sysconf
#include <stdlib.h> // posix_memalign
int main()
{
// test-parameters
const auto size = std::size_t{100 * 1024 * 1024};
const auto experiment_count = std::size_t{250};
//+/////////////////
// access a data-point 'on a whim'
//+/////////////////
// allocate for exp. data and load the memory pages
char* data = nullptr;
posix_memalign(reinterpret_cast<void**>(&data), sysconf(_SC_PAGESIZE), size);
if (data == nullptr)
{
std::cerr << "Fatal error! Unable to allocate memory." << std::endl;
std::abort();
}
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] = 0;
}
// timed run
auto min_duration = std::numeric_limits<double>::max();
for (auto counter = std::size_t{}; counter < experiment_count; ++counter)
{
// run
const auto dur1 = omp_get_wtime() * 1E+6;
#pragma omp parallel for simd safelen(8) schedule(static)
for (auto index = std::size_t{}; index < size; ++index)
{
data[index] ^= 1;
}
const auto dur2 = omp_get_wtime() * 1E+6;
const auto run_duration = dur2 - dur1;
if (run_duration < min_duration)
{
min_duration = run_duration;
}
}
// deallocate resources
free(data);
// REPORT
const auto traffic = size * 2; // 1x load, 1x write
std::cout << "Using " << omp_get_max_threads() << " threads. Minimum duration: " << min_duration << " us;\n"
<< "Maximum bandwidth: " << traffic / min_duration * 1E-3 << " GB/s;" << std::endl;
return 0;
}
The benchmark remains a 'naive' one and shall only serve as an indicator of the model's performance (as opposed to a program which can exactly calculate the memory bandwidth).
With the updated code, I get 24 GiB/s for single thread and 37 GiB/s when all 6 cores get involved. When compared to Intel Advisor's measured values of 25.5 GiB/s and 37.5 GiB/s, I think this is acceptable.
#PeterCordes I have retained the warm-up loop to once do an exactly identical run of the whole procedure so as to counter-act against effects unknown (healthy programmer's paranoia).
Edit In this case, the warm-up loop is indeed redundant because the minimum duration is being clocked.

How to calculate the sum of an array in parallel using C++ and OpenMP?

my task is to parallelize the creation, doubling, and summation of the array seen in my code below using C++ and OpenMP. However, I cannot get the summation to work in parallel properly. This is my first time using OpenMP, and I am also quite new to C++ as well. I have tried what can be seen in my code below as well as other variations (having the sum outside of the for loop, defining a sum in parallel to add to the global sum, I have tried what is suggested here, etc). The sum should be 4.15362e-14, but when I use multiple threads, I get different results each time that are incorrect. What is the proper way to achieve this?
P.S. We have only been taught the critical, master, barrier, and single constructs thus far so I would appreciate if answers would not include any others. Thanks!
#include <iostream>
#include <cmath>
#include <omp.h>
using namespace std;
int main()
{
const int size = 256;
double* sinTable = new double[256];
double sum = 0.0;
// parallelized
#pragma omp parallel
{
for (int n = 0; n < size; n++)
{
sinTable[n] = std::sin(2 * M_PI * n / size); // calculate and insert element into array
sinTable[n] = sinTable[n] * 2; // double current element in array
#pragma omp critical
sum += sinTable[n]; // add element to total sum (one thread at a time)
}
}
// print sum and exit
cout << "Sum: " << sum << endl;
return 0;
}
Unfortunately your code is not OK, because you run the for loop number of thread times instead of distributing the work. You should use:
#pragma omp parallel for
to distribute the work among threads.
Another alternative is to use reduction:
int main()
{
const int size = 256;
const double step = (2.0 * M_PI) / static_cast<double>(size);
double* sinTable = new double[size];
double sum = 0.0;
// parallelized
#pragma omp parallel for reduction(+:sum)
for (int n = 0; n < size; n++)
{
sinTable[n] = std::sin( static_cast<double>(n) * step); // calculate and insert element into array
sinTable[n] = sinTable[n] * 2.0; // double current element in array
sum += sinTable[n]; // add element to total sum (one thread at a time)
}
// print sum and exit
cout << "Sum: " << sum << endl;
delete[] sinTable;
return 0;
}
Note that in theory the sum should be zero. The value you obtain depends on the order of additions, so slight difference can be observed due to rounding errors.
size=256 sum(openmp)=2.84217e-14 sum(no openmp)= 4.15362e-14
size=512 sum(openmp)=5.68434e-14 sum(no openmp)= 5.68434e-14
size=1024 sum(openmp)=0 sum(no openmp)=-2.83332e-14
Here is the link to CodeExplorer.

Efficient circular buffer in C++ which will be passed to C-style array function parameter

I'm seeking advice about my approach to the following problem. I have a constant input of data that I need to add to my buffer, and at every iteration, I need to pass buffered data to a function that accepts C-style array through a pointer.
I'm worrying about efficiency so I pondered how could I store and manage data in some sort of circular buffer, but also get it as a sequential raw data to pass it to the said function.
My current approach can be summarized in the following example:
#include <iostream>
#include <array>
#include <algorithm>
void foo(double* arr, int size)
{
for (uint k = 0; k < size; k++)
std::cout << arr[k] << ", ";
std::cout << std::endl;
}
int main()
{
const int size = 20;
std::array<double, size> buffer{};
for (double data = 0.0; data < 50.0; data += 1.0)
{
std::move(std::next(std::begin(buffer)), std::end(buffer), std::begin(buffer));
buffer.back() = data;
foo(buffer.data(), size);
}
}
In real use-case, the buffer also needs to be padded to a "const" size of data at the beginning (I use quotes here because size may, or may not be known at compile-time, but once it is known, it will never change).
I store data in the std::array (or in std::vector if the size will not be known at compile-time) since the data is sequential in memory. When I need to insert new data, I use forward std::move to shift everything, and then I manually replace the last item. Finally, I just pass std::array::data() and its size to the function.
While at first glance this should work efficiently, reason tells me that because data is sequentially stored, the whole buffer will still be copied with std::move, and each insert will be O(n)
Real buffer size will probably be only in hundreds and data is arriving at 100Hz max, but the problem is I need the result of the called function as soon as possible so I don't want to lose time on a buffer management (even if we are talking few, or even less than ms). I have many questions about this, but their short-list is following:
Is my approach too naive?
Is my reasoning about O(n) correct?
Are there any other pitfalls with this approach?
Do you have suggestions for some other approach that I should look into?
Thank you for the answer Werner. When I run this solution on Repl.it, I get:
it took an average of 21us and a max of 57382us
For comparison, my original idea with the same buffer size has the following result:
it took an average of 19us and a max of 54129us
This means that my initial approach indeed was naive :)
In the meantime, while waiting for the answer, I've come up with following solution:
#include <iostream>
#include <array>
#include <algorithm>
#include <chrono>
void foo(double* arr, int size)
{
for (uint k = 0; k < size; k++)
std::cout << arr[k] << ", ";
std::cout << std::endl;
}
int main()
{
const int buffer_size = 20;
std::array<double, buffer_size*2> buffer{};
int buffer_idx = buffer_size;
for (double data = 0.0; data < 100.0; data += 1.0)
{
buffer.at(buffer_idx - buffer_size) = data;
buffer.at(buffer_idx++) = data;
foo(buffer.data() + buffer_idx - buffer_size, buffer_size);
buffer_idx -= buffer_size * (buffer_idx == buffer_size * 2);
}
}
Since the size of the buffer is not a problem, I allocate twice the memory needed and insert data at two places, offset by the buffer size. When I reach the end, I just go back like the typewriter. The idea is that I fake the circular buffer by storing one more copy of data so it can read data as if it crossed full circle.
For buffer size of 50000, this gives me the following result which exactly what I wanted:
it took an average of 0us and a max of 23us
Besides the answer by stribor14 I have two other suggestions. These are only based on performance, so readable or maintainable code will not really be found here.
My first idea when reading the problem was also to allocate twice the amount of storage but only write it once. When all places are written the second half will be copied over to the first half. My first instinct says this could be a better performing. My reasoning was that the same number of total writes will happen but all of the writes are sequential (instead of jumping every second write to another place in the array).
#include <cstddef>
#include <cstring>
#include <array>
const size_t buffer_size = 50'000;
int main()
{
std::array<double, 2 * buffer_size> buffer{};
double *index = buffer.data();
double *mid = index + buffer_size;
for (double data = 0.0; data < 10 * buffer_size; data += 1.0)
{
if (index == mid)
{
index = buffer.data();
std::memcpy(index, mid, buffer_size * sizeof(double));
}
*(index++ + buffer_size) = data;
foo(index, buffer_size);
}
}
Alternatively I thought it would be possible to optimize to OP's own answer to remove the array accesses. The idea is that buffer[buffer_idx - buffer_size] takes 2 additions to calculate the location of that value namely: *(buffer + buffer_idx - buffer_size). If buffer_idx contains a pointer, only one addition is needed. This gives following code:
#include <cstddef>
#include <array>
const size_t buffer_size = 50'000;
int main()
{
std::array<double, buffer_size * 2> buffer{};
double *index = buffer.data();
double *mid = buffer.data() + buffer_size;
for (double data = 0.0; data < 10 * buffer_size; data += 1.0)
{
*index = data;
*(index + buffer_size) = data;
++index;
index -= buffer_size * (index == mid);
foo(index, buffer_size);
}
}
It was now I noticed that I was going down the rabbit hole of C++-optimization. So we couldn't stop there. To choose which implementation to use I wanted to run a benchmark. Werner Pirkl gave a good starting point. But running this on our optimized code is nonsensical because the measured times are 0μs. So let's change it a bit I wrote a loop more inside the benchmark to gave it some runtime and came up with:
const int repeats = 1000;
volatile double *ptr;
int duration = 0;
const size_t buffer_size = 50'000;
// ... Set up of the buffers and indices
for (int i = 0; i < repeats; ++i)
{
auto t1 = std::chrono::high_resolution_clock::now();
for (double data = 0.0; data < 10 * buffer_size; data += 1.0)
{
// ... add data to circular buffer
ptr = // ... the start of the array
}
auto t2 = std::chrono::high_resolution_clock::now();
duration += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
}
(Note the use of a volatile double * to ensure that the raw pointer to the contiguous array is not optimized out.)
While running these tests I noticed they are very dependent on compiler-flags (-O2 -O3 -march=native ...). I will give some results, but like all C++-benchmarks, take it with a grain of salt and run your own with a real-world workload. (The reported times are average ns per insertion)
with `memcpy` stribor14 `operator[]` with pointers
|---------------|-----------|--------------|---------------|
-O2 | 1.38 | 1.57 | 1.41 | 1.15 |
-O3 | 1.37 | 1.63 | 1.36 | 1.09 |
-O3 -march=native | 1.35 | 1.61 | 1.34 | 1.09 |
Needless to say: I was quite disappointed about what I thought should perform the best. But as earlier stated, this benchmark is in no way representative of any real-world performance.
You'll always have to copy your data, as a "continous" ringbuffer doesn't exist (maybe in some fancy silicon it does).
Also you can't initialize an array template of runtime defined size.
You could use a vector to achieve this:
#include <iostream>
#include <chrono>
#include <deque>
#include <vector>
int main() {
std::vector<double> v;
// pre fill it a little
for(double data = 0.0; data > -50000.0; data -= 1.0) {
v.push_back(data);
}
size_t cnt = 0;
int duration = 0;
int max = 0;
for(double data = 0.0; data < 50000.0; data += 1.0, ++cnt) {
auto t1 = std::chrono::high_resolution_clock::now();
v.push_back(data);
v.erase(v.begin());
// foo(v.data(), v.size());
auto t2 = std::chrono::high_resolution_clock::now();
auto delta = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
duration += delta;
if(max == 0 || max < delta) {
max = delta;
}
}
std::cout << "it took an average of " << duration / cnt << "us and a max of " << max << " us" << std::endl;
return 0;
}
Output:
it took an average of 11us and a max of 245 us

How to optimize the following common loop?

I have code
#include <iostream>
#include <vector>
#include <ctime>
using namespace std;
void foo(int n, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = 0; i < n; ++i)
a[i] = b[i] * a[i] + c[i] * (d[i] + e[i] + f[i] + g[i]);
}
int main()
{
int m = 1001001;
vector<double> a(m), b(m), c(m), d(m), f(m);
clock_t start = std::clock();
for (int i = 0; i < 1000; ++i)
foo(1000000, &a[0], &b[0], &c[0], &d[0], &d[1], &f[0], &f[1000] );
double duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << "Finished in " << duration << " seconds [CPU Clock] " << endl;
}
Can you give me a workable example to optimize it with better performance? Any compiler is fine, like Intel c++ compiler and visual c++ compiler. Please suggest a CPU with good performance to do such job.
The code in question is useless. It does lots of calculations with uninitialised variables and then ignores the results. Compilers are getting more and more clever at figuring out that kind of thing and removing all the code for this. So don't be surprised if code like this doesn't take any time at all.
In C, you would declare the pointers as "const double* restrict" except a which would be double* restrict, telling the compiler that all pointers except the first one point to data that isn't going to be modified during the loop; this allows the compiler to vectorise. Not a C++ feature unfortunately afaik.
If this was your real problem, you would just swap the inner and outer loop, and remove loop invariants like this:
void foo(int iter, int n, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = 0; i < n; ++i) {
double xa = a [i];
double xb = b [i];
double xr = c[i] * (d[i] + e[i] + f[i] + g[i]);
for (int j = 0; j < iter; ++j)
xa = xb * xa + xr;
a [i] = xa;
}
}
You'd probably do four iterations in parallel to avoid the latency.
But in a real life situation, you would observe that in each call, you read about 40MB which is way beyond any cache. So you are limited by RAM speed. The usual solution is to split the work into smaller parts, for example 500 elements at a time, so everything fits into L1 cache, then perform the operation with the same data 1000 times.
On apple clang, I tried:
using __restict__ on the arguments to convince the compiler that there was no aliasing.
result: no change
distributing the computation over 8 threads in foo()
result: computation time increased from ~3 seconds to ~18seconds!
using #pragma omp parallel for
result: compiler ignored me and stayed with the original solution. ~3 seconds.
setting the command line option -march=native to allow the cpu's full awesomeness to shine
result: different assembler output (vectorisation applied), but run time still unchanged at ~3s
initial conclusions:
This problem is bound by memory access and not by the CPU.
You could experiment with prefetching the vectors into cache lines and then operating on them in lumps of 8 (8 doubles will fit into every cache line).
Make sure that while you are operating on x[i] to x[i+7] you are prefetching x[i+8] to x[i+15].
This might not help as you are using additions and multiplications which are so fast that your RAM may not be able to keep up anyway.
I think you should use multithreading. change foo to get fromIndex, toIndex, instead of n and distribute vectores over threads.
void foo(int fromIndex, int toIndex, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = fromIndex; i < toIndex; ++i)
a[i] = b[i] * a[i] + c[i] * (d[i] + e[i] + f[i] + g[i]);
}