I am trying to run the C++ FFT code from this web page:
https://www.nayuki.io/page/free-small-fft-in-multiple-languages
Pretty new to C++, so don't know how to run it. Essentially, I want to pass on a REAL vector and an IMAG vector to the program and generate an output of REAL and IMAG vectors.
Say my REAL_VEC = {1, 2, 3, 4, 5}
Say my IMAG_VEC = {0, 1, 0, 1, 0}
Am pasting the code that I have and its compiling. But where to give input and how to get output (for above vectors)?
//FftRealPairTest.cpp
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <random>
#include <vector>
#include "FftRealPair.hpp"
using std::cout;
using std::endl;
using std::vector;
// Private function prototypes
static void testFft(int n);
static vector<double> randomReals(int n);
// Mutable global variable
static double maxLogError = -INFINITY;
// Random number generation
std::default_random_engine randGen((std::random_device())());
int main() {
// Test diverse size FFTs
for (int i = 0, prev = 0; i <= 4; i++) {
int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
if (n > prev) {
testFft(n);
prev = n;
}
}
cout << endl;
cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
return EXIT_SUCCESS;
}
static void testFft(int n) {
vector<double> inputreal(randomReals(n));
vector<double> inputimag(randomReals(n));
vector<double> actualoutreal(inputreal);
vector<double> actualoutimag(inputimag);
Fft::transform(actualoutreal, actualoutimag);
}
static vector<double> randomReals(int n) {
std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
vector<double> result;
for (int i = 0; i < n; i++)
result.push_back(valueDist(randGen));
return result;
}
/////////////////
//FftRealPair.cpp
/*
* Free FFT and convolution (C++)
*
* Copyright (c) 2017 Project Nayuki. (MIT License)
* https://www.nayuki.io/page/free-small-fft-in-multiple-languages
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
* - The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* - The Software is provided "as is", without warranty of any kind, express or
* implied, including but not limited to the warranties of merchantability,
* fitness for a particular purpose and noninfringement. In no event shall the
* authors or copyright holders be liable for any claim, damages or other
* liability, whether in an action of contract, tort or otherwise, arising from,
* out of or in connection with the Software or the use or other dealings in the
* Software.
*/
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include "FftRealPair.hpp"
using std::size_t;
using std::vector;
// Private function prototypes
static size_t reverseBits(size_t x, int n);
void Fft::transform(vector<double> &real, vector<double> &imag) {
size_t n = real.size();
if (n != imag.size())
throw "Mismatched lengths";
if (n == 0)
return;
else if ((n & (n - 1)) == 0) // Is power of 2
transformRadix2(real, imag);
else // More complicated algorithm for arbitrary sizes
transformBluestein(real, imag);
}
void Fft::inverseTransform(vector<double> &real, vector<double> &imag) {
transform(imag, real);
}
void Fft::transformRadix2(vector<double> &real, vector<double> &imag) {
// Length variables
size_t n = real.size();
if (n != imag.size())
throw "Mismatched lengths";
int levels = 0; // Compute levels = floor(log2(n))
for (size_t temp = n; temp > 1U; temp >>= 1)
levels++;
if (static_cast<size_t>(1U) << levels != n)
throw "Length is not a power of 2";
// Trignometric tables
vector<double> cosTable(n / 2);
vector<double> sinTable(n / 2);
for (size_t i = 0; i < n / 2; i++) {
cosTable[i] = std::cos(2 * M_PI * i / n);
sinTable[i] = std::sin(2 * M_PI * i / n);
}
// Bit-reversed addressing permutation
for (size_t i = 0; i < n; i++) {
size_t j = reverseBits(i, levels);
if (j > i) {
std::swap(real[i], real[j]);
std::swap(imag[i], imag[j]);
}
}
// Cooley-Tukey decimation-in-time radix-2 FFT
for (size_t size = 2; size <= n; size *= 2) {
size_t halfsize = size / 2;
size_t tablestep = n / size;
for (size_t i = 0; i < n; i += size) {
for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
size_t l = j + halfsize;
double tpre = real[l] * cosTable[k] + imag[l] * sinTable[k];
double tpim = -real[l] * sinTable[k] + imag[l] * cosTable[k];
real[l] = real[j] - tpre;
imag[l] = imag[j] - tpim;
real[j] += tpre;
imag[j] += tpim;
}
}
if (size == n) // Prevent overflow in 'size *= 2'
break;
}
}
void Fft::transformBluestein(vector<double> &real, vector<double> &imag) {
// Find a power-of-2 convolution length m such that m >= n * 2 + 1
size_t n = real.size();
if (n != imag.size())
throw "Mismatched lengths";
size_t m = 1;
while (m / 2 <= n) {
if (m > SIZE_MAX / 2)
throw "Vector too large";
m *= 2;
}
// Trignometric tables
vector<double> cosTable(n), sinTable(n);
for (size_t i = 0; i < n; i++) {
unsigned long long temp = static_cast<unsigned long long>(i) * i;
temp %= static_cast<unsigned long long>(n) * 2;
double angle = M_PI * temp / n;
// Less accurate alternative if long long is unavailable: double angle = M_PI * i * i / n;
cosTable[i] = std::cos(angle);
sinTable[i] = std::sin(angle);
}
// Temporary vectors and preprocessing
vector<double> areal(m), aimag(m);
for (size_t i = 0; i < n; i++) {
areal[i] = real[i] * cosTable[i] + imag[i] * sinTable[i];
aimag[i] = -real[i] * sinTable[i] + imag[i] * cosTable[i];
}
vector<double> breal(m), bimag(m);
breal[0] = cosTable[0];
bimag[0] = sinTable[0];
for (size_t i = 1; i < n; i++) {
breal[i] = breal[m - i] = cosTable[i];
bimag[i] = bimag[m - i] = sinTable[i];
}
// Convolution
vector<double> creal(m), cimag(m);
convolve(areal, aimag, breal, bimag, creal, cimag);
// Postprocessing
for (size_t i = 0; i < n; i++) {
real[i] = creal[i] * cosTable[i] + cimag[i] * sinTable[i];
imag[i] = -creal[i] * sinTable[i] + cimag[i] * cosTable[i];
}
}
void Fft::convolve(const vector<double> &x, const vector<double> &y, vector<double> &out) {
size_t n = x.size();
if (n != y.size() || n != out.size())
throw "Mismatched lengths";
vector<double> outimag(n);
convolve(x, vector<double>(n), y, vector<double>(n), out, outimag);
}
void Fft::convolve(
const vector<double> &xreal, const vector<double> &ximag,
const vector<double> &yreal, const vector<double> &yimag,
vector<double> &outreal, vector<double> &outimag) {
size_t n = xreal.size();
if (n != ximag.size() || n != yreal.size() || n != yimag.size()
|| n != outreal.size() || n != outimag.size())
throw "Mismatched lengths";
vector<double> xr(xreal);
vector<double> xi(ximag);
vector<double> yr(yreal);
vector<double> yi(yimag);
transform(xr, xi);
transform(yr, yi);
for (size_t i = 0; i < n; i++) {
double temp = xr[i] * yr[i] - xi[i] * yi[i];
xi[i] = xi[i] * yr[i] + xr[i] * yi[i];
xr[i] = temp;
}
inverseTransform(xr, xi);
for (size_t i = 0; i < n; i++) { // Scaling (because this FFT implementation omits it)
outreal[i] = xr[i] / n;
outimag[i] = xi[i] / n;
}
}
static size_t reverseBits(size_t x, int n) {
size_t result = 0;
for (int i = 0; i < n; i++, x >>= 1)
result = (result << 1) | (x & 1U);
return result;
}
///////////
//FftRealPair.hpp
/*
* Free FFT and convolution (C++)
*
* Copyright (c) 2017 Project Nayuki. (MIT License)
* https://www.nayuki.io/page/free-small-fft-in-multiple-languages
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
* - The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* - The Software is provided "as is", without warranty of any kind, express or
* implied, including but not limited to the warranties of merchantability,
* fitness for a particular purpose and noninfringement. In no event shall the
* authors or copyright holders be liable for any claim, damages or other
* liability, whether in an action of contract, tort or otherwise, arising from,
* out of or in connection with the Software or the use or other dealings in the
* Software.
*/
#pragma once
#include <vector>
namespace Fft {
/*
* Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
* The vector can have any length. This is a wrapper function.
*/
void transform(std::vector<double> &real, std::vector<double> &imag);
/*
* Computes the inverse discrete Fourier transform (IDFT) of the given complex vector, storing the result back into the vector.
* The vector can have any length. This is a wrapper function. This transform does not perform scaling, so the inverse is not a true inverse.
*/
void inverseTransform(std::vector<double> &real, std::vector<double> &imag);
/*
* Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
* The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
*/
void transformRadix2(std::vector<double> &real, std::vector<double> &imag);
/*
* Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
* The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
* Uses Bluestein's chirp z-transform algorithm.
*/
void transformBluestein(std::vector<double> &real, std::vector<double> &imag);
/*
* Computes the circular convolution of the given real vectors. Each vector's length must be the same.
*/
void convolve(const std::vector<double> &x, const std::vector<double> &y, std::vector<double> &out);
/*
* Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
*/
void convolve(
const std::vector<double> &xreal, const std::vector<double> &ximag,
const std::vector<double> &yreal, const std::vector<double> &yimag,
std::vector<double> &outreal, std::vector<double> &outimag);
}
If you look at the .hpp file that you posted, the first function transform() takes two inputs: your real and imaginary vectors. The FFT is done 'in place' so the result is returned in the same vectors.
If you want to give a try, you may look at the testFft() and initialize
inputReal and inputImag with your data. The vectors are then copied in actualOutReal and actualOutImag (to avoid overwriting the original data) and passed to transform.
After that you should have your output in the same vectors (actualOutReal and actualOutImag).
This code does precisely what you want (requires C++11):
#include <cstddef>
#include <vector>
#include "FftRealPair.hpp"
int main() {
// Declare input
std::vector<double> real{1, 2, 3, 4, 5};
std::vector<double> imag{0, 1, 0, 1, 0};
// Do FFT
Fft::transform(real, imag);
// Print result
for (std::size_t i = 0; i < real.size(); i++) {
std::cout << real[i] << " " << imag[i] << std::endl;
}
return 0;
}
Related
I am following the example of eigen decomposition from here,
https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/syevd/cusolver_syevd_example.cu
I need to do it for Hermatian complex matrix. The problem is the eigen vector is not matching at all with the result with Matlab result.
Does anyone have any idea about it why this mismatch is happening?
I have also tried cusolverdn svd method to get eigen values and vector that is giving another result.
My code is here for convenience,
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "cusolver_utils.h"
int N = 16;
void BuildMatrix(cuComplex* input);
void main()
{
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
printf("*******************\n");
cuComplex* h_idata = (cuComplex*)malloc(sizeof(cuComplex) * N);
cuComplex* h_eigenVector = (cuComplex*)malloc(sizeof(cuComplex) * N); // eigen vector
float* h_eigenValue = (float*)malloc(sizeof(float) * 4); // eigen Value
BuildMatrix(h_idata);
int count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_idata[count].x, h_idata[count].y);
count++;
}
printf("\n");
}
printf("\n*****************\n");
/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
// step 2: reserve memory in cuda and copy input data from host to device
cuComplex* d_idata;
float* d_eigenValue = nullptr;
int* d_info = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_idata, N * sizeof(cuComplex)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_eigenValue), N * sizeof(float)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
CUDA_CHECK(cudaMemcpyAsync(d_idata, h_idata, N * sizeof(cuComplex), cudaMemcpyHostToDevice, stream));
// step 3: query working space of syevd
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
int lwork = 0; /* size of workspace */
cuComplex* d_work = nullptr; /* device workspace*/
const int m = 4;
const int lda = m;
cusolverDnCheevd_bufferSize(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, &lwork);
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
// step 4: compute spectrum
cusolverDnCheevd(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, d_work, lwork, d_info);
CUDA_CHECK(
cudaMemcpyAsync(h_eigenVector, d_idata, N * sizeof(cuComplex), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(
cudaMemcpyAsync(h_eigenValue, d_eigenValue, 4 * sizeof(double), cudaMemcpyDeviceToHost, stream));
int info = 0;
CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
std::printf("after syevd: info = %d\n", info);
if (0 > info)
{
std::printf("%d-th parameter is wrong \n", -info);
exit(1);
}
count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_eigenVector[count].x, h_eigenVector[count].y);
count++;
}
printf("\n");
}
printf("\n");
for (int i = 0; i < N / 4; i++)
{
std::cout << h_eigenValue[i] << std::endl;
}
printf("\n*****************\n");
/* free resources */
CUDA_CHECK(cudaFree(d_idata));
CUDA_CHECK(cudaFree(d_eigenValue));
CUDA_CHECK(cudaFree(d_info));
CUDA_CHECK(cudaFree(d_work));
CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
}
//0.5560 + 0.0000i - 0.4864 + 0.0548i 0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i 0.4317 + 0.0000i - 0.7318 - 0.2698i 1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i 1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i 1.3255 - 0.3344i - 2.4578 - 0.2609i 4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };
for (int i = 0; i < N; i++)
{
input[i].x = realVector.at(i) * std::pow(10, 11);
input[i].y = imagVector.at(i) * std::pow(10, 11);
}
}
I raised this issue in their git ( https://github.com/NVIDIA/CUDALibrarySamples/issues/58), but unfortunately no one is answering.
If anyone can help me to solve this that will be very helpful.
Please follow the post for the clear answer,
https://forums.developer.nvidia.com/t/eigen-decomposition-of-hermitian-matrix-using-cusolver-does-not-match-the-result-with-matlab/204157
The theory tells, A*V-lamda*V=0 should satisfy, however it might not be perfect zero. My thinking was it will very very close to zero or e-14 somethng like this. If the equation gives a value close to zero then it is acceptable.
There are different algorithms for solving eigen decomposition, like Jacobi algorithm, Cholesky factorization... The program I provided in my post uses the function cusolverDnCheevd which is based on LAPACK. LAPACK doc tells that it uses divide and conquer algorithm to solve Hermitian matrix. Here is the link, http://www.netlib.org/lapack/explore-html/d9/de3/group__complex_h_eeigen_ga6084b0819f9642f0db26257e8a3ebd42.html#ga6084b0819f9642f0db26257e8a3ebd42
this is optimized implementation of matrix multiplication and this routine performs a matrix multiplication operation.
C := C + A * B (where A, B, and C are n-by-n matrices stored in column-major format)
On exit, A and B maintain their input values.
void matmul_optimized(int n, int *A, int *B, int *C)
{
// to the effective bitwise calculation
// save the matrix as the different type
int i, j, k;
int cij;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
cij = C[i + j * n]; // the initialization into C also, add separate additions to the product and sum operations and then record as a separate variable so there is no multiplication
for (k = 0; k < n; ++k) {
cij ^= A[i + k * n] & B[k + j * n]; // the multiplication of each terms is expressed by using & operator the addition is done by ^ operator.
}
C[i + j * n] = cij; // allocate the final result into C }
}
}
how do I more speed up the multiplication of matrix based on above function/method?
this function is tested up to 2048 by 2048 matrix.
the function matmul_optimized is done with matmul.
#include <stdio.h>
#include <stdlib.h>
#include "cpucycles.c"
#include "helper_functions.c"
#include "matmul_reference.c"
#include "matmul_optimized.c"
int main()
{
int i, j;
int n = 1024; // Number of rows or columns in the square matrices
int *A, *B; // Input matrices
int *C1, *C2; // Output matrices from the reference and optimized implementations
// Performance and correctness measurement declarations
long int CLOCK_start, CLOCK_end, CLOCK_total, CLOCK_ref, CLOCK_opt;
long int COUNTER, REPEAT = 5;
int difference;
float speedup;
// Allocate memory for the matrices
A = malloc(n * n * sizeof(int));
B = malloc(n * n * sizeof(int));
C1 = malloc(n * n * sizeof(int));
C2 = malloc(n * n * sizeof(int));
// Fill bits in A, B, C1
fill(A, n * n);
fill(B, n * n);
fill(C1, n * n);
// Initialize C2 = C1
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
C2[i * n + j] = C1[i * n + j];
// Measure performance of the reference implementation
CLOCK_total = 0;
for (COUNTER = 0; COUNTER < REPEAT; COUNTER++)
{
CLOCK_start = cpucycles();
matmul_reference(n, A, B, C1);
CLOCK_end = cpucycles();
CLOCK_total = CLOCK_total + CLOCK_end - CLOCK_start;
}
CLOCK_ref = CLOCK_total / REPEAT;
printf("n=%d Avg cycle count for reference implementation = %ld\n", n, CLOCK_ref);
// Measure performance of the optimized implementation
CLOCK_total = 0;
for (COUNTER = 0; COUNTER < REPEAT; COUNTER++)
{
CLOCK_start = cpucycles();
matmul_optimized(n, A, B, C2);
CLOCK_end = cpucycles();
CLOCK_total = CLOCK_total + CLOCK_end - CLOCK_start;
}
CLOCK_opt = CLOCK_total / REPEAT;
printf("n=%d Avg cycle count for optimized implementation = %ld\n", n, CLOCK_opt);
speedup = (float)CLOCK_ref / (float)CLOCK_opt;
// Check correctness by comparing C1 and C2
difference = 0;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
difference = difference + C1[i * n + j] - C2[i * n + j];
if (difference == 0)
printf("Speedup factor = %.2f\n", speedup);
if (difference != 0)
printf("Reference and optimized implementations do not match\n");
//print(C2, n);
free(A);
free(B);
free(C1);
free(C2);
return 0;
}
You can try algorithm like Strassen or Coppersmith-Winograd and here is also a good example.
Or maybe try Parallel computing like future::task or std::thread
Optimizing matrix-matrix multiplication requires careful attention to be paid to a number of issues:
First, you need to be able to use vector instructions. Only vector instructions can access parallelism inherent in the architecture. So, either your compiler needs to be able to automatically map to vector instructions, or you have to do so by hand, for example by calling the vector intrinsic library for AVX-2 instructions (for x86 architectures).
Next, you need to pay careful attention to the memory hierarchy. Your performance can easily drop to less than 5% of peak if you don't do this.
Once you do this right, you will hopefully have broken the computation up into small enough computational chunks that you can also parallelize via OpenMP or pthreads.
A document that carefully steps through what is required can be found at http://www.cs.utexas.edu/users/flame/laff/pfhp/LAFF-On-PfHP.html. (This is very much a work in progress.) At the end of it all, you will have an implementation that gets close to the performance attained by high-performance libraries like Intel's Math Kernel Library (MKL) or the BLAS-like Library Instantiation Software (BLIS).
(And, actually, you CAN then also effectively incorporate Strassen's algorithm. But that is another story, told in Unit 3.5.3 of these notes.)
You may find the following thread relevant: How does BLAS get such extreme performance?
I am trying to build a spars Matrix using a Eigen or Armadillo library in C++ to solve a system of linear equations Ax=b. A is the coefficient matrix with a dimension of n*n, and B is a vector of right hand side with a dimension of n
the Spars Matrix A is like this, see the figure
I had a look though the Eigen document but I have a problem with defining and filling the Spars Matrix in C++.
could you please give me an example code to define the spars matrix and how to fill the values into the matrix using Eigen library in c++?
consider for example a simple spars matrix A:
1 2 0 0
0 3 0 0
0 0 4 5
0 0 6 7
int main()
{
SparseMatrix<double> A;
// fill the A matrix ????
VectorXd b, x;
SparseCholesky<SparseMatrix<double> > solver;
solver.compute(A);
x = solver.solve(b);
return 0;
}
The sparse matrix could be filled with the values mentioned in the post by using the .coeffRef() member function, as shown in this routine:
SparseMatrix<double> fillMatrix() {
int N = 4;
int M = 4;
SparseMatrix<double> m1(N,M);
m1.reserve(VectorXi::Constant(M, 4)); // 4: estimated number of non-zero enties per column
m1.coeffRef(0,0) = 1;
m1.coeffRef(0,1) = 2.;
m1.coeffRef(1,1) = 3.;
m1.coeffRef(2,2) = 4.;
m1.coeffRef(2,3) = 5.;
m1.coeffRef(3,2) = 6.;
m1.coeffRef(3,3) = 7.;
m1.makeCompressed();
return m1;
}
However, the SparseCholesky module (SimplicialCholesky<SparseMatrix<double> >) won't work in this case because the matrix is not Hermitian. The system could be solved with a LU or BiCGStab solver. Also note that sizes ofx and b need to be defined:
VectorXd b(A.rows()), x(A.cols());
In case of larger sparse matrices you may also want to look at the .reserve() function in order to allocate memory before filling the elements. The .reserve() function can be used to provide an estimate of the number of non-zero entries per column (or row, depending on the storage order. The default is comumn-major). In the example above that estimate is 4, but it does not make sense in such a small matrix. The documentation states that it is preferable to overestimate the number of non-zeros per column.
Since this question also asks about Armadillo, here is the corresponding Armadillo-based code. Best to use Armadillo version 9.100+ or later, and link with SuperLU.
#include <armadillo>
using namespace arma;
int main()
{
sp_mat A(4,4); // don't need to explicitly reserve the number of non-zeros
// fill with direct element access
A(0,0) = 1.0;
A(0,1) = 2.0;
A(1,1) = 3.0;
A(2,2) = 4.0;
A(2,3) = 5.0;
A(3,2) = 6.0;
A(3,3) = 7.0; // etc
// or load the sparse matrix from a text file with the data stored in coord format
sp_mat AA;
AA.load("my_sparse_matrix.txt", coord_ascii)
vec b; // ... fill b here ...
vec x = spsolve(A,b); // solve sparse system
return 0;
}
See also the documentation for SpMat, element access, .load(), spsolve().
The coord file format is simple. It stores non-zeros values.
Each line contains:
row col value
The row and column counts start at zero. Example:
0 0 1.0
0 1 2.0
1 1 3.0
2 2 4.0
2 3 5.0
3 2 6.0
3 3 7.0
1000 2000 9.0
Values not explicitly listed are assumed to be zero.
#include <vector>
#include <iostream>
#include <Eigen/Dense>
#include <Eigen/Sparse>
#include <Eigen/Core>
#include <cstdlib>
using namespace Eigen;
using namespace std;
int main()
{
double L = 5; // Length
const int N = 120; // No of cells
double L_cell = L / N;
double k = 100; // Thermal Conductivity
double T_A = 100.;
double T_B = 200.;
double S = 1000.;
Vector<double, N> d, D, A, aL, aR, aP, S_u, S_p;
vector<double> xp;
xp.push_back((0 + L_cell) / 2.0);
double xm = xp[0];
for (int i = 0; i < N - 1; i++)
{
xm = xm + L_cell;
xp.push_back(xm);
}
for (int i = 0; i < N; i++)
{
A(i) = .1;
d(i) = L_cell;
D(i) = k / d(i);
}
aL(0) = 0;
aR(0) = D(0) * A(0);
S_p(0) = -2 * D(0) * A(0);
aP(0) = aL(0) + aR(0) - S_p(0);
S_u(0) = 2 * D(0) * A(0) * T_A + S * L_cell * A(0);
for (int i = 1; i < N - 1; i++)
{
aL(i) = D(i) * A(i);
aR(i) = D(i) * A(i);
S_p(i) = 0;
aP(i) = aL(i) + aR(i) - S_p(i);
S_u(i) = S * A(i) * L_cell;
}
aL(N - 1) = D(N - 1) * A(N - 1);
aR(N - 1) = 0;
S_p(N - 1) = -2 * D(N - 1) * A(N - 1);
aP(N - 1) = aL(N - 1) + aR(N - 1) - S_p(N - 1);
S_u(N - 1) = 2 * D(N - 1) * A(N - 1) * T_B + S * L_cell * A(N - 1);
typedef Eigen::Triplet<double> T;
std::vector<T> tripletList;
tripletList.reserve(N * 3);
Matrix<double, N, 3> v; // v is declared here
v << (-1) * aL, aP, (-1) * aR;
for (int i = 0, j = 0; i < N && j < N; i++, j++)
{
tripletList.push_back(T(i, j, v(i, 1)));
if (i + 1 < N && j + 1 < N)
{
tripletList.push_back(T(i + 1, j, v(i + 1, 0)));
tripletList.push_back(T(i, j + 1, v(i, 2)));
}
}
SparseMatrix<double> coeff(N, N);
coeff.setFromTriplets(tripletList.begin(), tripletList.end());
SimplicialLDLT<SparseMatrix<double> > solver;
solver.compute(coeff);
if (solver.info() != Success) {
cout << "decomposition failed" << endl;
return;
}
Vector<double, N> temparature;
temparature = solver.solve(S_u);
if (solver.info() != Success)
{
cout << "Solving failed" << endl;
return;
}
vector<double> Te = {}, x = {};
Te.push_back(T_A);
x.push_back(0);
for (int i = 0; i < N; i++)
{
Te.push_back(temparature(i));
x.push_back(xp[i]);
}
Te.push_back(T_B);
x.push_back(L);
for (int i = 0; i < N + 2; i++)
{
cout << x[i] << " " << Te[i] << endl;
}
return 0;
}
Here is a full code of a solution to numerical problem which uses SparseMatrix. Look at the matrix v. It has the values of all the nonzero elements of coeff matrix yet to be defined. In the next loop I made a series of tripletList.push_back(...) adding a triplet consisting of row and column index and corresponding value taken from v for each non-zero element of coeff. Now declare a Sparse Matrix coeff with appropriate size and use the method setFromTriplets (documentation) to set its non-zero elements from tripletList triplets.
I have a program that solves generally for 1D brownian motion using an Euler's Method.
Being a stochastic process, I want to average it over many particles. But I find that as I ramp up the number of particles, it overloads and i get the std::badalloc error, which I understand is a memory error.
Here is my full code
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <cstdlib>
#include <limits>
#include <ctime>
using namespace std;
// Box-Muller Method to generate gaussian numbers
double generateGaussianNoise(double mu, double sigma) {
const double epsilon = std::numeric_limits<double>::min();
const double tau = 2.0 * 3.14159265358979323846;
static double z0, z1;
static bool generate;
generate = !generate;
if (!generate) return z1 * sigma + mu;
double u1, u2;
do {
u1 = rand() * (1.0 / RAND_MAX);
u2 = rand() * (1.0 / RAND_MAX);
} while (u1 <= epsilon);
z0 = sqrt(-2.0 * log(u1)) * cos(tau * u2);
z1 = sqrt(-2.0 * log(u1)) * sin(tau * u2);
return z0 * sigma + mu;
}
int main() {
// Initialize Variables
double gg; // Gaussian Number Picked from distribution
// Integrator
double t0 = 0; // Setting the Time Window
double tf = 10;
double n = 5000; // Number of Steps
double h = (tf - t0) / n; // Time Step Size
// Set Constants
const double pii = atan(1) * 4; // pi
const double eta = 1; // viscous constant
const double m = 1; // mass
const double aa = 1; // radius
const double Temp = 30; // Temperature in Kelvins
const double KB = 1; // Boltzmann Constant
const double alpha = (6 * pii * eta * aa);
// More Constants
const double mu = 0; // Gaussian Mean
const double sigma = 1; // Gaussian Std Deviation
const double ng = n; // No. of pts to generate for Gauss distribution
const double npart = 1000; // No. of Particles
// Initial Conditions
double x0 = 0;
double y0 = 0;
double t = t0;
// Vectors
vector<double> storX; // Vector that keeps displacement values
vector<double> storY; // Vector that keeps velocity values
vector<double> storT; // Vector to store time
vector<double> storeGaussian; // Vector to store Gaussian numbers generated
vector<double> holder; // Placeholder Vector for calculation operations
vector<double> mainstore; // Vector that holds the final value desired
storT.push_back(t0);
// Prepares mainstore
for (int z = 0; z < (n+1); z++) {
mainstore.push_back(0);
}
for (int NN = 0; NN < npart; NN++) {
holder.clear();
storX.clear();
storY.clear();
storT.clear();
storT.push_back(0);
// Prepares holder
for (int z = 0; z < (n+1); z++) {
holder.push_back(0);
storX.push_back(0);
storY.push_back(0);
}
// Gaussian Generator
srand(time(NULL));
for (double iiii = 0; iiii < ng; iiii++) {
gg = generateGaussianNoise(0, 1); // generateGaussianNoise(mu,sigma)
storeGaussian.push_back(gg);
}
// Solver
for (int ii = 0; ii < n; ii++) {
storY[ii + 1] =
storY[ii] - (alpha / m) * storY[ii] * h +
(sqrt(2 * alpha * KB * Temp) / m) * sqrt(h) * storeGaussian[ii];
storX[ii + 1] = storX[ii] + storY[ii] * h;
holder[ii + 1] =
pow(storX[ii + 1], 2); // Finds the displacement squared
t = t + h;
storT.push_back(t);
}
// Updates the Main Storage
for (int z = 0; z < storX.size(); z++) {
mainstore[z] = mainstore[z] + holder[z];
}
}
// Average over the number of particles
for (int z = 0; z < storX.size(); z++) {
mainstore[z] = mainstore[z] / (npart);
}
// Outputs the data
ofstream fout("LangevinEulerTest.txt");
for (int jj = 0; jj < storX.size(); jj++) {
fout << storT[jj] << '\t' << mainstore[jj] << '\t' << storX[jj] << endl;
}
return 0;
}
As you can see, npart is the variable that I change to vary the number of particles. But after each iteration, I do clear my storage vectors like storX,storY... So on paper, the number of particles should not affect memory? I am only just calling the compiler to repeat many more times, and add onto the main storage vector mainstore. I am running my code on a computer with 4GB ram.
Would greatly appreciate it if anyone could point out my errors in logic or suggest improvements.
Edit: Currently the number of particles is set to npart = 1000.
So when I try to ramp it up to like npart = 20000 or npart = 50000, it gives me memory errors.
Edit2 I've edited the code to allocate an extra index to each of the storage vectors. But it does not seem to fix the memory overflow
There is an out of bounds exception in the solver part. storY has size n and you access ii+1 where i goes up to n-1. So for your code provided. storY has size 5000. It is allowed to access with indices between 0 and 4999 (including) but you try to access with index 5000. The same for storX, holder and mainstore.
Also, storeGaussian does not get cleared before adding new variables. It grows by n for each npart loop. You access only the first n values of it in the solver part anyway.
Please note, that vector::clear removes all elements from the vector, but does not necessarily change the vector's capacity (i.e. it's storage array), see the documentation.
This won't cause the problem here, because you'll reuse the same array in the next runs, but it's something to be aware when using vectors.
I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.
struct atom
{
double x,y,z;
}
int main()
{
//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];
for (int i = 0; i < N; i ++)
for (int j = 0; j < N; j++)
DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
(atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;
}
Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.
If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.
I would use a SoA for storing the coordinates.
You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).
threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.
I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.
Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int N(20);
#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int div_up(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z);
int main(int argc, char **argv)
{
double* atoms_x_h;
check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));
double* atoms_y_h;
check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));
double* atoms_z_h;
check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));
for (int i(0); i < N; ++i) {
atoms_x_h[i] = i;
atoms_y_h[i] = i;
atoms_z_h[i] = i;
}
double* atoms_x_d;
check(cudaMalloc(&atoms_x_d, N * sizeof(double)));
double* atoms_y_d;
check(cudaMalloc(&atoms_y_d, N * sizeof(double)));
double* atoms_z_d;
check(cudaMalloc(&atoms_z_d, N * sizeof(double)));
check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));
double* distances_d;
check(cudaMalloc(&distances_d, N * N * sizeof(double)));
const int threads_per_block(256);
dim3 n_blocks(div_up(N, threads_per_block));
calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);
check(cudaPeekAtLastError());
check(cudaDeviceSynchronize());
double* distances_h;
check(cudaMallocHost(&distances_h, N * N * sizeof(double)));
check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int i(0); i < N; ++i) {
for (int j(0); j < N; ++j) {
cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
}
}
check(cudaFree(distances_d));
check(cudaFreeHost(distances_h));
check(cudaFree(atoms_x_d));
check(cudaFreeHost(atoms_x_h));
check(cudaFree(atoms_y_d));
check(cudaFreeHost(atoms_y_h));
check(cudaFree(atoms_z_d));
check(cudaFreeHost(atoms_z_h));
return 0;
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z)
{
int i(threadIdx.x + blockIdx.x * blockDim.x);
int j(threadIdx.y + blockIdx.y * blockDim.y);
if (i >= N || j >= N) {
return;
}
distances[i + N * j] =
(atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
(atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
(atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}