Arranging the grid size and block size - c++

I have 200 matrices A[i] (whose dimension is 4096*48), and 48 vectors v[j](whose dimension is 48*1). I want to calculate A[i]*v[j], (i=0:199,j=1:47).
I think about how to arrange my grid size and block size from yesterday. But I don't figure out an answer now. Could anyone give me some advice?
Max num of per block is 512. This is my working environment.
The following is my code. It works right. I have checked. But it is slower than Matlab :(
#include<iostream>
#include <mat.h>
#include <time.h>
#include <cuda_runtime.h>
#include "cuda.h"
using std::cout;
using std::endl;
using namespace cv;
using namespace std;
#include <limits>
#include <iostream>
#include <cstdlib>
using namespace std;
#define kernel_size 48
////////////////////////////////////////////
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(1,B.height);
dim3 dimGrid(A.height, C.width);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockCol = blockIdx.y;
int blockRow = blockIdx.x;
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[1][kernel_size];
__shared__ float Bs[kernel_size][1];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[0][row] = A.elements[blockRow * A.stride + row+B.height*blockCol];
Bs[row][0] = B.elements[row];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < B.height; ++e)
{
Cvalue += As[0][e] * Bs[e][0];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// Write Csub to device memory
// Each thread writes one element
C.elements[blockRow * C.stride +blockCol]= Cvalue;
}
//////////////////
float * gen_matrix(int n /*row*/, int m /*col*/){
float *A;
//srand(1023);
A = (float *) malloc(n*m*sizeof(float));
for(int row = 0;row < n;row++)
for(int col = 0;col < m;col++) {
A[row*m+col] = rand()%10;
}
/*
// print matrix elements.
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j)
cout << " [" << i << "," << j << "] " << A[i*m+j] ;
cout << endl;
}
*/
return A;
}
int main()
{
int k=kernel_size;
int s=2000;
int m =4096;
//int m=2;
//int s=1;
int n = k*s;
float *Ae = gen_matrix(m,n);
float *Be= gen_matrix(k,1);00
float *Ce=(float *) malloc(m*s*sizeof(float));
Matrix A ={n,m,n,Ae};
Matrix B ={1,k,1,Be};
Matrix C ={s,m,s,Ce};
const clock_t begin_time = clock();
MatMul(A, B, C);
std::cout << float( clock () - begin_time ) / CLOCKS_PER_SEC;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j <7; ++j)
cout << " [" << i << "," << j << "] " << Ce[i*m+j] ;
cout << endl;
}
//check
float *Ce2=(float *) malloc(s*m*sizeof(float));
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
Ce2[i*s+j]=0;
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
for (int ind = 0; ind < k; ind++)
{
Ce2[i*s+j]=Ce2[i*s+j]+Ae[j*k+ind+i*k*s]*Be[ind];
// printf("%f---****%f\n",Ae[j*k+ind+i*k*s],Be[ind]);
}
if (Ce2[i*s+j]!= Ce[i*s+j])
{
printf("%f----%f\n",Ce2[i*s+j],Ce[i*s+j]);
}
}
}
free(Ae);
free(Be);
free(Ce);
}

This is just a matrix-matrix multiplication problem. If you want things to run fast, you should not be writing your own matrix-matrix multiply code. Use CUBLAS Sgemm.
Conceptually, if you arrange your A matrices like this:
[A0]
[A1]
[A2]
...
[A199]
then you will have a new matrix AA that is (4096*200) rows x 48 columns.
Arrange your 48 V vectors (48x1) in a 48x48 matrix (VV):
[V0][V1][V2]...[V47]
(each V vector is a column of the new matrix VV)
You now have a single matrix multiplication problem (AA*VV) that is (4096*200)x48 multiplied by 48x48 which yields a (4096*200) x 48 result. This result has one column vector of length 4096*200 that contains 200 results of the individual matrix-vector multiplications you were trying to do. The 200 results per column * 48 columns combine to give you all of the results that your original problem would create. The first column would contain the results of [V0] multiplied by each of the 200 A matrices, the second column would contain the results of [V1] multiplied by each of the 200 A matrices, etc.
Once you have arranged your data like this, using CUBLAS Sgemm should be the quickest possible approach on the GPU. Note that CUBLAS expects the underlying storage to be column-major, so if you are rearranging your data, you will probably want to keep this in mind. There is a CUDA sample code for CUBLAS matrix multiplication.
In your code it appears you actually have 2000 A matrices, but your question refers to 200. I have used 200 for example in my answer, but the concept would be the same with 2000 A matrices.

Related

Why is Armadillo so slow compared to a C-style array in a simple row-wise computationnal task

I'm currently computing a small quantity for each value of a big matrix (millions of rows, number of columns < 1000) while considering each row independently.
More precisely, for each value M(i,j) in each row i, column j of this matrix, the quantity is simply [ M(i,j) - mean(i,s) ] / std(i,s) where s is the subset s in M(i,:) - j
in other words, s is the subset of all values of row i without value j.
I compared two implementations, one in C-style array and one in Armadillo, and Armadillo is roughly twice slower in termes of execution time. I would expect a similar or slighty slower execution time, but plain C arrays seem to dramatically improve the performance.
Is there any particular reason or somthing that I missed somewhere? Here is a example compiled with: -O2 -lstdc++ -DARMA_DONT_USE_WRAPPER -lopenblas -llapack -lm. Also tried to use ARMA_NO_DEBUG without success.
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <armadillo>
#include <chrono>
using namespace std::chrono;
/***************************
* main()
***************************/
int main( int argc, char *argv[] )
{
unsigned nrows = 2000000; //number of rows
unsigned ncols = 100; //number of cols
const arma::mat huge_mat = arma::randn(nrows, ncols); //create huge matrix
const arma::uvec vec = arma::linspace<arma::uvec>( 0, huge_mat.n_cols-1, huge_mat.n_cols); //create a vector of [0,...,n]
arma::rowvec inds = arma::zeros<arma::rowvec>( huge_mat.n_cols-1 ); //-1 since we remove only one value at each step.
arma::colvec simuT = arma::zeros<arma::colvec>( ncols ); //let's store the results in this simuT vector.
high_resolution_clock::time_point t1 = high_resolution_clock::now();
//compute some normalization over each value of line of this huge matrix:
for(unsigned i=0; i < nrows; i++) {
const arma::rowvec current_line = huge_mat.row(i); //extract current line
//for each observation in current_line:
for(unsigned j=0; j < ncols; j++) {
//Take care of side effects first:
if( j == 0 )
inds = current_line(arma::span(1, ncols-1));
else
if( j == 1 ) {
inds(0) = current_line(0);
inds(arma::span(1, ncols-2)) = current_line( arma::span(2, ncols-1) );
} else
inds(arma::span(0, j-1)) = current_line( arma::span(0, j-1) );
//Let's do some computation: huge_mat(i,j) - mean[huge_mat(i,:)] / std([huge_mat(i,:)]) //can compute the mean and std first... for each line.
simuT(j) = (current_line(j) - arma::mean(inds)) / ( std::sqrt( 1+1/((double) ncols-1) ) * arma::stddev(inds) );
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "ARMADILLO: " << duration << " secs\n";
//------------------PLAIN C Array
double *Mat_full;
double *output;
unsigned int i,j,k;
double mean=0, stdd=0;
double sq_diff_sum = 0, sum=0;
double diff = 0;
Mat_full = (double *) malloc(ncols * nrows * sizeof(double));
output = (double *) malloc(nrows * ncols * sizeof(double));
std::vector< std::vector<double> > V(huge_mat.n_rows);
//Some UGLY copy from arma::mat to double* using a vector:
for (size_t i = 0; i < huge_mat.n_rows; ++i)
V[i] = arma::conv_to< std::vector<double> >::from(huge_mat.row(i));
//then dump to Mat_full array:
for (i=0; i < V.size(); i++)
for (j=0; j < V[i].size(); j++)
Mat_full[i + huge_mat.n_rows * j] = V[i][j];
t1 = high_resolution_clock::now();
for(i=0; i < nrows; i++)
for(j=0; j < ncols; j++)
{
//compute mean of subset-------------------
sum = 0;
for(k = 0; k < ncols; k++)
if(k!=j)
{
sum = sum + Mat_full[i+k*nrows];
}
mean = sum / (ncols-1);
//compute standard deviation of subset-----
sq_diff_sum = 0;
for(k = 0; k < ncols; k++)
if(k!=j)
{
diff = Mat_full[i+k*nrows] - mean;
sq_diff_sum += diff * diff;
}
stdd = sqrt(sq_diff_sum / (ncols-2));
//export to plain C array:
output[i*ncols+j] = (Mat_full[i+j*nrows] - mean) / (sqrt(1+1/(((double) ncols)-1))*stdd);
}
t2 = high_resolution_clock::now();
duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "C ARRAY: " << duration << " secs\n";
}
In particular the calls to arma::mean and arma::stddev seem to perform poorly when comparing execution times. I did not perform any in-depth analyse of the size-effect over performance, but it seems that for small values of nrows the plain C tends to be (very much) faster. For a simple test using this
setup i got:
ARMADILLO: 111 secs
C ARRAY: 79 secs
in execution time.
EDIT
Here is modification where we work column-wise instead of row-wise and treat each column independently, as suggested by #rubenvb and #mtall. The resulting execution time slightly is decreased (ARMADILLO: 104 secs now), thus showing some improvments over working row-wise:
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <armadillo>
#include <chrono>
using namespace std::chrono;
/***************************
* main()
***************************/
int main( int argc, char *argv[] )
{
unsigned nrows = 100; //number of rows
unsigned ncols = 2000000; //number of cols
const arma::mat huge_mat = arma::randn(nrows, ncols); //create huge matrix
const arma::uvec vec = arma::linspace<arma::uvec>( 0, huge_mat.n_rows-1, huge_mat.n_rows); //create a vector of [0,...,n]
arma::colvec inds = arma::zeros<arma::colvec>( huge_mat.n_rows-1 ); //-1 since we remove only one value at each step.
arma::rowvec simuT = arma::zeros<arma::rowvec>( nrows ); //let's store the results in this simuT vector.
high_resolution_clock::time_point t1 = high_resolution_clock::now();
//compute some normalization over each value of line of this huge matrix:
for(unsigned i=0; i < ncols; i++) {
const arma::colvec current_line = huge_mat.col(i); //extract current line
//for each observation in current_line:
for(unsigned j=0; j < nrows; j++) {
//Take care of side effects first:
if( j == 0 )
inds = current_line(arma::span(1, nrows-1));
else
if( j == 1 ) {
inds(0) = current_line(0);
inds(arma::span(1, nrows-2)) = current_line( arma::span(2, nrows-1) );
} else
inds(arma::span(0, j-1)) = current_line( arma::span(0, j-1) );
//Let's do some computation: huge_mat(i,j) - mean[huge_mat(i,:)] / std([huge_mat(i,:)]) //can compute the mean and std first... for each line.
simuT(j) = (current_line(j) - arma::mean(inds)) / ( std::sqrt( 1+1/((double) nrows-1) ) * arma::stddev(inds) );
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "ARMADILLO: " << duration << " secs\n";
}
The reason is that Armadillo uses column-major ordering in mat, while your C array uses row-major ordering. This is kind of a big deal because your processor can use instruction vectorization to process multiple elements at once, where this requires contiguous memory chunks.
To verify whether this is the cause, do the same calculation but for columns instead of rows, and check the difference.

How to do a matrix multiplication using <threads> and a 1-D array in c++?

I am trying to do a matrix multiplication using threads. But I do not get correct values. Since the matrix can be very large I use the heap memory. The matrix is thus stored in a 1-D array.
The matrix is always a square matrix thus the number of rows and the number of columns is equal to the square root of the array length. If the array length is 16 then the numbers of rows is 4 and the number of columns is also 4.
I can not use an std::vector so that's why the use of std::unique_ptr.
There are 4 threads and each of them receives 1/4 of the original array to work with. This doesn't work due to the nature of matrix multiplication and I can't seem to find the right solution. How can I split the task for 4 threads?
auto matrixmultiplication(float* &array1, float* &array2, int arrayLength) {
unique_ptr<float[]> arrayOut(new float[arrayLength]);
auto numberOfThreads = 4;
auto widthMatrix = (int)sqrt(arrayLength);
auto elementsPerThread = (int)sqrt(arrayLength / numberOfThreads);
auto mul = [](auto* array1, auto* array2, auto* array3, auto dimension) {
for (auto x = 0; x < dimension; x++) {
for (auto y = 0; y < dimension; y++) {
array3[dimension * x + y] = 0;
for (auto z = 0; z < dimension; z++) {
array3[dimension * x + y] += array1[dimension * x + z] * array2[dimension * z + y];
}
}
}
};
vector<thread> threads;
for (auto i = 0; i < numberOfThreads; i++) {
threads.push_back(
thread(
mul,
array1 + i * elementsPerThread,
array2,
arrayOut.get() + i * elementsPerThread,
elementsPerThread
)
);
}
for (auto &thread : threads) {
thread.join();
}
return arrayOut;
};
For all threads I would start the processing from consecutive rows of the first matrix , i.e. the 0th thread will process the 0th row, the 1st will process 1st row, and so on to the nth thread.
After a thread processed a row it has to jump to the next row by the number of threads, i.e. if I have 2 threads, after the 0th finished processig the 0th row, it will jump to the 2nd row and process it.
Let's see it in a working example:
#include <iostream>
#include <memory>
#include <vector>
#include <thread>
// multiplies the specified row and column from specified matrices
void multiply(const int* m_1, const int* m_2,
std::size_t size, std::size_t row, std::size_t col, int* m_res) {
for(std::size_t i = 0; i < size; ++i)
m_res[row * size + col] += m_1[row * size + i] * m_2[i * size + col];
}
int main() {
constexpr int N = 3, THREAD_NUM = 2;
// matrices to multiply and a matrix for result
std::unique_ptr<int[]> A(new int[N * N] {
11, 12, 13, 21, 22, 23, 31, 32, 33
});
std::unique_ptr<int[]> B(new int[N * N] {
1, 0, 0, 0, 1, 0, 0, 0, 1
});
std::unique_ptr<int[]> C(new int[N * N] {});
// create vector for running threads then assign threads to its elements
std::vector<std::thread> thread_group(THREAD_NUM);
for(int thread_i = 0; thread_i < THREAD_NUM; ++thread_i)
thread_group[thread_i] = std::thread([&, thread_i]() {
// each thread stars from consecutive rows then steps by
// the number of threads
for(int row = thread_i; row < N; row += THREAD_NUM) {
for(int col = 0; col < N; ++col)
multiply(A.get(), B.get(), N, row, col, C.get());
}
});
for(auto& t : thread_group)
t.join();
// show the result
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j)
std::cout << (j ? "\t" : "") << C[i * N + j];
std::cout << std::endl;
}
}
If you have two matrices that you want to multiply, let's call them A and B, you just need to split the matrix A row-wise into 4 parts and pass the parts to corresponding threads. When it comes to matrix B, you need to pass a reference to the whole matrix to each thread as you need all its elements to calculate each row of A*B. This is going to be thread-safe as you are going to only read from the matrix B without modyfing it.

Pass cv::Mat to labview

i try to build an DLL with the OpenCV PCA included to make it usable under Labview.
I have defined the the function:
extern "C" __declspec(dllexport) int __cdecl doPCA(float *input,int32_t input_rows,int32_t input_cols,double maxComponents,float *output);
And wrote it like:
int __cdecl doPCA(float *input,int32_t input_rows, int32_t input_cols,double maxComponents,float *output)
{
Mat pcaset = Mat(input_rows,input_cols, CV_32FC1, &input); //CV_32FC1 is for float valued pixel
PCA pca(pcaset, // pass the data
Mat(), // we do not have a pre-computed mean vector, // so let the PCA engine to compute it
CV_PCA_DATA_AS_ROW, // indicate that the vectors// are stored as matrix rows// (use PCA::DATA_AS_COL if the vectors are // the matrix columns)
2 // specify, how many principal components to retain
);
int i, j;
for(i = 0; i < input_rows; i++)
{
for(j = 0; j < input_cols; j++)
{
output[(i * input_cols) + j] = pca.eigenvectors.data[(i * input_cols) + j]; // Write Values to 1D output array
}
}
if(pca.eigenvectors.empty()){return 0;} // is empty
if(!pca.eigenvectors.empty()){return 1;} // is not empty
}
At Labview side I access the function by the compiled DLL:
But I canĀ“t figure it out, how to pass value the from pca.eigenvectors cv::Mat to the 1D float output array.
int i, j;
for(i = 0; i < input_rows; i++)
{
for(j = 0; j < input_cols; j++)
{
output[(i * input_cols) + j] = pca.eigenvectors.data[(i * input_cols) + j]; // Write Values to 1D output array
}
}
Could anyone give a hint?
I learn how to do PCA from the page that Miki gives.
This is my code to do the similar thing.
///! Convert pointer to cv::Mat, do PCA, and convert back.
///! 2017.10.05 10:28:25 CST
int doPCA(float* data, int rows, int cols, int maxC, float* eigenvecs ) {
// convert pointer to Mat, CV_32FC1 is for float valued pixel.
Mat pcaset = Mat(rows,cols, CV_32FC1, data);
// let opencv compute the eigenvectors, and treat data as row, extract the first 2 principle components.
// pca.means : eigenvalues as row matrix
// pca.eigenvectors: eigenvectors as row matrix
maxC = (maxC >0 && maxC <= rows)?maxC:rows;
PCA pca(pcaset, Mat(), CV_PCA_DATA_AS_ROW,maxC);
cout << "Eigen values:\n"<< pca.mean <<endl;
cout << "Eigen vectors:\n"<<pca.eigenvectors<<endl;
if(pca.eigenvectors.empty()) {
return 0; // is empty
}
float *pvec = eigenvecs;
// get eigenvector in revered order
for(int i=maxC-1; i>=0; --i){
for(int j=0; j<cols; ++j){
*pvec++ = pca.eigenvectors.at<float>(i,j);
}
}
return 1;
}
int testPCA(){
// row first
float data[4] = {1.0,2.0,2.0,5.0};
int cols = 2;
int rows = 2;
// alloc two eigenvectors length: 2x2=4
float eigenvecs[4]={0};
// max components nums
int maxC = 2;
int res = doPCA(data, rows, cols, maxC, eigenvecs);
Mat eigenvalues(Size(cols, rows), CV_32FC1, eigenvecs);
cout << "Flag:\n" << res << endl;
cout << "Principle Components:\n"<< eigenvalues<<endl;
return 0;
}
Result:
Eigen values:
[1.5, 3.5]
Eigen vectors:
[0.31622776, 0.94868332;
0.94868332, -0.31622776]
Flag:
1
Principle Components:
[0.94868332, -0.31622776;
0.31622776, 0.94868332]

Vectors and matrices in C++ for generating a spectrogram

This is my first attempt to generate a spectrogram of a sinusoidal signal with C++.
To generate the spectrogram:
I divided the real sinusoidal signal into B blocks
Applied Hanning window on each block (I assumed there is no overlap). This should give me the inputs for the fft, in[j][k] where k is the block number
Apply fft on in[j][k] for each block and store it.
Here is the script:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <fftw3.h>
#include <iostream>
#include <cmath>
#include <fstream>
using namespace std;
int main(){
int i;
int N = 500; // sampled
int Windowsize = 100;
double Fs = 200; // sampling frequency
double T = 1 / Fs; // sample time
double f = 50; // frequency
double *in;
fftw_complex *out;
double t[N]; // time vector
fftw_plan plan_forward;
std::vector<double> signal(N);
int B = N / Windowsize; //number of blocks
in = (double*)fftw_malloc(sizeof(double) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
//Generating the signal
for(int i = 0; i < = N; i++){
t[i] = i * T;
signal[i] = 0.7 * sin(2 * M_PI * f * t[i]);// generate sine waveform
}
//Applying the Hanning window function on each block B
for(int k = 0; i <= B; k++){
for(int j = 0; j <= Windowsize; j++){
double multiplier = 0.5 * (1 - cos(2 * M_PI * j / (N-1))); // Hanning Window
in[j][k] = multiplier * signal[j];
}
plan_forward = fftw_plan_dft_r2c_1d (Windowsize, in, out, FFTW_ESTIMATE );
fftw_execute(plan_forward);
v[j][k]=(20 * log(sqrt(out[i][0] * out[i][0] + out[i][1] * out[i][1]))) / N;
}
fftw_destroy_plan(plan_forward);
fftw_free(in);
fftw_free(out);
return 0;
}
So, the question is: What is the correct way to declare in[j][k] and v[j][k] variables.
Update:I have declared my v [j] [k] as a matrix : double v [5][249]; according to this site :http://www.cplusplus.com/doc/tutorial/arrays/ so now my script looks like:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <fftw3.h>
#include <iostream>
#include <cmath>
#include <fstream>
using namespace std;
int main()
{
int i;
double y;
int N=500;//Number of pints acquired inside the window
double Fs=200;//sampling frequency
int windowsize=100;
double dF=Fs/N;
double T=1/Fs;//sample time
double f=50;//frequency
double *in;
fftw_complex *out;
double t[N];//time vector
double tt[5];
double ff[N];
fftw_plan plan_forward;
double v [5][249];
in = (double*) fftw_malloc(sizeof(double) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
plan_forward = fftw_plan_dft_r2c_1d ( N, in, out, FFTW_ESTIMATE );
for (int i=0; i<= N;i++)
{
t[i]=i*T;
in[i] =0.7 *sin(2*M_PI*f*t[i]);// generate sine waveform
}
for (int k=0; k< 5;k++){
for (int i = 0; i<windowsize; i++){
double multiplier = 0.5 * (1 - cos(2*M_PI*i/(windowsize-1)));//Hanning Window
in[i] = multiplier * in[i+k*windowsize];
fftw_execute ( plan_forward );
for (int i = 0; i<= (N/2); i++)
{
v[k][i]=(20*log10(sqrt(out[i][0]*out[i][0]+ out[i][1]*out[i] [1])));//Here I have calculated the y axis of the spectrum in dB
}
}
}
for (int k=0; k< 5;k++)//Center time for each block
{
tt[k]=(2*k+1)*T*(windowsize/2);
}
fstream myfile;
myfile.open("example2.txt",fstream::out);
myfile << "plot '-' using 1:2" << std::endl;
for (int k=0; k< 5;k++){
for (int i = 0; i<= ((N/2)-1); i++)
{
myfile << v[k][i]<< " " << tt[k]<< std::endl;
}
}
myfile.close();
fftw_destroy_plan ( plan_forward );
fftw_free ( in );
fftw_free ( out );
return 0;
}
I do not get errors anymore but the spectrogram plot is not right.
As indicated in FFTW's documentation, the size of the output (out in your case) when using fftw_plan_dft_r2c_1d is not the same as the size of the input. More specifically for an input of N real samples, the output consists of N/2+1 complex values. You may then allocate out with:
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (N/2 + 1));
For the spectrogram output you will then similarly have (N/2+1) magnitudes for each of the B blocks, resulting in the 2D array:
double** v = new double*[B];
for (int i = 0; i < B; i++){
v[i] = new double[(N/2+1)];
}
Also, note that you may reuse the input buffer in for each iteration (filling it with data for a new block). However since you have chosen to compute an N-point FFT and will be storing smaller blocks of Windowsize samples (in this case N=500 and Windowsize=100), make sure to initialize the remaining samples with zeros:
in = (double*)fftw_malloc(sizeof(double) * N);
for (int i = 0; i < N; i++){
in[i] = 0;
}
Note that in addition to the declaration and allocation of the in and v variables, the code you posted suffers from a few additional issues:
When computing the Hanning window, you should divide by the Windowsize-1 not N-1 (since in your case N correspond to the FFT size).
You are taking the FFT of the same block of signal over and over again since you are always indexing with j in the [0,Windowsize] range. You would most likely want to add an offset each time you process a different block.
Since the FFT size does not change, you only need to create the plan once. At the very least if you are going to create your plan at every iteration, you should similarly destroy it (with fftw_destroy_plan) at every iteration.
And a few additional points which may require some thoughts:
Scaling the log-scaled magnitudes by dividing by N might not do what you think. You are much more likely to want to scale the linear-scale magnitudes (ie. divide the magnitude before taking the logarithm). Note that this will result in a constant offset of the spectrum curve, which for many application is not that significant. If the scaling is important for your application, you may have a look at another answer of mine for more details.
The common formula 20*log10(x) typically used to convert linear scale to decibels uses a base-10 logarithm instead of the natural log (base e~2.7182) function which you've used. This would result in a multiplicative scaling (stretching), which may or may not be significant depending on your application.
To summarize, the following code might be more in line with what you are trying to do:
// Allocate & initialize buffers
in = (double*)fftw_malloc(sizeof(double) * N);
for (int i = 0; i < N; i++){
in[i] = 0;
}
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (N/2 + 1));
v = new (double*)[B];
for (int i = 0; i < B; i++){
v[i] = new double[(N/2+1)];
}
// Generate the signal
...
// Create the plan once
plan_forward = fftw_plan_dft_r2c_1d (Windowsize, in, out, FFTW_ESTIMATE);
// Applying the Hanning window function on each block B
for(int k = 0; k < B; k++){
for(int j = 0; j < Windowsize; j++){
// Hanning Window
double multiplier = 0.5 * (1 - cos(2 * M_PI * j / (Windowsize-1)));
in[j] = multiplier * signal[j+k*Windowsize];
}
fftw_execute(plan_forward);
for (int j = 0; j <= N/2; j++){
// Factor of 2 is to account for the fact that we are only getting half
// the spectrum (the other half is not return by a R2C plan due to symmetry)
v[k][j] = 2*(out[j][0] * out[j][0] + out[j][1] * out[j][1])/(N*N);
}
// DC component and at Nyquist frequency do not have a corresponding symmetric
// value, so should not have been doubled up above. Correct those special cases.
v[k][0] *= 0.5;
v[k][N/2] *= 0.5;
// Convert to decibels
for (int j = 0; j <= N/2; j++){
// 20*log10(sqrt(x)) is equivalent to 10*log10(x)
// also use some small epsilon (e.g. 1e-5) to avoid taking the log of 0
v[k][j] = 10 * log10(v[k][j] + epsilon);
}
}
// Clean up
fftw_destroy_plan(plan_forward);
fftw_free(in);
fftw_free(out);
// Delete this last one after you've done something useful with the spectrogram
for (int i = 0; i < B; i++){
delete[] v[i];
}
delete[] v;
Looks like you're missing the initial declaration for 'v' altogether, and 'in' is not declared properly.
See this page for a related question about creating 2D arrays in C++. As I understand, fftw_malloc() is basically new() or malloc() but aligns the variable properly for the FFTW algorithm.
Since you're not supplying 'v' to the anything related to FFTW, you could use standard malloc() for that.

Sparse matrix-vector multiplication in CUDA

I'm trying to implement a matrix-vector Multiplication on GPU (using CUDA).
In my C++ code (CPU), I load the matrix as a dense matrix, and then I perform the matrix-vector multiplication using CUDA. I'm also using shared memory to improve the performance.
How can I load the matrix in an efficient way, knowing that my matrix is a sparse matrix?
Below is my C++ function to load the matrix:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
Below is my CUDA Kernel function that handles the matrix-vector multiplication:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
What are the necessary changes that I have to make on my CUDA code to take into account that my matrix is a sparse matrix?
I found out from a forum that we can also use padding to be able to optimize the performance, but this requires me to change the way I read the matrix / sort the matrix. Any ideas how to implement this padding in the way I read the matrix and perform the calculation?
This is a very old post and I want to highlight that cuSPARSE (since some time now) makes routines for the multiplication between sparse matrices or between a sparse matrix and a dense vector available.
For the csr format, the relevant routine for the multiplication between a sparse matrix and a dense vector is cusparse<t>csrmv. Below, a fully worked example showing its use.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");
const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));
gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");
}
You might want to have a look at the very good CUSP library. They implement sparse matrices in a variety of formats (coo, csr, ellpack, diagonal and a hybrid between ellpack and coo). Each with their own advantages as described in the documentation. Most of them are "standard" sparse matrix formats about which you can find more information online. Not a complete answer to your question perhaps, but it should provide a starting point.