I need to fill a matrix with values returned from function below
__device__ float calc(float *ar, int m, float sum, int i, int j)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < m)
{
ar[idx] = __powf(ar[idx], i + j);
atomicAdd(&sum, ar[idx]);
}
return sum;
}
Matrix set up as one dimensional array and fills up through this function
__global__ void createMatrix(float *A, float *arr, int size)
{
A[threadIdx.y*size + threadIdx.x] = /*some number*/;
}
In theory it should be something like this
__global__ void createMatrix(float *A, float *arr, int size)
{
float sum = 0;
A[threadIdx.y*size + threadIdx.x] = calc(arr, size, sum, threadIdx.x, threadIdx.y);
}
but it doesn't work that way, calc always returns 0. Is there any way I can fill matrix using global function? Thanks in advance.
You're passing sum by value rather than by reference. So all of your atomicAdd()'s have no effect on the zero-initialized value in the kernel.
However, even if you were to pass it by reference, this would still be a poorly-designed kernel. You see, you don't need the atomics if you have a per-thread sum variable (which you do). Also, your calc() function only adds a value once to each sum value, while it seems you expect it to add more than once.
Related
I have a sparse matrix in cuSparse and I want to extract the diagonal. I can't seem to find a way to do it other than converting it back to CPU memory into Eigen SparseMatrix and use the .diagonal provided by Eigen to do it, and then copy the result back to GPU. Obviously this is pretty inefficient so I am wondering if there's a way to do it directly in the GPU. Please see below code for reference:
void CuSparseTransposeToEigenSparse(
const int *d_row,
const int *d_col,
const double *d_val,
const int num_non0,
const int mat_row,
const int mat_col,
Eigen::SparseMatrix<double> &mat){
std::vector<int> outer(mat_col + 1);
std::vector<int> inner(num_non0);
std::vector<double> value(num_non0);
cudaMemcpy(
outer.data(), d_row, sizeof(int) * (mat_col + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(
inner.data(), d_col, sizeof(int) * num_non0, cudaMemcpyDeviceToHost);
cudaMemcpy(
value.data(), d_val, sizeof(double) * num_non0, cudaMemcpyDeviceToHost);
Eigen::Map<Eigen::SparseMatrix<double>> mat_map(
mat_row, mat_col, num_non0, outer.data(), inner.data(), value.data());
mat = mat_map.eval();
}
int main(){
int *d_A_row;
int *d_A_col;
double *d_A_val;
int A_len;
int num_A_non0;
double *d_A_diag;
// these values are filled with some computation
// current solution
Eigen::SparseMatrix<double> A;
CuSparseTransposeToEigenSparse(
d_A_row, d_A_col, d_A_val, num_A_non0, A_len, A_len, A);
Eigen::VectorXd A_diag = A.diagonal();
cudaMemcpy(d_A_diag, A_diag.data(), sizeof(double) * A_len, cudaMemcpyHostToDevice);
// is there a way to fill in d_A_diag without copying back to CPU?
return 0;
}
Just in case anyone is interested. I figured it out for the case of a CSR matrix. The custom kernel to do it looks like this:
__global__ static void GetDiagFromSparseMat(const int *A_row,
const int *A_col,
const double *A_val,
const int A_len,
double *A_diag){
const int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < A_len){
const int num_non0_row = A_row[x + 1] - A_row[x];
A_diag[x] = 0.0;
for (int i = 0; i < num_non0_row; i++){
if (A_col[i + A_row[x]] == x){
A_diag[x] = A_val[i + A_row[x]];
break;
}
}
}
}
So I was trying to work on a code about Jordan's elimination. When I encountered a problem:
Cannot convert 'float' to 'float ()[100]' for argument '1' to 'int diabase(float ()[100])
What does this mean? And how can I fix it?
#include<stdio.h>
#define N 100
int read(float A[N][N]);
int jordan(float A[N][N],int n);
int print(float A[N][N],int n);
int main()
{
int i, j, k, n, y;
float A[N][N+1], c, x[N];
n = read(A[N][N]);
jordan(A[N][N]], n);
print(A[N][N], n);
return(0);
}
int read(float A[N][N]){
int n,i,j;
printf("Enter the size of matrix: ");
scanf("%d",&n);
printf("Enter the elements of augmented matrix row-wise:");
for(i=1; i<=n; i++)
{
for(j=1; j<=(n+1); j++)
{
printf(" A[%d][%d]:", i,j);
scanf("%f",&A[i][j]);
}
}
return n;
}
int print( float A[N][N],int n){
int i;
float x[n];
printf("The solution is:");
for(i=1; i<=n; i++)
{
x[i]=A[i][n+1]/A[i][i];
printf("n x%d=%fn",i,x[i]);
}
return 0;
}
int jordan(float A[N][N],int n){
int i,j,k;
float c;
for(j=1; j<=n; j++)
{
for(i=1; i<=n; i++)
{
if(i!=j)
{
c=A[i][j]/A[j][j];
for(k=1; k<=n+1; k++)
{
A[i][k]=A[i][k]-c*A[j][k];
}
}
}
}
return 0;
}
In all of these function calls
n=read(A[N][N]);
jordan(A[N][N]],n);
print(A[N][N],n);
You are passing and element of the array instead of the array. If you want to pass the array you just use the name of the array without any indexes
n=read(A);
jordan(A,n);
print(A,n);
You are going to run into another problem with this though as your functions expect a 2d array of 100 x 100 but your array you created in main() is 100 x 101 which is not going to match. You either need to make the array in main() 100 x 100 or change you function take arrays of 100 x 101
When you do function (A[N][N]) you are passing a single float value to function, so here:
n = read(A[N][N]);
jordan(A[N][N], n);
print(A[N][N], n);
You are passing single values to read, jordan and print. And B.T.W you are accessing cells that do not exist (A[N] is out of bound since A is float [N][N + 1].
If you want to pass A to your function, simply do:
n = read(A);
jordan(A, n);
print(A, n);
Apart from that, there are some mistakes in your code:
Your functions want a float [N][N] but A is float [N][N+1] so you will have to modify something.
You are accessing cells from 1 to n/n+1, C arrays have cells from 0 to n-1/n, so your loop should go from 0 to n-1/n.
Your main should be
int main()
{
float A[N][N];
int n = read(A);
jordan(A, n);
print(A, n);
return(0);
}
I was able to get pointers for 1D memoryviews using this StackOverflow question, but applying the same method to 2D memoryviews gives me a " Cannot assign type 'double *' to 'double **'" error.
cdef extern from "dgesvd.h" nogil:
void dgesvd(double **A, int m, int n, double *S, double **U, double **VT)
cdef:
double[:] S
double[:,:] A, U, VT
A = np.ascontiguousarray(np.zeros((N,N)))
S = np.zeros(N)
U = np.zeros(N)
VT = np.zeros(N)
dgesvd(&A[0,0], N, N, &S[0], &U[0], &VT[0])
EDIT: I got it to compile by doing
So I got it to compile successfully by doing:
cdef:
double[:] S
double[:,:] A, U, VT
U = np.zeros((N,N))
VT = np.zeros((N,N))
A = np.zeros((N,N))
S = np.zeros(N)
A_p = <double *> malloc(sizeof(double) * N)
U_p = <double *> malloc(sizeof(double) * N)
VT_p = <double *> malloc(sizeof(double) * N)
for i in range(N):
A_p = &A[i, 0]
U_p = &U[i, 0]
VT_p = &VT[i, 0]
dgesvd(&A_p, N, N, &S[0], &U_p, &VT_p)
free(A_p)
free(U_p)
free(VT_p)
BUT I get a segfault when I try to run it, so I probably did this wrong.
Here are the contents of "dgesvd.h" (I did not write it, but I know it works):
/*
This file has my implementation of the LAPACK routine dgesdd for
C++. This program solves for the singular value decomposition of a
rectangular matrix A. The function call is of the form
void dgesdd(double **A, int m, int n, double *S, double *U, double *VT)
A: the m by n matrix that we are decomposing
m: the number of rows in A
n: the number of columns in A (generally, n<m)
S: a min(m,n) element array to hold the singular values of A
U: a [m, min(m,n)] element rectangular array to hold the right
singular vectors of A. These vectors will be the columns of U,
so that U[i][j] is the ith element of vector j.
VT: a [min(m,n), n] element rectangular array to hold the left
singular vectors of A. These vectors will be the rows of VT
(it is a transpose of the vector matrix), so that VT[i][j] is
the jth element of vector i.
Note that S, U, and VT must be initialized before calling this
routine, or there will be an error. Here is a quick sample piece of
code to perform this initialization; in many cases, it can be lifted
right from here into your program.
S = new double[minmn];
U = new double*[m]; for (int i=0; i<m; i++) U[i] = new double[minmn];
VT = new double*[minmn]; for (int i=0; i<minmn; i++) VT[i] = new double[n];
Scot Shaw
24 January 2000 */
void dgesvd(double **A, int m, int n, double *S, double **U, double **VT);
double *dgesvd_ctof(double **in, int rows, int cols);
void dgesvd_ftoc(double *in, double **out, int rows, int cols);
extern "C" void dgesvd_(char *jobu, char *jobvt, int *m, int *n,
double *a, int *lda, double *s, double *u,
int *ldu, double *vt, int *ldvt, double *work,
int *lwork, int *info);
void dgesvd(double **A, int m, int n, double *S, double **U, double **VT)
{
char jobu, jobvt;
int lda, ldu, ldvt, lwork, info;
double *a, *u, *vt, *work;
int minmn, maxmn;
jobu = 'S'; /* Specifies options for computing U.
A: all M columns of U are returned in array U;
S: the first min(m,n) columns of U (the left
singular vectors) are returned in the array U;
O: the first min(m,n) columns of U (the left
singular vectors) are overwritten on the array A;
N: no columns of U (no left singular vectors) are
computed. */
jobvt = 'S'; /* Specifies options for computing VT.
A: all N rows of V**T are returned in the array
VT;
S: the first min(m,n) rows of V**T (the right
singular vectors) are returned in the array VT;
O: the first min(m,n) rows of V**T (the right
singular vectors) are overwritten on the array A;
N: no rows of V**T (no right singular vectors) are
computed. */
lda = m; // The leading dimension of the matrix a.
a = dgesvd_ctof(A, lda, n); /* Convert the matrix A from double pointer
C form to single pointer Fortran form. */
ldu = m;
/* Since A is not a square matrix, we have to make some decisions
based on which dimension is shorter. */
if (m>=n) { minmn = n; maxmn = m; } else { minmn = m; maxmn = n; }
ldu = m; // Left singular vector matrix
u = new double[ldu*minmn];
ldvt = minmn; // Right singular vector matrix
vt = new double[ldvt*n];
lwork = 5*maxmn; // Set up the work array, larger than needed.
work = new double[lwork];
dgesvd_(&jobu, &jobvt, &m, &n, a, &lda, S, u,
&ldu, vt, &ldvt, work, &lwork, &info);
dgesvd_ftoc(u, U, ldu, minmn);
dgesvd_ftoc(vt, VT, ldvt, n);
delete a;
delete u;
delete vt;
delete work;
}
double* dgesvd_ctof(double **in, int rows, int cols)
{
double *out;
int i, j;
out = new double[rows*cols];
for (i=0; i<rows; i++) for (j=0; j<cols; j++) out[i+j*rows] = in[i][j];
return(out);
}
void dgesvd_ftoc(double *in, double **out, int rows, int cols)
{
int i, j;
for (i=0; i<rows; i++) for (j=0; j<cols; j++) out[i][j] = in[i+j*rows];
}
You don't want to be using the "pointer-to-pointer" form. All the Cython/numpy arrays are stored as a single continuous array together with a few length parameters to let it do 2D access. You're probably best replicating the dgesvd wrapper in Cython (to allocate the working arrays, but not do the ftoc or ctof conversions).
I've had a go, below, but it's untested so there may be bugs. It's more for the gist of what to do than to be copied outright.
def dgesvd(double [:,:] A):
"""All sizes implicit in A, returns a tuple of U S V"""
# start by ensuring we have Fortran style ordering
cdef double[::1, :] A_f = A.copy_fortran()
# work out the sizes - it's possible I've got this the wrong way round!
cdef int m = A.shape[0]
cdef int n = A.shape[1]
cdef char jobu[] = 'S'
cdef char jobvt[] = 'S'
cdef double[::1,:] U
cdef double[::1,:] Vt
cdef double[::1] S
cdef double[::1] work
cdef int minnm, maxnm
cdef int info, lwork, ldu, ldvt
if m>=n:
minmn = n
maxmn = m
else:
minmn = m
maxmn = n
ldu = m;
U = np.array((ldu,minmn), order='F')
ldvt = minmn
Vt = np.array((ldvt,n), order='F')
S = np.array((minmn,)) # not absolutely sure - check this!
lwork = 5*maxmn
work = np.array((lwork,))
dgesvd_(&jobu, &jobvt, &m, &n, &A_f[0,0], &lda, &S[0], &U[0],
&ldu, &Vt[0,0], &ldvt, &work[0], &lwork, &info);
return U, S, Vt.T # transpose Vt on the way out
The way you call dgesdd is not consistent with its prototype. Apart from that, this should work. See, for instance, this example, that performs the dgemm call from Cython in a similar way.
Also note, that Scipy 0.16, will include a Cython API for BLAS/LAPACK, and it will probably be the best approach in the future.
Due to memory limit, I need to use gsl_matrix_float instead of gsl_matrix which stores data of type double. However, I want to use gsl_linalg_LU_decomp and gsl_linalg_LU_invert which only support gsl_matrix. And I did not find some other method which support the float version decomposition and inversion in gsl.
Is there any way to solve this dilemma? Or I can only transfer from float to double and then back? Thanks in advance!
The best you can probably do is, as you suggest, convert from float to double and back. Here is example code to perform the inversion (only the essential components are given - you have to fill in the blanks):
include <gsl/gsl_blas.h>
include <gsl/gsl_linalg.h>
void matrix_invert(gsl_matrix_float *, gsl_matrix_float *, int);
int main()
{
gsl_matrix_float *X = gsl_matrix_float_alloc(N, N);
gsl_matrix_float *invX = gsl_matrix_float_alloc(N, N);
matrix_invert(X, invX, N); //invM = inv(I)
return 0;
}
void matrix_invert(gsl_matrix_float *matrix, gsl_matrix_float *inverse, int N)
{
int i=0,j=0,signum=0;
gsl_matrix *DM = gsl_matrix_alloc(N, N);
gsl_matrix *DM_I = gsl_matrix_alloc(N, N);
for (i=0;i<N;i++)
for (j=0;j<N;j++)
gsl_matrix_set(DM, i, j, gsl_matrix_float_get(matrix,i,j));
gsl_permutation *p = gsl_permutation_alloc(N);
gsl_linalg_LU_decomp(DM, p, &signum);
gsl_linalg_LU_invert(DM, p, DM_I);
gsl_permutation_free(p);
gsl_matrix_free(DM);
for (i=0;i<N;i++)
for (j=0;j<N;j++)
gsl_matrix_float_set(inverse, i, j, gsl_matrix_get(DM_I,i,j));
}
This is a difficult issue but I'm not sure where to turn. To sum it up from the start, I'm having trouble with arrays in c++. To head off your inevitable response, I have to use arrays, I can't use vectors. The reason being that I will eventually be interfacing this with CUDA which can't accept vectors.
Anyway, I have written my own class to handle 2D arrays and all the stuff that goes with that behind the scenes. The header file is reproduced at the bottom of this question. I use this class to define a 6 x 10 array. I then loop over many items, primarily using the add method of my class. At some point in this complicated loop, the size of my array switches from 6 x 10 to 0 x 1074266112. I have tried to debug my code and figure out what the issue is but for the life of me I cannot find the issue.
I've printed out all the values at the moment the array size changes, and none of them are out of the norm and I'm never trying to index the array outside of the 6x10 size. In fact, it never even happens at the same point of the loop, it just seems to happen randomly for each run. The only constant I can find between each issue is that the new array size is always exactly 0 x 1074266112.
Sorry I can't provide a minimum working example, but this issue only crops up in my large program and I can't reproduce it in a smaller program. I was at least hoping anyone could see if I'm doing anything wrong with my Matrix program below and possibly suggest a method of debugging.
EDIT: If I change this to use a vector rather than an array, the issue goes away. I.e., if I change the relevant parts to vector<double> data and upon instantiating, data = *(vector<double>(x * y)), the issue mentioned above is no longer a problem. However I have no idea what could be the problem with making this an array.
#include <vector>
#include <iostream>
#ifndef MATRIX_H
#define MATRIX_H
using std::vector; using std::cout; using std::endl;
class Matrix {
//Define the private variables associated with any instance of this class.
double * data; //The 1D pointer which points to the array
int w, h; //The width and height of the 2D array that the 1D data array represents
public:
Matrix(){}
Matrix(int x, int y){ setSize(x,y); }
void setSize(int x, int y){ w = x; h = y; data = new double[x * y]; setAll(0); }
//Two methods to get the height and width of the effective 2D array
int getWidth(){ return w; }
int getHeight(){ return h; }
//Several methods used to set and get the values of elements within the array as well as extracting
//rows and columns as vectors.
void set(int x, int y, double value){ data[y*w + x] = value; }
void setAll(double value);
double get(int x, int y){ return data[y*w + x]; }
vector<double> getCol(int x);
vector<double> getRow(int y);
//Several methods to adjust the current value by the input
void increment(int x, int y){ data[y*w + w] += 1; }
void add(int x, int y, double value){ data[y*w + x] += value; }
void subtract(int x, int y, double value){ data[y*w + x] -= value; }
void multiply(int x, int y, double value){ data[y*w + x] *= value; }
void divide(int x, int y, double value){ data[y*w + x] /= value; }
};
void Matrix::setAll(double value){
for (int i = 0; i < w*h; i++){
data[i] = value;
}
}
vector<double> Matrix::getCol(int x){
vector<double> column(h);
for (int i = 0; i < h; i++){ column[i] = data[i*w + x]; }
return column;
}
vector<double> Matrix::getRow(int y){
vector<double> row(w);
for (int i = 0; i < w; i++){ row[i] = data[y*w + i]; }
return row;
}
#endif /* MATRIX_H */
Your increment method is wrong; it depends only on y when from context it looks like you intended to also use x in the array index calculation. If you're calling this increment somewhere in your program, you're probably writing to memory off in la-la land somewhere. I don't know if that's the only cause of your issue, but anyway it will corrupt your heap in probably unhelpful ways.