this is my third post and attempt to solve this problem, which first
showed up using numpy.dot(A, A.T) where A is large, 150,000 x 265 elements.
With numpy, I got back an array with many missing values, that were just zeros.
I've tried to call BLAS thru CBLAS. I'm getting a segmentation fault error
with large arrays.
I'm running this on a machine with about 250 GB free memory.
Thanks for reading...
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
#include <cblas.h> /* C BLAS BLAS */
#include "blaio.h"
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
// row_order transform transform rowsA colsB K alpha a lda b ldb beta c ldc
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, row, row, col, 1.0f, a, col, b, row, 0.0f, c, row);
int num_bad = 0;
for(i=0; i<(row*row); i++)
{
if (c[i] != col)
{
printf("Bad value found: %f, at index: %i\n", c[i], i );
num_bad += 1;
}
}
printf("Number of bad values found: %i \n\n", num_bad);
//printMatrix(CblasRowMajor, row, row, c, 8, 3, NULL, NULL, NULL, NULL, NULL, "c = ");
return 0;
} /* end func main */
UPDATE:
Ray has expertly noticed that the blas I'm using via cblas, must be 32 bit and not able to access the array indices. Therefore, I've installed blas64.x86_64 and blas64-devel.x86_64.
Then, rewrote a few lines of the code above to use the direct call to sgemm without cblas.
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
float alpha = 1.0, beta = 1.0;
sgemm_('N','N', &row, &row, &col, &alpha, &a[0], &col, &b[0], &row, &beta, &c[0], &row);
I compiled with:
gcc sgemm_test_fortran.c -o test -L /usr/lib64 -lblas64
The code compiled and I think it might run.. :)
The problem is that the size of your output matrix (100,000x100,000 = 1e10 elements) can't be stored in an int (2.14e9). You can fix this in your C++ code by switching the types to size_t, but you're going to run into the same problem inside the BLAS library.
What you need to to do is use a BLAS library that is compiled to use 8-byte integers; most BLAS libraries are compiled with 4-byte integers. You don't mention what BLAS library you're linking to, so it's hard to guess what the correct library name is (if it even exists) on your system.
Related
I am following the example of eigen decomposition from here,
https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/syevd/cusolver_syevd_example.cu
I need to do it for Hermatian complex matrix. The problem is the eigen vector is not matching at all with the result with Matlab result.
Does anyone have any idea about it why this mismatch is happening?
I have also tried cusolverdn svd method to get eigen values and vector that is giving another result.
My code is here for convenience,
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "cusolver_utils.h"
int N = 16;
void BuildMatrix(cuComplex* input);
void main()
{
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
printf("*******************\n");
cuComplex* h_idata = (cuComplex*)malloc(sizeof(cuComplex) * N);
cuComplex* h_eigenVector = (cuComplex*)malloc(sizeof(cuComplex) * N); // eigen vector
float* h_eigenValue = (float*)malloc(sizeof(float) * 4); // eigen Value
BuildMatrix(h_idata);
int count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_idata[count].x, h_idata[count].y);
count++;
}
printf("\n");
}
printf("\n*****************\n");
/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
// step 2: reserve memory in cuda and copy input data from host to device
cuComplex* d_idata;
float* d_eigenValue = nullptr;
int* d_info = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_idata, N * sizeof(cuComplex)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_eigenValue), N * sizeof(float)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
CUDA_CHECK(cudaMemcpyAsync(d_idata, h_idata, N * sizeof(cuComplex), cudaMemcpyHostToDevice, stream));
// step 3: query working space of syevd
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
int lwork = 0; /* size of workspace */
cuComplex* d_work = nullptr; /* device workspace*/
const int m = 4;
const int lda = m;
cusolverDnCheevd_bufferSize(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, &lwork);
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
// step 4: compute spectrum
cusolverDnCheevd(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, d_work, lwork, d_info);
CUDA_CHECK(
cudaMemcpyAsync(h_eigenVector, d_idata, N * sizeof(cuComplex), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(
cudaMemcpyAsync(h_eigenValue, d_eigenValue, 4 * sizeof(double), cudaMemcpyDeviceToHost, stream));
int info = 0;
CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
std::printf("after syevd: info = %d\n", info);
if (0 > info)
{
std::printf("%d-th parameter is wrong \n", -info);
exit(1);
}
count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_eigenVector[count].x, h_eigenVector[count].y);
count++;
}
printf("\n");
}
printf("\n");
for (int i = 0; i < N / 4; i++)
{
std::cout << h_eigenValue[i] << std::endl;
}
printf("\n*****************\n");
/* free resources */
CUDA_CHECK(cudaFree(d_idata));
CUDA_CHECK(cudaFree(d_eigenValue));
CUDA_CHECK(cudaFree(d_info));
CUDA_CHECK(cudaFree(d_work));
CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
}
//0.5560 + 0.0000i - 0.4864 + 0.0548i 0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i 0.4317 + 0.0000i - 0.7318 - 0.2698i 1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i 1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i 1.3255 - 0.3344i - 2.4578 - 0.2609i 4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };
for (int i = 0; i < N; i++)
{
input[i].x = realVector.at(i) * std::pow(10, 11);
input[i].y = imagVector.at(i) * std::pow(10, 11);
}
}
I raised this issue in their git ( https://github.com/NVIDIA/CUDALibrarySamples/issues/58), but unfortunately no one is answering.
If anyone can help me to solve this that will be very helpful.
Please follow the post for the clear answer,
https://forums.developer.nvidia.com/t/eigen-decomposition-of-hermitian-matrix-using-cusolver-does-not-match-the-result-with-matlab/204157
The theory tells, A*V-lamda*V=0 should satisfy, however it might not be perfect zero. My thinking was it will very very close to zero or e-14 somethng like this. If the equation gives a value close to zero then it is acceptable.
There are different algorithms for solving eigen decomposition, like Jacobi algorithm, Cholesky factorization... The program I provided in my post uses the function cusolverDnCheevd which is based on LAPACK. LAPACK doc tells that it uses divide and conquer algorithm to solve Hermitian matrix. Here is the link, http://www.netlib.org/lapack/explore-html/d9/de3/group__complex_h_eeigen_ga6084b0819f9642f0db26257e8a3ebd42.html#ga6084b0819f9642f0db26257e8a3ebd42
I have like 3 different size of matrices and want to transpose them parallel.
Firstly I put these in a 2D array using malloc and then use cudaMalloc to transfer array from host(h_B) to device (d_B).
Using threadIdx to find each address of matrix in the array. The cublas function is used.
Here are my code.
The code can be compiled but I cannot get result. It seems that in global function float *A = new float[m*n] is not a good way.
Dose anyone have ideas of this?
Thanks so much!
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include<iostream>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
{
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
return;
}
const float d_alpha = 1.0f;
const float d_beta = 0.0f;
int idx = threadIdx.x;
if(idx<N){
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
float *A = new float[m*n];
float *C = new float[m*n];
A = ptrA+address[idx];
C = ptrC+address[idx];
cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &d_alpha, (const float*)A, n, &d_beta, (const float *)A, n, C, m);
delete[] A;
delete[] C;
}
cublasDestroy(cnpHandle);
}
int main()
{
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
3,4,
5,6};
float c[8] = { 1,2,3,1,
2,3,4,5 };
float **h_B = (float**)malloc(N * sizeof(float*));
float **h_BT = (float**)malloc(N * sizeof(float*));
h_B[0] = a, h_BT[0] = a;
h_B[1] = b, h_BT[1] = b;
h_B[2] = c, h_BT[2] = c;
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_B, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_BT, h_BT, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(M_B, N_B, d_B,d_BT, N,address);
checkCudaErrors(cudaMemcpy(h_BT, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(d_B);
cudaFree(d_BT);
delete[] h_B;
delete[] h_BT;
return 0;
}
There were a number of errors in your code. I will probably miss some in my description.
Note that this cublas-in-device-code functionality is no longer available in newer CUDA versions.
Every pointer that is passed to device code needs an allocation with cudaMalloc. You had done cudaMalloc for a few pointers, but not all of them.
You're confused about pointers and arrays of pointers. I won't be able to sort all of that out for you. Your kernel design really doesn't need the complexity of using arrays of pointers. So I've removed all that.
In CUDA dynamic parallelism (CDP), pointers to the local address space cannot be passed to child kernels. You can't use alpha and beta in the local address space, and pass pointers to those to CUBLAS in CDP.
To do a pure transpose, study the CUBLAS Sgeam documentation for the recommended parameters to use.
I believe there were other things I fixed. Please study this example:
$ cat t1433.cu
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include<iostream>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
{
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("thread: %d, error1: %d\n", threadIdx.x, (int)status);
return;
}
float *d_alpha = new float; // a pointer to device-heap, not local memory
*d_alpha = 1.0f;
float *d_beta = new float;
*d_beta = 0.0f;
int idx = threadIdx.x;
if(idx<N){
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
status = cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, d_alpha, ptrA+address[idx], n, d_beta, ptrC+address[idx], m, ptrC+address[idx], m);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("thread: %d, error2: %d\n", threadIdx.x, (int)status);
return;
}
}
cublasDestroy(cnpHandle);
}
int main()
{
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
3,4,
5,6};
float c[8] = { 1,2,3,1,
2,3,4,5 };
float *h_Bdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
float *h_BTdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
memcpy(h_Bdata, a, sizeof(a));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0])), b, sizeof(b));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0]))+(sizeof(b)/sizeof(b[0])), c, sizeof(c));
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
int *d_address;
cudaMalloc(&d_address, sizeof(address));
cudaMemcpy(d_address, address, sizeof(address), cudaMemcpyHostToDevice);
int *d_M_B, *d_N_B;
cudaMalloc(&d_M_B, sizeof(M_B));
cudaMalloc(&d_N_B, sizeof(N_B));
cudaMemcpy(d_M_B, M_B, sizeof(M_B), cudaMemcpyHostToDevice);
cudaMemcpy(d_N_B, N_B, sizeof(N_B), cudaMemcpyHostToDevice);
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_Bdata, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(d_M_B, d_N_B, d_B,d_BT, N,d_address);
checkCudaErrors(cudaMemcpy(h_BTdata, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "B , BT" << std::endl;
for (int i = 0; i < NUM_B; i++){
std::cout << h_Bdata[i] << " , " << h_BTdata[i] << std::endl;}
cudaFree(d_B);
cudaFree(d_BT);
return 0;
}
$ /usr/local/cuda-8.0/bin/nvcc -I/usr/local/cuda-8.0/samples/common/inc t1433.cu -rdc=true -lcublas_device -lcudadevrt -arch=sm_35 -o t1433
$ LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64 CUDA_VISIBLE_DEVICES="3" cuda-memcheck ./t1433
========= CUDA-MEMCHECK
B , BT
1 , 1
2 , 4
3 , 2
4 , 5
5 , 3
6 , 6
1 , 1
2 , 3
3 , 5
4 , 2
5 , 4
6 , 6
1 , 1
2 , 2
3 , 2
1 , 3
2 , 3
3 , 4
4 , 1
5 , 5
========= ERROR SUMMARY: 0 errors
$
I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}
I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.
struct atom
{
double x,y,z;
}
int main()
{
//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];
for (int i = 0; i < N; i ++)
for (int j = 0; j < N; j++)
DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
(atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;
}
Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.
If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.
I would use a SoA for storing the coordinates.
You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).
threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.
I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.
Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int N(20);
#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int div_up(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z);
int main(int argc, char **argv)
{
double* atoms_x_h;
check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));
double* atoms_y_h;
check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));
double* atoms_z_h;
check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));
for (int i(0); i < N; ++i) {
atoms_x_h[i] = i;
atoms_y_h[i] = i;
atoms_z_h[i] = i;
}
double* atoms_x_d;
check(cudaMalloc(&atoms_x_d, N * sizeof(double)));
double* atoms_y_d;
check(cudaMalloc(&atoms_y_d, N * sizeof(double)));
double* atoms_z_d;
check(cudaMalloc(&atoms_z_d, N * sizeof(double)));
check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));
double* distances_d;
check(cudaMalloc(&distances_d, N * N * sizeof(double)));
const int threads_per_block(256);
dim3 n_blocks(div_up(N, threads_per_block));
calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);
check(cudaPeekAtLastError());
check(cudaDeviceSynchronize());
double* distances_h;
check(cudaMallocHost(&distances_h, N * N * sizeof(double)));
check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int i(0); i < N; ++i) {
for (int j(0); j < N; ++j) {
cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
}
}
check(cudaFree(distances_d));
check(cudaFreeHost(distances_h));
check(cudaFree(atoms_x_d));
check(cudaFreeHost(atoms_x_h));
check(cudaFree(atoms_y_d));
check(cudaFreeHost(atoms_y_h));
check(cudaFree(atoms_z_d));
check(cudaFreeHost(atoms_z_h));
return 0;
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z)
{
int i(threadIdx.x + blockIdx.x * blockDim.x);
int j(threadIdx.y + blockIdx.y * blockDim.y);
if (i >= N || j >= N) {
return;
}
distances[i + N * j] =
(atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
(atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
(atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}
I just started learning CUDA and I have been looking at examples on NVIDIA's website. Specifically, I have implemented the non-shared version of the matrix multiply (the first sample is the non-shared version even though it is in the shared memory section):
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory
I am having a problem with the output when I change the block sizes. NVIDIA's code has a default block size of 16 and this gives me the correct output when I multiply two matrices. However, if I change the block size to anything above 16 (while still being a multiple of 16), I get an output of zero for all elements in the matrix. I tested this on my laptop too and noticed the same results for anything over 32 rather than 16. Could someone explain what is happening? I have two 9800GTX+ video cards in SLI and so I should have a maximum block size of (512,512,1). Why can I only do 16?
Also, I am noticing the same behavior in the shared version of the matrix multiplication (also on the NVIDIA page).
I didn't post my code because I get the same problem if I directly copy the code from the NVIDIA site.
I would really appreciate any help with this or with resources to learn more about these kinds of CUDA details.
Thank you!
I have attached the code as requested:
#include "stdio.h"
#include <cuda.h>
#include <assert.h>
#include <time.h>
#include <math.h>
// This is an example CUDA program that compares the timings of a matrix multiplication.
// The comparisons are between the CPU, GPU, and the GPU with shared memory.
#define BLOCK_SIZE 32
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
typedef void (*FuncPtr)(Matrix& A, Matrix& B, Matrix& C);
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C);
// Helper declarations
void initializeMatrix(Matrix& A, int rows, int cols, float val);
void copyMatrix(Matrix& dest, Matrix& src);
void freeMatrix(Matrix& A);
void printError(cudaError_t err);
void printMat(Matrix& A);
void setVal(Matrix& A, float val);
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters);
// CUDA declarations
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C);
int main() {
printf("Beginning Matrix Multiplication Comparison\n");
// Initialize matrix
Matrix A, B, C;
int rowsA = 32;
int colsA = 32;
int colsB = 32;
initializeMatrix(A, rowsA, colsA, 5.0f);
initializeMatrix(B, colsA, colsB, 2.0f);
initializeMatrix(C, rowsA, colsB, 0.0f);
// C = A * B using CPU, GPU, and GPU with shared memory
FuncPtr gpuMatMult = &multiplyMatrix;
int numOfIterations = 100;
double multTime = applyMultFunc(gpuMatMult, A, B, C, numOfIterations);
printMat(C);
// Update user
printf("Normal Mat Mult Time: %f\n", multTime);
// Cleanup
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
printf("\nPress Enter to continue...\n");
getchar();
return 0;
}
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C) {
// Initialize device matrices
Matrix deviceA, deviceB, deviceC;
copyMatrix(deviceA, A);
copyMatrix(deviceB, B);
copyMatrix(deviceC, C);
// Initialize number of blocks and threads
dim3 numOfThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
int xSize = (C.width + numOfThreadsPerBlock.x - 1) / numOfThreadsPerBlock.x;
int ySize = (C.height + numOfThreadsPerBlock.y - 1) / numOfThreadsPerBlock.y;
dim3 numOfBlocks(xSize, ySize);
// Call CUDA kernel
cudaMultMat<<<numOfBlocks, numOfThreadsPerBlock>>>(deviceA, deviceB, deviceC);
printError(cudaThreadSynchronize());
printError(cudaMemcpy(C.elements, deviceC.elements, C.height * C.width * sizeof(float), cudaMemcpyDeviceToHost));
// Free cuda memory
printError(cudaFree(deviceA.elements));
printError(cudaFree(deviceB.elements));
printError(cudaFree(deviceC.elements));
}
// CUDA definitions
// GPU matrix multiplication (non-shared memory)
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C) {
// If the matrices are of the wrong size then return
if(A.width != B.height) {
return;
}
// Initialize the indexes into the grid
int col = (blockDim.x * blockIdx.x) + threadIdx.x;
int row = (blockDim.y * blockIdx.y) + threadIdx.y;
// Initialize the result
float cVal = 0.0f;
// Find the result for the dot product of a row of A and a column of B
for(int i = 0; i < A.width; i++) {
cVal += A.elements[row * A.width + i] * B.elements[i * B.width + col];
}
// If we are in bounds then save the result
if(row < C.height && col < C.width) {
C.elements[row * C.width + col] = cVal;
}
}
// Helper functions
void initializeMatrix(Matrix& A, int rows, int cols, float val) {
A.width = cols;
A.height = rows;
A.stride = A.width;
int numOfElements = A.width * A.height;
A.elements = (float*) malloc(numOfElements * sizeof(float));
for(int i = 0; i < numOfElements; i++) {
A.elements[i] = val;
}
}
void copyMatrix(Matrix& dest, Matrix& src) {
dest.width = src.width;
dest.height = src.height;
dest.stride = src.stride;
int size = src.width * src.height * sizeof(float);
printError(cudaMalloc(&dest.elements, size));
printError(cudaMemcpy(dest.elements, src.elements, size, cudaMemcpyHostToDevice));
}
void freeMatrix(Matrix& A) {
free(A.elements);
}
void printError(cudaError_t err) {
if(err != 0) {
printf("CUDA ERROR: %s\n", cudaGetErrorString(err));
getchar();
}
}
void printMat(Matrix& A) {
printf("*********************************\n");
for(int i = 0; i < A.height; i++) {
for(int j = 0; j < A.width; j++) {
int index = i * A.width + j;
printf("%2.1f, ", A.elements[index]);
}
printf("\n");
}
}
void setVal(Matrix& A, float val) {
for(int i = 0; i < A.width * A.height; i++) {
A.elements[i] = val;
}
}
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters) {
clock_t startTime = clock();
for(int i = 0; i < numOfIters; i++) {
func(A, B, C);
}
clock_t endTime = clock();
return (double) (endTime - startTime) / CLOCKS_PER_SEC;
}
You're exceeding the threads per block specification of your GPU when you increase the block sizes.
The 9800GTX has a limit of 512 threads per block, regardless of how you create the block. 16*16 = 256 which is OK. 32 x 32 = 1024 which is not OK. In this case the kernel fails to run and so the output is not correct.
Your laptop probably has a newer GPU which supports 1024 threads per block, so 32 x 32 is OK but anything larger is not.
If you add proper cuda error checking to the code you can confirm this. Note that this code appears to have cuda error checking, but the checking implemented on the kernel call is incoomplete. Study the link I gave and you will see the difference. If you modify the code with complete error checking, you will see the error.
if your GPU's compute capability is 1.0/1.1, you can have at most 512 threads per block. But in new GPU device, every block can have at most 1024 threads.