I previously posted a question regarding matrix-vector multiplication in CUDA and about writing my own kernel. After doing this, I decided to implement my problem using CUBLAS as suggested by some users (thanks #Robert Crovella ) on SO in the hopes of achieving higher performance (my project is performance driven).
Just to clarify: I want to multiply a NxN matrix with a 1xN vector.
I've been looking at the code pasted below for a couple of days now and I cant figure out why the multiplication is giving me an incorrect result. I fear that i am causing problems by using < vector > arrays (this is part of a much larger system that uses these data types). I don't mean to use this thread as a debugging tool but I think this will also be helpful to other users trying to achieve this as I have not come across a particularly comprehensive source on the internet for my particular problem (and for the cublas v2 API). Thanks in advance!
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>
//#include "timenow.cu"
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// random data filler
void fillvector(float *data, int N){
for(int i=0; i<N; i++){
data[i] = float(rand() % 10);
}
}
//printer
void printer(bool printOut, float *data, int N){
if(printOut == true){
for(int i=0; i<N; i++){
printf("%2.1f ", data[i]);
}
printf("\n");
}
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
int main(){
bool printOut = true;
int N;
std::cout << "Enter N: " ;
std::cin >> N;
std::vector<float> x0;
x0.resize(N);
std::vector<float> p;
p.resize(N*N);
// matrix A
std::vector<float> A[N];
for(int i=0;i<N;i++){
A[i].resize(N);
fillvector(A[i].data(), N);
printer(printOut, A[i].data(), N);
}
printf("\n");
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);
printf("\nStarting CUDA computation...");
///double startTime = timenow();
// device pointers
float *d_A, *d_p, *d_b, *d_x0, *d_v, *d_temp;
cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");
// might need to flatten A...
cublasSetVector(N, sizeof(float), &x0, 1, d_x0, 1);
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasSetMatrix(N, N, sizeof(float), &A, N, d_A, N);
cudaCheckErrors("cuda memcpy of A or x0 fail");
float *temp;
temp = (float *)malloc(N*sizeof(temp));
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_N, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("returning to host failed");
printf("\n");
printer(printOut, temp, N);
/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
printf("%2.1f ",v[i]);
}*/
printf("\nFinished CUDA computations...");
//double endTime = timenow();
//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);
cudaFree(d_temp);
cudaFree(d_A);
cudaFree(d_p);
cudaFree(d_x0);
return 0;
}
We don't reference the first element of a vector this way:
cublasSetVector(N, sizeof(float), &x0, 1, d_x0,
Instead you should do this:
cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1);
And likewise for your SetMatrix call referencing A:
cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N);
Your GetVector call has 2 errors:
cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
You have your temp and d_temp parameters reversed (you are copying from device to host) and you should not take the address of temp: it is already a pointer. So do this:
cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1);
You're not doing proper error checking on all cublas calls, such as your get/set matrix/vector calls. Use the same method you are using on other cublas calls for these also.
You are creating A as an array of vectors. This won't work with cublasSetMatrix. Instead we need to create A as a flat vector, of sufficient size (N*N) to store the entire matrix.
Finally, cublas expects the matrices it uses to be stored in column-major order. If you pass C-style arrays in row-major order, you should use the transpose for that matrix in cublasSgemv:
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
The following code has these various problems fixed:
$ cat t235.cu
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>
//#include "timenow.cu"
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// random data filler
void fillvector(float *data, int N){
for(int i=0; i<N; i++){
data[i] = float(rand() % 10);
}
}
//printer
void printer(bool printOut, float *data, int N){
if(printOut == true){
for(int i=0; i<N; i++){
printf("%2.1f ", data[i]);
}
printf("\n");
}
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
int main(){
bool printOut = true;
int N;
std::cout << "Enter N: " ;
std::cin >> N;
std::vector<float> x0;
x0.resize(N);
std::vector<float> p;
p.resize(N*N);
// matrix A
std::vector<float> A(N*N);
fillvector(A.data(), N*N);
for (int i=0; i< N; i++){
printer(printOut, &(A[(i*N)]), N);
printf("\n");}
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);
printf("\nStarting CUDA computation...");
///double startTime = timenow();
// device pointers
float *d_A, *d_x0, *d_temp;
cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");
// might need to flatten A...
cublasCheckErrors(cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1));
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasCheckErrors(cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N));
//cudaCheckErrors("cuda memcpy of A or x0 fail");
float *temp;
temp = (float *)malloc(N*sizeof(temp));
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
cublasCheckErrors(cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1));
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
//cudaCheckErrors("returning to host failed");
printf("\n");
printer(printOut, temp, N);
/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
printf("%2.1f ",v[i]);
}*/
printf("\nFinished CUDA computations...\n");
//double endTime = timenow();
//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);
cudaFree(d_temp);
cudaFree(d_A);
//cudaFree(d_p);
cudaFree(d_x0);
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t235 t235.cu -lcublas
$ ./t235
Enter N: 5
3.0 6.0 7.0 5.0 3.0
5.0 6.0 2.0 9.0 1.0
2.0 7.0 0.0 9.0 3.0
6.0 0.0 6.0 2.0 6.0
1.0 8.0 7.0 9.0 2.0
0.0 2.0 3.0 7.0 5.0
Starting CUDA computation...
83.0 86.0 92.0 62.0 110.0
Finished CUDA computations...
$
Related
This is my first time working with cuda. I am running some calculations involving cufft and two simple kernels on an NxNxN mesh (N=128). It seems to work fine until some time between 4040 and 4050 loops, the values of my mesh points become nan. On a smaller mesh, it can complete more loops before failing. This makes me think there is a memory leak somewhere. I tried running cuda-memcheck but it returned no errors. Can you spot any problems that could be causing this? I have reduced the code to a minimum but it is still long, my apologies. Thank you for your help.
#define _USE_MATH_DEFINES
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
data[i].y = 0;
}
__syncthreads();
}
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
for (float t = 0; t < tmax; t += dt) {
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}
Here is my instrumentation of your code so far. When I enabled the device assert in my_assert, it indicated that d_data3 input at the nan5 point was failing (i.e. it was nan). That indicated that the cufftExecC2C call on d_data3 immmediately prior was producing nan data. If you have invalid inputs, I believe an FFT can produce out-of-range results.
The code is instrumented to allow you to dump the data and look at it. You will have to modify dump_data to display whatever it is you wish to see.
When I run the code below, it eventually prints out:
4850.14
4851.14
4852.14
4853.14
4854.14
4855.14
4856.14
4857.14
4858.14
4859.14
4860.14
d_data3 output nan check failed
$
So the nan first occurs on iteration 4860, and the d_data3 input check did not fail, so the nan occurs in d_data3 as a result of the FFT operation in loop iteration 4860. You'll need to study the input and output data to see if you can determine why. There may be some modification to the d_data3 data in the Cube kernel that is causing this. For example, since you are repetitively cubing the data, doesn't it seem reasonable at some point that it would exceed float range?
Here's my instrumented code:
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <assert.h>
#include <stdio.h>
using namespace std;
__host__ __device__ void my_assert(bool cond){
//assert(cond);
}
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp = data[i].x;
if (isnan(temp)) {printf("nan1: %d\n", i); my_assert(0);}
data3[i].x = pow(data[i].x, 3);
if (isnan(data3[i].x)) {printf("nan2: %d %f\n", i, data[i].x); my_assert(0);}
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp1 = w[i];
if (isnan(temp1)) {printf("nan3: %d\n", i); my_assert(0);}
float temp2 = data[i].x;
if (isnan(temp2)) {printf("nan4: %d\n", i); my_assert(0);}
float temp3 = data3[i].x;
if (isnan(temp3)) {printf("nan5: %d\n", i); my_assert(0);}
float temp4 = v[i];
if (isnan(temp4)) {printf("nan6: %d\n", i); my_assert(0);}
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
if (isnan(data[i].x)) {printf("nan7: %d, %f, %f, %f, %f, %d\n",i, temp1, temp2, temp3, temp4, n); my_assert(0);}
data[i].y = 0;
}
__syncthreads();
}
__global__ void nan_kernel(cufftComplex *d, int len, bool *res){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len)
if (isnan(d[idx].x) || isnan(d[idx].y)) *res = true;
}
bool *d_nan;
bool checknan(cufftComplex *d, int len){
bool h_nan = false;
cudaMemcpy(d_nan, &h_nan, sizeof(bool), cudaMemcpyHostToDevice);
nan_kernel<<<(len/1024)+1, 1024>>>(d, len, d_nan);
cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
return h_nan;
}
void dump_data(cufftComplex *d1, cufftComplex *d2, int len)
{
// add code here to spit out the data however you would like to see it
// perhaps to a file
std::cout << "input: output: " << std::endl;
for (int i = 0; i < len; i++)
std::cout << d1[i].x << "," << d1[i].y << " " << d2[i].x << "," << d2[i].y << std::endl;
};
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
cufftResult res;
cudaMalloc(&d_nan, sizeof(bool));
cufftComplex *i3, *o3;
i3 = (cufftComplex *)malloc(mem_size);
o3 = (cufftComplex *)malloc(mem_size);
std::cout << "start loop" << std::endl;
for (float t = 0; t < tmax; t += dt) {
std::cout << t/dt << std::endl;
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cudaMemcpy(i3, d_data3, mem_size, cudaMemcpyDeviceToHost);
if (checknan(d_data3, n)) {std::cout << "d_data3 input nan check failed" << std::endl; return -1;}
res = cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft1 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
if (checknan(d_data3, n)) {std::cout << "d_data3 output nan check failed" << std::endl; cudaMemcpy(o3, d_data3, mem_size, cudaMemcpyDeviceToHost); dump_data(i3, o3, n); return -1;}
res = cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft2 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
res = cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS) {std::cout << "cufft3 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
cudaError_t cres = cudaGetLastError();
if (cres != cudaSuccess) std::cout << "cuda error: " << cudaGetErrorString(cres) << std::endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}
EDIT:
After some addition of printout code to dump_data (see modification above) I see this:
...
4859.14
4860.14
d_data3 output nan check failed
input: output:
3.37127e+19,0 nan,nan
3.21072e+19,0 nan,nan
2.76453e+19,0 nan,nan
2.13248e+19,0 nan,nan
1.44669e+19,0 nan,nan
8.37214e+18,0 nan,nan
3.93645e+18,0 nan,nan
1.35501e+18,0 nan,nan
2.55741e+17,0 nan,nan
5.96468e+15,0 nan,nan
-1.36656e+16,0 nan,nan
-2.33688e+17,0 nan,nan
-8.37407e+17,0 nan,nan
-1.79915e+18,0 nan,nan
-2.96302e+18,0 nan,nan
-4.11485e+18,0 nan,nan
-5.03876e+18,0 nan,nan
-5.57617e+18,0 nan,nan
-5.65307e+18,0 nan,nan
-5.28957e+18,0 nan,nan
-4.5872e+18,0 nan,nan
-3.68309e+18,0 nan,nan
...
I'm not an FFT expert, but it might be the case that if you do an FFT on a large array filled with large values, using float precision, that overflow may occur. If you only need to get to 5000 iterations and you're failing at 4860, you might get there if you change all your datatypes to double from float, but I'm not sure about the numerical sense of what you are doing here.
Finally, note that both cufft and fftw perform un-normalized transforms. This may be playing a role in the seeming growth of magnitudes in your data set. As I stated already, I'm not familiar with the arithmetic or algorithm you are trying to implement here.
Is it possible that you have a float underflow happening around iteration 4040? Taking the cube of your data3 would lead me to check out that possibility. It is pretty easy to spiral into an underflow on a float32 if your not careful. You could throw a check in there to limit your value to some minimum epsilon to prevent this.
I have two matrices
#define MATRIX_SIZE 20
#define BLOCK_SIZE 2
#define TILE_SIZE 2
double** A
double** B
Matrix A is dense, Matrix B is tridiagonal. I have created a vectorized representation of A
/* sz = A.rowlen = B.rowlen = A.collen = B.collen */
double* A1d = matrix_to_vector(sz, A);
I have also created a compressed representation of B with the following function
double* l_array = new double(sz - 1);
double* m_array = new double(sz);
double* r_array = new double(sz-1);
int current_l_idx = 0;
int current_m_idx = 0;
int current_r_idx = 0;
for (int i = 0; i < sz; i++) {
for (int j = 0; j < sz; j++) {
if ((i == j+1) || (i-1 == j)) {
l_array[current_l_idx] = B[i][j];
current_l_idx++;
}
else if ((i == j-1) || (i+1 == j)) {
r_array[current_r_idx] = B[i][j];
current_r_idx++;
}
else if (i == j) {
m_array[current_m_idx] = B[i][j];
current_m_idx++;
}
}
}
I then create an empty 2D vectorized matrix E as well as all my objects for CUDA
double* E1d = matrix_to_vector(sz, E);
double* d_A
double* d_B_l;
double* d_B_m;
double* d_B_r;
double* d_E;
size_t sizeA = sz * sz * sizeof(double);
size_t sizeB_lr = (sz - 1) * sizeof(double);
size_t sizeB_m = sz * sizeof(double);
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B_l. sizeB_lr);
cudaMalloc(&d_B_m, sizeB_m);
cudaMalloc(&d_B_r, sizeB_lr);
cudaMalloc(&d_E, sizeA);
cudaMemcpy(d_A, A1d, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, l_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, m_array, sizeB_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, r_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_E, E1d, sizeA, cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(MATRIX_SIZE / threads.x, MATRIX_SIZE / threads.y);
cudakernel<<<grid, threads>>>(sz, d_A, d_B_l, d_B_m, d_B_r, d_E);
I can perform this multiplication serially but I, unfortunately, have NO idea how to implement this on the CUDA device
Assumptions
A and B are always square
sz will always be evenly divisible by BLOCK_SIZE and TILE_SIZE
BLOCK_SIZE will always equal TILE_SIZE
I suspect based on your setup code that you are looking for a tiled shared-memory approach to this kind of matrix multiplication, and I'm not really wanting to do your homework for you, so I'll demonstrate an example that doesn't use shared memory.
If you understand how matrix multiplication works, and you also understand how to create an ordinary shared memory GPU matrix multiply kernel, converting the following code to use shared memory should be relatively straightforward:
#include <stdio.h>
#define DSIZE 256
#define BSIZE 32
#define TOL 0.0001
typedef double mytype;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// C = A x B
// A,B,C are all dense
template <typename T>
__global__ void mm(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
for (int i = 0; i < sz; i++)
temp += A[idy*sz+i]*B[i*sz+idx];
C[idy*sz+idx] = temp;}
}
// C = A x B
// A,C are dense, B is tridiagonal
template <typename T>
__global__ void mmt(const T * __restrict__ A, const T * __restrict__ B_l, const T * __restrict__ B_m, const T * __restrict__ B_r, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
if (idx > 0) temp += A[idy*sz+(idx-1)]*B_r[idx-1];
temp += A[idy*sz+(idx) ]*B_m[idx];
if (idx < (sz-1)) temp += A[idy*sz+(idx+1)]*B_l[idx];
C[idy*sz+idx] = temp;}
}
int main(){
mytype *d_A, *h_A, *d_B, *h_B, *d_C, *h_Cd, *h_Cs, *d_B_l, *h_B_l, *d_B_m, *h_B_m, *d_B_r, *h_B_r;
size_t msz = DSIZE*DSIZE;
size_t mszb = msz*sizeof(mytype);
// host side allocations
h_A = (mytype *)malloc(mszb);
h_B = (mytype *)malloc(mszb);
h_Cd =(mytype *)malloc(mszb);
h_Cs =(mytype *)malloc(mszb);
h_B_l = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_r = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_m = (mytype *)malloc( DSIZE*sizeof(mytype));
if (!h_A || !h_B || !h_Cd || !h_Cs || !h_B_l || !h_B_r || !h_B_m) {printf("malloc fail\n"); return -1;}
// device side allocations
cudaMalloc(&d_A, mszb);
cudaMalloc(&d_B, mszb);
cudaMalloc(&d_C, mszb);
cudaMalloc(&d_B_l, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_r, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_m, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
// prepare A, B matrices
/*
|1 1 1 ...|
A = |2 2 2 ...|
|3 3 3 ...|
|4 4 4 ...|
|... |
|2 1 0 ...| B_l = left/lower subdiagonal (i.e. all 3's)
B = |3 2 1 ...| B_m = middle/main diagonal (i.e. all 2's)
|0 3 2 ...| B_r = right/upper superdiagonal (i.e. all 1's)
|0 0 3 ...|
|... |
*/
for (int i = 0; i < DSIZE; i++){
if (i < DSIZE-1){
h_B_r[i] = 1;
h_B_l[i] = 3;}
h_B_m[i] = 2;
for (int j = 0; j < DSIZE; j++){
h_A[i*DSIZE+j] = i+1;
if (j==i+1) h_B[i*DSIZE+j] = 1;
else if (j==i) h_B[i*DSIZE+j] = 2;
else if (j==i-1) h_B[i*DSIZE+j] = 3;
else h_B[i*DSIZE+j] = 0;}}
// copy data to device
cudaMemcpy(d_A, h_A, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, h_B_l, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, h_B_r, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, h_B_m, DSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
// perform dense-dense multiply
dim3 block(BSIZE,BSIZE);
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
cudaMemset(d_C, 0, mszb);
mm<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaMemcpy(h_Cd, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
// perform dense-sparse multiply
cudaMemset(d_C, 0, mszb);
mmt<<<grid, block>>>(d_A, d_B_l, d_B_m, d_B_r, d_C, DSIZE);
cudaMemcpy(h_Cs, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 3/kernel fail");
// compare results
for (int i = 0; i < DSIZE; i++)
for (int j = 0; j < DSIZE; j++)
if (abs(h_Cs[i*DSIZE+j] - h_Cd[i*DSIZE+j]) > TOL) {printf("results mismatch at (%d, %d) dense: %f sparse: %f\n", i, j, h_Cd[i*DSIZE+j], h_Cs[i*DSIZE+j]); return -1;}
printf("Success!\n");
return 0;
}
Notes:
All of the global memory accesses in the mmt kernel (i.e. for A, the B vectors, and C) should properly coalesce across threads. Therefore, a conversion to use shared memory should also easily yield non-bank-conflicted access to shared memory.
While studying this code may be useful for learning, I recommend any serious sparse-dense matrix multiplication be done with routines from CUSPARSE such as csrmm. It will almost certainly be much more efficient (faster) than the above code, and likely faster than any shared memory conversion of the above code as well.
I'm trying to invert a matrix composed of complex numbers, where I'm using matrix inversion code for real numbers posted in the following link by 'user'
cuda matrix inverse gaussian jordan
code compiles, no bugs, but problem is output is wrong! I don't know where I went wrong.
Can anyone, please, help.
Thank you in advance!
here is the complete code:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n = 3;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCimag(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *d_L, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
dim3 numBlocks(16,16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost );
cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCimag(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
Thank you #RobertCrovella for ur fast and very insightful suggestion! Regarding your answer to my question: I changed my threadsPerBlock(4,4) and numBlocks(1,1) so I'll be using 1 block with 16 threads for my 4x4 matrix. My input matrix is the following
1 0 0 0
0 2 0 0
0 0 3 0
0 0 0 4
all numbers are real in here, then expected inverted matrix should look like
1 0 0 0
0 1/2 0 0
0 0 1/3 0
0 0 0 1/4
and i'm not getting this at all. I inputted cuda memcheck tool to see if my kernel is not lunching
but it didn't show any error massages. I started learning CUDA very recently and don't have much experience. Can anyone give more detailed response? Thank You!
here is my modified code.
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != 0 ){
if( cuCreal(d_A[x*h+x]) != 0 ){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
DISCLAIMER: I am not an expert on matrix inversion. I have not worked through the details of the differences between real matrix inversion and complex matrix inversion (there shouldn't be many differences, I don't think). As suggested already, there are probably better/faster ways to invert matrices.
The immediate problem seems to be in your dev kernel, particularly here:
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
This is requiring that both the real and imaginary parts of the d_A matrix element in question be non-zero in order for the dev kernel to do any work. However I don't think this condition should be necessary. For division, we probably only require that either the real or the imaginary part be non-zero. I think in the complex domain we are actually dividing by zero only if both real and imaginary parts are zero. If you inspect the cuCdiv function provided in cuComplex.h, you can ascertain for yourself under what conditions it will "blow up" and therefore what conditions need to be tested for and avoided. I'm confident your test is not correct.
The following modified code works correctly for me, for your simple test case:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <math.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if(( cuCimag(d_A[x*h+x]) != 0 ) || ( cuCreal(d_A[x*h+x]) != 0 )){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
return 0;
}
FINAL DISCLAIMER: I'm not saying this is a fully-validated approach to inversion of matrices of arbitrary dimensions. I'm simply pointing out a critical bug that seems to make it fail for your simple test case. I also expressed some reservations in the previous question you linked.
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have a program that calculates 1-10 million scalar products.
It looks like this. ts and A are arrays of about 1000-10000 3D points (each element is a 3x1 vector). For the moment, with ts.size() = 10,000 and A.size() = 1000, my code takes about 41ms. I have not done any parallelization so far. Will the calculations be much faster, for example, in CUDA? I have no such experience. Or is there any other way? Thanks.
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
This is my implementation of the scalar product.
double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}
Could I use Lapack or Eigen instead, formulating the problem as matrix multiplication? I've done that in Matlab and it is only 5 times slower. Any speedup would be great. With OpenMP i guess I could be 4x faster.
This answer consists of two parts:
Accelerating the calculation of many independent scalar products;
Solving your specific problem.
PART 1
The problem of calculating a large number of independent scalar products is an embarassingly parallel problem. If you aim at accelerating only the mentioned scalar products, retaining the rest of the computation on the CPU, then I agree with Calvin that most of the time will be spent in device-> memory transaction of the large N*M resulting matrix. However, if you purge your timing from the mentioned transaction, accelerating the calculations will be worth. This is shown by the code below, tested on an Intel Xeon E5-2650 2.00 GHz, Eight core processor equipped with an NVIDIA Kepler K20c cards, and whose timing is the following:
CPU: 27ms; GPU (without D2H transaction): 0.08ms; GPU (with D2H transaction): 23ms
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*M*sizeof(float));
float *D = (float*)malloc(N*M*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N*M; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
PART 2
For solving your specific problem, the CUDA will be worth, even by considering the D2H memory transaction (which is very cheap). This is confirmed by the code below, tested on the same system as above, and whose timing is the following:
CPU: 46ms; GPU (with D2H transaction): 0.31ms;
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f)
output[idx] = 1.f;
else
output[idx] = 0.f;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*sizeof(float));
float *D = (float*)malloc(N*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f)
C[i] = 1.f;
else
C[i] = 0.f;
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
Instead of optimising for arithmetic, you should use better algorithm first.
In most practical situation ts and A are not totally random per each cycle, and you may somehow organise (sort) them spatially, and greatly reduce the need for calculating spatial metric.
Now if you insist to stick with current algorithm, you may enable compiler to emit SSE code, this should give some instant boost without any programming work.
Now since you have to ask this question, the chance that you may further squeeze cycles by manually code with compiler intrinsics, is relatively narrow.
About CUDA, for just 10 million dot product the overhead for CPU-RAM-DISPLAY RAM-GPU communication is significant and not worth all the trouble.
To parallelize this using MIMD with OpenMP you can do this:
#pragma omp parallel for
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
You could also consider using SIMD. In that case you should change your data structure and store blocks of points equal to the SIMD width (4 for SSE with floats). Something like
class PointBlock4 {
float x[4];
float y[4];
float z[4];
//
}
Each block has four points. This is obviously more complicated but it's achievable. You could get a speed up as four as well. Combining SIMD and MIMD you could get a speedup of 16x (with four cores). But for large n your algorithm will become memory bound and not compute bound so you will achieve a much lower speedup. In fact your algorithm may already be memory bound so you might achieve much with SIMD or MIMD. I would test OpenMP first to see if you gain much.
I'm trying to pass a 2d array to a kernel so that each thread can access index = threadIdx.x + (blockIdx.x * blockDim.x) but I'm having trouble figuring out just how to do this and how to copy the data back over.
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
for(block_size)
h_array[block][thread] should be 1
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
What am I doing wrong, here?
Your cudaMemset2D is accesing to a bigger memory space that you previously allocated with cudaMallocPitch Also your cudaMemcpy2D is copying a little portion of that memory.
You should use the function in the following way:
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);
Here's a complete code that passes a basic test with the errors mentioned by #hidrargyro fixed:
$ cat t236.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
int main(){
int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
for(int j = 0; j<block_size; j++)
if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}
$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$