Not able to get the cublasSgelsbatched function to work - c++

I'm currently trying to get the cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) version to work. I started by first making a small test case to see what parameters are needed exactly and how they need to be inputted. However after much trial and error I still can't get it to work, I get a status return of 13, which corresponds to CUBLAS_STATUS_EXECUTION_FAILED which is a very vague error, also I tried some other cublas testcases and they seem to be working fine. I also tested the input matrix in MATlab, which does have a LS solution.
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
I'm on Windows 10 running in MVS using CUDA 9.0
I'd really appreciate some help

As pointed out in the comments, you are not creating a proper array of pointers on the device. The batched function works with an array of pointers that lives in device memory, for the data parameters, for example:
Aarray device input/output array of pointers to array, with each array of dim. m x n with lda>=max(1,m). Matrices Aarray[i] should not overlap; otherwise, undefined behavior is expected.
Passing for example &gAmat seems to satisfy the type requirement, but that pointer does not point to device memory.
The following modifications to your code focused on proper handling of gAmat and gYmat seem to run without error for me:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$
Your code only shows a single array. If you had a batch of arrays, you would pass an actual array of device-allocated pointers, for each of A and Y.
Based on comments below, here is a version of the code using non-random input:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$

Related

How to access dynamically allocated array in CUDA

Here I'm trying to access a dynamically allocated array in CUDA. However, after running the output is c[0][0] = 0. Am I accessing the allocated array correctly? I think the way I'm copying the arrays is probably correct and for some reason, the value of C has not been changed on the device.
#include<iostream>
using namespace std;
__global__ void add_matrix(float *A, float *B, float *C, int n) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if ((i < n) && (j < n)){
C[i*n+j] = A[i*n+j] + B[i*n+j];
}
}
int main(){
const size_t N = 1024;
const size_t size = N * N * sizeof(float);
float *A, *B, *C;
A = (float*) malloc(size);
B = (float*) malloc(size);
C = (float*) malloc(size);
for (size_t i=0; i<N*N; i++){
A[i] = 5.0;
B[i] = 6.0;
}
float *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, size);
cudaMalloc((void**)&B_d, size);
cudaMalloc((void**)&C_d, size);
auto code = cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
if (code != cudaSuccess){
cout << "Error copying A to device" << endl;
}
code = cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
if (code != cudaSuccess){
cout << "Error copying B to device" << endl;
}
dim3 threads(N, N);
dim3 blocks(1,1);
add_matrix<<<blocks, threads>>>(A_d, B_d, C_d, N);
code = cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
if (code != cudaSuccess){
cout << "Error copying C from device" << endl;
}
std::cout << "C[0][0] : " << C[0] << std::endl;
free(A); free(B); free(C);
cudaFree(A_d); cudaFree(B_d); cudaFree(C_d);
return 0;
}
The problem was arranging the blocks. I totally forgot each block can have a limited number of threads. we can obtain the maximum threads per block by getting maxThreadsPerBlock property using cudaDeviceGetAttribute. It seems the Colab GPU supports 1024 threads in each block. so I changed the arrangement this way:
dim3 threads(32,32);
dim3 blocks(32,32);
And it worked

Trying to run a CusolverSSgels testcase, however it is not working

I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
You have a problem with your dinfo parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$
Notes:
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.

Got the initial value from GPU programming in OpenCL

I developed a small code to add two small vector using GPU by OpenCL library. The main code vectorAdd.cc is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
{
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
int main()
{
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("vector_add_kernel.cl" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
randomInit(A,szVec);
randomInit(B,szVec);
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err = program.build("cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
{
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
}
std::cout << std::endl;
return 0;
}
and the kernel code vector_add_kernel.cl is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
result:
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C, I do not understand why. I also initialized vectorC with some other values and again the result was the initial values.
It's probably just a syntax error.
auto err = program.build("cl.std.CL1.2");
should be
auto err = program.build("-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.
The problem stems from building the program with this command
auto err = program.build("cl.std.CL1.2");
and by replacing the command above with
auto err = program.build();
The problem solved.
But still I do not know why this happened. Any idea?

Incorrect output when transforming from complex to real number using cuda cuFFT

I am using cuda version 7.5 cufft to perform some FFT and inverse FFT.
I have a problem when performing inverse FFT using cufftExecC2R(.,.) function.
Actually, when I use a batch_size = 1 in the cufftPlan1d(,) I get correct result. However, when I increase the batch size the results is incorrect.
I am pasting a sample minimal code to illustrate this. Please ignore the dirtiness of the code as I just quickly created this.
#include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctime>
#include <iostream>
typedef float2 Complex;
void iTest(int argc, char** argv);
#define SIGNAL_SIZE 9
#define BATCH_SIZE 2
int main(int argc, char** argv) {
iTest(argc, argv);
return 0;
}
void iProcess(Complex *x, double *y, size_t n) {
cufftComplex *deviceData;
cudaMalloc(reinterpret_cast<void**>(&deviceData),
SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
cudaMemcpyHostToDevice);
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
if (cufftStatus != cudaSuccess) {
printf("cufftPlan1d failed!");
}
cufftComplex *d_complex;
cudaMalloc(reinterpret_cast<void**>(&d_complex),
sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
cufftStatus = cufftExecC2C(handle, deviceData, d_complex, CUFFT_FORWARD);
if (cufftStatus != cudaSuccess) {
printf("cufftExecR2C failed!");
}
cufftComplex *hostOutputData = (cufftComplex*)malloc(
(SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));
cudaMemcpy(hostOutputData, d_complex,
SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
cudaMemcpyDeviceToHost);
std::cout << "\nPrinting COMPLEX" << "\n";
for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
//! convert complex to real
cufftHandle c2r_handle;
cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
if (cufftStatus != cudaSuccess) {
printf("cufftPlan1d failed!");
}
cufftReal *d_odata;
cudaMalloc(reinterpret_cast<void**>(&d_odata),
sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE);
cufftStatus = cufftExecC2R(c2r_handle, d_complex, d_odata);
cufftReal odata[SIGNAL_SIZE * BATCH_SIZE];
cudaMemcpy(odata, d_odata, sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE,
cudaMemcpyDeviceToHost);
std::cout << "\nPrinting REAL" << "\n";
for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
std::cout << i << " \t" << odata[i]/(SIGNAL_SIZE) << "\n";
}
cufftDestroy(handle);
cudaFree(deviceData);
}
void iTest(int argc, char** argv) {
Complex* h_signal = reinterpret_cast<Complex*>(
malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));
std::cout << "\nPrinting INPUT" << "\n";
for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
h_signal[i].y = 0;
std::cout << i << "\t" << h_signal[i].x << "\n";
}
std::cout << "\n";
double y[SIGNAL_SIZE * BATCH_SIZE];
iProcess(h_signal, y, 1);
}
I cannot find out where is the bug in my code and what information I am missing.
Sample output when using BATCH_SIZE = 1
Sample output when using BATCH_SIZE = 2
The information that you are missing is that you don't understand that there are data format differences for the input data expected for a C2C transform vs. C2R (or R2C).
You should start by reading this section and this section of the CUFFT documentation.
Note that it says:
Each of those functions demands different input data layout
But you are passing input data that was correct for a C2C transform directly to a C2R transform. That won't work.
The most direct solution IMO is to convert all of your work to C2C transform types. The C2C transform can support both forward (e.g. "real-to-complex") and inverse (e.g. "complex-to-real"). The C2R transform type you are using can also support "complex-to-real", but the data arrangement you would use for C2R differs from the data arrangement you would use for C2C with the inverse path specified, for what is otherwise the same transform. You have not accounted for this.
Here is a worked example showing a modified version of your code that uses C2C for both the forward and inverse paths, and correctly reproduces the input for a batch size of 2:
$ cat t19.cu
#include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctime>
#include <iostream>
typedef float2 Complex;
void iTest(int argc, char** argv);
#define SIGNAL_SIZE 9
#define BATCH_SIZE 2
int main(int argc, char** argv) {
iTest(argc, argv);
return 0;
}
void iProcess(Complex *x, double *y, size_t n) {
cufftComplex *deviceData;
cudaMalloc(reinterpret_cast<void**>(&deviceData),
SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
cudaMemcpyHostToDevice);
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
if (cufftStatus != cudaSuccess) {
printf("cufftPlan1d failed!");
}
cufftComplex *d_complex;
cudaMalloc(reinterpret_cast<void**>(&d_complex),
sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
cufftStatus = cufftExecC2C(handle, deviceData, d_complex, CUFFT_FORWARD);
if (cufftStatus != cudaSuccess) {
printf("cufftExecR2C failed!");
}
cufftComplex *hostOutputData = (cufftComplex*)malloc(
(SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));
cudaMemcpy(hostOutputData, d_complex,
SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
cudaMemcpyDeviceToHost);
std::cout << "\nPrinting COMPLEX" << "\n";
for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
//! convert complex to real
/* cufftHandle c2r_handle;
cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
if (cufftStatus != cudaSuccess) {
printf("cufftPlan1d failed!");
}
*/
cufftComplex *d_odata;
cudaMalloc(reinterpret_cast<void**>(&d_odata),
sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
cufftStatus = cufftExecC2C(handle, d_complex, d_odata, CUFFT_INVERSE);
cufftComplex odata[SIGNAL_SIZE * BATCH_SIZE];
cudaMemcpy(odata, d_odata, sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE,
cudaMemcpyDeviceToHost);
std::cout << "\nPrinting REAL" << "\n";
for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
std::cout << i << " \t" << odata[i].x/(SIGNAL_SIZE) << "\n";
}
cufftDestroy(handle);
cudaFree(deviceData);
}
void iTest(int argc, char** argv) {
Complex* h_signal = reinterpret_cast<Complex*>(
malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));
std::cout << "\nPrinting INPUT" << "\n";
for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
h_signal[i].y = 0;
std::cout << i << "\t" << h_signal[i].x << "\n";
}
std::cout << "\n";
double y[SIGNAL_SIZE * BATCH_SIZE];
iProcess(h_signal, y, 1);
}
$ nvcc -arch=sm_61 -o t19 t19.cu -lcufft
t19.cu: In function ‘void iProcess(Complex*, double*, size_t)’:
t19.cu:34:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
if (cufftStatus != cudaSuccess) {
^
t19.cu:43:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
if (cufftStatus != cudaSuccess) {
^
$ cuda-memcheck ./t19
========= CUDA-MEMCHECK
Printing INPUT
0 0.840188
1 0.394383
2 0.783099
3 0.79844
4 0.911647
5 0.197551
6 0.335223
7 0.76823
8 0.277775
9 0.55397
10 0.477397
11 0.628871
12 0.364784
13 0.513401
14 0.95223
15 0.916195
16 0.635712
17 0.717297
Printing COMPLEX
0 5.306536 0.000000
1 0.015338 -0.734991
2 -0.218001 0.740248
3 0.307508 -0.706533
4 1.022732 0.271765
5 1.022732 -0.271765
6 0.307508 0.706533
7 -0.218001 -0.740248
8 0.015338 0.734991
9 5.759857 0.000000
10 -0.328981 0.788566
11 0.055356 -0.521014
12 -0.127504 0.581872
13 0.014066 0.123027
14 0.014066 -0.123027
15 -0.127504 -0.581872
16 0.055356 0.521014
17 -0.328981 -0.788566
Printing REAL
0 0.840188
1 0.394383
2 0.783099
3 0.79844
4 0.911647
5 0.197551
6 0.335223
7 0.76823
8 0.277775
9 0.55397
10 0.477397
11 0.628871
12 0.364784
13 0.513401
14 0.95223
15 0.916195
16 0.635712
17 0.717297
========= ERROR SUMMARY: 0 errors
$

How to call existing host function from device function in cuda [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Closed 8 years ago.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Improve this question
I have seen a similar question here
However,I could not get an exact answer here, and it is written in 2012.
I am trying to call cublasStatus_t cublasSgbmv(...) function, which is defined in "cublas_v2.h", in a __global__ function. However, I could not use the dynamic parallelism feature. I only have 1 source.cu file. However, I have read that I should compile it in a dynamic way so that it separates device and host functions, then I can link these outputs.
Is there anyone who knows how to do it, or a good source to explain it?
Thanks in advance
edit : if undervoted, please explain the reason at least for me to learn my mistake?
edit2 :
my specific problem is, I'm using the following code in my Source.cu :
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cstdio>
#include <stdio.h>
#include <device_launch_parameters.h>
#include <stdlib.h> //srand(), rand()
#include <time.h>
#include <builtin_types.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define IDX2C(i ,j , ld ) ((( j )*( ld ))+( i ))
#define HEIGHT 4
#define WIDTH 4
#define V 4
#define KL 2
#define KU 1
#define THREADS_PER_BLOCK 512
#pragma comment(lib, "cublas")
//#pragma comment(lib, "helper_cuda")
using namespace std;
void create_Matrix(int* matrix, int width, int height){
int i, len;
len = height * width;
srand(time(NULL));
for (i = 0; i < len; i++){
matrix[i] = rand() % 10 + 1; //generates number between 1-10
}
}
template <typename T>
void print_vector(T* vector, int len){
for (int i = 0; i < len; i++)
cout << vector[i] << " ";
cout << endl;
}
template <typename T>
void creating_bandedMatrix(T* bandedMatrix, int height, int width, int ku, int kl){
//fill matrix with zeros at the beginning
int i, len;
len = height * width;
for (i = 0; i < len; i++){
bandedMatrix[i] = 0; //generates number between 1-10
}
srand(time(NULL));
//filling banded diagonal
int start, end;
for (int i = 0; i < height; i++){
start = i - kl;
if (start < 0)
start = 0;
end = i + ku + 1;
if (end > width)
end = width;
for (int j = start; j < end; j++){
*(bandedMatrix + (i*width) + j) = (float)(rand() % (10) + 1); //rand() / (T)RAND_MAX;;
}
}
}
template <typename T>
void print_matrix(T* matrix, int width, int height){
int len = width*height;
cout << "asdsffffff" << endl;
for (int i = 0; i < len; i++){
if (!(i%width))
cout << endl;
cout << i << ":" <<matrix[i] << " ";
}
cout << endl;
}
template <typename T>
void computeMatrixVectorMultiplication(T* bandedMatrix, T* vector2){
T row_sum = 0;
T* bandedHostResult = (T*)malloc(WIDTH * sizeof(T));
for (int i = 0; i < HEIGHT; i++){
row_sum = 0;
for (int j = 0; j < WIDTH; j++){
row_sum += (*(bandedMatrix + i*WIDTH + j)) * vector2[j];
}
bandedHostResult[i] = row_sum;
}
//priting the result
cout << "\n\nBanded Host Result...\n";
print_vector(bandedHostResult, WIDTH);
}
template <typename T>
void fillLapackMatrix(T* lapack_matrix, T* bandedMatrix, int kl, int ku, int banded_w, int banded_h, int lapack_w, int lapack_h){
int i, j, lapack_i;
int len = lapack_h * lapack_w;
for (i = 0; i < len; i++){
lapack_matrix[i] = 0; //generates number between 1-10
}
for (i = 0; i < banded_w; i++){
for (j = 0; j < banded_h; j++){
lapack_i = ku + i - j;
*(lapack_matrix + lapack_i*lapack_w + j) = *(bandedMatrix + i*banded_w + j);
//lapack_matrix[lapack_i*lapack_w + j] = bandedMatrix[i*bandedMatrix + j];
}
}
}
__global__ void device_cublasSgbmv(int m,int n,int kl, int ku,float* alpha, float* A, int lda ,float* B,int ldb,float*R, int ldr, float* beta){
int index = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t handle;
cublasCreate(&handle);
cublasOperation_t trans = CUBLAS_OP_N;
float* dev_x;
cudaMalloc((void**)&dev_x,sizeof(float) * n);
if(index < ldr){
cublasSgbmv(handle, trans,m, n, kl, ku, alpha, A, m, B+index*n, 1, beta, R+index*n, 1);
index = 0;
}
}
void fillNormalMatrix(float* B,int h,int w){
for(int i = 0; i < h;i++){
for(int j = 0; j < w;j++){
B[i*w + j] = 1;
}
}
}
int main()
{
cublasStatus_t status;
float *A;
float *x, *y;
float *dev_x, *dev_y;
int incx, incy;
float *dev_A = 0;
float alpha = 1.0f;
float beta = 0.0f;
int matrixSize = WIDTH * HEIGHT;
int i, j;
cublasHandle_t handle;
/* Initialize CUBLAS */
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
//Allocate host memory for the matrices
A = (float *)malloc(matrixSize* sizeof(float));
//Allocate memory for host vectors
x = (float *)malloc(WIDTH * sizeof(float));
y = (float*)malloc(WIDTH * sizeof(float));
// Fill the matrices with test data
creating_bandedMatrix(A, WIDTH, HEIGHT, KU, KL);
cout << "Banded Matrix\n";
print_matrix(A, WIDTH, HEIGHT);
//Fill the vectors with random data
for (i = 0; i < WIDTH; i++){
x[i] = 1;// (float)(rand() % (10) + 1);:
y[i] = (float)(rand() % (10) + 1);
}
cout << "\nvector x...\n";
print_vector(x, WIDTH);
//cout << "\nvector y...\n";
//print_vector(y, WIDTH);
//Allocate device memory for the matrix
if (cudaMalloc((void **)&dev_A, matrixSize * sizeof(float)) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
//Allocate device memory for vectors
if (cudaMalloc((void**)&dev_x, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&dev_y, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
// Initialize the device vectors with the host vectors
status = cublasSetVector(WIDTH, sizeof(float), x, 1, dev_x, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write x vector)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(WIDTH, sizeof(float), y, 1, dev_y, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write y vector)\n");
return EXIT_FAILURE;
}
//initialize matrix with lapack format
int lapack_width = WIDTH > HEIGHT ? HEIGHT : WIDTH;
int lapack_height = KL + KU + 1;
int lapackSize = lapack_height * lapack_width;
float* lapack_matrix = (float*)malloc(lapackSize * sizeof(float));
fillLapackMatrix(lapack_matrix, A, KL, KU, WIDTH, HEIGHT, lapack_width, lapack_height);
cout << "\n\nLAPACK MAtrix\n";
print_matrix(lapack_matrix, lapack_width, lapack_height);
//convert to column column matrix
float* col = (float*)malloc(lapackSize * sizeof(float));
for (i = 0; i < WIDTH; i++){
for (j = 0; j < HEIGHT; j++){
col[i + WIDTH*j] = lapack_matrix[WIDTH*i + j];
}
}
cout << "Lapack Column Based Matrix\n";
print_matrix(col,HEIGHT-1,WIDTH);
//status = cublasSetVector(lapackSize, sizeof(float), A, 1, dev_A, 1);
cublasSetMatrix(HEIGHT, WIDTH, sizeof(float), col, HEIGHT, dev_A, HEIGHT);
cublasOperation_t trans = CUBLAS_OP_N;
incy = incx = 1;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////// Banded Matrix Matrix Multipllicatio ///////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float* B,*dev_B,*dev_R,*R;
B = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
R = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
fillNormalMatrix(B,WIDTH,HEIGHT);
cudaMalloc((void**)&dev_B,matrixSize*sizeof(*B));
cudaMalloc((void**)&dev_R,matrixSize*sizeof(*R));
cublasSetMatrix(HEIGHT, WIDTH, sizeof(*B), B, HEIGHT, dev_B, HEIGHT);
cout << "Matrix B\n";
print_matrix(B,HEIGHT,WIDTH);
cout << "gfsdf\n";
device_cublasSgbmv<<<1,4>>>(HEIGHT, WIDTH, KL, KU, &alpha, dev_A, WIDTH, dev_B, HEIGHT, dev_R, HEIGHT,&beta);
cout << "after\n";
cublasGetMatrix(HEIGHT,WIDTH, sizeof (*R) ,dev_R ,WIDTH,R,WIDTH);
getchar();
return 0;
}
and compile it like :
nvcc -gencode=arch=compute_35,code=sm_35 -lcublas -lcudadevrt -O3 Source.cu -o Source.o -dc
g++ Source.o -lcublas -lcudart
then, I get the following :
In function `__sti____cudaRegisterAll_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a()':
tmpxft_00001f1e_00000000-3_lapack_vector.cudafe1.cpp:(.text.startup+0x575): undefined reference to `__cudaRegisterLinkedBinary_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a'
collect2: error: ld returned 1 exit status
You can compile and link the code you have now shown with a single command like this:
nvcc -arch=sm_35 -rdc=true -lcublas -lcublas_device -lcudadevrt -o test Source.cu
You may get some warnings like this:
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_ssyrk.o'
Those can be safely ignored.