I had asked doubt error: calling a __host__ function from a __global__ function is not allowed and i got the ans . accordingly i have modified my code bt i am unable to access d_point[i]. how can i access that.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width,int height, int min_distance,int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i <= size)
{
float2 point = (d_points)[i];
int x = floorf(point.x);
int y = floorf(point.y);
printf(" ( %d %d )",x,y);
if(x < d_x_max && y < d_y_max)
{
x /= min_distance;
y /= min_distance;
(d_counters)[y*width+x]++;
__syncthreads();
}
}
}
void DenseSample(const Mat& grey, std::vector<Point2f>& points, const double quality, const int min_distance)
{
int width = grey.cols/min_distance;
int height = grey.rows/min_distance;
Mat eig;
cornerMinEigenVal(grey, eig, 3, 3);
double maxVal = 0;
minMaxLoc(eig, 0, &maxVal);
const double threshold = maxVal*quality;
std::vector<int> counters(width*height);
int x_max = min_distance*width;
int y_max = min_distance*height;
printf("in descriptor size:%ld ",points.size());
int *d_counters;
float2 *d_points;
cudaMalloc(&d_counters,counters.size()*width*height*sizeof(int));
printf("in cuda point size:%d ",points.size());
cudaMalloc(&d_points,points.size()*sizeof(float2));
cout<<"points.size() : "<<points.size()<<endl;
cudaMemcpy(d_points, &points, points.size()*sizeof(float2), cudaMemcpyHostToDevice);
int blk=cvFloor(points.size()/1024)+1;
cout<<"blk : "<<blk<<endl;
if(points.size()>0)
{
densefun<<<blk,1024>>>(d_counters,d_points,x_max,y_max,width,height,min_distance, points.size());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy(&counters, d_counters, counters.size()* width*height*sizeof(int), cudaMemcpyDeviceToHost);
}
cudaFree(d_counters);
cudaFree(d_points);
points.clear();
int index = 0;
int offset = min_distance/2;
for(int i = 0; i < height; i++)
for(int j = 0; j < width; j++, index++)
{
if(counters[index] <= 0)
{
int x = j*min_distance+offset;
int y = i*min_distance+offset;
if(eig.at<float>(y, x) > threshold)
points.push_back(Point2f(float(x), float(y)));
}
}
}
output is:
in descriptor size:1605 in cuda point size:1605 points.size() : 1605
blk : 2
Error: an illegal memory access was encountered
in descriptor size:918 in cuda point size:918 points.size() : 918
blk : 1
Error: an illegal memory access was encountered
You create a thread gird with block length 1024 and grid length equal to
int blk=cvFloor(points.size()/1024)+1;
Which basically means that the number of threads will be multiple of 1024 greater than points.size(). In this case using:
int i = blockDim.x * blockIdx.x + threadIdx.x;
float2 point = (d_points)[i];
cannot be successful, because you can be almost certain that you will get out of bounds memory access. Add some conditional to ensure that it won't happen.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width, int height, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < width * height)
{
//rest of the code
}
}
Also, you don't allocate enugh memory for d_points:
float2 *d_points;
cudaMalloc(&d_points,points.size()*sizeof(float));
If you want to allocate array of float2 (or copy to it) you need to use sizeof(float2).
Related
Disclaimer: Im a cuda beginner.
typedef struct
{
int row_;
int col_;
float* element_;
int step;
}Matrix_t;
#define BLOCK_SIZE 64
__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);
the Matrix_t are used to link a cv::cuda::GpuMat to the C interface via the ptr() operator to get the GPU pointer to element.
__device__ float getElement(const Matrix_t A, int row, int col)
{
return A.element_[row* A.step + col];
}
__device__ void setElement(Matrix_t A, int row, int col, float value)
{
A.element_[row*A.step + col] = value;
}
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
{
Matrix_t A_sub;
A_sub.row_ = BLOCK_SIZE;
A_sub.col_ = BLOCK_SIZE;
A_sub.step = A.step;
A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];
return A_sub;
}
Here is the kernel:
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
float SubDotValue = 0.0f;
int row = threadIdx.y;
int col = threadIdx.x;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, blockRow, m);
Matrix_t B_sub = getSubMat(B, blockRow, m);
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[row] = SubDotValue;
}
and the wrapper:
float Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
{
float retval = 0;
float* retval_partial;
float* retval_device;
Matrix_t A;
A.col_ = Number_of_cols;
A.row_ = Number_of_rows;
A.element_ = M_dev_1;
A.step = step;
Matrix_t B;
B.col_ = Number_of_cols;
B.row_ = Number_of_rows;
B.element_ = M_dev_2;
B.step = step;
retval_partial = (float*)malloc( B.row_*sizeof(float) );
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
printf("\n Cuda malloc: %s", cudaGetErrorString(err));
std::cout<<std::flush;
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);
MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
err = cudaThreadSynchronize();
std::cout<<std::flush;
printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
err = cudaFree(retval_device);
printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
{
retval+=retval_partial[i];
}
free(retval_partial);
return retval;
}
and the main:
int main(int argc, const char * argv[])
{
cv::cuda::DeviceInfo devInfo;
cv::cuda::setDevice(devInfo.deviceID());
cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);
double tr = (double) cv::getTickCount();
double res = cudatestA.dot(cudatestB);
tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();
cv::cuda::GpuMat ctA(cudatestA);
cv::cuda::GpuMat ctB(cudatestB);
double tm_ = (double) cv::getTickCount();
float res_m = 0;
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();
printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
return 0;
}
I'm currently stuck on various points:
1) it always output 0.
2) it can only work for matrix M*N Multiple of the defined BLOCK_SIZE (64).
for 1) I can't figure where my logic break, I could get the dot product to work on vector without any troubles but the matrix problem induced by the stride between each row prevent me to use the code (code deleted as the site tell me that there is too much code).
Partial answer:
In your kernel you aren't doing the good sum, nor taking the good elements, and your dim seems inverted
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;
float SubDotValue = 0.0f;
//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;
}
And your wrapper isn't also allocating the size you need:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
should be:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );
Note that other allocation related have to change too (Lazy me).
And your call in main need to divide the GpuMat step by the size of one element of the GpuMat
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());
You might also want to change your Matrix_t structure to use const float* instead of float to be able to use:
GpuMat_.ptr<float>();
instead of:
(float*)GpuMat.ptr();
Note that for a matrix of N rows you are starting N^2 threads doing the same thing. I don't have enough knowledge on Cuda to fix that.
I'm confused whether an image is stored in row-major or column-major order in global memory of the device.
I'am getting two different outputs of an image while accessing the image in both the orders.
When accessing in row-major order-
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = numCols * y + x;
if (x >= numCols || y >= numRows)
return;
//marking column boundaries
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
}
else if (x >= numCols-2){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
}
else{
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
}
d_Image[m].w = d_sample[m].w;
output using row-major
when accessing in column-major order-
int m = x * numRows + y;
output using col-major
Dimensions-
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_Image, d_sample, numRows, numCols);
I'm loading and saving the image using opencv.
In the first output red and blue dots are scattered all over the image. And in the second output(col-major) the boundary rows are marked while i'm trying to mark the columns. I'm too much confused.
Edit
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols);
cv::Mat sample;
cv::Mat Image;
size_t numRows() { return sample.rows; }
size_t numCols() { return sample.cols; }
__global__ void blur(const uchar4 *d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int m = y*numCols + x;
if (x >= numCols || y >= numRows)
return;
if (x <= 2){
d_Image[m].x = 255;
d_Image[m].y = 0;
d_Image[m].z = 0;
}
else if (x >= (numCols-2)){
d_Image[m].x = 0;
d_Image[m].y = 0;
d_Image[m].z = 255;
}
else{
d_Image[m].x = d_sample[m].x;
d_Image[m].y = d_sample[m].y;
d_Image[m].z = d_sample[m].z;
}
d_Image[m].w = d_sample[m].w;
}
int main(){
uchar4 *h_sample, *d_sample, *d_Image, *h_Image;
int filter[9];
sample = cv::imread("sample.jpg", CV_LOAD_IMAGE_COLOR);
if (sample.empty()){
std::cout << "error in loading image.";
system("pause");
}
cv::cvtColor(sample,sample,CV_BGR2RGBA);
Image.create(numRows(), numCols(), CV_8UC4);
if (!sample.isContinuous() || !Image.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
system("pause");
exit(1);
}
cv::cvtColor(Image,Image,CV_BGR2RGBA);
h_sample = (uchar4*)sample.data;
h_Image = (uchar4*)Image.data;
size_t numPixels = numRows() * numCols();
//allocate mmeory on device
checkCudaErrors(cudaMalloc((void**)&d_sample, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMalloc((void**)&d_Image, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_sample, 0, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(d_Image, 0, sizeof(uchar4) * numPixels));
//copy to device
checkCudaErrors(cudaMemcpy(d_sample, h_sample, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
helper(d_sample, d_Image, numCols(), numRows());
//copy back to host
checkCudaErrors(cudaMemcpy(h_Image, d_Image, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
cv::cvtColor(Image,Image,CV_RGBA2BGR);
cv::namedWindow("Image", CV_WINDOW_AUTOSIZE);
cv::imshow("Image", Image);
cv::waitKey(0);
cv::imwrite("sample.jpg", Image);
return 0;
}
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
const dim3 blockSize(16,16);
const dim3 gridSize(numCols/16+1, numRows/16+1, 1);
blur << < gridSize, blockSize >> >(d_sample, d_Image, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
void helper(uchar4* d_sample, uchar4* d_Image, size_t numRows, size_t numCols){
and you call
helper(d_sample, d_Image, numCols(), numRows());
I think you may have switched cols and rows when you call helper...
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Closed 8 years ago.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Improve this question
I have seen a similar question here
However,I could not get an exact answer here, and it is written in 2012.
I am trying to call cublasStatus_t cublasSgbmv(...) function, which is defined in "cublas_v2.h", in a __global__ function. However, I could not use the dynamic parallelism feature. I only have 1 source.cu file. However, I have read that I should compile it in a dynamic way so that it separates device and host functions, then I can link these outputs.
Is there anyone who knows how to do it, or a good source to explain it?
Thanks in advance
edit : if undervoted, please explain the reason at least for me to learn my mistake?
edit2 :
my specific problem is, I'm using the following code in my Source.cu :
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cstdio>
#include <stdio.h>
#include <device_launch_parameters.h>
#include <stdlib.h> //srand(), rand()
#include <time.h>
#include <builtin_types.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define IDX2C(i ,j , ld ) ((( j )*( ld ))+( i ))
#define HEIGHT 4
#define WIDTH 4
#define V 4
#define KL 2
#define KU 1
#define THREADS_PER_BLOCK 512
#pragma comment(lib, "cublas")
//#pragma comment(lib, "helper_cuda")
using namespace std;
void create_Matrix(int* matrix, int width, int height){
int i, len;
len = height * width;
srand(time(NULL));
for (i = 0; i < len; i++){
matrix[i] = rand() % 10 + 1; //generates number between 1-10
}
}
template <typename T>
void print_vector(T* vector, int len){
for (int i = 0; i < len; i++)
cout << vector[i] << " ";
cout << endl;
}
template <typename T>
void creating_bandedMatrix(T* bandedMatrix, int height, int width, int ku, int kl){
//fill matrix with zeros at the beginning
int i, len;
len = height * width;
for (i = 0; i < len; i++){
bandedMatrix[i] = 0; //generates number between 1-10
}
srand(time(NULL));
//filling banded diagonal
int start, end;
for (int i = 0; i < height; i++){
start = i - kl;
if (start < 0)
start = 0;
end = i + ku + 1;
if (end > width)
end = width;
for (int j = start; j < end; j++){
*(bandedMatrix + (i*width) + j) = (float)(rand() % (10) + 1); //rand() / (T)RAND_MAX;;
}
}
}
template <typename T>
void print_matrix(T* matrix, int width, int height){
int len = width*height;
cout << "asdsffffff" << endl;
for (int i = 0; i < len; i++){
if (!(i%width))
cout << endl;
cout << i << ":" <<matrix[i] << " ";
}
cout << endl;
}
template <typename T>
void computeMatrixVectorMultiplication(T* bandedMatrix, T* vector2){
T row_sum = 0;
T* bandedHostResult = (T*)malloc(WIDTH * sizeof(T));
for (int i = 0; i < HEIGHT; i++){
row_sum = 0;
for (int j = 0; j < WIDTH; j++){
row_sum += (*(bandedMatrix + i*WIDTH + j)) * vector2[j];
}
bandedHostResult[i] = row_sum;
}
//priting the result
cout << "\n\nBanded Host Result...\n";
print_vector(bandedHostResult, WIDTH);
}
template <typename T>
void fillLapackMatrix(T* lapack_matrix, T* bandedMatrix, int kl, int ku, int banded_w, int banded_h, int lapack_w, int lapack_h){
int i, j, lapack_i;
int len = lapack_h * lapack_w;
for (i = 0; i < len; i++){
lapack_matrix[i] = 0; //generates number between 1-10
}
for (i = 0; i < banded_w; i++){
for (j = 0; j < banded_h; j++){
lapack_i = ku + i - j;
*(lapack_matrix + lapack_i*lapack_w + j) = *(bandedMatrix + i*banded_w + j);
//lapack_matrix[lapack_i*lapack_w + j] = bandedMatrix[i*bandedMatrix + j];
}
}
}
__global__ void device_cublasSgbmv(int m,int n,int kl, int ku,float* alpha, float* A, int lda ,float* B,int ldb,float*R, int ldr, float* beta){
int index = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t handle;
cublasCreate(&handle);
cublasOperation_t trans = CUBLAS_OP_N;
float* dev_x;
cudaMalloc((void**)&dev_x,sizeof(float) * n);
if(index < ldr){
cublasSgbmv(handle, trans,m, n, kl, ku, alpha, A, m, B+index*n, 1, beta, R+index*n, 1);
index = 0;
}
}
void fillNormalMatrix(float* B,int h,int w){
for(int i = 0; i < h;i++){
for(int j = 0; j < w;j++){
B[i*w + j] = 1;
}
}
}
int main()
{
cublasStatus_t status;
float *A;
float *x, *y;
float *dev_x, *dev_y;
int incx, incy;
float *dev_A = 0;
float alpha = 1.0f;
float beta = 0.0f;
int matrixSize = WIDTH * HEIGHT;
int i, j;
cublasHandle_t handle;
/* Initialize CUBLAS */
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
//Allocate host memory for the matrices
A = (float *)malloc(matrixSize* sizeof(float));
//Allocate memory for host vectors
x = (float *)malloc(WIDTH * sizeof(float));
y = (float*)malloc(WIDTH * sizeof(float));
// Fill the matrices with test data
creating_bandedMatrix(A, WIDTH, HEIGHT, KU, KL);
cout << "Banded Matrix\n";
print_matrix(A, WIDTH, HEIGHT);
//Fill the vectors with random data
for (i = 0; i < WIDTH; i++){
x[i] = 1;// (float)(rand() % (10) + 1);:
y[i] = (float)(rand() % (10) + 1);
}
cout << "\nvector x...\n";
print_vector(x, WIDTH);
//cout << "\nvector y...\n";
//print_vector(y, WIDTH);
//Allocate device memory for the matrix
if (cudaMalloc((void **)&dev_A, matrixSize * sizeof(float)) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
//Allocate device memory for vectors
if (cudaMalloc((void**)&dev_x, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&dev_y, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
// Initialize the device vectors with the host vectors
status = cublasSetVector(WIDTH, sizeof(float), x, 1, dev_x, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write x vector)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(WIDTH, sizeof(float), y, 1, dev_y, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write y vector)\n");
return EXIT_FAILURE;
}
//initialize matrix with lapack format
int lapack_width = WIDTH > HEIGHT ? HEIGHT : WIDTH;
int lapack_height = KL + KU + 1;
int lapackSize = lapack_height * lapack_width;
float* lapack_matrix = (float*)malloc(lapackSize * sizeof(float));
fillLapackMatrix(lapack_matrix, A, KL, KU, WIDTH, HEIGHT, lapack_width, lapack_height);
cout << "\n\nLAPACK MAtrix\n";
print_matrix(lapack_matrix, lapack_width, lapack_height);
//convert to column column matrix
float* col = (float*)malloc(lapackSize * sizeof(float));
for (i = 0; i < WIDTH; i++){
for (j = 0; j < HEIGHT; j++){
col[i + WIDTH*j] = lapack_matrix[WIDTH*i + j];
}
}
cout << "Lapack Column Based Matrix\n";
print_matrix(col,HEIGHT-1,WIDTH);
//status = cublasSetVector(lapackSize, sizeof(float), A, 1, dev_A, 1);
cublasSetMatrix(HEIGHT, WIDTH, sizeof(float), col, HEIGHT, dev_A, HEIGHT);
cublasOperation_t trans = CUBLAS_OP_N;
incy = incx = 1;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////// Banded Matrix Matrix Multipllicatio ///////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float* B,*dev_B,*dev_R,*R;
B = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
R = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
fillNormalMatrix(B,WIDTH,HEIGHT);
cudaMalloc((void**)&dev_B,matrixSize*sizeof(*B));
cudaMalloc((void**)&dev_R,matrixSize*sizeof(*R));
cublasSetMatrix(HEIGHT, WIDTH, sizeof(*B), B, HEIGHT, dev_B, HEIGHT);
cout << "Matrix B\n";
print_matrix(B,HEIGHT,WIDTH);
cout << "gfsdf\n";
device_cublasSgbmv<<<1,4>>>(HEIGHT, WIDTH, KL, KU, &alpha, dev_A, WIDTH, dev_B, HEIGHT, dev_R, HEIGHT,&beta);
cout << "after\n";
cublasGetMatrix(HEIGHT,WIDTH, sizeof (*R) ,dev_R ,WIDTH,R,WIDTH);
getchar();
return 0;
}
and compile it like :
nvcc -gencode=arch=compute_35,code=sm_35 -lcublas -lcudadevrt -O3 Source.cu -o Source.o -dc
g++ Source.o -lcublas -lcudart
then, I get the following :
In function `__sti____cudaRegisterAll_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a()':
tmpxft_00001f1e_00000000-3_lapack_vector.cudafe1.cpp:(.text.startup+0x575): undefined reference to `__cudaRegisterLinkedBinary_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a'
collect2: error: ld returned 1 exit status
You can compile and link the code you have now shown with a single command like this:
nvcc -arch=sm_35 -rdc=true -lcublas -lcublas_device -lcudadevrt -o test Source.cu
You may get some warnings like this:
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_ssyrk.o'
Those can be safely ignored.
I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.
The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.
Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :
void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
size_t inputBytes = range*128*sizeof(unsigned char);
size_t baseBytes = cardBase*128*sizeof(unsigned char);
size_t outputBytes = range*sizeof(int);
unsigned char * data_d;
unsigned char * descBase_d;
int * cardBase_d;
int * dataReturned_d;
cudaMalloc((void **) &data_d, inputBytes);
cudaMalloc((void **) &descBase_d, baseBytes);
cudaMalloc((void **) &cardBase_d, sizeof(int));
cudaMalloc((void **) &dataReturned_d, outputBytes);
int blockSize = 196;
int nBlocks = range/blockSize + (range%blockSize == 0?0:1);
cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);
FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);
cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);
cudaFree(data_d);
cudaFree(descBase_d);
cudaFree(cardBase_d);
cudaFree(dataReturned_d);
}
And the function entering the GPU (I don't think the error is here) :
__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char descriptor1[128], descriptor2[128];
int part = 0;
int result = 0;
int winner = 0;
int minDistance = 0;
int itelimit = *cardBase;
for (int k = 0; k < 128; k++)
{
descriptor1[k] = data[idx*128+k];
}
// initialize minDistance
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[k];
}
for (int k = 0; k < 128; k++)
{
part = (descriptor1[k]-descriptor2[k]);
part *= part;
minDistance += part;
}
// test all descriptors in the base :
for (int i = 1; i < itelimit; i++)
{
result = 0;
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[i*128+k];
// Calculate squared l2 distance :
part = (descriptor1[k]-descriptor2[k]);
part *= part;
result += part;
}
// Compare to minDistance
if (result < minDistance)
{
minDistance = result;
winner = i;
}
}
// Write the result in dataReturned
dataReturned[idx] = winner;
}
Thank you in advance if you can help me.
EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".
linux has a watchdog mechanism. If your kernel runs for a long time (you say it is slow in debug mode) you can hit the linux watchdog, and receive the "launch timed out and was terminated" error.
In this case you have several things you might try. The options are covered here.
The following is a CUDA programming example which is basically C but with NVidia CUDA functions within. I've been trying to interpret this code example and figure out what it is trying to do. My question is this the program compiles just fine, but what arguments does it take? For example this CUDA program is being run in a linux emulator however upon running ./program it returns:
Usage: ./program number
Segmentation fault
What are the programs input arguments. Thank you.
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand () % 128);
y[i] = (float) (rand () % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}
int N = atoi(argv[1]);
The program takes a single integer as a command line argument. (Try calling it as ./program 5, for example.)
It then calculates a SAXPY (An old term originating from early BLAS implementations, but it stuck. It means "single (precision, aka float) real alpha x plus y".) with vectors of dimension N.