Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Closed 8 years ago.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Improve this question
I have seen a similar question here
However,I could not get an exact answer here, and it is written in 2012.
I am trying to call cublasStatus_t cublasSgbmv(...) function, which is defined in "cublas_v2.h", in a __global__ function. However, I could not use the dynamic parallelism feature. I only have 1 source.cu file. However, I have read that I should compile it in a dynamic way so that it separates device and host functions, then I can link these outputs.
Is there anyone who knows how to do it, or a good source to explain it?
Thanks in advance
edit : if undervoted, please explain the reason at least for me to learn my mistake?
edit2 :
my specific problem is, I'm using the following code in my Source.cu :
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cstdio>
#include <stdio.h>
#include <device_launch_parameters.h>
#include <stdlib.h> //srand(), rand()
#include <time.h>
#include <builtin_types.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define IDX2C(i ,j , ld ) ((( j )*( ld ))+( i ))
#define HEIGHT 4
#define WIDTH 4
#define V 4
#define KL 2
#define KU 1
#define THREADS_PER_BLOCK 512
#pragma comment(lib, "cublas")
//#pragma comment(lib, "helper_cuda")
using namespace std;
void create_Matrix(int* matrix, int width, int height){
int i, len;
len = height * width;
srand(time(NULL));
for (i = 0; i < len; i++){
matrix[i] = rand() % 10 + 1; //generates number between 1-10
}
}
template <typename T>
void print_vector(T* vector, int len){
for (int i = 0; i < len; i++)
cout << vector[i] << " ";
cout << endl;
}
template <typename T>
void creating_bandedMatrix(T* bandedMatrix, int height, int width, int ku, int kl){
//fill matrix with zeros at the beginning
int i, len;
len = height * width;
for (i = 0; i < len; i++){
bandedMatrix[i] = 0; //generates number between 1-10
}
srand(time(NULL));
//filling banded diagonal
int start, end;
for (int i = 0; i < height; i++){
start = i - kl;
if (start < 0)
start = 0;
end = i + ku + 1;
if (end > width)
end = width;
for (int j = start; j < end; j++){
*(bandedMatrix + (i*width) + j) = (float)(rand() % (10) + 1); //rand() / (T)RAND_MAX;;
}
}
}
template <typename T>
void print_matrix(T* matrix, int width, int height){
int len = width*height;
cout << "asdsffffff" << endl;
for (int i = 0; i < len; i++){
if (!(i%width))
cout << endl;
cout << i << ":" <<matrix[i] << " ";
}
cout << endl;
}
template <typename T>
void computeMatrixVectorMultiplication(T* bandedMatrix, T* vector2){
T row_sum = 0;
T* bandedHostResult = (T*)malloc(WIDTH * sizeof(T));
for (int i = 0; i < HEIGHT; i++){
row_sum = 0;
for (int j = 0; j < WIDTH; j++){
row_sum += (*(bandedMatrix + i*WIDTH + j)) * vector2[j];
}
bandedHostResult[i] = row_sum;
}
//priting the result
cout << "\n\nBanded Host Result...\n";
print_vector(bandedHostResult, WIDTH);
}
template <typename T>
void fillLapackMatrix(T* lapack_matrix, T* bandedMatrix, int kl, int ku, int banded_w, int banded_h, int lapack_w, int lapack_h){
int i, j, lapack_i;
int len = lapack_h * lapack_w;
for (i = 0; i < len; i++){
lapack_matrix[i] = 0; //generates number between 1-10
}
for (i = 0; i < banded_w; i++){
for (j = 0; j < banded_h; j++){
lapack_i = ku + i - j;
*(lapack_matrix + lapack_i*lapack_w + j) = *(bandedMatrix + i*banded_w + j);
//lapack_matrix[lapack_i*lapack_w + j] = bandedMatrix[i*bandedMatrix + j];
}
}
}
__global__ void device_cublasSgbmv(int m,int n,int kl, int ku,float* alpha, float* A, int lda ,float* B,int ldb,float*R, int ldr, float* beta){
int index = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t handle;
cublasCreate(&handle);
cublasOperation_t trans = CUBLAS_OP_N;
float* dev_x;
cudaMalloc((void**)&dev_x,sizeof(float) * n);
if(index < ldr){
cublasSgbmv(handle, trans,m, n, kl, ku, alpha, A, m, B+index*n, 1, beta, R+index*n, 1);
index = 0;
}
}
void fillNormalMatrix(float* B,int h,int w){
for(int i = 0; i < h;i++){
for(int j = 0; j < w;j++){
B[i*w + j] = 1;
}
}
}
int main()
{
cublasStatus_t status;
float *A;
float *x, *y;
float *dev_x, *dev_y;
int incx, incy;
float *dev_A = 0;
float alpha = 1.0f;
float beta = 0.0f;
int matrixSize = WIDTH * HEIGHT;
int i, j;
cublasHandle_t handle;
/* Initialize CUBLAS */
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
//Allocate host memory for the matrices
A = (float *)malloc(matrixSize* sizeof(float));
//Allocate memory for host vectors
x = (float *)malloc(WIDTH * sizeof(float));
y = (float*)malloc(WIDTH * sizeof(float));
// Fill the matrices with test data
creating_bandedMatrix(A, WIDTH, HEIGHT, KU, KL);
cout << "Banded Matrix\n";
print_matrix(A, WIDTH, HEIGHT);
//Fill the vectors with random data
for (i = 0; i < WIDTH; i++){
x[i] = 1;// (float)(rand() % (10) + 1);:
y[i] = (float)(rand() % (10) + 1);
}
cout << "\nvector x...\n";
print_vector(x, WIDTH);
//cout << "\nvector y...\n";
//print_vector(y, WIDTH);
//Allocate device memory for the matrix
if (cudaMalloc((void **)&dev_A, matrixSize * sizeof(float)) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
//Allocate device memory for vectors
if (cudaMalloc((void**)&dev_x, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&dev_y, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
// Initialize the device vectors with the host vectors
status = cublasSetVector(WIDTH, sizeof(float), x, 1, dev_x, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write x vector)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(WIDTH, sizeof(float), y, 1, dev_y, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write y vector)\n");
return EXIT_FAILURE;
}
//initialize matrix with lapack format
int lapack_width = WIDTH > HEIGHT ? HEIGHT : WIDTH;
int lapack_height = KL + KU + 1;
int lapackSize = lapack_height * lapack_width;
float* lapack_matrix = (float*)malloc(lapackSize * sizeof(float));
fillLapackMatrix(lapack_matrix, A, KL, KU, WIDTH, HEIGHT, lapack_width, lapack_height);
cout << "\n\nLAPACK MAtrix\n";
print_matrix(lapack_matrix, lapack_width, lapack_height);
//convert to column column matrix
float* col = (float*)malloc(lapackSize * sizeof(float));
for (i = 0; i < WIDTH; i++){
for (j = 0; j < HEIGHT; j++){
col[i + WIDTH*j] = lapack_matrix[WIDTH*i + j];
}
}
cout << "Lapack Column Based Matrix\n";
print_matrix(col,HEIGHT-1,WIDTH);
//status = cublasSetVector(lapackSize, sizeof(float), A, 1, dev_A, 1);
cublasSetMatrix(HEIGHT, WIDTH, sizeof(float), col, HEIGHT, dev_A, HEIGHT);
cublasOperation_t trans = CUBLAS_OP_N;
incy = incx = 1;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////// Banded Matrix Matrix Multipllicatio ///////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float* B,*dev_B,*dev_R,*R;
B = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
R = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
fillNormalMatrix(B,WIDTH,HEIGHT);
cudaMalloc((void**)&dev_B,matrixSize*sizeof(*B));
cudaMalloc((void**)&dev_R,matrixSize*sizeof(*R));
cublasSetMatrix(HEIGHT, WIDTH, sizeof(*B), B, HEIGHT, dev_B, HEIGHT);
cout << "Matrix B\n";
print_matrix(B,HEIGHT,WIDTH);
cout << "gfsdf\n";
device_cublasSgbmv<<<1,4>>>(HEIGHT, WIDTH, KL, KU, &alpha, dev_A, WIDTH, dev_B, HEIGHT, dev_R, HEIGHT,&beta);
cout << "after\n";
cublasGetMatrix(HEIGHT,WIDTH, sizeof (*R) ,dev_R ,WIDTH,R,WIDTH);
getchar();
return 0;
}
and compile it like :
nvcc -gencode=arch=compute_35,code=sm_35 -lcublas -lcudadevrt -O3 Source.cu -o Source.o -dc
g++ Source.o -lcublas -lcudart
then, I get the following :
In function `__sti____cudaRegisterAll_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a()':
tmpxft_00001f1e_00000000-3_lapack_vector.cudafe1.cpp:(.text.startup+0x575): undefined reference to `__cudaRegisterLinkedBinary_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a'
collect2: error: ld returned 1 exit status
You can compile and link the code you have now shown with a single command like this:
nvcc -arch=sm_35 -rdc=true -lcublas -lcublas_device -lcudadevrt -o test Source.cu
You may get some warnings like this:
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_ssyrk.o'
Those can be safely ignored.
Related
I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
{
for (int i = 0; i < input.rows; i++)
{
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
{
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
}
}
}
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_col)++;
}
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_row)++;
}
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
cv::INTER_LANCZOS4);
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
free(orgimageH);
free(FY);
return 0;
}
The results I got for first 10 values in CPU version:
patch numbers:
16129
238,240,240,235,237,230,227,229,228,227
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
{
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
//
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
free(orgimageH);
free(FY);
cudaFree(d_FY);
cudaFree(d_orgimageH);
return 0;
}
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
}
}
EDIT 2:
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
break;
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
break;
/* ... */
}
}
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).
I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
I had asked doubt error: calling a __host__ function from a __global__ function is not allowed and i got the ans . accordingly i have modified my code bt i am unable to access d_point[i]. how can i access that.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width,int height, int min_distance,int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i <= size)
{
float2 point = (d_points)[i];
int x = floorf(point.x);
int y = floorf(point.y);
printf(" ( %d %d )",x,y);
if(x < d_x_max && y < d_y_max)
{
x /= min_distance;
y /= min_distance;
(d_counters)[y*width+x]++;
__syncthreads();
}
}
}
void DenseSample(const Mat& grey, std::vector<Point2f>& points, const double quality, const int min_distance)
{
int width = grey.cols/min_distance;
int height = grey.rows/min_distance;
Mat eig;
cornerMinEigenVal(grey, eig, 3, 3);
double maxVal = 0;
minMaxLoc(eig, 0, &maxVal);
const double threshold = maxVal*quality;
std::vector<int> counters(width*height);
int x_max = min_distance*width;
int y_max = min_distance*height;
printf("in descriptor size:%ld ",points.size());
int *d_counters;
float2 *d_points;
cudaMalloc(&d_counters,counters.size()*width*height*sizeof(int));
printf("in cuda point size:%d ",points.size());
cudaMalloc(&d_points,points.size()*sizeof(float2));
cout<<"points.size() : "<<points.size()<<endl;
cudaMemcpy(d_points, &points, points.size()*sizeof(float2), cudaMemcpyHostToDevice);
int blk=cvFloor(points.size()/1024)+1;
cout<<"blk : "<<blk<<endl;
if(points.size()>0)
{
densefun<<<blk,1024>>>(d_counters,d_points,x_max,y_max,width,height,min_distance, points.size());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy(&counters, d_counters, counters.size()* width*height*sizeof(int), cudaMemcpyDeviceToHost);
}
cudaFree(d_counters);
cudaFree(d_points);
points.clear();
int index = 0;
int offset = min_distance/2;
for(int i = 0; i < height; i++)
for(int j = 0; j < width; j++, index++)
{
if(counters[index] <= 0)
{
int x = j*min_distance+offset;
int y = i*min_distance+offset;
if(eig.at<float>(y, x) > threshold)
points.push_back(Point2f(float(x), float(y)));
}
}
}
output is:
in descriptor size:1605 in cuda point size:1605 points.size() : 1605
blk : 2
Error: an illegal memory access was encountered
in descriptor size:918 in cuda point size:918 points.size() : 918
blk : 1
Error: an illegal memory access was encountered
You create a thread gird with block length 1024 and grid length equal to
int blk=cvFloor(points.size()/1024)+1;
Which basically means that the number of threads will be multiple of 1024 greater than points.size(). In this case using:
int i = blockDim.x * blockIdx.x + threadIdx.x;
float2 point = (d_points)[i];
cannot be successful, because you can be almost certain that you will get out of bounds memory access. Add some conditional to ensure that it won't happen.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width, int height, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < width * height)
{
//rest of the code
}
}
Also, you don't allocate enugh memory for d_points:
float2 *d_points;
cudaMalloc(&d_points,points.size()*sizeof(float));
If you want to allocate array of float2 (or copy to it) you need to use sizeof(float2).
I'm using QtCreator to code an algorithm that I have already coded on Matlab.
When coding this program, I have two errors. The firts one (APPCRASH) appears just when I build and execute the program normally, but not when I try to debug it (Heisenbug) and it appears on the function 'matriceA'. I tried to make the variables volatile and to write the matrix A term formulas on other function, hoping that that will stop the compiler optimization (I think that the compiler optimization might cause the problem), but I have not been able to solve the problem. I have not tried to to compile the project using the option -o0 because my professor (it's an university project) has to be able to compile it normally (without specific options).
The second one is a SISSEGV segmentation fault. It happens when the code arrives to "DestroyFloatArray(&b, width);" on InpaintingColor.
And here the codes:
clanu_process.cpp (it's little messy because I've tried a lot of things...)
#include "clanu_process.h"
#include "iomanip"
void InpaintingColor(float **Rout, float **Gout, float **Bout, float **Rin, float **Gin, float **Bin, float **Mask, int width, int height, double param)
{
cout << "1" << endl;
float alphak = 0, bethak = 0, res = 0;
float **b = 0, **xk = 0, **dk = 0, **rk = 0, **Ark = 0, **tmp1 = 0,**tmp2 = 0,**tmp3 = 0;
Ark = AllocateFloatArray( width, height);
tmp1 = AllocateFloatArray( width, height);
tmp2 = AllocateFloatArray( width, height);
tmp3 = AllocateFloatArray( width, height);
xk = AllocateFloatArray( width, height);
dk = AllocateFloatArray( width, height);
rk = AllocateFloatArray( width, height);
b = AllocateFloatArray( width, height);
cout << "2" << endl;
res = 1e8;
matrixProductByScalar(b,1.0/(3.0*256),Rin,width,height);
matrixDuplicate(xk, b, width, height);
// APPCRASH error
matriceA(Ark,xk,Mask,width,height);
//More code
// SIGSEGV error
DestroyFloatArray(&b, width);
DestroyFloatArray(&xk, width);
DestroyFloatArray(&dk, width);
DestroyFloatArray(&rk, width);
DestroyFloatArray(&Ark, width);
DestroyFloatArray(&tmp1, width);
DestroyFloatArray(&tmp2, width);
DestroyFloatArray(&tmp3, width);
}
float** matriceA(float **A, float **I, float **Masque, int N2, int N1){
volatile bool bool_iplus = false, bool_imoins = false, bool_jmoins = false, bool_jplus = false;
volatile int iplus = 0, imoins = 0, jplus = 0, jmoins = 0;
for(int i = 1; i <= N1; i++){
bool_iplus = i<N1;
iplus = i+1 < N1 ? i+1 : N1;
bool_imoins = i>1;
imoins = i-1 > 1 ? i-1 : 1;
for(int j = 1; j <= N2; j++){
bool_jplus = j<N2;
jplus = j+1 < N2 ? j+1 : N2;
bool_jmoins = j>1;
jmoins = j -1 > 1 ? j-1 : 1;
if(Masque[i-1][j-1]!=0){
//cout << "if - " << i << ", " << j<< endl;
A[i-1][j-1] = (1.0/36)*(16*I[i-1][j-1]
+ 4*(
(bool_iplus?I[iplus-1][j-1]:0)
+ (bool_imoins?I[imoins-1][j-1]:0)
+ (bool_jplus?I[i-1][jplus-1]:0)
+ (bool_jmoins?I[i-1][jmoins-1]:0)
)+(
(bool_iplus&&bool_jplus?I[iplus-1][jplus-1]:0)
+ (bool_imoins&&bool_jplus?I[imoins-1][jplus-1]:0)
+ (bool_imoins&&bool_jmoins?I[imoins-1][jmoins-1]:0))
+ (bool_iplus&&bool_jmoins?I[iplus-1][jmoins-1]:0));
}else{
//cout << "else - " << i << ", " << j << endl;
A[i-1][j-1]=
-(1.0*N1*N2)*(
-8.0*I[i-1][j-1]
+ I[iplus-1][j-1]
+ I[imoins-1][j-1]
+ I[i-1][jplus-1]
+ I[i-1][jmoins-1]
+ I[iplus-1][jplus-1]
+ I[imoins-1][jplus-1]
+ I[imoins-1][jmoins-1]
+ I[iplus-1][jmoins-1]);
}
}
}
return A;
}
The functions AllocateFloatArray and DestroyFloatArray
float ** AllocateFloatArray(int width, int height)
{
float ** r = new float*[width];
for(int i=0; i<width; i++)
r[i] = new float[height];
return r;
}
void DestroyFloatArray(float ***a, int width)
{
if( *a == 0 ) return;
for(int i=0; i<width; i++)
delete[] a[0][i];
delete[] *a;
*a = 0;
}
Thank you for your time.
I'm no sure that it's the cause of your problem but...
Your function "Matrix operations" (sum(), matrixSubstraction(), matrixAddition(), matrixProductByElement(), matrixProductByScalar(), and matrixDuplicate()) are ranging the first index from zero to width and the second one from zero to height.
If I'm not wrong, this is correct and is consistent with allocation/deallocation (AllocateFloatArray() and DestroyFloatArray()).
But look at the two matriceA() functions; they are defined as
float** matriceA(float **A, float **I, int N2, int N1)
float** matriceA(float **A, float **I, float **Masque, int N2, int N1)
In both functions the first index range from zero to N1 and the second one from zero to N2; by example
for(int i = 1; i <= N1; i++){
// ...
for(int j = 1; j <= N2; j++){
// ...
A[i-1][j-1] = (1.0/36)*(16*I[i-1][j-1] // ...
Good. But you call matriceA() in this way
matriceA(Ark,rk,Mask,width,height);
Briefly: you allocate your matrices as width * height matrices; your "matrix operations" are using they as width * height matrices but your matriceA() function are using they as height * width.
Wonderful way to devastate the memory.
I suppose the solution could be
1) switch N1 and N2 in matriceA() definition
2) or switch width and height in matriceA() calling
p.s.: sorry for my bad English.
I'm trying to implement a matrix-vector Multiplication on GPU (using CUDA).
In my C++ code (CPU), I load the matrix as a dense matrix, and then I perform the matrix-vector multiplication using CUDA. I'm also using shared memory to improve the performance.
How can I load the matrix in an efficient way, knowing that my matrix is a sparse matrix?
Below is my C++ function to load the matrix:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
Below is my CUDA Kernel function that handles the matrix-vector multiplication:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
What are the necessary changes that I have to make on my CUDA code to take into account that my matrix is a sparse matrix?
I found out from a forum that we can also use padding to be able to optimize the performance, but this requires me to change the way I read the matrix / sort the matrix. Any ideas how to implement this padding in the way I read the matrix and perform the calculation?
This is a very old post and I want to highlight that cuSPARSE (since some time now) makes routines for the multiplication between sparse matrices or between a sparse matrix and a dense vector available.
For the csr format, the relevant routine for the multiplication between a sparse matrix and a dense vector is cusparse<t>csrmv. Below, a fully worked example showing its use.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");
const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));
gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");
}
You might want to have a look at the very good CUSP library. They implement sparse matrices in a variety of formats (coo, csr, ellpack, diagonal and a hybrid between ellpack and coo). Each with their own advantages as described in the documentation. Most of them are "standard" sparse matrix formats about which you can find more information online. Not a complete answer to your question perhaps, but it should provide a starting point.