I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
Related
I'm creating a Matrix math library with CUDA to improve my CNNs performance (and to understand C++ better).
I would like to be able to add error handling and tell the user (me) what has gone wrong when using the matrix class.
This can be seen in my main file as, in this case, I'm trying to add a 10 * 10 matrix to a 15 * 15 matrix. This is an impossible action and would like some output to tell the user. for example
Error in file "Main.cu" on line: 9 (Dimensions inconsistent)
If you check inside the function the line number is line number of the check and I've looked at using macros to check but I'm wondering if there is another way without having to call the macro every time I add two matrices together.
Main.cu
#include "Matrix.cuh"
int main() {
double* init;
cudaMallocManaged(&init, sizeof(double));
Matrix A(10, 10, 2);
Matrix B(15, 15, 3);
Matrix C = A + B;
A.printM("A");
B.printM("B");
C.printM("C");
//cudaFree(init);
return 0;
}
Matrix.cu
#include "Matrix.cuh"
__global__
void sumMatrix(Matrix* A, Matrix* B, Matrix* C)
{
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < A->ColumnCount && y < A->RowCount)
{
C->VALUES[y * A->ColumnCount + x] = A->VALUES[y * A->ColumnCount + x] + B->VALUES[y * A->ColumnCount + x];
}
}
__global__
void matrixInit(Row* rows, int R, int C, double* VALUES, double val) {
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < C && y < R)
{
if (x == 0)
{
rows[y].Count = C;
rows[y].values = VALUES + C * y;
}
VALUES[y * C + x] = val;
}
}
Matrix::Matrix(int R, int C, double val)
{
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, val);
cudaDeviceSynchronize();
cudaCheckErrors("MATRIX INIT VAL");
}
Matrix::Matrix(int R, int C)
{
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, 0);
cudaDeviceSynchronize();
cudaCheckErrors("MATRIX INIT VAL");
}
void Matrix::updatePointers()
{
for (size_t i = 0; i < RowCount; i++)
{
rows[i].values = VALUES + (i * ColumnCount);
}
}
void Matrix::removePointers()
{
VALUES = nullptr;
rows = nullptr;
}
void Matrix::printM(const char* msg)
{
std::cout << "Matrix " << msg << ": " << RowCount << "*" << ColumnCount << std::endl;
for (size_t i = 0; i < RowCount; i++)
{
for (size_t j = 0; j < ColumnCount; j++)
{
std::cout << rows[i][j] << " ";
}
std::cout << std::endl;
}
}
Matrix Matrix::sum(Matrix B)
{
Matrix* A_p, * B_p, * C_p;
Matrix C(RowCount, ColumnCount);
cudaMallocManaged(&A_p, sizeof(Matrix));
cudaMallocManaged(&B_p, sizeof(Matrix));
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(A_p, this, sizeof(Matrix));
memcpy(B_p, &B, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
dim3 gridDim(ceil(ColumnCount / (double)BLOCK_SIZE), ceil(RowCount / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
sumMatrix << < gridDim, blockDim >> > (A_p, B_p, C_p);
cudaDeviceSynchronize();
cudaCheckErrors("SUM");
B.removePointers();
C.removePointers();
return *C_p;
}
Row& Matrix::operator[](size_t i)
{
if (i >= RowCount)
{
std::cout << "OUT OF BOUNDS";
std::exit(1);
}
return rows[i];
}
Matrix& Matrix::operator+(Matrix B)
{
Matrix C = sum(B);
Matrix* C_p;
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
B.removePointers();
C.removePointers();
return *C_p;
}
Matrix::~Matrix()
{
if (VALUES != nullptr && rows != nullptr)
{
cudaFree(VALUES);
cudaFree(rows);
}
}
I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
{
for (int i = 0; i < input.rows; i++)
{
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
{
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
}
}
}
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_col)++;
}
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_row)++;
}
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
cv::INTER_LANCZOS4);
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
free(orgimageH);
free(FY);
return 0;
}
The results I got for first 10 values in CPU version:
patch numbers:
16129
238,240,240,235,237,230,227,229,228,227
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
{
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
//
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
free(orgimageH);
free(FY);
cudaFree(d_FY);
cudaFree(d_orgimageH);
return 0;
}
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
}
}
EDIT 2:
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
break;
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
break;
/* ... */
}
}
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).
I'm trying to implement Floyd Warshall algorithm using cuda but I'm having syncrhornization problem.
This is my code:
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
This is my initial variables:
#define GRAPH_SIZE 2000
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE
This is how I'm generating the graph:
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 40;
if (r > 20) {
r = INF;
}
D(i, j) = r;
}
}
}
}
When I set GRAPH_SIZE to a smaller number like 100 the result is incorrect.
I have written the algorithm sequentially on the cpu like the code bellow:
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
And I run and test it like this:
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
}
else {
fprintf(stderr, "SUCCESS!\n");
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
Can anyone give me an ideia how to solve this?
The main problem I could find seems to be that your grid sizing is not done correctly.
With N=2000 and thread block side dimension of 16, that happens to be whole-number divisible. But if you reduce N to 100, it is not.
We can fix that by "rounding up" your grid dimensions:
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
And adding a thread-check to your kernel:
if ((i < graph_size) && (j < graph_size))
Here's a modified code that seems to run correctly for me:
$ cat t92.cu
#include <cstdio>
#include <cassert>
#define GRAPH_SIZE 100
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
#define HANDLE_ERROR(x) x
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if ((i < graph_size) && (j < graph_size))
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 1000;
if (r > 20) {
D(i, j) = INF;
}
else
D(i, j) = r+10;
}
}
}
}
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
int qq = 0;
for (int i = 0; i < GRAPH_SIZE*GRAPH_SIZE; i++)
if (output_cpu[i] != output_gpu[i]) {qq++; printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);}
printf("# mismatches: %d\n", qq);
}
else {
fprintf(stderr, "SUCCESS!\n");
// for (int i = 0; i < 100; i++)
// printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
$ nvcc -o t92 t92.cu
$ vi t92.cu
$ cuda-memcheck ./t92
========= CUDA-MEMCHECK
SUCCESS!
========= ERROR SUMMARY: 0 errors
$
(I've modified your test case slightly as it was producing an output matrix that was mostly zero. )
I'm trying to implement a matrix-vector Multiplication on GPU (using CUDA).
In my C++ code (CPU), I load the matrix as a dense matrix, and then I perform the matrix-vector multiplication using CUDA. I'm also using shared memory to improve the performance.
How can I load the matrix in an efficient way, knowing that my matrix is a sparse matrix?
Below is my C++ function to load the matrix:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
Below is my CUDA Kernel function that handles the matrix-vector multiplication:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
What are the necessary changes that I have to make on my CUDA code to take into account that my matrix is a sparse matrix?
I found out from a forum that we can also use padding to be able to optimize the performance, but this requires me to change the way I read the matrix / sort the matrix. Any ideas how to implement this padding in the way I read the matrix and perform the calculation?
This is a very old post and I want to highlight that cuSPARSE (since some time now) makes routines for the multiplication between sparse matrices or between a sparse matrix and a dense vector available.
For the csr format, the relevant routine for the multiplication between a sparse matrix and a dense vector is cusparse<t>csrmv. Below, a fully worked example showing its use.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");
const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));
gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");
}
You might want to have a look at the very good CUSP library. They implement sparse matrices in a variety of formats (coo, csr, ellpack, diagonal and a hybrid between ellpack and coo). Each with their own advantages as described in the documentation. Most of them are "standard" sparse matrix formats about which you can find more information online. Not a complete answer to your question perhaps, but it should provide a starting point.
Is there a way to speed up this 1D convolution ? I tried to make the dy cache efficient
but compiling with g++ and -O3 gave worse performances.
I am convolving with [-1. , 0., 1] in both directions.
Is not homework.
#include<iostream>
#include<cstdlib>
#include<sys/time.h>
void print_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
std::cout << matrix[j * width + i] << ",";
}
std::cout << std::endl;
}
}
void fill_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
matrix[j * width + i] = ((float)rand() / (float)RAND_MAX) ;
}
}
}
#define RESTRICT __restrict__
void dx_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[1];
for (int j=0; j < height; j++){
float* row = in_matrix + j * width;
for (int i=1; i < width-1; i++){
float res = -1.F * row[i-1] + row[i+1]; /* -1.F * value + 0.F * value + 1.F * value; */
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}
void dy_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[ width + 1];
for (int j=1; j < height-1; j++){
for (int i=0; i < width; i++){
float res = -1.F * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}
double now (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
int main(int argc, char **argv){
int width, height;
float *in_matrix;
float *out_matrix;
if(argc < 3){
std::cout << argv[0] << "usage: width height " << std::endl;
return -1;
}
srand(123);
width = atoi(argv[1]);
height = atoi(argv[2]);
std::cout << "Width:"<< width << " Height:" << height << std::endl;
if (width < 3){
std::cout << "Width too short " << std::endl;
return -1;
}
if (height < 3){
std::cout << "Height too short " << std::endl;
return -1;
}
in_matrix = (float *) malloc( height * width * sizeof(float));
out_matrix = (float *) malloc( height * width * sizeof(float));
fill_matrix(height, width, in_matrix);
//print_matrix(height, width, in_matrix);
float min, max;
double a = now();
dx_matrix(height, width, in_matrix, out_matrix, &min, &max);
std::cout << "dx min:" << min << " max:" << max << std::endl;
dy_matrix(height, width, in_matrix, out_matrix, &min, &max);
double b = now();
std::cout << "dy min:" << min << " max:" << max << std::endl;
std::cout << "time: " << b-a << " sec" << std::endl;
return 0;
}
Use local variables for computing the min and max. Every time you do this:
if (res > *max ) *max = res;
if (res < *min ) *min = res;
max and min have to get written to memory. Adding restrict on the pointers would help (indicating the writes are independent), but an even better way would be something like
//Setup
float tempMin = ...
float tempMax = ...
...
// Inner loop
tempMin = (res < tempMin) ? res : tempMin;
tempMax = (res > tempMax) ? res : tempMax;
...
// End
*min = tempMin;
*max = tempMax;
First of all, I would rewrite the dy loop to get rid of "[ (j-1) * width + i]" and "in_matrix[ (j+1) * width + i]", and do something like:
float* p, *q, *out;
p = &in_matrix[(j-1)*width];
q = &in_matrix[(j+1)*width];
out = &out_matrix[j*width];
for (int i=0; i < width; i++){
float res = -1.F * p[i] + q[i] ;
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out[i] = res;
}
But that is a trivial optimization that the compiler may already be doing for you.
It will be slightly faster to do "q[i]-p[i]" instead of "-1.f*p[i]+q[i]", but, again, the compiler may be smart enough to do that behind your back.
The whole thing would benefit considerably from SSE2 and multithreading. I'd bet on at least a 3x speedup from SSE2 right away. Multithreading can be added using OpenMP and it will only take a few lines of code.
The compiler might notice this but you are creating/freeing a lot of variables on the stack as you go in and out of the scope operators {}. Instead of:
for (int j=0; j < height; j++){
float* row = in_matrix + j * width;
for (int i=1; i < width-1; i++){
float res = -1.F * row[i-1] + row[i+1];
How about:
int i, j;
float *row;
float res;
for (j=0; j < height; j++){
row = in_matrix + j * width;
for (i=1; i < width-1; i++){
res = -1.F * row[i-1] + row[i+1];
Well, the compiler might be taking care of these, but here are a couple of small things:
a) Why are you multiplying by -1.F? Why not just subtract? For instance:
float res = -1.F * row[i-1] + row[i+1];
could just be:
float res = row[i+1] - row[i-1];
b) This:
if (res > *max ) *max = res;
if (res < *min ) *min = res;
can be made into
if (res > *max ) *max = res;
else if (res < *min ) *min = res;
and in other places. If the first is true, the second can't be so let's not check it.
Addition:
Here's another thing. To minimize your multiplications, change
for (int j=1; j < height-1; j++){
for (int i=0; i < width; i++){
float res = -1.F * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;
to
int h = 0;
int width2 = 2 * width;
for (int j=1; j < height-1; j++){
h += width;
for (int i=h; i < h + width; i++){
float res = in_matrix[i + width2] - in_matrix[i];
and at the end of the loop
out_matrix[i + width] = res;
You can do similar things in other places, but hopefully you get the idea. Also, there is a minor bug,
*min = *max = -1.F * in_matrix[0] + in_matrix[ width + 1 ];
should be just in_matrix[ width ] at the end.
Profiling this with -O3 and -O2 using versions of both the clang and g++ compilers on OS X, I found that
30% of the time was spent filling the initial matrix
matrix[j * width + i] = ((float)rand() / (float)RAND_MAX) ;
40% of the time was spent in dx_matrix on the line.
out_matrix[j * width + i] = row[i+1] -row[i-1];
About 9% of the time was spent in the conditionals in dx_matrix .. I separated them into a separate loop to see if that helped, but it didn't change anything much.
Shark gave the suggestion that this could be improved through the use of SSE instructions.
Interestingly only about 19% of the time was spent in the dy_matrix routine.
This was running on 10k by 10k matrix ( about 1.6 seconds )
Note your results may be different if you're using a different compiler, different OS etc.