Floyd Warshall algorithm in parallel using cuda - c++

I'm trying to implement Floyd Warshall algorithm using cuda but I'm having syncrhornization problem.
This is my code:
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
This is my initial variables:
#define GRAPH_SIZE 2000
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE
This is how I'm generating the graph:
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 40;
if (r > 20) {
r = INF;
}
D(i, j) = r;
}
}
}
}
When I set GRAPH_SIZE to a smaller number like 100 the result is incorrect.
I have written the algorithm sequentially on the cpu like the code bellow:
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
And I run and test it like this:
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
}
else {
fprintf(stderr, "SUCCESS!\n");
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
Can anyone give me an ideia how to solve this?

The main problem I could find seems to be that your grid sizing is not done correctly.
With N=2000 and thread block side dimension of 16, that happens to be whole-number divisible. But if you reduce N to 100, it is not.
We can fix that by "rounding up" your grid dimensions:
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
And adding a thread-check to your kernel:
if ((i < graph_size) && (j < graph_size))
Here's a modified code that seems to run correctly for me:
$ cat t92.cu
#include <cstdio>
#include <cassert>
#define GRAPH_SIZE 100
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
#define HANDLE_ERROR(x) x
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if ((i < graph_size) && (j < graph_size))
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 1000;
if (r > 20) {
D(i, j) = INF;
}
else
D(i, j) = r+10;
}
}
}
}
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
int qq = 0;
for (int i = 0; i < GRAPH_SIZE*GRAPH_SIZE; i++)
if (output_cpu[i] != output_gpu[i]) {qq++; printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);}
printf("# mismatches: %d\n", qq);
}
else {
fprintf(stderr, "SUCCESS!\n");
// for (int i = 0; i < 100; i++)
// printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
$ nvcc -o t92 t92.cu
$ vi t92.cu
$ cuda-memcheck ./t92
========= CUDA-MEMCHECK
SUCCESS!
========= ERROR SUMMARY: 0 errors
$
(I've modified your test case slightly as it was producing an output matrix that was mostly zero. )

Related

How to make my Cuda kernel run on bigger matrixes?

So my code is suppsed to work like this:
-take in_martix of NxN elements and R factor
-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1
Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ?
Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;
__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int out_local = 0;
for (int col = tx; col <= tx + 2 * d_R ; col++)
for (int row = ty; row <= ty + 2 * d_R ; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
void random_ints(int tab[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
tab[i][j] = 1;
}
int main() {
static int tab[N][N];
random_ints(tab);
int tab_size = sizeof(int) * N * N;
int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);
dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE )), ceil((float)N / (float)(THREAD_BLOCK_SIZE )));
void** d_tab;
void** d_out;
cudaMalloc((void**)&d_tab, tab_size);
cudaMalloc((void**)&d_out, out_size);
cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);
int* d_tab_begin = (int*)(d_tab);
int* d_out_begin = (int*)(d_out);
MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);
int* out = (int*)malloc(out_size);
cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int col = 0; col < N - 2 * R; col++)
{
for (int row = 0; row < N - 2 * R; row++)
{
cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
}
cout << endl;
}
}
Finally thanks to Robert I found how to make the code work - by adding if statment
if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
for (int col = tx; col <= tx + 2 * d_R; col++)
for (int row = ty; row <= ty + 2 * d_R; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}

Problem in converting the "for loop" in CUDA

I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
{
for (int i = 0; i < input.rows; i++)
{
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
{
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
}
}
}
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_col)++;
}
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_row)++;
}
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
cv::INTER_LANCZOS4);
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
free(orgimageH);
free(FY);
return 0;
}
The results I got for first 10 values in CPU version:
patch numbers:
16129
238,240,240,235,237,230,227,229,228,227
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
{
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
//
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
free(orgimageH);
free(FY);
cudaFree(d_FY);
cudaFree(d_orgimageH);
return 0;
}
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
}
}
EDIT 2:
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
break;
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
break;
/* ... */
}
}
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).

2D array CUDA problems

I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}

"Access Violation Reading/Writing location" exception each time at a different malloc/memcpy line

The following code is for 2D Wavelet Decomposition:
wfb2dec.cpp
#include "global.h"
#include <opencv2/opencv.hpp>
#include <opencv2/core.hpp>
void MatConCat(imgpel **, imgpel **, imgpel **, imgpel **, imgpel **, int, int);
void BorderWrap(imgpel **, imgpel **, int, int, int, int);
void Conv2D(imgpel **, imgpel **, double *, char *, int, int, int, int);
void VSubSample(imgpel **, imgpel **, int, int);
void HSubSample(imgpel **, imgpel **, int, int);
int wfb2dec(double *h, double *g, int m, int n, imgpel **frameI_dwt, int lvl)
{
// h is the Low Pass analysis filter;
int len_h0 = (int)h[0]; //sizeof(h) / sizeof(double);
int ext_h0 = (int)floor(len_h0 / 2);
double *h0 = (double *) malloc(len_h0 * sizeof(double));
memcpy(h0, &h[1], len_h0 * sizeof(double));
// h1 is the High Pass analysis filter; (needs to be derived from g);
int len_h1 = (int)g[0];
int c = (int)floor((len_h1 + 1) / 2);
if ((len_h1 % 2) == 0)
c += 1;
int ext_h1 = len_h1 - c + 1;
double *h1 = (double *) malloc(len_h1 * sizeof(double));
memcpy(h1, g + 1, len_h1 * sizeof(double));
//h1 = g;
analysis_hpf(h1, h1, len_h1, c);
imgpel **x = frameI_dwt;
//Wavelet Decomposition:-
for (int i = 0; i < lvl; i++)
{
int m1 = (int)(m / pow(2,i));
int n1 = (int)(n / pow(2,i));
//Row-wise filtering:-
imgpel **x_L = (imgpel **) malloc((m1*n1) * sizeof(imgpel));
imgpel **x_H = (imgpel **) malloc((m1*n1) * sizeof(imgpel));
imgpel **x_Ls = (imgpel **)malloc((m1*n1/2) * sizeof(imgpel));
imgpel **x_Hs = (imgpel **)malloc((m1*n1/2) * sizeof(imgpel));
x_L = rowfiltering(x, h0, ext_h0, len_h0, m1, n1);
VSubSample(x_L, x_Ls, m1, n1);
x_H = rowfiltering(x, h1, ext_h1, len_h1, m1, n1);
VSubSample(x_H, x_Hs, m1, n1);
free(x_L); free(x_H);
//Column-wise filtering:-
imgpel **x_LL = (imgpel **) malloc((m1*n1 / 2) * sizeof(imgpel));
imgpel **x_LH = (imgpel **) malloc((m1*n1 / 2) * sizeof(imgpel));
imgpel **x_HL = (imgpel **) malloc((m1*n1 / 2) * sizeof(imgpel));
imgpel **x_HH = (imgpel **) malloc((m1*n1 / 2) * sizeof(imgpel));
imgpel **x_LLs = (imgpel **)malloc((m1*n1 / 4) * sizeof(imgpel));
imgpel **x_LHs = (imgpel **)malloc((m1*n1 / 4) * sizeof(imgpel));
imgpel **x_HLs = (imgpel **)malloc((m1*n1 / 4) * sizeof(imgpel));
imgpel **x_HHs = (imgpel **)malloc((m1*n1 / 4) * sizeof(imgpel));
imgpel **x_L_tr = (imgpel **)malloc((m1*n1 / 2) * sizeof(imgpel));
cvTranspose(x_Ls, x_L_tr);
free(x_Ls);
imgpel **x_H_tr = (imgpel **)malloc((m1*n1 / 2) * sizeof(imgpel));
cvTranspose(x_Hs, x_H_tr);
free(x_Hs);
x_LL = rowfiltering(x_L_tr, h0, ext_h0, len_h0, m1, n1/2);
HSubSample(x_LL, x_LLs, m1, n1 / 2);
x_LH = rowfiltering(x_L_tr, h1, ext_h1, len_h1, m1, n1/2);
HSubSample(x_LH, x_LHs, m1, n1 / 2);
x_HL = rowfiltering(x_H_tr, h0, ext_h0, len_h0, m1, n1/2);
HSubSample(x_HL, x_HLs, m1, n1 / 2);
x_HH = rowfiltering(x_H_tr, h1, ext_h1, len_h1, m1, n1/2);
HSubSample(x_HH, x_HHs, m1, n1 / 2);
free(x_L_tr); free(x_H_tr);
free(x_LL); free(x_LH); free(x_HL); free(x_HH);
MatConCat(x_LLs, x_LHs, x_HLs, x_HHs, x, m1, n1);
x = (imgpel **) realloc(x, (m1 / 2) * (n1 / 2) * sizeof(imgpel));
x = x_LLs;
free(x_LLs); free(x_LHs); free(x_HLs); free(x_HHs);
}
return 1;
}
void analysis_hpf(double *h1, double *g, int len_h1, int c)
{
for (int i = 0; i < len_h1; i++)
{
h1[i] = -1 * g[i] * pow(-1, i - c + 1);
}
}
imgpel ** rowfiltering(imgpel **x, double *f, int ext, int len, int m1, int n1)
{
int ext2 = len - ext - 1;
imgpel **x_Bor;
int memory_size = get_mem2Dpel(&x_Bor, m1, n1 + ext + ext2);
//x = [x(:, end-ext1+1:end) x x(:, 1:ext2)];
BorderWrap(x, x_Bor, m1, n1, ext, ext2);
//cv::Mat A = cv::Mat(m1, n1, CV_16U, x);
//cv::Mat Apad = cv::Mat(m1, n1 + ext + ext2, CV_16U, x_Bor);
//cv::copyMakeBorder(A, Apad, 0, 0, ext, ext2, CV_HAL_BORDER_WRAP);
//y = conv2(x, f, 'valid');
//imgpel **y = (imgpel **)malloc(m1*n1 * sizeof(imgpel));
imgpel **y;
memory_size = get_mem2Dpel(&y, m1, n1 + ext + ext2);
Conv2D(x_Bor, y, f, (char *)"valid", m1, n1, ext, len);
return y;
}
void VSubSample(imgpel **x, imgpel **xs, int m1, int n1)
{
for (int i = 0; i < m1; i++)
{
for (int j = 0; j < n1; j += 2)
{
xs[i][j / 2] = x[i][j];
}
}
}
void HSubSample(imgpel **x, imgpel **xs, int m1, int n1)
{
for (int i = 0; i < n1; i++)
{
for (int j = 0; j < m1; j += 2)
{
xs[j / 2][i] = x[j][i];
}
}
}
void MatConCat(imgpel **x_LL, imgpel **x_LH, imgpel **x_HL, imgpel **x_HH, imgpel **x, int m1, int n1)
{
//memcpy(x, x_LL, (m1*n1 / 4) * sizeof(imgpel));
for (int i = 0; i < m1/2; i++)
{
for (int j = 0; j < n1/2; j++)
{
x[i][j] = x_LL[i][j];
}
}
//memcpy(((frameI_dwt + 0) + n1 / 2), x_LH, (m1*n1 / 4) * sizeof(imgpel));
for (int i = 0; i < m1/2; i++)
{
for (int j = (n1/2)+1; i < n1; j++)
{
x[i][j] = x_LH[i][j - (n1 / 2) - 1];
}
}
//memcpy(((frameI_dwt + m1 / 2) + 0), x_HL, (m1*n1 / 4) * sizeof(imgpel));
for (int i = (m1/2)+1; i < m1; i++)
{
for (int j = 0; j < n1/2;j++)
{
x[i][j] = x_HL[i - (m1 / 2) - 1][j];
}
}
//memcpy(((frameI_dwt + m1 / 2) + n1 / 2), x_HH, (m1*n1 / 4) * sizeof(imgpel));
for (int i = (m1/2)+1; i < m1; i++)
{
for (int j = (n1/2)+1; j < n1; j++)
{
x[i][j] = x_HH[i - (m1 / 2) - 1][j - (n1 / 2) - 1];
}
}
}
void BorderWrap(imgpel **x, imgpel **x_Bor, int m1, int n1, int ext, int ext2)
{
for (int i = ext; i > 0; i--)
{
for (int j = 0; j < m1; j++)
{
x_Bor[j][ext - i] = x[j][n1 - i];
}
}
for (int i = 0; i < ext2; i++)
{
for (int j = 0; j < m1; j++)
{
x_Bor[j][n1 + i + 1] = x[j][i];
}
}
for (int i = 0; i < n1; i++)
{
for (int j = 0; j < m1; j++)
{
x_Bor[j][ext + i] = x[j][i];
}
}
}
void Conv2D(imgpel **x_Bor, imgpel **y, double *f, char *a, int m1, int n1, int ext, int len)
{
int ext2 = len - ext - 1;
for (int i = 0; i < m1; i++)
{
for (int j = 0; j < n1; i++)
{
y[i][j] = 0;
//flip filter f for convolution:-
for (int k = 0; k < len; k++)
{
y[i][j] += x_Bor[i][j + k] * f[len - k - 1];
}
}
}
}
The above function is called by encoder.cpp in the main() function. Variables/functions which are not declared in wfb2dec.cpp are in the header global.h.
I run into
Access Violation Reading/Writing Location
exception each time at different locations whenever I run the code. Where could be the problem?

CUDA function doesn't change data

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define SZ_INT sizeof(int)
#define CELL_SZ 1
#define CELL_VALUE(a,x) (((a) << 1) | x)
#define FROM(a) ((a) & 1)
#define LENGTH(a) ((a) >> 1)
#define INDEX(i,j,m) ((i) * (m + 1) + j)
//FROM: 1 if L[i][j] took value from L[i - 1][j], 0 if L[i][j] took value from L[i][j - 1]
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("Cuda error: %s\n", cudaGetErrorString(err)); \
printf("Error in file: %s, line: %i\n", __FILE__, __LINE__); \
}
__global__ void Find_L_entry (int *L, int *A, int n, int *B, int m, int diag) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int i = diag - j;
if (i >= 0 && i < n && j >= 0 && j < m) {
if (A[i] == B[j]) {
L[INDEX(i, j, m)] = CELL_VALUE(LENGTH(L[INDEX(i - 1, j - 1, m)]) + 1, 0);
} else {
L[INDEX(i, j, m)] = (LENGTH(L[INDEX(i - 1, j, m)]) > LENGTH(L[INDEX(i, j - 1, m)])) ?
CELL_VALUE(LENGTH(L[INDEX(i - 1, j, m)]), 1) :
CELL_VALUE(LENGTH(L[INDEX(i, j - 1, m)]), 0);
}
}
}
__host__ void output_sequence(int *L, int *A, int n, int *B, int m) {
int len = LENGTH(L[INDEX(n - 1, m - 1, m)]);
int i = n - 1, j = m - 1;
int *lcs = (int*) malloc(len * SZ_INT);
int top = 0;
while (i >= 0 && j >= 0) {
if (A[i] == B[j]) {
lcs[top++] = A[i];
i--; j--;
} else {
if (FROM(L[INDEX(i, j, m)]) == 1)
i--;
else
j--;
}
}
printf("Length: %d\nSequence: ", len);
for (int i = len - 1; i >= 0; i--) {
printf("%d%c", lcs[i], i ? ' ' : '\n');
}
free(lcs);
}
__host__ void read_sequence(int *&A, int &n, int num) {
printf("Enter number of elements in sequence %d\n", num);
scanf("%d", &n);
A = (int*) malloc(n * sizeof(int));
printf("Enter %d elements of sequence %d\n", n, num);
for (int i = 0; i < n; i++)
scanf("%d", A + i);
}
int main ( int argc, char **argv ) {
int number_of_blocks = atoi(argv[1]), threads_in_block = atoi(argv[2]);
int n, m;
int *A, *B;
read_sequence(A, n, 1);
read_sequence(B, m, 2);
int *d_A, *d_B;
cudaMalloc((void**)&d_A, n * SZ_INT);
cudaMalloc((void**)&d_B, m * SZ_INT);
CUDA_CHECK_ERROR(cudaMemcpy(d_A, A, n * SZ_INT, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_B, B, m * SZ_INT, cudaMemcpyHostToDevice));
int *big_L = (int*) malloc((n + 1) * (m + 1) * CELL_SZ * SZ_INT);
for (int i = 0; i < (n + 1) * (m + 1) * CELL_SZ; i++)
big_L[i] = 0;
int *L = &big_L[(m + 2) * CELL_SZ];
int *dev_L;
cudaMalloc((void**)&dev_L, (n + 1) * (m + 1) * SZ_INT);
int *d_L = &dev_L[(m + 2) * CELL_SZ];
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, (n * (m + 1) - 1) * SZ_INT, cudaMemcpyHostToDevice));
int diag_count = n + m - 1;
for (int diag = 0; diag < diag_count; diag++) {
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, SZ_INT, cudaMemcpyHostToDevice));
Find_L_entry<<<number_of_blocks, threads_in_block>>>(d_L, d_A, n, d_B, m, diag);
CUDA_CHECK_ERROR(cudaPeekAtLastError());
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
CUDA_CHECK_ERROR(cudaDeviceSynchronize());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
printf("%d%c", L[INDEX(i,j,m)], j == m - 1 ? '\n' : ' ');
system("pause");
CUDA_CHECK_ERROR(cudaThreadSynchronize());
}
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
output_sequence(L, A, n, B, m);
cudaFree(d_L);
cudaFree(d_A);
cudaFree(d_B);
free(A); free(B); free(big_L);
return 0;
}
The code doesn't run properly. After calling funciton Find_L_entry aray d_L doesn't changes.
I'm compiling via cmd.
nvcc -g -G -arch=sm_21 -o lcs.exe lcs.cu
When I run it, I get a runtime error: "Cuda error: invalid device in function, line 94"
The runtime error you are receiving is occurring because the runtime API cannot either find or create code which can run on your GPU.
The underlying reason is that you are compiling your code for an architecture (compute capability 2.1) which is incompatible with your GPU. You have stated you have a GT310M, which you can see from here is a compute capability 1.2 device. The CUDA tool chain supports backwards code compatibility (ie old code will run on a new device), but not the other way around.
You should build your code something like this:
nvcc -g -G -arch=sm_12 -o lcs.exe lcs.cu