Hi I would like to modify Steam Code from CPU to GPU version. It's not really necessary to understand whole code. So, I will present just fragments if someone is interested, everything (source code and description) can find here:
http://www.dgp.toronto.edu/people/stam/reality/Research/pub.html => "Real-Time Fluid Dynamics for Games".
It is probably quite easy task. But I long time no used C++ and just studying CUDA, so it's hard for me. Trying from long time, but no effects.
CPU version (works):
#define IX(i,j) ((i)+(N+2)*(j))
...
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
for (int i = 1; i <= N; i++)
{
for (int j = 1; j <= N; j++)
{
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
}
set_bnd(N, b, x);
}
}
my GPU version (doesn't compile):
#define IX(i,j) ((i)+(N+2)*(j))
__global__
void GPU_lin_solve(int *N, int *b, float * x, float * x0, float *a, float *c)
{
int i = threadIdx.x * blockIdx.x + threadIdx.x;
int j = threadIdx.y * blockIdx.y + threadIdx.y;
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
int *d_N, *d_b;
float **d_x, **d_x0;
float *d_a, *d_c, *d_xx, *d_xx0;
*d_xx = **d_x;
*d_xx0 = **d_x0;
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_b, sizeof(int));
cudaMalloc(&d_xx, sizeof(float));
cudaMalloc(&d_xx0, sizeof(float));
cudaMalloc(&d_a, sizeof(float));
cudaMalloc(&d_c, sizeof(float));
cudaMemcpy(d_N, &N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx, &*x, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx0, &*x0, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_a, &a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &c, sizeof(float), cudaMemcpyHostToDevice);
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
// compilator showing problem in the line above
// Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
cudaMemcpy(&*x, d_xx, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_N);
cudaFree(d_b);
cudaFree(d_xx);
cudaFree(d_xx0);
cudaFree(d_a);
cudaFree(d_c);
set_bnd(N, b, x);
}
}
The compiler is reporting an error:
Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
at the kernel launch
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
What I am doing wrong?
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
N in your condition and macro is a pointer, you're treating as though it's an integer.
Try dereferencing it?
Related
I have a parent kernel than calls a child kernel as shown below. How can I make sure that the child kernel completed all its threads' calculations before continuing in the parent's calculations?
I know that I can't use cudaDeviceSynchronize(); inside parent kernel as it may result in problems. What can I do?
__device__ void child(double* A, double* B, int p)
{
int r;
r = blockIdx.x * blockDim.x + threadIdx.x;
B[p + r] += A[r];
}
__global__ void parent(double* A, double* B, double* C)
{
int i, r;
r = blockIdx.x * blockDim.x + threadIdx.x;
child<<<1, 5>>>(A, B, 7 * r);
for (i = 0; i < 5; i++)
C[r] += B[r * 7 + i];
}
int main()
{
parent<<<1, 2>>>(A, B, C);
}
According to Robert Crovella's comment, it seems the best solution now is to refactor the code to avoid any unpredictable results like this:
__global__ void child(double* A, double* B)
{
int r, c;
r = blockIdx.x * blockDim.x + threadIdx.x;
c = blockIdx.y * blockDim.y + threadIdx.y;
B[7 * r + c] += A[c];
}
__global__ void parent(double* A, double* B, double* C)
{
int i, r;
r = blockIdx.x * blockDim.x + threadIdx.x;
for (i = 0; i < 5; i++)
C[r] += B[r * 7 + i];
}
int main()
{
dim3 Grid(1, 1);
dim3 Block(2, 5);
child<<<Grid, Block>>>(A, B);
cudaDeviceSynchronize();
parent<<<1, 2>>>(A, B, C);
}
I have two vectors a and b. Each vector contains the coordinates of a 3d points x, y, z vector3f.
struct Vector3f
{
float x;
float y;
float z;
}
vector a has a size of n = 5000 points and vector b has a size of m = 4000. I need to do a tensor vector product between them like on the right side of the picture. the resulted vector should have a length size of 5000 * 4000 and contain float point where results are stored at c.
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// int j = blockIdy.y * blockDim.y + threadIdx.y;
//check if the idx is out of range
if (i < n) {
for (int j = 0; j < m; j++) {
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
}
dim3 blockSize(32, 1, 1);
dim3 gridSize((n + blockSize.x - 1) / blockSize.x, 1, 1);
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I get high execution time on Volta arch which is a lot.
My question is how can I optimize the kernel to reduce time which is mainly because of the for loop inside the kernel. I know here that all global reads and writes are not coalesced.
You can make the kernel move through both a and b simultaneously, like this:
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdy.y * blockDim.y + threadIdx.y;
if (i < n && j < m)
{
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
dim3 blockSize(32, 32);
dim3 gridSize((int)ceil(n / 32.0), (int)ceil(m / 32.0));
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
Update
I tried to modify the code to use a single array with and without shared memory, the code without shared memory with always faster 3 or 4 times.
With shared memory:
#define BLOCK_SIZE 32
void tensor3dProdcut(const int n, const int m, const float* a, const float* b, float* c)
{
float* d_a;
size_t size = (uint64_t)n * 3 * sizeof(float);
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
float* d_b;
size = (uint64_t)m * 3 * sizeof(float);
cudaMalloc(&d_b, size);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
float* d_c;
size = (uint64_t)n * m * sizeof(float);
cudaMalloc(&d_c, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((int)ceil((double)n / BLOCK_SIZE), (int)ceil((double)m / BLOCK_SIZE));
tensor3dProdcutKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n, m);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
__shared__ double as[BLOCK_SIZE][3];
__shared__ double bs[BLOCK_SIZE][3];
for (i = 0; i < 3; i++)
{
as[row][i] = a[(BLOCK_SIZE * blockRow + row) * 3 + i];
bs[col][i] = b[(BLOCK_SIZE * blockCol + col) * 3 + i];
}
__syncthreads();
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += as[row][i] * bs[col][i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
Without shared memory:
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += a[(BLOCK_SIZE * blockRow + row) * 3 + i] * b[(BLOCK_SIZE * blockCol + col) * 3 + i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}
I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define SZ_INT sizeof(int)
#define CELL_SZ 1
#define CELL_VALUE(a,x) (((a) << 1) | x)
#define FROM(a) ((a) & 1)
#define LENGTH(a) ((a) >> 1)
#define INDEX(i,j,m) ((i) * (m + 1) + j)
//FROM: 1 if L[i][j] took value from L[i - 1][j], 0 if L[i][j] took value from L[i][j - 1]
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("Cuda error: %s\n", cudaGetErrorString(err)); \
printf("Error in file: %s, line: %i\n", __FILE__, __LINE__); \
}
__global__ void Find_L_entry (int *L, int *A, int n, int *B, int m, int diag) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int i = diag - j;
if (i >= 0 && i < n && j >= 0 && j < m) {
if (A[i] == B[j]) {
L[INDEX(i, j, m)] = CELL_VALUE(LENGTH(L[INDEX(i - 1, j - 1, m)]) + 1, 0);
} else {
L[INDEX(i, j, m)] = (LENGTH(L[INDEX(i - 1, j, m)]) > LENGTH(L[INDEX(i, j - 1, m)])) ?
CELL_VALUE(LENGTH(L[INDEX(i - 1, j, m)]), 1) :
CELL_VALUE(LENGTH(L[INDEX(i, j - 1, m)]), 0);
}
}
}
__host__ void output_sequence(int *L, int *A, int n, int *B, int m) {
int len = LENGTH(L[INDEX(n - 1, m - 1, m)]);
int i = n - 1, j = m - 1;
int *lcs = (int*) malloc(len * SZ_INT);
int top = 0;
while (i >= 0 && j >= 0) {
if (A[i] == B[j]) {
lcs[top++] = A[i];
i--; j--;
} else {
if (FROM(L[INDEX(i, j, m)]) == 1)
i--;
else
j--;
}
}
printf("Length: %d\nSequence: ", len);
for (int i = len - 1; i >= 0; i--) {
printf("%d%c", lcs[i], i ? ' ' : '\n');
}
free(lcs);
}
__host__ void read_sequence(int *&A, int &n, int num) {
printf("Enter number of elements in sequence %d\n", num);
scanf("%d", &n);
A = (int*) malloc(n * sizeof(int));
printf("Enter %d elements of sequence %d\n", n, num);
for (int i = 0; i < n; i++)
scanf("%d", A + i);
}
int main ( int argc, char **argv ) {
int number_of_blocks = atoi(argv[1]), threads_in_block = atoi(argv[2]);
int n, m;
int *A, *B;
read_sequence(A, n, 1);
read_sequence(B, m, 2);
int *d_A, *d_B;
cudaMalloc((void**)&d_A, n * SZ_INT);
cudaMalloc((void**)&d_B, m * SZ_INT);
CUDA_CHECK_ERROR(cudaMemcpy(d_A, A, n * SZ_INT, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_B, B, m * SZ_INT, cudaMemcpyHostToDevice));
int *big_L = (int*) malloc((n + 1) * (m + 1) * CELL_SZ * SZ_INT);
for (int i = 0; i < (n + 1) * (m + 1) * CELL_SZ; i++)
big_L[i] = 0;
int *L = &big_L[(m + 2) * CELL_SZ];
int *dev_L;
cudaMalloc((void**)&dev_L, (n + 1) * (m + 1) * SZ_INT);
int *d_L = &dev_L[(m + 2) * CELL_SZ];
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, (n * (m + 1) - 1) * SZ_INT, cudaMemcpyHostToDevice));
int diag_count = n + m - 1;
for (int diag = 0; diag < diag_count; diag++) {
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, SZ_INT, cudaMemcpyHostToDevice));
Find_L_entry<<<number_of_blocks, threads_in_block>>>(d_L, d_A, n, d_B, m, diag);
CUDA_CHECK_ERROR(cudaPeekAtLastError());
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
CUDA_CHECK_ERROR(cudaDeviceSynchronize());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
printf("%d%c", L[INDEX(i,j,m)], j == m - 1 ? '\n' : ' ');
system("pause");
CUDA_CHECK_ERROR(cudaThreadSynchronize());
}
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
output_sequence(L, A, n, B, m);
cudaFree(d_L);
cudaFree(d_A);
cudaFree(d_B);
free(A); free(B); free(big_L);
return 0;
}
The code doesn't run properly. After calling funciton Find_L_entry aray d_L doesn't changes.
I'm compiling via cmd.
nvcc -g -G -arch=sm_21 -o lcs.exe lcs.cu
When I run it, I get a runtime error: "Cuda error: invalid device in function, line 94"
The runtime error you are receiving is occurring because the runtime API cannot either find or create code which can run on your GPU.
The underlying reason is that you are compiling your code for an architecture (compute capability 2.1) which is incompatible with your GPU. You have stated you have a GT310M, which you can see from here is a compute capability 1.2 device. The CUDA tool chain supports backwards code compatibility (ie old code will run on a new device), but not the other way around.
You should build your code something like this:
nvcc -g -G -arch=sm_12 -o lcs.exe lcs.cu