#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define SZ_INT sizeof(int)
#define CELL_SZ 1
#define CELL_VALUE(a,x) (((a) << 1) | x)
#define FROM(a) ((a) & 1)
#define LENGTH(a) ((a) >> 1)
#define INDEX(i,j,m) ((i) * (m + 1) + j)
//FROM: 1 if L[i][j] took value from L[i - 1][j], 0 if L[i][j] took value from L[i][j - 1]
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("Cuda error: %s\n", cudaGetErrorString(err)); \
printf("Error in file: %s, line: %i\n", __FILE__, __LINE__); \
}
__global__ void Find_L_entry (int *L, int *A, int n, int *B, int m, int diag) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int i = diag - j;
if (i >= 0 && i < n && j >= 0 && j < m) {
if (A[i] == B[j]) {
L[INDEX(i, j, m)] = CELL_VALUE(LENGTH(L[INDEX(i - 1, j - 1, m)]) + 1, 0);
} else {
L[INDEX(i, j, m)] = (LENGTH(L[INDEX(i - 1, j, m)]) > LENGTH(L[INDEX(i, j - 1, m)])) ?
CELL_VALUE(LENGTH(L[INDEX(i - 1, j, m)]), 1) :
CELL_VALUE(LENGTH(L[INDEX(i, j - 1, m)]), 0);
}
}
}
__host__ void output_sequence(int *L, int *A, int n, int *B, int m) {
int len = LENGTH(L[INDEX(n - 1, m - 1, m)]);
int i = n - 1, j = m - 1;
int *lcs = (int*) malloc(len * SZ_INT);
int top = 0;
while (i >= 0 && j >= 0) {
if (A[i] == B[j]) {
lcs[top++] = A[i];
i--; j--;
} else {
if (FROM(L[INDEX(i, j, m)]) == 1)
i--;
else
j--;
}
}
printf("Length: %d\nSequence: ", len);
for (int i = len - 1; i >= 0; i--) {
printf("%d%c", lcs[i], i ? ' ' : '\n');
}
free(lcs);
}
__host__ void read_sequence(int *&A, int &n, int num) {
printf("Enter number of elements in sequence %d\n", num);
scanf("%d", &n);
A = (int*) malloc(n * sizeof(int));
printf("Enter %d elements of sequence %d\n", n, num);
for (int i = 0; i < n; i++)
scanf("%d", A + i);
}
int main ( int argc, char **argv ) {
int number_of_blocks = atoi(argv[1]), threads_in_block = atoi(argv[2]);
int n, m;
int *A, *B;
read_sequence(A, n, 1);
read_sequence(B, m, 2);
int *d_A, *d_B;
cudaMalloc((void**)&d_A, n * SZ_INT);
cudaMalloc((void**)&d_B, m * SZ_INT);
CUDA_CHECK_ERROR(cudaMemcpy(d_A, A, n * SZ_INT, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_B, B, m * SZ_INT, cudaMemcpyHostToDevice));
int *big_L = (int*) malloc((n + 1) * (m + 1) * CELL_SZ * SZ_INT);
for (int i = 0; i < (n + 1) * (m + 1) * CELL_SZ; i++)
big_L[i] = 0;
int *L = &big_L[(m + 2) * CELL_SZ];
int *dev_L;
cudaMalloc((void**)&dev_L, (n + 1) * (m + 1) * SZ_INT);
int *d_L = &dev_L[(m + 2) * CELL_SZ];
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, (n * (m + 1) - 1) * SZ_INT, cudaMemcpyHostToDevice));
int diag_count = n + m - 1;
for (int diag = 0; diag < diag_count; diag++) {
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, SZ_INT, cudaMemcpyHostToDevice));
Find_L_entry<<<number_of_blocks, threads_in_block>>>(d_L, d_A, n, d_B, m, diag);
CUDA_CHECK_ERROR(cudaPeekAtLastError());
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
CUDA_CHECK_ERROR(cudaDeviceSynchronize());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
printf("%d%c", L[INDEX(i,j,m)], j == m - 1 ? '\n' : ' ');
system("pause");
CUDA_CHECK_ERROR(cudaThreadSynchronize());
}
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
output_sequence(L, A, n, B, m);
cudaFree(d_L);
cudaFree(d_A);
cudaFree(d_B);
free(A); free(B); free(big_L);
return 0;
}
The code doesn't run properly. After calling funciton Find_L_entry aray d_L doesn't changes.
I'm compiling via cmd.
nvcc -g -G -arch=sm_21 -o lcs.exe lcs.cu
When I run it, I get a runtime error: "Cuda error: invalid device in function, line 94"
The runtime error you are receiving is occurring because the runtime API cannot either find or create code which can run on your GPU.
The underlying reason is that you are compiling your code for an architecture (compute capability 2.1) which is incompatible with your GPU. You have stated you have a GT310M, which you can see from here is a compute capability 1.2 device. The CUDA tool chain supports backwards code compatibility (ie old code will run on a new device), but not the other way around.
You should build your code something like this:
nvcc -g -G -arch=sm_12 -o lcs.exe lcs.cu
Related
So my code is suppsed to work like this:
-take in_martix of NxN elements and R factor
-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1
Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ?
Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;
__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int out_local = 0;
for (int col = tx; col <= tx + 2 * d_R ; col++)
for (int row = ty; row <= ty + 2 * d_R ; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
void random_ints(int tab[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
tab[i][j] = 1;
}
int main() {
static int tab[N][N];
random_ints(tab);
int tab_size = sizeof(int) * N * N;
int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);
dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE )), ceil((float)N / (float)(THREAD_BLOCK_SIZE )));
void** d_tab;
void** d_out;
cudaMalloc((void**)&d_tab, tab_size);
cudaMalloc((void**)&d_out, out_size);
cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);
int* d_tab_begin = (int*)(d_tab);
int* d_out_begin = (int*)(d_out);
MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);
int* out = (int*)malloc(out_size);
cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int col = 0; col < N - 2 * R; col++)
{
for (int row = 0; row < N - 2 * R; row++)
{
cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
}
cout << endl;
}
}
Finally thanks to Robert I found how to make the code work - by adding if statment
if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
for (int col = tx; col <= tx + 2 * d_R; col++)
for (int row = ty; row <= ty + 2 * d_R; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
I'm trying to implement Floyd Warshall algorithm using cuda but I'm having syncrhornization problem.
This is my code:
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
This is my initial variables:
#define GRAPH_SIZE 2000
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE
This is how I'm generating the graph:
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 40;
if (r > 20) {
r = INF;
}
D(i, j) = r;
}
}
}
}
When I set GRAPH_SIZE to a smaller number like 100 the result is incorrect.
I have written the algorithm sequentially on the cpu like the code bellow:
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
And I run and test it like this:
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
}
else {
fprintf(stderr, "SUCCESS!\n");
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
Can anyone give me an ideia how to solve this?
The main problem I could find seems to be that your grid sizing is not done correctly.
With N=2000 and thread block side dimension of 16, that happens to be whole-number divisible. But if you reduce N to 100, it is not.
We can fix that by "rounding up" your grid dimensions:
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
And adding a thread-check to your kernel:
if ((i < graph_size) && (j < graph_size))
Here's a modified code that seems to run correctly for me:
$ cat t92.cu
#include <cstdio>
#include <cassert>
#define GRAPH_SIZE 100
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
#define HANDLE_ERROR(x) x
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if ((i < graph_size) && (j < graph_size))
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 1000;
if (r > 20) {
D(i, j) = INF;
}
else
D(i, j) = r+10;
}
}
}
}
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
int qq = 0;
for (int i = 0; i < GRAPH_SIZE*GRAPH_SIZE; i++)
if (output_cpu[i] != output_gpu[i]) {qq++; printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);}
printf("# mismatches: %d\n", qq);
}
else {
fprintf(stderr, "SUCCESS!\n");
// for (int i = 0; i < 100; i++)
// printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
$ nvcc -o t92 t92.cu
$ vi t92.cu
$ cuda-memcheck ./t92
========= CUDA-MEMCHECK
SUCCESS!
========= ERROR SUMMARY: 0 errors
$
(I've modified your test case slightly as it was producing an output matrix that was mostly zero. )
I'm new to MPI; so forgive me if my code is clumsy. I want to convert a n*n matrix to upper triangular form and then calculate its determinant. My code works with one processor, but with more than one processor, it doesn't work. I am hoping someone can give me some advice about it.
My code:
#include <stdio.h>
#include <string.h> /* For strlen */
#include <mpi.h> /* For MPI functions, etc */
#include<stdlib.h> /* for rand */
#include<math.h>
#define n 10
#define N 100
int main(void)
{
srand(0);
double A[N];
double temp[N];
int i;
int j;
int k;
int m=100;
int s=1;
int y;
double z;
double det=1;
int pid;
int np;
int share=0;
for (i = 0; i < N; i++)
{
A[i] = rand();
}
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &np);
if (np == 1)
{
for (i = 0; i < n; i++)
{
if (A[i * n + i] == 0)
{
s = 0;
for (j = i + 1; j < n; j++)
{
if (A[j * n + i] != 0)
{
for (k = i; k < n; k++)
{
A[i * n + k] = A[i * n + k] + A[j * n + k];
}
s = 1;
}
if (s == 1)
break;
}
}
if (s != 1)
{
det = 0;
break;
}
z = A[i * n + i];
det = det * z;
for (k = i; k < n; k++)
{
A[i * n + k] = A[i * n + k] / z;
}
for (j = i + 1; j < n; j++)
{
z = A[j * n + i];
for (k = i; k < n; k++)
{
A[j * n + k] = A[j * n + k] - z * A[i * n + k];
}
}
}
for (i = 0; i < N; i++)
{
printf("element %d of matrix is %G \n", i, A[i]);
}
printf("det is %G", det);
}
else
{
for (i = 0; i < n ; i++)
{
if (A[i * n + i] == 0)
{
s = 0;
for (j = i + 1; j < n; j++)
{
if (A[j * n + i] != 0)
{
for (k = i; k < n; k++)
{
A[i * n + k] = A[i * n + k] + A[j * n + k];
}
s = 1;
}
if (s == 1)
break;
}
}
if (s != 1)
{
det = 0;
break;
}
z = A[i * n + i];
det = det * z;
for (k = i; k < n; k++)
{
A[i * n + k] = A[i * n + k] / z;
}
m = n - i;
if (m > 1)
{
if (np >= m)
{
if (pid == 0)
{
for (k = 1; k < m; k++)
{
MPI_Send(&m, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&A[(k + i) * n + i], m, MPI_DOUBLE, k, 0, MPI_COMM_WORLD);
}
for (k = 1; k < m; k++)
{
MPI_Recv(&temp[0], m + 1, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (j = 0; j < m; j++)
{
A[(int(temp[m]) + i) * n + i + j] = temp[j];
}
}
printf("det is %G", det);
}
else
{
double local_A[N];
double local_z;
int local_m;
MPI_Recv(&local_m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_A[0], local_m, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
local_z = local_A[0];
for (k = 0; k < m; k++)
{
local_A[k] = local_A[k] - (local_z * A[(i * n) + i + k]);
}
local_A[k + 1] = pid;
MPI_Send(&local_A[0], local_m + 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
}
}
if (np < m)
{
if (m % np == 0)
{
if (pid == 0)
{
share = m / np;
for (k = 1; k < np; k++)
{
MPI_Send(&m, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&share, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&A[share * k * n], share * n, MPI_DOUBLE, k, 0, MPI_COMM_WORLD);
}
for (k = 1; k < share; k++)
{
z = A[(k + i) * n + i];
for (j = 0; j < m; j++)
{
A[(k + i) * n + i + j] = A[(k + i) * n + i + j] - z;
}
}
for (k = 1; k < np; k++)
{
MPI_Recv(&temp[0], share * m + 1, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (j = 0; j < share * m; j++)
{
A[(int(temp[share * m]) + i) * n * share + i + j] = temp[j];
}
}
printf("det is %G", det);
}
else
{
double local_A[N];
double local_z;
int local_share;
int local_m;
MPI_Recv(&local_m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_share, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_A[0], local_share * n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (k = 0; k < local_share; k++)
{
local_z = local_A[(k + i) * n + i];
for (j = 0; j < n; j++)
{
local_A[k * n + j] = local_A[(k + i) * n + i + j] - local_z;
}
}
local_A[local_share * local_m] = pid;
MPI_Send(&local_A[0], share * local_m + 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
}
}
if (m % np != 0)
{
if (pid == 0)
{
share = m / np;
for (k = 1; k < np; k++)
{
MPI_Send(&m, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&share, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&A[share * k * n], share * n, MPI_DOUBLE, k, 0, MPI_COMM_WORLD);
}
for (k = 1; k < share; k++)
{
z = A[(k + i) * n + i];
for (j = 0; j < m; j++)
{
A[(k + i) * n + i + j] = A[(k + i) * n + i + j] - z;
}
}
for (k = 1; k < np; k++)
{
MPI_Recv(&temp[0], share * m + 1, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (j = 0; j < share * m; j++)
{
A[(int(temp[share * m]) + i) * n * share + i + j] = temp[j];
}
}
for (k = 1; k < (m % np) + 1; k++)
{
y = (share * np) + k;
MPI_Send(&y, 1, MPI_INT, k, 0, MPI_COMM_WORLD);
MPI_Send(&A[(share * np + i + k) * n + i], m, MPI_DOUBLE, k, 0, MPI_COMM_WORLD);
}
for (k = 1; k < m; k++)
{
MPI_Recv(&temp[0], m + 1, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (j = 0; j < n; j++)
{
A[int(temp[n]) * n + i + j] = temp[j];
}
}
printf("det is %G", det);
}
else
{
double local_A[N];
double local_z;
double local_y;
int local_share;
int local_m;
MPI_Recv(&local_m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_share, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_A[0], local_share * n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (k = 0; k < local_share; k++)
{
local_z = local_A[(k + i) * n + i];
for (j = 0; j < n; j++)
{
local_A[k * n + j] = local_A[(k + i) * n + i + j] - local_z;
}
}
local_A[local_share * local_m] = pid;
MPI_Send(&local_A[0], local_share * local_m + 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
MPI_Recv(&local_y, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&local_A[0], m, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
local_z = local_A[0];
for (j = 0; j < local_m; j++)
{
local_A[j] = local_A[j] - local_z;
}
local_A[j + 1] = local_y;
MPI_Send(&local_A[0], m + 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
}
}
}
}
}
}
MPI_Finalize();
}
Even with two or treee processor my code doesn't work, and I don't know why it doesn't work.
I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
Hi I would like to modify Steam Code from CPU to GPU version. It's not really necessary to understand whole code. So, I will present just fragments if someone is interested, everything (source code and description) can find here:
http://www.dgp.toronto.edu/people/stam/reality/Research/pub.html => "Real-Time Fluid Dynamics for Games".
It is probably quite easy task. But I long time no used C++ and just studying CUDA, so it's hard for me. Trying from long time, but no effects.
CPU version (works):
#define IX(i,j) ((i)+(N+2)*(j))
...
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
for (int i = 1; i <= N; i++)
{
for (int j = 1; j <= N; j++)
{
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
}
set_bnd(N, b, x);
}
}
my GPU version (doesn't compile):
#define IX(i,j) ((i)+(N+2)*(j))
__global__
void GPU_lin_solve(int *N, int *b, float * x, float * x0, float *a, float *c)
{
int i = threadIdx.x * blockIdx.x + threadIdx.x;
int j = threadIdx.y * blockIdx.y + threadIdx.y;
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
int *d_N, *d_b;
float **d_x, **d_x0;
float *d_a, *d_c, *d_xx, *d_xx0;
*d_xx = **d_x;
*d_xx0 = **d_x0;
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_b, sizeof(int));
cudaMalloc(&d_xx, sizeof(float));
cudaMalloc(&d_xx0, sizeof(float));
cudaMalloc(&d_a, sizeof(float));
cudaMalloc(&d_c, sizeof(float));
cudaMemcpy(d_N, &N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx, &*x, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx0, &*x0, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_a, &a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &c, sizeof(float), cudaMemcpyHostToDevice);
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
// compilator showing problem in the line above
// Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
cudaMemcpy(&*x, d_xx, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_N);
cudaFree(d_b);
cudaFree(d_xx);
cudaFree(d_xx0);
cudaFree(d_a);
cudaFree(d_c);
set_bnd(N, b, x);
}
}
The compiler is reporting an error:
Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
at the kernel launch
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
What I am doing wrong?
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
N in your condition and macro is a pointer, you're treating as though it's an integer.
Try dereferencing it?