I am trying to convolve an image using CUDA, but I cannot get a result. cuda-gdb does not work properly on my system so I cannot tell what is happening inside the CUDA kernel. The CUDA kernel I am using is the following:
__global__
void
convolve_component_EXTEND_kern(const JSAMPLE *data, // image data
ssize_t data_width, // image width
ssize_t data_height, // image height
const float *kern, // convolution kernel data
ssize_t kern_w_f, // convolution kernel has a width of 2 * kern_w_f + 1
ssize_t kern_h_f, // convolution_kernel has a height of 2 * kern_h_f + 1
JSAMPLE *res) // array to store the result
{
ssize_t i = ::blockIdx.x * ::blockDim.x + ::threadIdx.x;
ssize_t j = ::blockIdx.y * ::blockDim.y + ::threadIdx.y;
float value = 0;
for (ssize_t m = 0; m < 2 * kern_w_f + 1; m++) {
for (ssize_t n = 0; n < 2 * kern_h_f + 1; n++) {
ssize_t x = i + m - kern_w_f; // column index for this contribution to convolution sum for (i, j)
ssize_t y = j + n - kern_h_f; // row index for ...
x = x < 0 ? 0 : (x >= data_width ? data_width - 1 : x);
y = y < 0 ? 0 : (y >= data_height ? data_height - 1 : y);
value += ((float) data[data_width * y + x]) * kern[(2 * kern_w_f + 1) * n + m];
}
}
res[data_width * j + i] = (JSAMPLE) value;
}
and I am invoking it in this function
void
convolve_component_EXTEND_cuda(const JSAMPLE *data,
ssize_t data_width,
ssize_t data_height,
const float *kern,
ssize_t kern_w_f,
ssize_t kern_h_f,
JSAMPLE *res)
{
JSAMPLE *d_data;
cudaMallocManaged(&d_data,
data_width * data_height * sizeof(JSAMPLE));
cudaMemcpy(d_data,
data,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyHostToDevice);
float *d_kern;
cudaMallocManaged(&d_kern,
(2 * kern_w_f + 1) * (2 * kern_h_f + 1) * sizeof(float));
cudaMemcpy(d_kern,
kern,
(2 * kern_w_f + 1) * (2 * kern_h_f + 1) * sizeof(float),
cudaMemcpyHostToDevice);
JSAMPLE *d_res;
cudaMallocManaged(&d_res,
data_width * data_height * sizeof(JSAMPLE));
dim3 threadsPerBlock(16, 16); // can be adjusted to 32, 32 (1024 threads per block is the maximum)
dim3 numBlocks(data_width / threadsPerBlock.x,
data_height / threadsPerBlock.y);
convolve_component_EXTEND_kern<<<numBlocks, threadsPerBlock>>>(d_data,
data_width,
data_height,
d_kern,
kern_w_f,
kern_h_f,
d_res);
cudaDeviceSynchronize();
cudaMemcpy(d_res,
res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_kern);
cudaFree(d_res);
}
In this context, the image data is contained in the array called data in such a way that the pixel at (i, j) is accessed by indexing into the array at data_width * j + i. the kernel data is in the array called kern, and it has a width of 2 * kern_w_f + 1 and a height of 2 * kern_h_f + 1. The element at (i, j) is accessed by indexing into the kern array at (2 * w_f + 1) * j + i, just like the data array. The array res is used to store the result of the convolution, and is allocated using calloc() before being passed to the function.
When I invoke the second function on an image's data, all the image's pixels are converted to 0 instead of the convolution being applied. Can anyone please point out the problem?
Just after calling the kernel, and performing the convolution you try to copy your data back to the res array.
cudaDeviceSynchronize();
cudaMemcpy(d_res,
res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
this should be
cudaDeviceSynchronize();
cudaMemcpy(res,
d_res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
as the first argument of cudaMemcpy is the destination-pointer.
cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
Related
I have two vectors a and b. Each vector contains the coordinates of a 3d points x, y, z vector3f.
struct Vector3f
{
float x;
float y;
float z;
}
vector a has a size of n = 5000 points and vector b has a size of m = 4000. I need to do a tensor vector product between them like on the right side of the picture. the resulted vector should have a length size of 5000 * 4000 and contain float point where results are stored at c.
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// int j = blockIdy.y * blockDim.y + threadIdx.y;
//check if the idx is out of range
if (i < n) {
for (int j = 0; j < m; j++) {
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
}
dim3 blockSize(32, 1, 1);
dim3 gridSize((n + blockSize.x - 1) / blockSize.x, 1, 1);
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I get high execution time on Volta arch which is a lot.
My question is how can I optimize the kernel to reduce time which is mainly because of the for loop inside the kernel. I know here that all global reads and writes are not coalesced.
You can make the kernel move through both a and b simultaneously, like this:
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdy.y * blockDim.y + threadIdx.y;
if (i < n && j < m)
{
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
dim3 blockSize(32, 32);
dim3 gridSize((int)ceil(n / 32.0), (int)ceil(m / 32.0));
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
Update
I tried to modify the code to use a single array with and without shared memory, the code without shared memory with always faster 3 or 4 times.
With shared memory:
#define BLOCK_SIZE 32
void tensor3dProdcut(const int n, const int m, const float* a, const float* b, float* c)
{
float* d_a;
size_t size = (uint64_t)n * 3 * sizeof(float);
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
float* d_b;
size = (uint64_t)m * 3 * sizeof(float);
cudaMalloc(&d_b, size);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
float* d_c;
size = (uint64_t)n * m * sizeof(float);
cudaMalloc(&d_c, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((int)ceil((double)n / BLOCK_SIZE), (int)ceil((double)m / BLOCK_SIZE));
tensor3dProdcutKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n, m);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
__shared__ double as[BLOCK_SIZE][3];
__shared__ double bs[BLOCK_SIZE][3];
for (i = 0; i < 3; i++)
{
as[row][i] = a[(BLOCK_SIZE * blockRow + row) * 3 + i];
bs[col][i] = b[(BLOCK_SIZE * blockCol + col) * 3 + i];
}
__syncthreads();
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += as[row][i] * bs[col][i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
Without shared memory:
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += a[(BLOCK_SIZE * blockRow + row) * 3 + i] * b[(BLOCK_SIZE * blockCol + col) * 3 + i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}
I meet a problem when i used constant memory. It will happen the error:
ERROR: an illegal memory access was encountered
It seem the kernel function doesn't execute.
But if I don't chose the constant memory, everything are ok. So it makes me so confused. I had thought very long time. But I still don't the reason. Can you help me to solve the problem? Thank you very much.
If the variable s is not used constant memory, everything are ok. But if the s is used constant memory, the program will break.
the variable that used constant memory define as followed:
#ifdef USE_CONST_MEM
__constant__ Sphere s[SPHERES];
#else
Sphere *s;
#endif
the kernel function defined as followed:
#ifdef USE_CONST_MEM
__global__ void kernel(unsigned char *ptr) {
printf("ok2");
#else
__global__ void kernel(Sphere *s, unsigned char *ptr) {
#endif
// map from threadIdx/BlockIdx to pixel position
printf("ok2");
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
REAL ox = (x - DIM / 2);
REAL oy = (y - DIM / 2);
REAL r = 0, g = 0, b = 0;
REAL maxz = -INF;
__syncthreads();
for (int i = 0; i<SPHERES; i++) {
REAL n;
REAL t = s[i].hit(ox, oy, &n);
if (t > maxz) {
REAL fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
printf("r: %.2f g: %.2f, b %.2f\n", r, g, b);
}
}
__syncthreads();
ptr[offset * 4 + 0] = (int)(r * 255);
ptr[offset * 4 + 1] = (int)(g * 255);
ptr[offset * 4 + 2] = (int)(b * 255);
ptr[offset * 4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
CPUAnimBitmap *bitmap;
};
there is the function that call the kernel function.
void generate_frame(DataBlock *d, int ticks) {
//START_GPU
//movin the spheres
kernelMoving << <128, 32 >> >(s, SPHERES);
printf("ok0\n");
// generate a bitmap from our sphere data
dim3 grids(DIM / 16, DIM / 16);
dim3 threads(16, 16);
#ifdef USE_CONST_MEM
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
printf("ok0-1\n");
kernel << <grids, threads >> >(s, d->dev_bitmap);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess)
{
fprintf(stderr,"ERROR: %s\n", cudaGetErrorString(error) );
exit(-1);
}
printf("ok0-1-1\n");
#else
printf("ok0-2\n");
kernel << <grids, threads >> >(s, d->dev_bitmap);
#endif
printf("ok1\n");
//END_GPU
HANDLE_ERROR(cudaMemcpy(d->bitmap->get_ptr(),
d->dev_bitmap,
d->bitmap->image_size(),
cudaMemcpyDeviceToHost));
}
the initialzation code as followed:
#ifdef USE_CONST_MEM
#else
HANDLE_ERROR(cudaMalloc((void**)&s,
sizeof(Sphere) * SPHERES));
#endif
// allocate temp memory, initialize it, copy to constant
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere) * SPHERES);
for (int i = 0; i<SPHERES; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(10.0f) + 5;
temp_s[i].dx = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
temp_s[i].dy = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
temp_s[i].dz = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
}
#ifdef USE_CONST_MEM
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s,
sizeof(Sphere) * SPHERES));
#else
HANDLE_ERROR(cudaMemcpy(s, temp_s, sizeof(Sphere)*SPHERES, cudaMemcpyHostToDevice));
#endif
free(temp_s);
the version of cuda is 8.0. the system is ubuntu 16.04.
Yeah, I know where I am wrong. When I used constant memory, I also try to change it's value in the function kernel_moving that try to modify the constant value. So the program will break. Now, I change to this, it works.
#ifdef USE_CONST_MEM
//printf("the number of SPHERES is %d\n", SPHERES);
Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere) * SPHERES);
HANDLE_ERROR(cudaMemcpyFromSymbol(temp_s, s, sizeof(Sphere) * SPHERES,0, cudaMemcpyDeviceToHost));
Sphere* dev_temp_s;
cudaMalloc((void**)&dev_temp_s, sizeof(Sphere) * SPHERES);
cudaMemcpy(dev_temp_s, temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyHostToDevice);
kernelMoving << <128, 32 >> >(dev_temp_s, SPHERES);
cudaMemcpy(temp_s, dev_temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyDeviceToHost);
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere) * SPHERES));
free(temp_s);
cudaFree(dev_temp_s);
#else
kernelMoving << <128, 32 >> >(s, SPHERES);
#endif
Hi I would like to modify Steam Code from CPU to GPU version. It's not really necessary to understand whole code. So, I will present just fragments if someone is interested, everything (source code and description) can find here:
http://www.dgp.toronto.edu/people/stam/reality/Research/pub.html => "Real-Time Fluid Dynamics for Games".
It is probably quite easy task. But I long time no used C++ and just studying CUDA, so it's hard for me. Trying from long time, but no effects.
CPU version (works):
#define IX(i,j) ((i)+(N+2)*(j))
...
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
for (int i = 1; i <= N; i++)
{
for (int j = 1; j <= N; j++)
{
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
}
set_bnd(N, b, x);
}
}
my GPU version (doesn't compile):
#define IX(i,j) ((i)+(N+2)*(j))
__global__
void GPU_lin_solve(int *N, int *b, float * x, float * x0, float *a, float *c)
{
int i = threadIdx.x * blockIdx.x + threadIdx.x;
int j = threadIdx.y * blockIdx.y + threadIdx.y;
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
void lin_solve(int N, int b, float * x, float * x0, float a, float c)
{
for (int k = 0; k<20; k++)
{
int *d_N, *d_b;
float **d_x, **d_x0;
float *d_a, *d_c, *d_xx, *d_xx0;
*d_xx = **d_x;
*d_xx0 = **d_x0;
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_b, sizeof(int));
cudaMalloc(&d_xx, sizeof(float));
cudaMalloc(&d_xx0, sizeof(float));
cudaMalloc(&d_a, sizeof(float));
cudaMalloc(&d_c, sizeof(float));
cudaMemcpy(d_N, &N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx, &*x, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_xx0, &*x0, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_a, &a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &c, sizeof(float), cudaMemcpyHostToDevice);
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
// compilator showing problem in the line above
// Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
cudaMemcpy(&*x, d_xx, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_N);
cudaFree(d_b);
cudaFree(d_xx);
cudaFree(d_xx0);
cudaFree(d_a);
cudaFree(d_c);
set_bnd(N, b, x);
}
}
The compiler is reporting an error:
Error 23 error : argument of type "int *" is incompatible with parameter of type "int"
at the kernel launch
GPU_lin_solve << <1, 1 >> > (d_N, d_b, d_xx, d_xx0, d_a, d_c);
What I am doing wrong?
if (i < N && j < N)
x[IX(i, j)] = (x0[IX(i, j)] + a*(x[IX(i - 1, j)] + x[IX(i + 1, j)] + x[IX(i, j - 1)] + x[IX(i, j + 1)])) / c;
}
N in your condition and macro is a pointer, you're treating as though it's an integer.
Try dereferencing it?
In this example, I am trying to create an 10x8 array using values from a 10x9 array. It looks like I am accessing memory incorrectly but I am not sure where my error is.
The code in C++ would be something like
for (int h = 0; h < height; h++){
for (int i = 0; i < (width-2); i++)
dd[h*(width-2)+i] = hi[h*(width-1)+i] + hi[h*(width-1)+i+1];
}
This is what I am trying in CUDA:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <iostream>
#define TILE_WIDTH 4
using namespace std;
__global__ void cudaOffsetArray(int height, int width, float *HI, float *DD){
int x = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int y = blockIdx.y * blockDim.y + threadIdx.y; // Row // height
int grid_width = gridDim.x * blockDim.x;
//int index = y * grid_width + x;
if ((x < (width - 2)) && (y < (height)))
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
}
int main(){
int height = 10;
int width = 10;
float *HI = new float [height * (width - 1)];
for (int i = 0; i < height; i++){
for (int j = 0; j < (width - 1); j++)
HI[i * (width - 1) + j] = 1;
}
float *gpu_HI;
float *gpu_DD;
cudaMalloc((void **)&gpu_HI, (height * (width - 1) * sizeof(float)));
cudaMalloc((void **)&gpu_DD, (height * (width - 2) * sizeof(float)));
cudaMemcpy(gpu_HI, HI, (height * (width - 1) * sizeof(float)), cudaMemcpyHostToDevice);
dim3 dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
cudaOffsetArray<<<dimGrid,dimBlock>>>(height, width, gpu_HI, gpu_DD);
float *result = new float[height * (width - 2)];
cudaMemcpy(result, gpu_DD, (height * (width - 2) * sizeof(float)), cudaMemcpyDeviceToHost);
for (int i = 0; i < height; i++){
for (int j = 0; j < (width - 2); j++)
cout << result[i * (width - 2) + j] << " ";
cout << endl;
}
cudaFree(gpu_HI);
cudaFree(gpu_DD);
delete[] result;
delete[] HI;
system("pause");
}
I've also tried this in the global function:
if ((x < (width - 2)) && (y < (height)))
DD[y * (grid_width - 2) + (blockIdx.x - 2) * blockDim.x + threadIdx.x] =
(HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x] +
HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x + 1]);
To "fix" your code, change each use of grid_width to width in this line in your kernel:
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
Like this:
DD[y * (width - 2) + x] = (HI[y * (width - 1) + x] + HI[y * (width - 1) + x + 1]);
Explanation:
Your grid_width:
dim3 dimGrid((width * 2 - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
doesn't actually correspond to your array size (10x10, or 10x9, or 10x8). I"m not sure why you're launching 2*width threads in the x dimension, but this means that your thread array is considerably larger than your data array.
So when you use grid_width in the kernel:
DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
the indexing will be a problem. If you instead change each instance of grid_width above to just width (which corresponds to the actual width of your data array) you'll get better indexing, I think. Normally it's not an issue to launch "extra threads" because you have a thread check line in your kernel:
if ((x < (width - 2)) && (y < (height)))
but when you launch extra threads, it is making your grid larger, and so you can't use grid dimensions to index properly into your data array.