As it is known that copying data to the GPU is slow I was wondering what specifically "counts" as passing data to the GPU.
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
for (int i = 0; i < reps; ++i)
add_kernel<<<blocks, threads>>>(a, b, c, size);
}
Does something such as passing size to the kernel incur (significant) overhead? Or does "data transfers" refer more specically to copying large arrays from the heap to the GPU.
IE would this variant be (much) faster
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; i < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
add_kernel<<<blocks, threads>>>(a, b, c, size, reps);
}
IE (again) in "ideal" CUDA programs should programmers be attemping to write a large majority of the computational programs in purely CUDA kernels or write CUDA kernels that are then called from the CPU (in the instance that passing values from the stack does not incur significant overhead).
Edited for clarity.
Everything counts. In order to run the kernel CPU needs to pass somehow which kernel to call and with which parameters. On "micro-level", if your kernel performs only several operations, these are considerable expenses. In real life, if your kernels do a lot of work, they are neglible.
And relatively big service expenses can be if such small operations are not pipelined. You can see this in NVidia's Visual Profiler. I don't know/remember exact numbers, but the order is following. Bandwidth between CPU and GPU can be like 1 GB/s, so 1 byte/nanosecond. But actually to send 4 bytes packet and to get acknowledge will take something like 1 microsecond. So to send 10000 bytes - like 11 microseconds.
Also execution of operations are optimized for massive execution on GPU, so execution of 10 consecutive operations with one 32 threads warp can take like 200 GPU clock cycles (like 0.2 microsecond). And say 0.5 microsecond for sending command for kernel execution before it will start.
In real life the problem is usually in that to sum 100 million of numbers you'll spend 0.4 seconds because of bandwidth limitation and say 0.1 microsecond for calculation itself. Because top GPU can perform about 1000 operations in each cycle near 1 nanosecond long.
Hi I have benchmarked the two versions. Simply calling CUDA functions DOES have a noticeable overhead
This is the output --
Calculating... (BlackCat_Tensors) reps outside
It took me 27.359249 clicks (27.359249 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 10.855168 clicks (10.855168 seconds).
This is my benchmark --
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 10000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */
Here's a secondary benchmark:
I imagine the threads for the inside loop could be higher as it is calculating more and ergo should have an even greater disparity in performance.
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */
Calculating... (BlackCat_Tensors) reps outside
It took me 14.969501 clicks (14.969501 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 13.060688 clicks (13.060688 seconds).
Related
I have two matrices
#define MATRIX_SIZE 20
#define BLOCK_SIZE 2
#define TILE_SIZE 2
double** A
double** B
Matrix A is dense, Matrix B is tridiagonal. I have created a vectorized representation of A
/* sz = A.rowlen = B.rowlen = A.collen = B.collen */
double* A1d = matrix_to_vector(sz, A);
I have also created a compressed representation of B with the following function
double* l_array = new double(sz - 1);
double* m_array = new double(sz);
double* r_array = new double(sz-1);
int current_l_idx = 0;
int current_m_idx = 0;
int current_r_idx = 0;
for (int i = 0; i < sz; i++) {
for (int j = 0; j < sz; j++) {
if ((i == j+1) || (i-1 == j)) {
l_array[current_l_idx] = B[i][j];
current_l_idx++;
}
else if ((i == j-1) || (i+1 == j)) {
r_array[current_r_idx] = B[i][j];
current_r_idx++;
}
else if (i == j) {
m_array[current_m_idx] = B[i][j];
current_m_idx++;
}
}
}
I then create an empty 2D vectorized matrix E as well as all my objects for CUDA
double* E1d = matrix_to_vector(sz, E);
double* d_A
double* d_B_l;
double* d_B_m;
double* d_B_r;
double* d_E;
size_t sizeA = sz * sz * sizeof(double);
size_t sizeB_lr = (sz - 1) * sizeof(double);
size_t sizeB_m = sz * sizeof(double);
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B_l. sizeB_lr);
cudaMalloc(&d_B_m, sizeB_m);
cudaMalloc(&d_B_r, sizeB_lr);
cudaMalloc(&d_E, sizeA);
cudaMemcpy(d_A, A1d, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, l_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, m_array, sizeB_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, r_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_E, E1d, sizeA, cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(MATRIX_SIZE / threads.x, MATRIX_SIZE / threads.y);
cudakernel<<<grid, threads>>>(sz, d_A, d_B_l, d_B_m, d_B_r, d_E);
I can perform this multiplication serially but I, unfortunately, have NO idea how to implement this on the CUDA device
Assumptions
A and B are always square
sz will always be evenly divisible by BLOCK_SIZE and TILE_SIZE
BLOCK_SIZE will always equal TILE_SIZE
I suspect based on your setup code that you are looking for a tiled shared-memory approach to this kind of matrix multiplication, and I'm not really wanting to do your homework for you, so I'll demonstrate an example that doesn't use shared memory.
If you understand how matrix multiplication works, and you also understand how to create an ordinary shared memory GPU matrix multiply kernel, converting the following code to use shared memory should be relatively straightforward:
#include <stdio.h>
#define DSIZE 256
#define BSIZE 32
#define TOL 0.0001
typedef double mytype;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// C = A x B
// A,B,C are all dense
template <typename T>
__global__ void mm(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
for (int i = 0; i < sz; i++)
temp += A[idy*sz+i]*B[i*sz+idx];
C[idy*sz+idx] = temp;}
}
// C = A x B
// A,C are dense, B is tridiagonal
template <typename T>
__global__ void mmt(const T * __restrict__ A, const T * __restrict__ B_l, const T * __restrict__ B_m, const T * __restrict__ B_r, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
if (idx > 0) temp += A[idy*sz+(idx-1)]*B_r[idx-1];
temp += A[idy*sz+(idx) ]*B_m[idx];
if (idx < (sz-1)) temp += A[idy*sz+(idx+1)]*B_l[idx];
C[idy*sz+idx] = temp;}
}
int main(){
mytype *d_A, *h_A, *d_B, *h_B, *d_C, *h_Cd, *h_Cs, *d_B_l, *h_B_l, *d_B_m, *h_B_m, *d_B_r, *h_B_r;
size_t msz = DSIZE*DSIZE;
size_t mszb = msz*sizeof(mytype);
// host side allocations
h_A = (mytype *)malloc(mszb);
h_B = (mytype *)malloc(mszb);
h_Cd =(mytype *)malloc(mszb);
h_Cs =(mytype *)malloc(mszb);
h_B_l = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_r = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_m = (mytype *)malloc( DSIZE*sizeof(mytype));
if (!h_A || !h_B || !h_Cd || !h_Cs || !h_B_l || !h_B_r || !h_B_m) {printf("malloc fail\n"); return -1;}
// device side allocations
cudaMalloc(&d_A, mszb);
cudaMalloc(&d_B, mszb);
cudaMalloc(&d_C, mszb);
cudaMalloc(&d_B_l, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_r, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_m, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
// prepare A, B matrices
/*
|1 1 1 ...|
A = |2 2 2 ...|
|3 3 3 ...|
|4 4 4 ...|
|... |
|2 1 0 ...| B_l = left/lower subdiagonal (i.e. all 3's)
B = |3 2 1 ...| B_m = middle/main diagonal (i.e. all 2's)
|0 3 2 ...| B_r = right/upper superdiagonal (i.e. all 1's)
|0 0 3 ...|
|... |
*/
for (int i = 0; i < DSIZE; i++){
if (i < DSIZE-1){
h_B_r[i] = 1;
h_B_l[i] = 3;}
h_B_m[i] = 2;
for (int j = 0; j < DSIZE; j++){
h_A[i*DSIZE+j] = i+1;
if (j==i+1) h_B[i*DSIZE+j] = 1;
else if (j==i) h_B[i*DSIZE+j] = 2;
else if (j==i-1) h_B[i*DSIZE+j] = 3;
else h_B[i*DSIZE+j] = 0;}}
// copy data to device
cudaMemcpy(d_A, h_A, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, h_B_l, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, h_B_r, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, h_B_m, DSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
// perform dense-dense multiply
dim3 block(BSIZE,BSIZE);
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
cudaMemset(d_C, 0, mszb);
mm<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaMemcpy(h_Cd, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
// perform dense-sparse multiply
cudaMemset(d_C, 0, mszb);
mmt<<<grid, block>>>(d_A, d_B_l, d_B_m, d_B_r, d_C, DSIZE);
cudaMemcpy(h_Cs, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 3/kernel fail");
// compare results
for (int i = 0; i < DSIZE; i++)
for (int j = 0; j < DSIZE; j++)
if (abs(h_Cs[i*DSIZE+j] - h_Cd[i*DSIZE+j]) > TOL) {printf("results mismatch at (%d, %d) dense: %f sparse: %f\n", i, j, h_Cd[i*DSIZE+j], h_Cs[i*DSIZE+j]); return -1;}
printf("Success!\n");
return 0;
}
Notes:
All of the global memory accesses in the mmt kernel (i.e. for A, the B vectors, and C) should properly coalesce across threads. Therefore, a conversion to use shared memory should also easily yield non-bank-conflicted access to shared memory.
While studying this code may be useful for learning, I recommend any serious sparse-dense matrix multiplication be done with routines from CUSPARSE such as csrmm. It will almost certainly be much more efficient (faster) than the above code, and likely faster than any shared memory conversion of the above code as well.
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have a program that calculates 1-10 million scalar products.
It looks like this. ts and A are arrays of about 1000-10000 3D points (each element is a 3x1 vector). For the moment, with ts.size() = 10,000 and A.size() = 1000, my code takes about 41ms. I have not done any parallelization so far. Will the calculations be much faster, for example, in CUDA? I have no such experience. Or is there any other way? Thanks.
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
This is my implementation of the scalar product.
double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}
Could I use Lapack or Eigen instead, formulating the problem as matrix multiplication? I've done that in Matlab and it is only 5 times slower. Any speedup would be great. With OpenMP i guess I could be 4x faster.
This answer consists of two parts:
Accelerating the calculation of many independent scalar products;
Solving your specific problem.
PART 1
The problem of calculating a large number of independent scalar products is an embarassingly parallel problem. If you aim at accelerating only the mentioned scalar products, retaining the rest of the computation on the CPU, then I agree with Calvin that most of the time will be spent in device-> memory transaction of the large N*M resulting matrix. However, if you purge your timing from the mentioned transaction, accelerating the calculations will be worth. This is shown by the code below, tested on an Intel Xeon E5-2650 2.00 GHz, Eight core processor equipped with an NVIDIA Kepler K20c cards, and whose timing is the following:
CPU: 27ms; GPU (without D2H transaction): 0.08ms; GPU (with D2H transaction): 23ms
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*M*sizeof(float));
float *D = (float*)malloc(N*M*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N*M; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
PART 2
For solving your specific problem, the CUDA will be worth, even by considering the D2H memory transaction (which is very cheap). This is confirmed by the code below, tested on the same system as above, and whose timing is the following:
CPU: 46ms; GPU (with D2H transaction): 0.31ms;
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f)
output[idx] = 1.f;
else
output[idx] = 0.f;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*sizeof(float));
float *D = (float*)malloc(N*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f)
C[i] = 1.f;
else
C[i] = 0.f;
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
Instead of optimising for arithmetic, you should use better algorithm first.
In most practical situation ts and A are not totally random per each cycle, and you may somehow organise (sort) them spatially, and greatly reduce the need for calculating spatial metric.
Now if you insist to stick with current algorithm, you may enable compiler to emit SSE code, this should give some instant boost without any programming work.
Now since you have to ask this question, the chance that you may further squeeze cycles by manually code with compiler intrinsics, is relatively narrow.
About CUDA, for just 10 million dot product the overhead for CPU-RAM-DISPLAY RAM-GPU communication is significant and not worth all the trouble.
To parallelize this using MIMD with OpenMP you can do this:
#pragma omp parallel for
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
You could also consider using SIMD. In that case you should change your data structure and store blocks of points equal to the SIMD width (4 for SSE with floats). Something like
class PointBlock4 {
float x[4];
float y[4];
float z[4];
//
}
Each block has four points. This is obviously more complicated but it's achievable. You could get a speed up as four as well. Combining SIMD and MIMD you could get a speedup of 16x (with four cores). But for large n your algorithm will become memory bound and not compute bound so you will achieve a much lower speedup. In fact your algorithm may already be memory bound so you might achieve much with SIMD or MIMD. I would test OpenMP first to see if you gain much.
I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Here is my issue:
I have a 3D array of float3 on my device:
int size[3] = {416,464,512};
cudaExtent extent = make_cudaExtent(size[0]*sizeof(float3),size[1],size[2]);
cudaPitchedPtr renderedVolume;
int ret = cudaMalloc3D(&renderedVolume, extent);
size_t pitch = renderedVolume.pitch; //pitch = 5,120
size_t slicePitch = pitch * size[1]; //slicePitch = 2,375,680
Then I work with it and make it full of outstanding data.
After that I wish to copy it on a 1D linear memory on my host:
float *host_memory = (float*)malloc(size[0]*size[1]*size[2]*sizeof(float3));
cudaMemcpy3DParms p = {0};
p.srcPtr = renderedVolume;
p.dstPtr = make_cudaPitchedPtr(host_memory,size[0]*sizeof(float3),size[0],size[1]);
p.extent = make_cudaExtent(size[0]*sizeof(float3),size[1],size[2]);
p.srcPos = make_cudaPos(0,0,0);
p.dstPos = make_cudaPos(0,0,0);
p.kind=cudaMemcpyDeviceToHost;
cudaMemcpy3D(&p);
I am comparing the result in host_memory with the data I initially wrote tu renderedVolume (my_data) and with the data I read in my 3Dmemory, slice by slice:
float* test1 = (float*)malloc(size[0]*size[1]*sizeof(float3));
cudaMemcpy(test1, myData, size[0]*size[1]*sizeof(float3) , cudaMemcpyDeviceToHost);
float* test2 = (float*)malloc(size[0]*size[1]*sizeof(float3));
cudaMemcpy(test2,(char*)renderedVolume.ptr + slicePitch * i,size[0]*size[1]*sizeof(float3), cudaMemcpyDeviceToHost);
Problem:
The first slice (i=0) is ok, I have the same data in host_memory, test1 and test2.
In the second slice, I have the same data in test1 and test2. However, I should find this data in host_memory+579072 (=number of float per slice, also heigth*pitch of the destination pitched pointer) and I find it in host_memory+577504. It is off by 1568 bytes, which corresponds to nothing that I am aware of, and this is why I would very much appreciate if any of you have an idea of what the problem might be in my code ?
This is a late answer provided to remove this question from the unanswered list.
Below, I'm providing a full code showing how to allocate 3D memory by cudaMalloc3D, moving a host allocated 1D memory to 3D device memory by cudaMemcpy3D, performing some operations on the 3D device data by the test_kernel_3D __global__ function and moving the 3D result data back to 1D host memory, again by cudaMemcpy3D.
The __global__ function test_kernel_3D squares each element of the 3D device memory. In particular, each thread of a 2D grid takes care of performing a for loop along the "depth" dimension.
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
#define N 128
#define M 64
#define W 16
/*****************/
/* CUDA MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/******************/
/* TEST KERNEL 3D */
/******************/
__global__ void test_kernel_3D(cudaPitchedPtr devPitchedPtr)
{
int tidx = blockIdx.x*blockDim.x+threadIdx.x;
int tidy = blockIdx.y*blockDim.y+threadIdx.y;
char* devPtr = (char*) devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * N;
for (int w = 0; w < W; w++) {
char* slice = devPtr + w * slicePitch;
float* row = (float*)(slice + tidy * pitch);
row[tidx] = row[tidx] * row[tidx];
}
}
/********/
/* MAIN */
/********/
int main()
{
float a[N][M][W];
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
for (int w=0; w<W; w++) {
a[i][j][w] = 3.f;
//printf("row %i column %i depth %i value %f \n",i,j,w,a[i][j][w]);
}
// --- 3D pitched allocation and host->device memcopy
cudaExtent extent = make_cudaExtent(M * sizeof(float), N, W);
cudaPitchedPtr devPitchedPtr;
gpuErrchk(cudaMalloc3D(&devPitchedPtr, extent));
cudaMemcpy3DParms p = { 0 };
p.srcPtr.ptr = a;
p.srcPtr.pitch = M * sizeof(float);
p.srcPtr.xsize = M;
p.srcPtr.ysize = N;
p.dstPtr.ptr = devPitchedPtr.ptr;
p.dstPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.xsize = M;
p.dstPtr.ysize = N;
p.extent.width = M * sizeof(float);
p.extent.height = N;
p.extent.depth = W;
p.kind = cudaMemcpyHostToDevice;
gpuErrchk(cudaMemcpy3D(&p));
dim3 GridSize(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));
dim3 BlockSize(BLOCKSIZE_y,BLOCKSIZE_x);
test_kernel_3D<<<GridSize,BlockSize>>>(devPitchedPtr);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
p.srcPtr.ptr = devPitchedPtr.ptr;
p.srcPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.ptr = a;
p.dstPtr.pitch = M * sizeof(float);
p.kind = cudaMemcpyDeviceToHost;
gpuErrchk(cudaMemcpy3D(&p));
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
for (int w=0; w<W; w++)
printf("row %i column %i depth %i value %f\n",i,j,w,a[i][j][w]);
getch();
return 0;
}
The following is a CUDA programming example which is basically C but with NVidia CUDA functions within. I've been trying to interpret this code example and figure out what it is trying to do. My question is this the program compiles just fine, but what arguments does it take? For example this CUDA program is being run in a linux emulator however upon running ./program it returns:
Usage: ./program number
Segmentation fault
What are the programs input arguments. Thank you.
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand () % 128);
y[i] = (float) (rand () % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}
int N = atoi(argv[1]);
The program takes a single integer as a command line argument. (Try calling it as ./program 5, for example.)
It then calculates a SAXPY (An old term originating from early BLAS implementations, but it stuck. It means "single (precision, aka float) real alpha x plus y".) with vectors of dimension N.