Fast calculation of many scalar products [closed]

Fast calculation of many scalar products [closed] - c++

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have a program that calculates 1-10 million scalar products.
It looks like this. ts and A are arrays of about 1000-10000 3D points (each element is a 3x1 vector). For the moment, with ts.size() = 10,000 and A.size() = 1000, my code takes about 41ms. I have not done any parallelization so far. Will the calculations be much faster, for example, in CUDA? I have no such experience. Or is there any other way? Thanks.
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
This is my implementation of the scalar product.
double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}
Could I use Lapack or Eigen instead, formulating the problem as matrix multiplication? I've done that in Matlab and it is only 5 times slower. Any speedup would be great. With OpenMP i guess I could be 4x faster.

This answer consists of two parts:
Accelerating the calculation of many independent scalar products;
Solving your specific problem.
PART 1
The problem of calculating a large number of independent scalar products is an embarassingly parallel problem. If you aim at accelerating only the mentioned scalar products, retaining the rest of the computation on the CPU, then I agree with Calvin that most of the time will be spent in device-> memory transaction of the large N*M resulting matrix. However, if you purge your timing from the mentioned transaction, accelerating the calculations will be worth. This is shown by the code below, tested on an Intel Xeon E5-2650 2.00 GHz, Eight core processor equipped with an NVIDIA Kepler K20c cards, and whose timing is the following:
CPU: 27ms; GPU (without D2H transaction): 0.08ms; GPU (with D2H transaction): 23ms
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*M*sizeof(float));
float *D = (float*)malloc(N*M*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N*M; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
PART 2
For solving your specific problem, the CUDA will be worth, even by considering the D2H memory transaction (which is very cheap). This is confirmed by the code below, tested on the same system as above, and whose timing is the following:
CPU: 46ms; GPU (with D2H transaction): 0.31ms;
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f)
output[idx] = 1.f;
else
output[idx] = 0.f;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*sizeof(float));
float *D = (float*)malloc(N*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f)
C[i] = 1.f;
else
C[i] = 0.f;
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}

Instead of optimising for arithmetic, you should use better algorithm first.
In most practical situation ts and A are not totally random per each cycle, and you may somehow organise (sort) them spatially, and greatly reduce the need for calculating spatial metric.
Now if you insist to stick with current algorithm, you may enable compiler to emit SSE code, this should give some instant boost without any programming work.
Now since you have to ask this question, the chance that you may further squeeze cycles by manually code with compiler intrinsics, is relatively narrow.
About CUDA, for just 10 million dot product the overhead for CPU-RAM-DISPLAY RAM-GPU communication is significant and not worth all the trouble.

To parallelize this using MIMD with OpenMP you can do this:
#pragma omp parallel for
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
You could also consider using SIMD. In that case you should change your data structure and store blocks of points equal to the SIMD width (4 for SSE with floats). Something like
class PointBlock4 {
float x[4];
float y[4];
float z[4];
//
}
Each block has four points. This is obviously more complicated but it's achievable. You could get a speed up as four as well. Combining SIMD and MIMD you could get a speedup of 16x (with four cores). But for large n your algorithm will become memory bound and not compute bound so you will achieve a much lower speedup. In fact your algorithm may already be memory bound so you might achieve much with SIMD or MIMD. I would test OpenMP first to see if you gain much.

Related

Cuda Memory Transfer overhead

As it is known that copying data to the GPU is slow I was wondering what specifically "counts" as passing data to the GPU.
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
for (int i = 0; i < reps; ++i)
add_kernel<<<blocks, threads>>>(a, b, c, size);
}
Does something such as passing size to the kernel incur (significant) overhead? Or does "data transfers" refer more specically to copying large arrays from the heap to the GPU.
IE would this variant be (much) faster
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; i < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
add_kernel<<<blocks, threads>>>(a, b, c, size, reps);
}
IE (again) in "ideal" CUDA programs should programmers be attemping to write a large majority of the computational programs in purely CUDA kernels or write CUDA kernels that are then called from the CPU (in the instance that passing values from the stack does not incur significant overhead).
Edited for clarity.

Everything counts. In order to run the kernel CPU needs to pass somehow which kernel to call and with which parameters. On "micro-level", if your kernel performs only several operations, these are considerable expenses. In real life, if your kernels do a lot of work, they are neglible.
And relatively big service expenses can be if such small operations are not pipelined. You can see this in NVidia's Visual Profiler. I don't know/remember exact numbers, but the order is following. Bandwidth between CPU and GPU can be like 1 GB/s, so 1 byte/nanosecond. But actually to send 4 bytes packet and to get acknowledge will take something like 1 microsecond. So to send 10000 bytes - like 11 microseconds.
Also execution of operations are optimized for massive execution on GPU, so execution of 10 consecutive operations with one 32 threads warp can take like 200 GPU clock cycles (like 0.2 microsecond). And say 0.5 microsecond for sending command for kernel execution before it will start.
In real life the problem is usually in that to sum 100 million of numbers you'll spend 0.4 seconds because of bandwidth limitation and say 0.1 microsecond for calculation itself. Because top GPU can perform about 1000 operations in each cycle near 1 nanosecond long.

Hi I have benchmarked the two versions. Simply calling CUDA functions DOES have a noticeable overhead
This is the output --
Calculating... (BlackCat_Tensors) reps outside
It took me 27.359249 clicks (27.359249 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 10.855168 clicks (10.855168 seconds).
This is my benchmark --
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 10000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */

Here's a secondary benchmark:
I imagine the threads for the inside loop could be higher as it is calculating more and ergo should have an even greater disparity in performance.
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */
Calculating... (BlackCat_Tensors) reps outside
It took me 14.969501 clicks (14.969501 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 13.060688 clicks (13.060688 seconds).

cuda calculations give nan after so many loops

This is my first time working with cuda. I am running some calculations involving cufft and two simple kernels on an NxNxN mesh (N=128). It seems to work fine until some time between 4040 and 4050 loops, the values of my mesh points become nan. On a smaller mesh, it can complete more loops before failing. This makes me think there is a memory leak somewhere. I tried running cuda-memcheck but it returned no errors. Can you spot any problems that could be causing this? I have reduced the code to a minimum but it is still long, my apologies. Thank you for your help.
#define _USE_MATH_DEFINES
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
data[i].y = 0;
}
__syncthreads();
}
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
for (float t = 0; t < tmax; t += dt) {
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}

Here is my instrumentation of your code so far. When I enabled the device assert in my_assert, it indicated that d_data3 input at the nan5 point was failing (i.e. it was nan). That indicated that the cufftExecC2C call on d_data3 immmediately prior was producing nan data. If you have invalid inputs, I believe an FFT can produce out-of-range results.
The code is instrumented to allow you to dump the data and look at it. You will have to modify dump_data to display whatever it is you wish to see.
When I run the code below, it eventually prints out:
4850.14
4851.14
4852.14
4853.14
4854.14
4855.14
4856.14
4857.14
4858.14
4859.14
4860.14
d_data3 output nan check failed
$
So the nan first occurs on iteration 4860, and the d_data3 input check did not fail, so the nan occurs in d_data3 as a result of the FFT operation in loop iteration 4860. You'll need to study the input and output data to see if you can determine why. There may be some modification to the d_data3 data in the Cube kernel that is causing this. For example, since you are repetitively cubing the data, doesn't it seem reasonable at some point that it would exceed float range?
Here's my instrumented code:
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <assert.h>
#include <stdio.h>
using namespace std;
__host__ __device__ void my_assert(bool cond){
//assert(cond);
}
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp = data[i].x;
if (isnan(temp)) {printf("nan1: %d\n", i); my_assert(0);}
data3[i].x = pow(data[i].x, 3);
if (isnan(data3[i].x)) {printf("nan2: %d %f\n", i, data[i].x); my_assert(0);}
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp1 = w[i];
if (isnan(temp1)) {printf("nan3: %d\n", i); my_assert(0);}
float temp2 = data[i].x;
if (isnan(temp2)) {printf("nan4: %d\n", i); my_assert(0);}
float temp3 = data3[i].x;
if (isnan(temp3)) {printf("nan5: %d\n", i); my_assert(0);}
float temp4 = v[i];
if (isnan(temp4)) {printf("nan6: %d\n", i); my_assert(0);}
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
if (isnan(data[i].x)) {printf("nan7: %d, %f, %f, %f, %f, %d\n",i, temp1, temp2, temp3, temp4, n); my_assert(0);}
data[i].y = 0;
}
__syncthreads();
}
__global__ void nan_kernel(cufftComplex *d, int len, bool *res){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len)
if (isnan(d[idx].x) || isnan(d[idx].y)) *res = true;
}
bool *d_nan;
bool checknan(cufftComplex *d, int len){
bool h_nan = false;
cudaMemcpy(d_nan, &h_nan, sizeof(bool), cudaMemcpyHostToDevice);
nan_kernel<<<(len/1024)+1, 1024>>>(d, len, d_nan);
cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
return h_nan;
}
void dump_data(cufftComplex *d1, cufftComplex *d2, int len)
{
// add code here to spit out the data however you would like to see it
// perhaps to a file
std::cout << "input: output: " << std::endl;
for (int i = 0; i < len; i++)
std::cout << d1[i].x << "," << d1[i].y << " " << d2[i].x << "," << d2[i].y << std::endl;
};
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
cufftResult res;
cudaMalloc(&d_nan, sizeof(bool));
cufftComplex *i3, *o3;
i3 = (cufftComplex *)malloc(mem_size);
o3 = (cufftComplex *)malloc(mem_size);
std::cout << "start loop" << std::endl;
for (float t = 0; t < tmax; t += dt) {
std::cout << t/dt << std::endl;
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cudaMemcpy(i3, d_data3, mem_size, cudaMemcpyDeviceToHost);
if (checknan(d_data3, n)) {std::cout << "d_data3 input nan check failed" << std::endl; return -1;}
res = cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft1 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
if (checknan(d_data3, n)) {std::cout << "d_data3 output nan check failed" << std::endl; cudaMemcpy(o3, d_data3, mem_size, cudaMemcpyDeviceToHost); dump_data(i3, o3, n); return -1;}
res = cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft2 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
res = cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS) {std::cout << "cufft3 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
cudaError_t cres = cudaGetLastError();
if (cres != cudaSuccess) std::cout << "cuda error: " << cudaGetErrorString(cres) << std::endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}
EDIT:
After some addition of printout code to dump_data (see modification above) I see this:
...
4859.14
4860.14
d_data3 output nan check failed
input: output:
3.37127e+19,0 nan,nan
3.21072e+19,0 nan,nan
2.76453e+19,0 nan,nan
2.13248e+19,0 nan,nan
1.44669e+19,0 nan,nan
8.37214e+18,0 nan,nan
3.93645e+18,0 nan,nan
1.35501e+18,0 nan,nan
2.55741e+17,0 nan,nan
5.96468e+15,0 nan,nan
-1.36656e+16,0 nan,nan
-2.33688e+17,0 nan,nan
-8.37407e+17,0 nan,nan
-1.79915e+18,0 nan,nan
-2.96302e+18,0 nan,nan
-4.11485e+18,0 nan,nan
-5.03876e+18,0 nan,nan
-5.57617e+18,0 nan,nan
-5.65307e+18,0 nan,nan
-5.28957e+18,0 nan,nan
-4.5872e+18,0 nan,nan
-3.68309e+18,0 nan,nan
...
I'm not an FFT expert, but it might be the case that if you do an FFT on a large array filled with large values, using float precision, that overflow may occur. If you only need to get to 5000 iterations and you're failing at 4860, you might get there if you change all your datatypes to double from float, but I'm not sure about the numerical sense of what you are doing here.
Finally, note that both cufft and fftw perform un-normalized transforms. This may be playing a role in the seeming growth of magnitudes in your data set. As I stated already, I'm not familiar with the arithmetic or algorithm you are trying to implement here.

Is it possible that you have a float underflow happening around iteration 4040? Taking the cube of your data3 would lead me to check out that possibility. It is pretty easy to spiral into an underflow on a float32 if your not careful. You could throw a check in there to limit your value to some minimum epsilon to prevent this.

Why repeating a kernel inside a for-loop makes CUDA code significantly slower?

Suppose we have four float arrays to be used on the host side, as well as its four counterparts to be used on the device side:
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
Now assume that we have a very simple kernel, taken from one of the examples at NVIDIA's blog:
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
Such kernel is to be called by the host side inside a for-loop, like the following:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
And then I compare the execution time of such loop against its pure-CPU version:
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
Now, what I don't understand is the following. For instance with ARRAYS_SIZE = 1000000 and for LOOP_N = 1000, when I run both loops in the versions shown above, I get a ratio between the execution time of CPU version and CUDA version that is around 6. It is, the CUDA version is approximately 6 times faster.
However, if I comment out one of the calls to saxpy that is inside the CUDA version of the loop and one of the calculations inside the CPU version of the loop, the ratio between CPU and CUDA becomes around 210. It is, the CUDA version is approximately 210 times faster.
What is the technical reason for such performance loss when merely repeating the call to a kernel, if no memory is being transferred to / from the device? Are there any workarounds to this?
A (hopefully) fully reproducible code example goes below:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;
//Pretty simple kernel, from the example in Nvidia's blog
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
// Main loop
int main(void)
{
timers t0, t1, t2;
timers tfinal0, tfinal1, tfinal2;
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
//Initializing arrays at the host side:
for (int i = 0; i < ARRAYS_SIZE; i++) {
x[i] = 1.0f;
x2[i] = 1.0f;
y[i] = 2.0f;
y2[i] = 2.0f;
}
// GPU memory allocation:
cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));
// Transfering arrays from host to device:
cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
//////////////////
// CPU run //
//////////////////
t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
//comment one of the following out to see the point of my question:
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
tfinal0 = std::chrono::high_resolution_clock::now();
auto time0 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal0 - t0).count();
std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;
//////////////////
// GPU-CUDA run //
//////////////////
// Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
t1 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
//comment one of the following out to see the point of my question:
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
tfinal1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal1 - t1).count();
std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;
//Display performance ratio CPU / GPU-CUDA
std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;
//Freeing memory used by arrays:
cudaFree(d_x);
cudaFree(d_x2);
cudaFree(d_y);
cudaFree(d_y2);
free(x);
free(x2);
free(y);
free(y2);
return 0;
}

You are not waiting for the kernels to be finished. As all kernel launches are asynchronous, you need to explicitly call cudaDeviceSynchronize() before stopping your timer.
The differences you are observing with variants of your current code likely stem from the fact that the queue for kernels to launch is finite, so at some point your code will start waiting for part of your kernels anyways.
On Windows kernel batching also plays into this, up to some number (or a timeout) the driver will not even start to launch kernels.

A simple change solves the problem, but I would still very much appreciate learning the technical reasons for all this.
The solution is to merely change, in my toy example above, the kernel to:
__global__
void saxpy(int n, float a, float *x, float *y, float *x2, float *y2)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
y2[i] = a*x2[i] + y2[i];
}
}
And then call it only once, like the following:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y, d_x2, d_y2);
}
Now the performance difference against the CPU implementation is just the same - which should be expected.
If someone can jump in with an answer to why this makes a difference, please post it that I will favor it over mine.

CUDA: cascaded summation of all vector elements

I have implemented a cascaded addition function for a large vector of float values on my GPU and my CPU. That simply means that all elements of this vector shell be summed up into one result. The CPU algorithm is quite trivial and works fine, but the GPU algorithm is always 35200 off the desired result.
The minimal working code for the algorithm and comparison to the CPU is below.
The output is always this:
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
I checked it with cuda-memcheck but no errors occured in that run. I have tried many many different things but none of themworked. It if not due to the inaccuracy of the float datatype because I changed all floats to ints and still got the exact same result.
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
}
void reductionCPU(float *result, const float *input)
{
for (int i = 0; i < SIZE; i++)
*result += input[i];
}
__global__ void reductionKernelImproved(float *result, const float *input)
{
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
__syncthreads();
#pragma unroll ILP
for (i = 0; i < ILP; i++)
{
if (index < SIZE)
{
atomicAdd(&interResult, input[index]);
index++;
}
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
}
void reductionWithCudaImproved(float *result, const float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return;
}

I think you have overlapping indices in your kernel call:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
If I am not mistaken, your blockDim.x = 4 and BLOCK_COUNT_X_IMPR = 100, so each row will jump 400 indices.
However, your col can go as high as 400 * 8.
Consider:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
So I guess you should rewrite your index
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;

C/CUDA Program Output

The following is a CUDA programming example which is basically C but with NVidia CUDA functions within. I've been trying to interpret this code example and figure out what it is trying to do. My question is this the program compiles just fine, but what arguments does it take? For example this CUDA program is being run in a linux emulator however upon running ./program it returns:
Usage: ./program number
Segmentation fault
What are the programs input arguments. Thank you.
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand () % 128);
y[i] = (float) (rand () % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}

int N = atoi(argv[1]);
The program takes a single integer as a command line argument. (Try calling it as ./program 5, for example.)
It then calculates a SAXPY (An old term originating from early BLAS implementations, but it stuck. It means "single (precision, aka float) real alpha x plus y".) with vectors of dimension N.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Fast calculation of many scalar products [closed] - c++

Related

Cuda Memory Transfer overhead

cuda calculations give nan after so many loops

Why repeating a kernel inside a for-loop makes CUDA code significantly slower?

CUDA: cascaded summation of all vector elements

C/CUDA Program Output

Categories

Resources