Related
I found this question about the speed of Structure of Array (SoA) and Array of Struct (AoS) implementation and the top answer states that the speed of each implementation depends on access pattern. To test this, I've created 3 kernels where one is for raw array of doubles, another is for AoS pattern, and the last is for SoA pattern. Each method adds 1.0 to each respective "element" in some column vector if you will. What I'm finding is that the straight array case is more 3x faster than both SoA and AoS implementations. I cannot understand why the straight array is this much faster when the access patterns are identical between all three methods? The CUDA profiler outputs the same message for each kernel which is
The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To further improve performance, work will likely need to be shifted from the most utilized to another unit. Start by analyzing workloads in the Compute Workload Analysis section.
Here is a minimal reproducible example:
#include <iostream>
#include <chrono>
#include <vector>
using namespace std::chrono;
constexpr int nvars = 4;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct AoS {
double t, x, y, z;
__device__ AoS& operator+=(const AoS &vec) {
t += vec.t;
x += vec.x;
y += vec.y;
z += vec.z;
return *this;
};
};
struct SoA {
size_t size;
bool host_allocated, device_allocated;
double* t;
double* x;
double* y;
double* z;
double* gpu_t;
double* gpu_x;
double* gpu_y;
double* gpu_z;
SoA* device_ptr;
SoA(int elements) : size(elements * sizeof(double)), host_allocated(false), device_allocated(false) {};
~SoA(){
if (host_allocated){
free(t);
free(x);
free(y);
free(z);
}
if (device_allocated) {
cudaFree(gpu_t);
cudaFree(gpu_x);
cudaFree(gpu_y);
cudaFree(gpu_z);
cudaFree(device_ptr);
}
}
void host_allocate() {
t = (double*)malloc(size);
x = (double*)malloc(size);
y = (double*)malloc(size);
z = (double*)malloc(size);
host_allocated = true;
};
void device_allocate() {
if (!host_allocated) {
host_allocate();
}
gpuErrchk(cudaMalloc((void**)&device_ptr, sizeof(SoA)));
gpuErrchk(cudaMalloc((void**)&gpu_t, size));
gpuErrchk(cudaMalloc((void**)&gpu_x, size));
gpuErrchk(cudaMalloc((void**)&gpu_y, size));
gpuErrchk(cudaMalloc((void**)&gpu_z, size));
device_allocated = true;
};
void copy_to_device() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(gpu_t, t, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_x, x, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_y, y, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_z, z, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->t), &gpu_t, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->x), &gpu_x, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->y), &gpu_y, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->z), &gpu_z, sizeof(double *), cudaMemcpyHostToDevice));
}
void copy_to_host() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(t, gpu_t, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(x, gpu_x, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(y, gpu_y, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(z, gpu_z, size, cudaMemcpyDeviceToHost));
}
SoA* get_ptr(){
if (device_allocated) {
return device_ptr;
}
return this;
}
};
__global__ void arr_add(double* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
#pragma unroll
for (int var = 0; var < nvars; var++) {
vec[ii + var] = vec[ii + var] + 1.0;
}
};
__global__ void aos_add(AoS* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec[ii] += AoS{1.0, 1.0, 1.0, 1.0};
};
__global__ void soa_add(SoA *vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec->t[ii] = vec->t[ii] + 1.0;
vec->x[ii] = vec->x[ii] + 1.0;
vec->y[ii] = vec->y[ii] + 1.0;
vec->z[ii] = vec->z[ii] + 1.0;
};
int main() {
constexpr int n = 1 << 25;
constexpr int block_size = 128;
high_resolution_clock::time_point t1, t2;
duration<double> dt1, dt2, dt3;
SoA hybrid_soa(n);
hybrid_soa.device_allocate();
hybrid_soa.copy_to_device();
std::vector<AoS> host_vec_aos(n);
std::vector<double> host_arr(n * nvars);
AoS *dev_aos;
double *dev_arr;
gpuErrchk(cudaMalloc((void**)&dev_aos, n * sizeof(AoS)));
gpuErrchk(cudaMalloc((void**)&dev_arr, n * nvars * sizeof(double)));
gpuErrchk(cudaMemcpy(dev_aos, host_vec_aos.data(), n * sizeof(AoS), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_arr, host_arr.data(), n * nvars * sizeof(double), cudaMemcpyHostToDevice));
const int nblocks = (n + block_size - 1) / block_size;
std::cout << "Size of AoS struct is " << sizeof(AoS) << " bytes" << "\n";
std::cout << "Size of SoA struct is " << sizeof(SoA) << " bytes" << "\n";\
t1 = high_resolution_clock::now();
arr_add<<<nblocks, block_size>>>(dev_arr, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt1 = t2 - t1;
std::cout << "SrA took: " << std::scientific << dt1.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
aos_add<<<nblocks, block_size>>>(dev_aos, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt2 = t2 - t1;
std::cout << "AoS took: " << std::scientific << dt2.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
soa_add<<<nblocks, block_size>>>(hybrid_soa.get_ptr(), n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt3 = t2 - t1;
std::cout << "SoA took: " << std::scientific << dt3.count() << " seconds" << "\n";
gpuErrchk(cudaMemcpy(host_vec_aos.data(), dev_aos, n * sizeof(AoS), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(host_arr.data(), dev_arr, n * nvars * sizeof(double), cudaMemcpyDeviceToHost));
hybrid_soa.copy_to_host();
std::cout << "Straight array is: " << dt2.count() / dt1.count() << " times faster than AoS" << "\n";
std::cout << "Straight array is: " << dt3.count() / dt1.count() << " times faster than SoA" << "\n";
cudaFree(dev_aos);
cudaFree(dev_arr);
return 0;
}
I was expecting the speed to be near identical.
I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
This is my first time working with cuda. I am running some calculations involving cufft and two simple kernels on an NxNxN mesh (N=128). It seems to work fine until some time between 4040 and 4050 loops, the values of my mesh points become nan. On a smaller mesh, it can complete more loops before failing. This makes me think there is a memory leak somewhere. I tried running cuda-memcheck but it returned no errors. Can you spot any problems that could be causing this? I have reduced the code to a minimum but it is still long, my apologies. Thank you for your help.
#define _USE_MATH_DEFINES
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
data[i].y = 0;
}
__syncthreads();
}
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
for (float t = 0; t < tmax; t += dt) {
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}
Here is my instrumentation of your code so far. When I enabled the device assert in my_assert, it indicated that d_data3 input at the nan5 point was failing (i.e. it was nan). That indicated that the cufftExecC2C call on d_data3 immmediately prior was producing nan data. If you have invalid inputs, I believe an FFT can produce out-of-range results.
The code is instrumented to allow you to dump the data and look at it. You will have to modify dump_data to display whatever it is you wish to see.
When I run the code below, it eventually prints out:
4850.14
4851.14
4852.14
4853.14
4854.14
4855.14
4856.14
4857.14
4858.14
4859.14
4860.14
d_data3 output nan check failed
$
So the nan first occurs on iteration 4860, and the d_data3 input check did not fail, so the nan occurs in d_data3 as a result of the FFT operation in loop iteration 4860. You'll need to study the input and output data to see if you can determine why. There may be some modification to the d_data3 data in the Cube kernel that is causing this. For example, since you are repetitively cubing the data, doesn't it seem reasonable at some point that it would exceed float range?
Here's my instrumented code:
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <assert.h>
#include <stdio.h>
using namespace std;
__host__ __device__ void my_assert(bool cond){
//assert(cond);
}
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp = data[i].x;
if (isnan(temp)) {printf("nan1: %d\n", i); my_assert(0);}
data3[i].x = pow(data[i].x, 3);
if (isnan(data3[i].x)) {printf("nan2: %d %f\n", i, data[i].x); my_assert(0);}
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Spectral (cufftComplex *data, cufftComplex *data3, float *w, float *v, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
float temp1 = w[i];
if (isnan(temp1)) {printf("nan3: %d\n", i); my_assert(0);}
float temp2 = data[i].x;
if (isnan(temp2)) {printf("nan4: %d\n", i); my_assert(0);}
float temp3 = data3[i].x;
if (isnan(temp3)) {printf("nan5: %d\n", i); my_assert(0);}
float temp4 = v[i];
if (isnan(temp4)) {printf("nan6: %d\n", i); my_assert(0);}
data[i].x = (w[i] * data[i].x + data3[i].x * v[i]) / n;
if (isnan(data[i].x)) {printf("nan7: %d, %f, %f, %f, %f, %d\n",i, temp1, temp2, temp3, temp4, n); my_assert(0);}
data[i].y = 0;
}
__syncthreads();
}
__global__ void nan_kernel(cufftComplex *d, int len, bool *res){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len)
if (isnan(d[idx].x) || isnan(d[idx].y)) *res = true;
}
bool *d_nan;
bool checknan(cufftComplex *d, int len){
bool h_nan = false;
cudaMemcpy(d_nan, &h_nan, sizeof(bool), cudaMemcpyHostToDevice);
nan_kernel<<<(len/1024)+1, 1024>>>(d, len, d_nan);
cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
return h_nan;
}
void dump_data(cufftComplex *d1, cufftComplex *d2, int len)
{
// add code here to spit out the data however you would like to see it
// perhaps to a file
std::cout << "input: output: " << std::endl;
for (int i = 0; i < len; i++)
std::cout << d1[i].x << "," << d1[i].y << " " << d2[i].x << "," << d2[i].y << std::endl;
};
float ran();
int main (int argc, char **argv) {
float QQ, C;
float tmax = 5000;
int N = 128;
int n = N*N*N;
float dn = M_PI/8;
float dt = .075;
float psi0 = -0.175;
float r = -0.1;
tmax *= dt;
//setup cuda complex arrays
int mem_size = sizeof(cufftComplex)*n;
int float_mem_size = sizeof(float)*n;
cufftComplex *h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data;
cudaMalloc((void**)&d_data, mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cufftComplex *d_data3;
cudaMalloc((void**)&d_data3, mem_size);
float * h_w = (float*)malloc(float_mem_size);
float *d_w;
cudaMalloc(&d_w, float_mem_size);
float * h_v = (float*)malloc(float_mem_size);
float *d_v;
cudaMalloc(&d_v, float_mem_size);
for (int i=0; i<n; i++){
h_data[i].x = psi0 + r * ran();
h_data[i].y = 0;
}
int nx, ny, nz;
float B = -4 * M_PI * M_PI / ( pow((N*dn),2));
for (int i=0; i<n; i++){
nx = (i % N);
ny = (i / N) % N;
nz = i / (N * N);
if (nx > (N / 2)) {
nx = (N - nx);
}
if (ny > (N / 2)) {
ny = (N - ny);
}
if (nz > (N / 2)) {
nz = (N - nz);
}
QQ = B * (pow(nx, 2.0) + pow(ny, 2.0) + pow(nz, 2.0));
C = -r - 2.0 * QQ - pow(QQ, 2.0);
h_w[i] = exp(QQ * (1.0 - C) * dt);
h_v[i] = (h_w[i] - 1.0) / (1.0 - C);
}
cudaMemcpy(d_w, h_w, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_v, h_v, float_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, N, N, N, CUFFT_C2C);
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = n/maxThreads;
cufftResult res;
cudaMalloc(&d_nan, sizeof(bool));
cufftComplex *i3, *o3;
i3 = (cufftComplex *)malloc(mem_size);
o3 = (cufftComplex *)malloc(mem_size);
std::cout << "start loop" << std::endl;
for (float t = 0; t < tmax; t += dt) {
std::cout << t/dt << std::endl;
Cube <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, n);
cudaDeviceSynchronize();
cudaMemcpy(i3, d_data3, mem_size, cudaMemcpyDeviceToHost);
if (checknan(d_data3, n)) {std::cout << "d_data3 input nan check failed" << std::endl; return -1;}
res = cufftExecC2C(plan, d_data3, d_data3, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft1 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
if (checknan(d_data3, n)) {std::cout << "d_data3 output nan check failed" << std::endl; cudaMemcpy(o3, d_data3, mem_size, cudaMemcpyDeviceToHost); dump_data(i3, o3, n); return -1;}
res = cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS) {std::cout << "cufft2 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
Spectral <<<numBlocks, threadsPerBlock>>> (d_data, d_data3, d_w, d_v, n);
cudaDeviceSynchronize();
res = cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS) {std::cout << "cufft3 error: " << (int)res << " , " << t/dt << std::endl; return 1;}
cudaDeviceSynchronize();
}
//check output (should be a number)
cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost);
cout <<h_data[0].x <<endl;
cudaError_t cres = cudaGetLastError();
if (cres != cudaSuccess) std::cout << "cuda error: " << cudaGetErrorString(cres) << std::endl;
//clean up
cufftDestroy(plan);
cudaFree(d_data);
cudaFree(d_data3);
cudaFree(d_w);
cudaFree(d_v);
free(h_w);
free(h_v);
free(h_data);
free(h_data3);
return 0;
}
float ran(){ //random in range [-1,1]
float u= float (rand())/(RAND_MAX);
//return round(u);
return 2*u-1;
}
EDIT:
After some addition of printout code to dump_data (see modification above) I see this:
...
4859.14
4860.14
d_data3 output nan check failed
input: output:
3.37127e+19,0 nan,nan
3.21072e+19,0 nan,nan
2.76453e+19,0 nan,nan
2.13248e+19,0 nan,nan
1.44669e+19,0 nan,nan
8.37214e+18,0 nan,nan
3.93645e+18,0 nan,nan
1.35501e+18,0 nan,nan
2.55741e+17,0 nan,nan
5.96468e+15,0 nan,nan
-1.36656e+16,0 nan,nan
-2.33688e+17,0 nan,nan
-8.37407e+17,0 nan,nan
-1.79915e+18,0 nan,nan
-2.96302e+18,0 nan,nan
-4.11485e+18,0 nan,nan
-5.03876e+18,0 nan,nan
-5.57617e+18,0 nan,nan
-5.65307e+18,0 nan,nan
-5.28957e+18,0 nan,nan
-4.5872e+18,0 nan,nan
-3.68309e+18,0 nan,nan
...
I'm not an FFT expert, but it might be the case that if you do an FFT on a large array filled with large values, using float precision, that overflow may occur. If you only need to get to 5000 iterations and you're failing at 4860, you might get there if you change all your datatypes to double from float, but I'm not sure about the numerical sense of what you are doing here.
Finally, note that both cufft and fftw perform un-normalized transforms. This may be playing a role in the seeming growth of magnitudes in your data set. As I stated already, I'm not familiar with the arithmetic or algorithm you are trying to implement here.
Is it possible that you have a float underflow happening around iteration 4040? Taking the cube of your data3 would lead me to check out that possibility. It is pretty easy to spiral into an underflow on a float32 if your not careful. You could throw a check in there to limit your value to some minimum epsilon to prevent this.
I am new to cuda and trying to write a little code which should generate random points on a sphere. Here is the code.
__global__
void setup_kernel(curandStateMRG32k3a *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
int threads = 256;
dim3 grid = size / threads;
setup_kernel<<<grid,threads>>>(devStates);
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
This code works if the number of elements in the vectors is less than 4000 (I tried 1K,2K,3K and 4K). Than it gives me cuda Error Illegal Address in the first cudaMemcpy. I don't think I run out of memory, I am working with gtx 980 (4GB of global memory). Any idea how to fix this?
EDIT: The code after the suggested modifications is the following:
__global__
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < numberOfElements) curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
int threads = 512;
dim3 grid = (h_x.size() + threads - 1) / threads;
setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
cudaDeviceSynchronize();
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
I feel sorry for keeping posting here but I think by understanding what are my mistakes now I think I might get a better understanding of cuda.
So, now I am getting errorIllegalAdress on cudaMemcpy device->host when h_x.size() is 20k. I still do not understand how the code works for small numbers but not for big ones.
The problem is here:
size_t size = h_x.size() * sizeof(float);
...
int threads = 256;
dim3 grid = size / threads;
Your size variable is scaled by the number of bytes. So that is not the correct variable to use for the grid size. You should compute the grid size like this:
dim3 grid = h_x.size() / threads;
or similar. Also note that this construct won't properly initialize all curand state unless the vector length (h_x.size()) is evenly divisible by threads i.e. 256. The method to address this would be to include a thread check in your setup_kernel similar to the one in your other kernel:
__global__
void setup_kernel(curandStateMRG32k3a *state, int size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < size)
curand_init(0, id, 0, &state[id]);
}
and launch enough threads to cover the vector size:
dim3 grid = (h_x.size()+threads-1) / threads;
I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.
I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.
So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.
What can I do to fix this?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}
Error checking can make a big difference in debugging. You should always use it before coming here.
It wasn't clear if you wanted a row or column vector i.e. a matrix of [1xN] or [Nx1]
I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'
Here's [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
Explanation
Firslty, Error Handling:
Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.
The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).
Allocating:
Your allocation was written out as
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
Accessing:
For accessing you were mixing up accessing a row, and accessing an element in the row.
double e = *(double *)(((char *) A + idx * mpitch) + N);
Once again stick to the documentation. The API documentation for cudaMallocPitch includes
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
for us this translates into
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
I've used column = 0 for completeness since we do not have more than one column.
Copying:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
For this case this is correct. API for cudaMemcpy2D is
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);