Installing CUDA C++ library? - c++

Apologies for sounding a complete n00b, but I have learnt I can call CUDA extension functions to C++ and have the GPU calculate. However, I cant seem to find instructions how to download the library (nor which library I need to download)? Strangely enough I have a great example but I don't know how to get the libraries!
Just so my post is more useful, this is the example I wish to implement:
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add<<<N,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}

You can find Cuda SDK here : Cuda SDK
Wasn't very hard to find to be honest... However, if you are facing this kind of problem in the future, you will usually find the libraries by searching for it's name (here Cuda), followed by "SDK" on Google. Should always be in the first results.
If you want to get started, NVIDIA provides a very nice documentation in my opinion as well as a section to get started, including an introduction to parallel programming : Getting Started

Related

Memcopy multiple gpus in cuda programming [duplicate]

How can I use two devices in order to improve for example
the performance of the following code (sum of vectors)?
Is it possible to use more devices "at the same time"?
If yes, how can I manage the allocations of the vectors on the global memory of the different devices?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#define NB 32
#define NT 500
#define N NB*NT
__global__ void add( double *a, double *b, double *c);
//===========================================
__global__ void add( double *a, double *b, double *c){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid < N){
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
//============================================
//BEGIN
//===========================================
int main( void ) {
double *a, *b, *c;
double *dev_a, *dev_b, *dev_c;
// allocate the memory on the CPU
a=(double *)malloc(N*sizeof(double));
b=(double *)malloc(N*sizeof(double));
c=(double *)malloc(N*sizeof(double));
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(double) );
cudaMalloc( (void**)&dev_b, N * sizeof(double) );
cudaMalloc( (void**)&dev_c, N * sizeof(double) );
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = (double)i;
b[i] = (double)i*2;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, N * sizeof(double), cudaMemcpyHostToDevice);
for(int i=0;i<10000;++i)
add<<<NB,NT>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(double), cudaMemcpyDeviceToHost);
// display the results
// for (int i=0; i<N; i++) {
// printf( "%g + %g = %g\n", a[i], b[i], c[i] );
// }
printf("\nGPU done\n");
// free the memory allocated on the GPU
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
// free the memory allocated on the CPU
free( a );
free( b );
free( c );
return 0;
}
Thank you in advance.
Michele
Since CUDA 4.0 was released, multi-GPU computations of the type you are asking about are relatively easy. Prior to that, you would have need to use a multi-threaded host application with one host thread per GPU and some sort of inter-thread communication system in order to use mutliple GPUs inside the same host application.
Now it is possible to do something like this for the memory allocation part of your host code:
double *dev_a[2], *dev_b[2], *dev_c[2];
const int Ns[2] = {N/2, N-(N/2)};
// allocate the memory on the GPUs
for(int dev=0; dev<2; dev++) {
cudaSetDevice(dev);
cudaMalloc( (void**)&dev_a[dev], Ns[dev] * sizeof(double) );
cudaMalloc( (void**)&dev_b[dev], Ns[dev] * sizeof(double) );
cudaMalloc( (void**)&dev_c[dev], Ns[dev] * sizeof(double) );
}
(disclaimer: written in browser, never compiled, never tested, use at own risk).
The basic idea here is that you use cudaSetDevice to select between devices when you are preforming operations on a device. So in the above snippet, I have assumed two GPUs and allocated memory on each [(N/2) doubles on the first device and N-(N/2) on the second].
The transfer of data from the host to device could be as simple as:
// copy the arrays 'a' and 'b' to the GPUs
for(int dev=0,pos=0; dev<2; pos+=Ns[dev], dev++) {
cudaSetDevice(dev);
cudaMemcpy( dev_a[dev], a+pos, Ns[dev] * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy( dev_b[dev], b+pos, Ns[dev] * sizeof(double), cudaMemcpyHostToDevice);
}
(disclaimer: written in browser, never compiled, never tested, use at own risk).
The kernel launching section of your code could then look something like:
for(int i=0;i<10000;++i) {
for(int dev=0; dev<2; dev++) {
cudaSetDevice(dev);
add<<<NB,NT>>>( dev_a[dev], dev_b[dev], dev_c[dev], Ns[dev] );
}
}
(disclaimer: written in browser, never compiled, never tested, use at own risk).
Note that I have added an extra argument to your kernel call, because each instance of the kernel may be called with a different number of array elements to process. I Will leave it to you to work out the modifications required.
But, again, the basic idea is the same: use cudaSetDevice to select a given GPU, then run kernels on it in the normal way, with each kernel getting its own unique arguments.
You should be able to put these parts together to produce a simple multi-GPU application. There are a lot of other features which can be used in recent CUDA versions and hardware to assist multiple GPU applications (like unified addressing, the peer-to-peer facilities are more), but this should be enough to get you started. There is also a simple muLti-GPU application in the CUDA SDK you can look at for more ideas.

CUDA kernel returns nothing

I'm using CUDA Toolkit 8 with Visual Studio Community 2015. When I try simple vector addition from NVidia's PDF manual (minus error checking which I don't have the *.h's for) it always comes back as undefined values, which means the output array was never filled. When I pre-fill it with 0's, that's all I get at the end.
Others have had this problem and some people are saying it's caused by compiling for the wrong compute capability. However, I am using an NVidia GTX 750 Ti, which is supposed to be Compute Capability 5. I have tried compiling for Compute Capability 2.0 (the minimum for my SDK) and 5.0.
I also cannot make any of the precompiled examples work, such as vectoradd.exe which says, "Failed to allocate device vector A (error code initialization error)!" And oceanfft.exe says, "Error unable to find GLSL vertex and fragment shaders!" which doesn't make sense because GLSL and fragment shading are very basic features.
My driver version is 361.43 and other apps such as Blender Cycles in CUDA mode and Stellarium work perfectly.
Here is the code that should work:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#define N 10
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x; // handle the data at this index
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main(void) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// fill the arrays 'a' and 'b' on the CPU
for (int i = 0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy(dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice);
add << <N, 1 >> >(dev_a, dev_b, dev_c);
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost);
// display the results
for (int i = 0; i<N; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
// free the memory allocated on the GPU
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I'm trying to develop CUDA apps so any help would be greatly appreciated.
This was apparently caused by using an incompatible driver version with the CUDA 8 toolkit. Installing the driver distributed with the version 8 toolkit solved thr problem.
[Answer assembled from comments and added as a community wiki entry to get the question off the unanswered queue for the CUDA tag]

C++ class dll with CUDA member?

I have a C++ class-based dll. I'd like to convert some of the class members to CUDA based operation.
I am using VS2012, WINDOWS 7, CUDA6.5, sm_20;
Say the original SuperProjector.h file is like:
class __declspec(dllexport) SuperProjector
{
public:
SuperProjector(){};
~SuperProjector(){};
void sumVectors(float* c, float* a, float* b, int N);
};
and the original sumVector() function in SuperProjector.cpp
void SuperProjector::sumVectors(float* c, float* a, float* b, int N)
{
for (int n = 1; n < N; b++)
c[n] = a[n] + b[n];
}
I am stuck on how I should convert sumVector() to CUDA. Specifically:
I read some posts saying add __global__ __device__ keywords in front
of class members will work, but so I need to change the suffix of
the cpp file to cu?
I also tried to create a cuda project from the beginning, but it seems VS2012 does not give me the option of creating a dll once I chose to create a CUDA project.
I am very confused what is the best way to convert some of the members of tthis C++ class based dll into some CUDA kernel functions. I appreciate anyone can offer some ideas, or better with some very simple examples.
Create CUDA project, let's call it cudaSuperProjector and add two files SuperProjector.cu and SuperProjector.h
cudaSuperProjector.h
class __declspec(dllexport) cudaSuperProjector {
public:
cudaSuperProjector(){ }
~cudaSuperProjector(){ }
void sumVectors(float* c, float* a, float* b, int N);
};
cudaSuperProjector.cu
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cudaSuperProjector.h"
__global__ void addKernel(float *c, const float *a, const float *b) {
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int size) {
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel << <1, size >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);
return cudaStatus;
}
void cudaSuperProjector::sumVectors(float* c, float* a, float* b, int N) {
cudaError_t cudaStatus = addWithCuda(c, a, b, N);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSuperProjector::sumVectors failed!");
}
}
Note: In properties of file cudaSuperProjector.cu Item Type should be CUDA C/C++.
Go to properties of the project and in General set value of Configuration Type to Dynamic Library (.dll). Now everything for creating library is ready. Compile this project and in output folder you will find cudaSuperProjector.dll and cudaSuperProjector.lib. Create directory cudaSuperProjector\lib and copy cudaSuperProjector.dll and cudaSuperProjector.lib there. Also create cudaSuperProjector\include and copy cudaSuperProjector.h in it.
Create another Visual C++ project, let's call it SuperProjector. Add file SuperProjector.cpp to the project.
SuperProjector.cpp
#include <stdio.h>
#include "cudaSuperProjector/cudaSuperProjector.h"
int main(int argc, char** argv) {
float a[6] = { 0, 1, 2, 3, 4, 5 };
float b[6] = { 1, 2, 3, 4, 5, 6 };
float c[6] = { };
cudaSuperProjector csp;
csp.sumVectors(c, a, b, 6);
printf("c = {%f, %f, %f, %f, %f, %f}\n",
c[0], c[1], c[2], c[3], c[4], c[5]);
return 0;
}
In properties of the project add path to the dll and lib files to the VC++ Directories -> Library Directories, for example D:\cudaSuperProjector\lib;, in VC++ Directories -> Include Directories add path to the header, for example D:\cudaSuperProjector\include;. Then go to the Linker -> Input and add cudaSuperProjector.lib;.
Now your project should compile fine, but when you run it it will show you the error
The program can't start because cudaSuperProjector.dll is missing from
your computer. Try reinstalling the program to fix this problem.
You need to copy cudaSuperProjector.dll to the output folder of the project, so it will be under the same folder as SuperProjector.exe. You can do it manually or add
copy D:\cudaSuperProjector\lib\cudaSuperProjector.dll $(SolutionDir)$(Configuration)\
in Build Events -> Post-Build Events -> Command Line,
where $(SolutionDir)$(Configuration)\ is output path for solution (see Configuration Properties -> General -> Output Directory).

CUDA cudaMemCpy doesn't appear to copy despite CudaSuccess

I'm just starting with CUDA and this is my very first project. I've done a search for this issue and while I've noticed other people have had similar problems, none of the suggestions seemed relevant to my specific issue or have helped in my case.
As an exercise, I'm trying to write an n-body simulation using CUDA. At this stage I'm not interested whether my specific implementation is efficient or not, I'm just looking for something that works and I can refine it later. I'll also need to update the code later, once it's working, to work on my SLI configuration.
Here's a brief outline of the process:
Create X and Y position, velocity, acceleration vectors.
Create same vectors on GPU and copy values across
In a loop: (i) calculate acceleration for the iteration, (ii) apply acceleration to velocities and positions, and (iii) copy positions back to host for display.
(Display not implemented yet. I'll do this later)
Don't worry about the acceleration calculation function for now, here is the update function:
__global__ void apply_acc(double* pos_x, double* pos_y, double* vel_x, double* vel_y, double* acc_x, double* acc_y, int N)
{
int i = threadIdx.x;
if (i < N);
{
vel_x[i] += acc_x[i];
vel_y[i] += acc_y[i];
pos_x[i] += vel_x[i];
pos_y[i] += vel_y[i];
}
}
And here's some of the code in the main method:
cudaError t;
t = cudaMalloc(&d_pos_x, N * sizeof(double));
t = cudaMalloc(&d_pos_y, N * sizeof(double));
t = cudaMalloc(&d_vel_x, N * sizeof(double));
t = cudaMalloc(&d_vel_y, N * sizeof(double));
t = cudaMalloc(&d_acc_x, N * sizeof(double));
t = cudaMalloc(&d_acc_y, N * sizeof(double));
t = cudaMemcpy(d_pos_x, pos_x, N * sizeof(double), cudaMemcpyHostToDevice);
t = cudaMemcpy(d_pos_y, pos_y, N * sizeof(double), cudaMemcpyHostToDevice);
t = cudaMemcpy(d_vel_x, vel_x, N * sizeof(double), cudaMemcpyHostToDevice);
t = cudaMemcpy(d_vel_y, vel_y, N * sizeof(double), cudaMemcpyHostToDevice);
t = cudaMemcpy(d_acc_x, acc_x, N * sizeof(double), cudaMemcpyHostToDevice);
t = cudaMemcpy(d_acc_y, acc_y, N * sizeof(double), cudaMemcpyHostToDevice);
while (true)
{
calc_acc<<<1, N>>>(d_pos_x, d_pos_y, d_vel_x, d_vel_y, d_acc_x, d_acc_y, N);
apply_acc<<<1, N>>>(d_pos_x, d_pos_y, d_vel_x, d_vel_y, d_acc_x, d_acc_y, N);
t = cudaMemcpy(pos_x, d_pos_x, N * sizeof(double), cudaMemcpyDeviceToHost);
t = cudaMemcpy(pos_y, d_pos_y, N * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << pos_x[0] << std::endl;
}
Every loop, cout writes the same value, whatever random value it was set to when the position arrays were original created. If I change the code in apply_acc to something like:
__global__ void apply_acc(double* pos_x, double* pos_y, double* vel_x, double* vel_y, double* acc_x, double* acc_y, int N)
{
int i = threadIdx.x;
if (i < N);
{
pos_x[i] += 1.0;
pos_y[i] += 1.0;
}
}
then it still gives the same value, so either apply_acc isn't being called or the cudaMemcpy isn't copying the data back.
All the cudaMalloc and cudaMemcpy calls return cudaScuccess.
Here's a PasteBin link to the complete code. It should be fairly simple to follow as there's a lot of repetition for the various arrays.
Like I said, I've never written CUDA code before, and I wrote this based on the #2 CUDA example video from NVidia where the guy writes the parallel array addition code. I'm not sure if it makes any difference, but I'm using 2x GTX970's with the latest NVidia drivers and CUDA 7.0 RC, and I chose not to install the bundled drivers when installing CUDA as they were older than what I had.
This won't work:
const int N = 100000;
...
calc_acc<<<1, N>>>(...);
apply_acc<<<1, N>>>(...);
The second parameter of a kernel launch config (<<<...>>>) is the threads per block parameter. It is limited to either 512 or 1024 depending on how you are compiling. These kernels will not launch, and the type of error this produces needs to be caught by using correct CUDA error checking. Simply looking at the return values of subsequent CUDA API functions will not indicate the presence of this type of error (which is why you are seeing cudaSuccess subsequently).
Regarding the concept itself, I suggest you learn more about CUDA thread and block hierarchy. To launch a large number of threads, you need to use both parameters (i.e. niether of the first two parameters should be 1) of the kernel launch config. This is usually advisable from a performance perspective as well.

Cuda "invalid argument" 2d array - Cellular automata

I'm trying to calculate 2d cellular automata redistribution using Cuda. I'm completely new to it so I have no idea what I do wrong. I've tried many solutions that I've seen here but all give "invalid argument" when I call the kernel.
Here is a simplified version of the kernel:
//kernel definition
__global__ void stepCalc(float B[51][51], int L, int flag, float m, float en)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
flag = 0;
if (i < L-2 && j < L-2 && i>2 && j>2 && abs(g)>m)
{
flag = 1;
en+=-16*g*g+8*B[i][j]*abs(g);
B[i][j]+=-4*f*g;
B[i+1][j]+=f*g;
B[i-1][j]+=f*g;
B[i][j+1]+=f*g;
B[i][j-1]+=f*g;
}
}
The main function looks like this:
#define L 50
float B[L+1][L+1];
//initialize B[i][j]
float g=0;
int flag = 1;
float m=0.1;
float en = 0;
while (flag==1)
{
float (*dB)[L+1];
int *dFlag=NULL;
float *dEn=NULL;
cudaMalloc((void **)&dFlag,sizeof(int));
cudaMalloc((void **)&dEn,sizeof(float));
cudaMalloc((void **)&dB, ((L+1)*(L+1))*sizeof(float));
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(dEn, &en, sizeof(float), cudaMemcpyDeviceToHost);
dim3 threadsPerBlock(16,16);
dim3 numBlocks((L+1)/threadsPerBlock.x,(L+1)/threadsPerBlock.y);
stepCalc<<<numBlocks, threadsPerBlock>>>(dB, L, dflag, m, dEn);
GPUerrchk(cudaPeekAtLastError()); //gives "invalid argument" at this line
cudaMemcpy(B, (dB), sizeB, cudaMemcpyDeviceToHost);
cudaMemcpy(&flag, dFlag, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&en, dEn, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dB);
cudaFree(dFlag);
cudaFree(dEn);
}
I need to extract the new array B, the flag value and the sum 'en' over all threads. Am I even close to how a solution should look? Is it even possible? I've also tried making the host array B as float** B with no luck.
There are various problems with your code.
You may be overlooking the difference between passing a value to a kernel and passing a pointer:
__global__ void stepCalc(float B[51][51], int L, int flag, float m, float en)
^ ^
| |
a pointer a value
we'll come back to B in a moment, but for values like flag and en, passing these by value to a kernel has similar implications to passing by value to a C function. It is a one-way communication path. Since it's evident from your code that you want to use these values modified by the kernel later in host code, you will need to pass pointers, instead. In a few cases, you have already allocated pointers for this purpose, so you have an additional type of error in that in some cases (dFlag) you are passing a pointer whereas the kernel definition expects a value.
Regarding B, passing a 2D array from host to device can be more difficult than you might initially expect, due to the deep copy problem. Without covering all that ground here, search on "CUDA 2D array" in the upper right hand corner of this page, and you'll get a lot of information about it and various ways to deal with it. Since you seem to be willing to consider an array of fixed width (known at compile-time), we can simplify the handling of a 2D array by leveraging the compiler to help us with a particular typedef.
When you're having trouble with a cuda code, it's good practice to do rigorous CUDA error checking throughout your code, not in just one place. One reason for this is that CUDA errors incurred in a particular place will often be returned at any subsequent place in the code. This makes it confusing if you don't check every CUDA API call, as a particular "invalid argument" error might not be due to the kernel itself, but some API call that occurred previously.
You typically don't want cudaMalloc operations in a data-processing while loop. These are normally operations you do once, at the beginning of your code. Doing the cudaMalloc at each iteration of the while-loop has several negative issues, one of which is that you will run out of memory (although you have cudaFree statements, so perhaps not), eventually, and you are effectively throwing away your data at each iteration. Also, it will negatively impact your performance.
You have some of your cudaMemcpy transfer directions wrong, like here:
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyDeviceToHost);
Setting flag to zero in your kernel code will be problematic. Warps can execute in any order, and after some warps have already set flag to 1 later in the kernel, other warps could begin executing and set flag to zero again. This is probably not what you want. One possible fix is to set flag to zero before executing the kernel (i.e. in host code, and copy it to the device).
Your kernel will generate out-of-bounds indexing here:
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
(just ask yourself what happens when i=0 and j=0). The fix for this is to move this line of code inside the if-check you have for bounds checking right after it.
Your kernel uses a variable f which is defined nowhere that I can see, for example here:
B[i+1][j]+=f*g;
The following code is my attempt to rework your code, create a complete example, and remove the above issues. It doesn't do anything useful, but it compiles without errors and runs without errors for me. I haven't provided any data, so it's just a proof-of-concept at this point. I'm sure it still contains data processing errors.
#include <stdio.h>
#define my_L 50
typedef float farray[my_L+1];
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
//kernel definition
__global__ void stepCalc(farray B[], int L, int *flag, float m, float *en)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
//float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
// flag = 0;
float f = 1.0f;
if (i < L-2 && j < L-2 && i>2 && j>2){
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
if (abs(g)>m)
{
*flag = 1;
*en+=-16*g*g+8*B[i][j]*abs(g);
B[i][j]+=-4*f*g;
B[i+1][j]+=f*g;
B[i-1][j]+=f*g;
B[i][j+1]+=f*g;
B[i][j-1]+=f*g;
}
}
}
int main(){
farray B[my_L+1];
//initialize B[i][j]
farray *dB;
int flag = 1;
float m=0.1;
float en = 0;
int *dFlag=NULL;
float *dEn=NULL;
cudaMalloc((void **)&dFlag,sizeof(int));
cudaCheckErrors("1");
cudaMalloc((void **)&dEn,sizeof(float));
cudaCheckErrors("2");
size_t sizeB = (my_L+1)*sizeof(farray);
cudaMalloc((void **)&dB, sizeB);
cudaCheckErrors("3");
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaCheckErrors("4");
cudaMemcpy(dEn, &en, sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("5");
dim3 threadsPerBlock(16,16);
dim3 numBlocks((my_L+1)/threadsPerBlock.x,(my_L+1)/threadsPerBlock.y);
while (flag==1)
{
flag = 0;
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("6");
stepCalc<<<numBlocks, threadsPerBlock>>>(dB, my_L, dFlag, m, dEn);
cudaDeviceSynchronize();
cudaCheckErrors("7");
cudaMemcpy(&flag, dFlag, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("8");
}
cudaMemcpy(B, (dB), sizeB, cudaMemcpyDeviceToHost);
cudaCheckErrors("9");
cudaMemcpy(&en, dEn, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("10");
// process B
cudaFree(dB);
cudaFree(dFlag);
cudaFree(dEn);
}