I've been spending a lot of time trying to figure out the cause of this problem. The following code attempts to generate a sequence of normally distributed random variables using curand on the device. It seems to generate a few successfully, but then crashes with an "illegal memory address was encountered error". Any help is much appreciated.
main.cu
#include <stdio.h>
#include <cuda.h>
#include <curand_kernel.h>
class A {
public:
__device__ A(const size_t& seed) {
printf("\nA()");
curandState state;
curand_init(seed, 0, 0, &state);
for(size_t i = 0; i < 1000; ++i)
printf("\n%f", curand_normal(&state));
}
__device__ ~A() { printf("\n~A()"); }
};
/// Kernel
__global__ void kernel(const size_t& seed) {
printf("\nHello from Kernel...");
A a(seed);
return;
}
int main(void) {
kernel<<<1,1>>>(1);
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
return 0;
}
Output
Hello from Kernel...
A()
0.292537
-0.718359
0.958011
0.633711kernel launch failed with error "an illegal memory access was encountered".
I have ran this both on my machine (CUDA 7.0), and a supercomputing cluster (CUDA 6.5), and the same result unfolds.
Get rid of the pass-by-reference on the kernel parameter (&).
You are not allowed to write GPU kernels that have pass-by-reference parameters. A GPU kernel cannot modify a host variable. (ignoring Unified Memory, Zero-Copy, and related mechanisms which are not at issue here.)
Related
I am working on parallel solving of identical ordinary differential equations with different initial conditions. I have solved this problem with OpenMP and now I want to implement similar code on GPU. Specifically, I want to allocate memory on device for floats in class constructor and then deallocate it in destructor. It doesn't work for me since I get my executable "terminated by signal SIGSEGV (Address boundary error)". Is it possible to use classes, constructors and destructors in CUDA?
By the way, I am newbie in CUDA and not very experienced in C++ either.
I attach the code in case I have described my problem poorly.
#include <cmath>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <random>
#include <string>
#include <chrono>
#include <ctime>
using namespace std;
template<class ode_sys>
class solver: public ode_sys
{
public:
int *nn;
float *t,*tt,*dt,*x,*xx,*m0,*m1,*m2,*m3;
using ode_sys::rhs_sys;
__host__ solver(int n): ode_sys(n)
{ //here I try to allocate memory. It works malloc() and doesn't with cudaMalloc()
size_t size=sizeof(float)*n;
cudaMalloc((void**)&nn,sizeof(int));
*nn=n;
cudaMalloc((void**)&t,sizeof(float));
cudaMalloc((void**)&tt,sizeof(float));
cudaMalloc((void**)&dt,sizeof(float));
cudaMalloc((void**)&x,size);
cudaMalloc((void**)&xx,size);
cudaMalloc((void**)&m0,size);
cudaMalloc((void**)&m1,size);
cudaMalloc((void**)&m2,size);
cudaMalloc((void**)&m3,size);
}
__host__ ~solver()
{
cudaFree(nn);
cudaFree(t);
cudaFree(tt);
cudaFree(dt);
cudaFree(x);
cudaFree(xx);
cudaFree(m0);
cudaFree(m1);
cudaFree(m2);
cudaFree(m3);
}
__host__ __device__ void rk4()
{//this part is not important now.
}
};
class ode
{
private:
int *nn;
public:
float *eps,*d;
__host__ ode(int n)
{
cudaMalloc((void**)&nn,sizeof(int));
*nn=n;
cudaMalloc((void**)&eps,sizeof(float));
size_t size=sizeof(float)*n;
cudaMalloc((void**)&d,size);
}
__host__ ~ode()
{
cudaFree(nn);
cudaFree(eps);
cudaFree(d);
}
__host__ __device__ float f(float x_,float y_,float z_,float d_)
{
return d_+*eps*(sinf(x_)+sinf(z_)-2*sinf(y_));
}
__host__ __device__ void rhs_sys(float *t,float *dt,float *x,float *dx)
{
}
};
//const float pi=3.14159265358979f;
__global__ void solver_kernel(int m,int n,solver<ode> *sys_d)
{
int index = threadIdx.x;
int stride = blockDim.x;
//actually ode numerical evaluation should be here
for (int l=index;l<m;l+=stride)
{//this is just to check that i can run kernel
printf("%d Hello \n", l);
}
}
int main ()
{
auto start = std::chrono::system_clock::now();
std::time_t start_time = std::chrono::system_clock::to_time_t(start);
cout << "started computation at " << std::ctime(&start_time);
int m=128,n=4,l;// i want to run 128 threads, n is dimension of ode
size_t size=sizeof(solver<ode>(n));
solver<ode> *sys_d; //an array of objects
cudaMalloc(&sys_d,size*m); //nvprof shows that this array is allocated
for (l=0;l<m;l++)
{
new (sys_d+l) solver<ode>(n); //it doesn't work as it meant to
}
solver_kernel<<<1,m>>>(m,n,sys_d);
for (l=0;l<m;l++)
{
(sys_d+l)->~solver<ode>(); //it doesn't work as it meant to
}
cudaFree(sys_d); //it works
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "finished computation at " << std::ctime(&end_time) << "elapsed time: " << elapsed_seconds.count() << "s\n";
return 0;
}
//end of file
Distinguish host-side and device-side memory
As other answer also state:
GPU (global) memory you allocate with cudaMalloc() is not accessible by code running on the CPU; and
System memory (aka host memorY) you allocate in plain C++ (with std::vector, with std::make_unique, with new etc.) is not accessible by code running on the GPU.
So, you need to allocate both host-side and device-side memory. For a simple example of working with both device-side and host-side memory see the CUDA vectorAdd sample program.
(Actually, you can also make a special kind of allocation which is accessible from both the device and the host; this is Unified Memory. But let's ignore that for now since we're dealing with the basics.)
Don't live in the kingdom of nouns
Specifically, I want to allocate memory on device for floats in class constructor and then deallocate it in destructor.
I'm not sure you really want to do that. You seem to be taking a more Java-esque approach, in which everything you do is noun-centric, i.e. classes are used for everything: You don't solve equations, you have an "equation solver". You don't "do X", you have an "XDoer" class etc. Why not just have a (templated) function which solves an ODE system, returning the solution? Are you using your "solver" in any other way?
(this point is inspired by Steve Yegge's blog post, Execution in the Kingdom of Nouns.)
Try to avoid allocating and de-allocating yourself
In well-written modern C++, we try to avoid direct, manual allocation of memory (that's a link to the C++ Core Programming Guidelines by the way). Now, it's true that you free your memory with the destructor, so it's not all that bad, but I'd really consider using std::unique_ptr on the host and something equivalent on the device (like cuda::memory::unique_ptr from my Modern-C++ CUDA API wrapper cuda-api-wrappers library); or a GPU-oriented container class like thrust's device vector.
Check for errors
You really must check for errors after you call CUDA API functions. And this is doubly necessary after you launch a kernel. When you call a C++ standard library code, it throws an exception on error; CUDA's runtime API is C-like, and doesn't know about exceptions. It will just fail and set some error variable you need to check.
So, either you write error checks, like in the vectorAdd() sample I linked to above, or you get some library to exhibit more standard-library-like behavior. cuda-api-wrappers and thrust will both do that - on different levels of abstraction; and so will other libraries/frameworks.
You need an array on the host side and one on the device side.
Initialize the host array, then copy it to the device array with cudaMemcpy. The destruction has to be done on the host side again.
An alternative would be to initialize the array from the device, you would need to put __device__ in front of your constructor, then just use malloc.
You can not dereference pointer to device memory in host code:
__host__ ode(int n)
{
cudaMalloc((void**)&nn,sizeof(int));
*nn=n; // !!! ERROR
cudaMalloc((void**)&eps,sizeof(float));
size_t size=sizeof(float)*n;
cudaMalloc((void**)&d,size);
}
You will have to copy the values with cudaMemcpy.
(Or use the parameters of a __global__ function.)
I am having trouble trying to make a CUDA program manage an array of lambdas by their index. An example code that reproduces the problem
#include <cuda.h>
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <cassert>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n",
cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
template<typename Lambda>
__global__ void kernel(Lambda f){
int t = blockIdx.x * blockDim.x + threadIdx.x;
printf("device: thread %i: ", t);
printf("f() = %i\n", f() );
}
int main(int argc, char **argv){
// arguments
if(argc != 2){
fprintf(stderr, "run as ./prog i\nwhere 'i' is function index");
exit(EXIT_FAILURE);
}
int i = atoi(argv[1]);
// lambdas
auto lam0 = [] __host__ __device__ (){ return 333; };
auto lam1 = [] __host__ __device__ (){ return 777; };
// make vector of functions
std::vector<int(*)()> v;
v.push_back(lam0);
v.push_back(lam1);
// host: calling a function by index
printf("host: f() = %i\n", (*v[i])() );
// device: calling a function by index
kernel<<< 1, 1 >>>( v[i] ); // does not work
//kernel<<< 1, 1 >>>( lam0 ); // does work
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
return EXIT_SUCCESS;
}
Compiling with
nvcc -arch sm_60 -std=c++11 --expt-extended-lambda main.cu -o prog
The error I get when running is
➜ cuda-lambda ./prog 0
host: f() = 333
device: GPUassert: invalid program counter main.cu 53
It seems that CUDA cannot manage the int(*)() function pointer form (while host c++ does work properly). On the other hand, each lambda is managed as a different data type, no matter if they are identical in code and have the same contract. Then, how can we achieve function by index in CUDA?
There are a few considerations here.
Although you suggest wanting to "manage an array of lambdas", you are actually relying on the graceful conversion of a lambda to a function pointer (possible when the lambda does not capture).
When you mark something as __host__ __device__, you are declaring to the compiler that two copies of said item need to be compiled (with two obviously different entry points): one for the CPU, and one for the GPU.
When we take a __host__ __device__ lambda and ask it to degrade to a function pointer, we are then left with the question "which function pointer (entry point) to choose?" The compiler no longer has the option to carry about the experimental lambda object anymore, and so it must choose one or the other (host or device, CPU or GPU) for your vector. Whichever one it chooses, the vector could (will) break if used in the wrong environment.
One takeaway from this is that your two test cases are not the same. In one case (broken) you are passing a function pointer to the kernel (so the kernel is templated to accept a function pointer argument) and in the other case (working) you are passing a lambda to the kernel (so the kernel is templated to accept a lambda argument).
The problem here, in my view, is not simply arising out of use of a container, but arising out of the type of container you are using. I can demonstrate this in a simple way (see below) by converting your vector to a vector of actual lambda type. In that case, we can make the code "work" (sort of), but since every lambda has a unique type, this is an uninteresting demonstration. We can create a multi-element vector, but the only element we can store in it is one of your two lambdas (not both at the same time).
If we use a container that can handle dissimilar types (e.g. std::tuple), perhaps we can make some progress here, but I know of no direct method to index through the elements of such a container. Even if we could, the template kernel accepting lambda as argument/template type would have to be instantiated for each lambda.
In my view, function pointers avoid this particular type "messiness".
Therefore, as an answer to this question:
Then, how can we achieve function by index in CUDA?
I would suggest for the time being that function by index in host code be separated (e.g. two separate containers) from function by index in device code, and for function by index in device code, you use any of the techniques (which don't use or depend on lambdas) covered in other questions, such as this one.
Here is a worked example (I think) demonstrating the note above, that we can create a vector of lambda "type", and use the resultant element(s) from that vector as lambdas in both host and device code:
$ cat t64.cu
#include <cuda.h>
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <cassert>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n",
cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
template<typename Lambda>
__global__ void kernel(Lambda f){
int t = blockIdx.x * blockDim.x + threadIdx.x;
printf("device: thread %i: ", t);
printf("f() = %i\n", f() );
}
template <typename T>
std::vector<T> fill(T L0, T L1){
std::vector<T> v;
v.push_back(L0);
v.push_back(L1);
return v;
}
int main(int argc, char **argv){
// arguments
if(argc != 2){
fprintf(stderr, "run as ./prog i\nwhere 'i' is function index");
exit(EXIT_FAILURE);
}
int i = atoi(argv[1]);
// lambdas
auto lam0 = [] __host__ __device__ (){ return 333; };
auto lam1 = [] __host__ __device__ (){ return 777; };
auto v = fill(lam0, lam0);
// make vector of functions
// std::vector< int(*)()> v;
// v.push_back(lam0);
// v.push_back(lam1);
// host: calling a function by index
// host: calling a function by index
printf("host: f() = %i\n", (*v[i])() );
// device: calling a function by index
kernel<<< 1, 1 >>>( v[i] ); // does not work
//kernel<<< 1, 1 >>>( lam0 ); // does work
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
return EXIT_SUCCESS;
}
$ nvcc -arch sm_61 -std=c++11 --expt-extended-lambda t64.cu -o t64
$ cuda-memcheck ./t64 0
========= CUDA-MEMCHECK
host: f() = 333
device: thread 0: f() = 333
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t64 1
========= CUDA-MEMCHECK
host: f() = 333
device: thread 0: f() = 333
========= ERROR SUMMARY: 0 errors
$
As mentioned above already, this code is not a sensible code. It is advanced to prove a particular point.
I'm a bit of a newcomer to CUDA and thrust. I seem to be unable to get the thrust::for_each algorithm to work when supplied with a counting_iterator.
Here is my simple functor:
struct print_Functor {
print_Functor(){}
__host__ __device__
void operator()(int i)
{
printf("index %d\n", i);
}
};
Now if I call this with a host-vector prefilled with a sequence, it works fine:
thrust::host_vector<int> h_vec(10);
thrust::sequence(h_vec.begin(),h_vec.end());
thrust::for_each(h_vec.begin(),h_vec.end(), print_Functor());
However, if I try to do this with thrust::counting_iterator it fails:
thrust::counting_iterator<int> first(0);
thrust::counting_iterator<int> last = first+10;
for(thrust::counting_iterator<int> it=first;it!=last;it++)
printf("Value %d\n", *it);
printf("Launching for_each\n");
thrust::for_each(first,last,print_Functor());
What I get is that the for loop executes correctly, but the for_each fails with the error message:
after cudaFuncGetAttributes: unspecified launch failure
I tried to do this by making the iterator type a template argument:
thrust::for_each<thrust::counting_iterator<int>>(first,last, print_Functor());
but the same error results.
For completeness, I'm calling this from a MATLAB mex file (64 bit).
I've been able to get other thrust algorithms to work with the counting iterator (e.g. thrust::reduce gives the right result).
As a newcomer I'm probably doing something really stupid and missing something obvious - can anyone help?
Thanks for the comments so far. I have taken on board the comments so far. The worked example (outside Matlab) worked correctly and produced output, but if this was made into a mex file it still did not work - the first time producing no output at all and the second time just producing the same error message as before (only fixed by a recompile, when it goes back to no output).
However there is a similar problem with it not executing the functor from thrust::for_each even under DOS. Here is a complete example:
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
struct sum_Functor {
int *sum;
sum_Functor(int *s){sum = s;}
__host__ __device__
void operator()(int i)
{
*sum+=i;
printf("In functor: i %d sum %d\n",i,*sum);
}
};
int main(){
thrust::counting_iterator<int> first(0);
thrust::counting_iterator<int> last = first+10;
int sum = 0;
sum_Functor sf(&sum);
printf("After constructor: value is %d\n", *(sf.sum));
for(int i=0;i<5;i++){
sf(i);
}
printf("Initiating for_each call - current value %d\n", (*(sf.sum)));
thrust::for_each(first,last,sf);
cudaDeviceSynchronize();
printf("After for_each: value is %d\n",*(sf.sum));
}
This is compiled under a DOS prompt with:
nvcc -o pf pf.cu
The output produced is:
After constructor: value is 0
In functor: i 0 sum 0
In functor: i 1 sum 1
In functor: i 2 sum 3
In functor: i 3 sum 6
In functor: i 4 sum 10
Initiating for_each call - current value 10
After for_each: value is 10
In other words the functor's overloaded operator() is called correctly from the for loop but is never called by the thrust::for_each algorithm. The only way to get the for_each to execute the functor when using the counting iterator is to omit the member variable.
( I should add that after years of using pure Matlab, my C++ is very rusty, so I could be missing something obvious ...)
On your comments you say that you want your code to be executed on host side.
The error code "unspecified launch failure", and the fact your functor is defined as host device make me think thrust wants to execute on your device.
Can you add an execution policy to be sure where your code is executed ?
replace :
thrust::for_each(first,last,sf);
with
thrust::for_each(thrust::host, first,last,sf);
To be able to run on the GPU, your result must be allocated on device memory (through cudaMalloc) then copied back to host.
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/execution_policy.h>
struct sum_Functor {
int *sum;
sum_Functor(int *s){sum=s;}
__host__ __device__
void operator()(int i)
{
atomicAdd(sum, 1);
}
};
int main(int argc, char**argv){
thrust::counting_iterator<int> first(0);
thrust::counting_iterator<int> last = first+atoi(argv[1]);
int *d_sum;
int h_sum = 0;
cudaMalloc(&d_sum,sizeof(int));
cudaMemcpy(d_sum,&h_sum,sizeof(int),cudaMemcpyHostToDevice);
thrust::for_each(thrust::device,first,last,sum_Functor(d_sum));
cudaDeviceSynchronize();
cudaMemcpy(&h_sum,d_sum,sizeof(int),cudaMemcpyDeviceToHost);
printf("sum = %d\n", *h_sum);
cudaFree(d_sum);
}
Code Update : To have the correct result on your device you must use an atomic operation.
I need to transform an existing Code about SPH (=Smoothed Particle Hydrodynamics) into a code that can be run on a GPU.
Unfortunately, it has a lot of data structure that I need to copy from the CPU to the GPU. I already looked up in the web and I thought, that I did the right thing for my copying-code, but unfortunately, I get an error (something with unhandled exception).
When I opened the Debugger, I saw that there is no information passed to my variables that should be copied to the GPU. It's just saying "The memory could not be read".
So here is an example of one data structure that needs to be copied to the GPU:
__device__ struct d_particle_data
{
float Pos[3]; /*!< particle position at its current time */
float PosMap[3]; /*!< initial boundary particle postions */
float Mass; /*!< particle mass */
float Vel[3]; /*!< particle velocity at its current time */
float GravAccel[3]; /*!< particle acceleration due to gravity */
}*d_P;
and I pass it on the GPU with the following:
cudaMalloc((void**)&d_P, N*sizeof(sph_particle_data));
cudaMemcpy(d_P, P, N*sizeof(d_sph_particle_data), cudaMemcpyHostToDevice);
The data structure P looks the same as the data structure d_P. Does anybody can help me?
EDIT
So, here's a pretty small part of that code:
First, the headers I have to use in the code:
Allvars.h: Variables that I need on the host
struct particle_data
{
float a;
float b;
}
*P;
proto.h: Header with all the functions
extern void main_GPU(int N, int Ntask);
Allvars_gpu.h: all the variables that have to be on the GPU
__device__ struct d_particle_data
{
float a;
float b;
}
*d_P;
So, now I call from the .cpp-File the -.cu-File:
hydra.cpp:
#include <stdio.h>
#include <cuda_runtime.h>
extern "C" {
#include "proto.h"
}
int main(void) {
int N_gas = 100; // Number of particles
int NTask = 1; // Number of CPUs (Code has MPI-stuff included)
main_GPU(N_gas,NTask);
return 0;
}
Now, the action takes place in the .cu-File:
hydro_gpu.cu:
#include <cuda_runtime.h>
#include <stdio.h>
extern "C" {
#include "Allvars_gpu.h"
#include "allvars.h"
#include "proto.h"
}
__device__ void hydro_evaluate(int target, int mode, struct d_particle_data *P) {
int c = 5;
float a,b;
a = P[target].a;
b = P[target].b;
P[target].a = a+c;
P[target].b = b+c;
}
__global__ void hydro_particle(struct d_particle_data *P) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
hydro_evaluate(i,0,P);
}
void main_GPU(int N, int Ntask) {
int Blocks;
cudaMalloc((void**)&d_P, N*sizeof(d_particle_data));
cudaMemcpy(d_P, P, N*sizeof(d_particle_data), cudaMemcpyHostToDevice);
Blocks = (N+N-1)/N;
hydro_particle<<<Blocks,N>>>(d_P);
cudaMemcpy(P, d_P, N*sizeof(d_particle_data), cudaMemcpyDeviceToHost);
cudaFree(d_P);
}
The really short answer is probably not to declare *d_P as a static __device__ symbol. Those cannot be passed as device pointer arguments to cudaMalloc, cudaMemcpy, or kernel launches and your use of __device__ is both unecessary and incorrect in this example.
If you make that change, your code might start working. Note that I lost interest in trying to actually compile your MCVE code some time ago, and there might well be other problems, but I'm too bored with this question to look for them. This answer has mostly been added to get this question off the unanswered queue for the CUDA tag.
Having been playing around with this grand CUDA experiment for a few months now, I find myself experimenting more and trying to pull away from the tutorial examples.
My question is this : If I want to just use arrays on the GPU for something like temporary storage without copying them back to the host for display/output, can I just create a device array with __device__ double array[numpoints]; Then for anything I want to take back from the GPU, I need to do the whole cudaMalloc, cudaMemcpy spiel, right? Additionally, is there any difference between one method or another? I thought they both create arrays in global memory.
See this discription about the __device__ qualifier. So if you declare it __device__ you cannot access it in the host through cudaMemcpy but there are other mentioned in the link.
Instead what you can do is declare a global pointer(ie., without __device__) in host code and allocate using the cudaMalloc. So you can use the same to copy the result back to host using the cudaMemcpy.
You can create, fill and use globl memory arrays without the need of using cudaMemcpy to copy data from the host for initialization, if this is what are you asking. In the following simple example, I'm creating a global memory array which is initialized directly on the device and then I'm releasing it when not needed anymore.
#include<stdio.h>
__global__ void init_temp_data(float* temp_data) {
temp_data[threadIdx.x] = 3.f;
}
__global__ void copy_global_data(float* temp_data, float* d_data) {
d_data[threadIdx.x] = temp_data[threadIdx.x];
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
float* data = (float*)malloc(16*sizeof(float));
float* d_data; gpuErrchk(cudaMalloc((void**)&d_data,16*sizeof(float)));
float* temp_data; gpuErrchk(cudaMalloc((void**)&temp_data,16*sizeof(float)));
init_temp_data<<<1,16>>>(temp_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
copy_global_data<<<1,16>>>(temp_data,d_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaFree(temp_data));
gpuErrchk(cudaMemcpy(data,d_data,16*sizeof(float),cudaMemcpyDeviceToHost));
for (int i=0; i<16; i++) printf("Element number %i is equal to %f\n",i,data[i]);
getchar();
return 0;
}