CUDA global memory - c++

this is my code
#include "stdafx.h"
#include <iostream>
using namespace std;
#define n 10
__device__ int glMem[n];
__global__ void initVals()
{
for(int i=0;i<n;i++)
glMem[i] = 0;
}
__global__ void test(int *out)
{
for(int i=0;i<n;i++)
out[i] = 10;
}
int main()
{
const size_t sz = size_t(n)*sizeof(int);
initVals<<<1,1>>>();
int *devMem;
cudaMalloc((void **)&devMem, sz);
test<<<1, 1>>>(devMem);
int *hoMem=new int[n];
cudaMemcpy(hoMem, devMem,sz, cudaMemcpyDeviceToHost);
//print
for(int i=0;i<n;i++)
cout<<hoMem[i]<<endl;
return 0;
}
IN this code I define
glMem
to size n. If I dont know the size earlier hw can I define??
for example I need to define like this.
__device__ int *glMem;
It doesnt work. Please give some code sample..

In that case you need to allocate the memory into the device.
// size of data
unsigned int size_of_glMem = n * sizeof(int);
// allocate device memory for result
int* glMem = NULL;
cudaMalloc( (void**) &glMem, size_of_glMem );
Hope this help.

Related

What is wrong with multiple arrays in C++ [duplicate]

#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);

in c++, getting segmentation fault for this codebase [duplicate]

#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);

Hi, Just Want to Know What This Error Means

"error C2660: 'storeInitialValues' : function does not take 1 arguments" shows up in the log of my code when I try to build. I've looked at some past errors posted here and I think it might be some kind of initialization error with either/all the usersize, v, dsize, and/or asize. I just want to see the error on the specific calling of storeInitialValues(usersize, v, dsize, asize); that's it. Thank you very much in advance.
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <ctime>
#include <cstdlib>
using namespace std;
struct vec
{
};
struct arr
{
};
void fillArray(int A[], int size);
void storeInitialValues(int * & arr, int & asize, int & dsize, vector<int>& v, int & usersize);
int main()
{
int usersize, dsize, asize;
vector <int> v;
int * ptr = new int[10];
cout << "How many values in data structures? Please enter values greater than 20." << endl;
cin >> usersize;
while (usersize < 21)
{
cout << "Error, enter values greater than 20!" << endl;
cin >> usersize;
}
cout << "Alright, here are your numbers: " << endl;
storeInitialValues(usersize, v, dsize, asize);
}
// fillArray stores sequential, unique, integer values into an array and
// then randomizes their order
void fillArray(int A[], int size)
{
srand((int)time(0));
for (int i = 0; i < size; i++)
{
A[i] = i + 1;
}
for (int k = size - 1; k>1; k--)
{
swap(A[k], A[rand() % k]);
}
}
// storeInitialValues calls fillArray to produce an array of unique randomly
// organized values and then inserts those values into a dynamically sized
// array and a vector.
void storeInitialValues(int * & arr, int & asize, int & dsize, vector<int>& v, int usersize)
{
int * temp = new int[usersize]; // temporary array for randomized data
fillArray(temp, usersize); // get data
for (int i = 0; i < usersize; i++) // copy data into the dynamic data structures
{
add(arr, asize, dsize, temp[i]);
v.push_back(temp[i]);
}
delete[] temp; // clean up temporary pointer
temp = NULL;
}
void add(int & usersize, int & arr, int & dsize, int & temp[i])
{
}
void remove()
{
}
Nothing about your call to storeInitialValues matches the declaration. I think you might be confused thinking the names of the variables are important. That's not the case. You have to pass variables that match the type of the variables in the function declaration in the correct order, the name are irrelevant.
int * & arr is a very strange declaration. int *arr would be a pointer to an int that you could treat as an array. What exactly are you aiming for with int * &? Mixing * and & requires that you be very careful with your usage. But you are also using vector, which is a very safe way of dealing with arrays. Why not just use vectors? You also declare and allocate ptr in the main function but you don't use it nor do you delete it.

CUDA: using unified memory together with classes and arrays

I'm trying to get unified memory to work with classes, and to pass and manipulate arrays in unified memory with kernel calls. I want to pass everything by reference.
So I'm overriding the new method for classes and arrays so they are accessible by the GPU, but I think I need to add more code to have arrays in unified memory, but not quite sure how to do this. I get a memory access error when the fillArray() method is called.
If I have to do these sorts of operations (arithmetic on arrays and copying between different sized arrays) hundreds of times, is unified memory a good approach or should I stick with manually copying between cpu and gpu memory? Thank you very much!
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#define TILE_WIDTH 4
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
__global__ void add1(int height, int width, int *a, int *resultArray)
{
int w = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int h = blockIdx.y * blockDim.y + threadIdx.y;
int index = h * width + w;
if ((w < width) && (h < height))
resultArray[index] = a[index] + 1;
}
class Managed
{
public:
void *operator new(size_t len)
{
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void Managed::operator delete(void *ptr)
{
cudaFree(ptr);
}
void* operator new[] (size_t len) {
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void Managed::operator delete[] (void* ptr) {
cudaFree(ptr);
}
};
class testArray : public Managed
{
public:
testArray()
{
height = 16;
width = 8;
myArray = new int[height*width];
}
~testArray()
{
delete[] myArray;
}
CUDA_CALLABLE_MEMBER void runTest()
{
fillArray(myArray);
printArray(myArray);
dim3 dimGridWidth((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
add1<<<dimGridWidth,dimBlock>>>(height, width, myArray, myArray);
cudaDeviceSynchronize();
printArray(myArray);
}
private:
int *myArray;
int height;
int width;
void fillArray(int *myArray)
{
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++)
myArray[i*width+j] = i*width+j;
}
}
void printArray(int *myArray)
{
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++)
printf("%i ",myArray[i*width+j]);
printf("\n");
}
}
};
int main()
{
testArray *test = new testArray;
test->runTest();
//testArray test;
//test.runTest();
system("pause");
return 0;
}
I want to pass everything by reference so there's no copying.
__global__ void add1(int height, int width, int *&a, int *&resultArray)
Passing a pointer by reference has one use: to modify (reseat) the pointer in the caller's scope. Which you do not do. So the references are, in this case, superfluous. In fact, it's a pessimization, because you're introducing another level of indirection. Use the following signature instead:
__global__ void add1(int height, int width, int* a, int* resultArray)
This compiles and runs, but it seems that the +1 operation never occurs. Why is this?
I know I should have catch error statements, this code is just a simple example.
Well, it's really unfortunate, because adding proper error checking would probably have helped you find the error. In the future, consider adding error checking before asking on SO.
Your kernel expects its arguments to be in an address space it can access. That means it must be a pointer that was obtained through a call to any of the cudaMalloc variants.
But what are you passing?
myArray = new int[height*width]; // Not a cudaMalloc* variant
[...]
add1<<<dimGridWidth,dimBlock>>>(height, width, myArray, myArray);
Therefore the pointer you pass to your kernel has no meaning, because it is not in a "CUDA address space". Your kernel probably segfaults immediately.
I think your confusion may arise from the fact that the enclosing class of myArray (testArray) inherits from Managed. This means that new testArray will allocate a testArray in GPU-accessible address space, but it doesn't mean that using operator new on that class members will allocate them in that address space, too. They too need to be allocated through cudaMalloc* (for example, although not required, through an overloaded operator new that forwards the allocation to cudaMallocManaged). A simple solution is to allocate your array not with new but like this:
cudaMallocManaged(&myArray, width * height* sizeof(*myArray));
Replace the corresponding call to delete with cudaFree.
Additionally:
testArray test;
This does not allocate test on GPU-accessible space, because it is not allocated through operator new.

Type Qualifiers for a device class in CUDA

I'm currently attempting to make a piece of CUDA code with a class that will be used solely on the device side (i.e. host doesn't need to know of it's existence). However I cannot work out the correct qualifiers for the class (deviceclass below):
__device__ float devicefunction (float *x) {return x[0]+x[1];}
class deviceclass {
private:
float _a;
public:
deviceclass(float *x) {_a = devicefunction(x);}
float getvalue () {return _a;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass *test;
test = new deviceclass(1.0, 2.0);
A[i] = test->getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
//...etc
}
Setting Deviceclass as solely a __device__ throws an error as it's called from a global function, however setting it as __device__ __host__ or __global__ seems unnecessary. Can someone point me in the right direction?
It turns out the qualifiers have to go on the member functions of the class, below is a fully working version:
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
void Cleanup(void);
// Functions to be pointed to
__device__ float Plus (float a, float b) {return a+b;}
class deviceclass {
private:
float test;
public:
__device__ deviceclass(float a, float b) {
test = Plus(a,b);
}
__device__ float getvalue() {return test;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass test(1.0, 2.0);
A[i] = test.getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
// Verify result
int i;
for (i = 0; i < N; ++i) {
cout << endl << h_A[i];
}
cout << endl;
Cleanup();
}
void Cleanup(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
// Free host memory
if (h_A)
free(h_A);
cudaThreadExit();
exit(0);
}
I take it that Node() is a typo.
From the CUDA C Programming Guide, Section 3.1.5:
However, only a subset of C++ is fully supported for the device code
and Appendix D.6:
Code compiled for devices with compute capability 2.x and higher may make use of C++ classes...
I think your code is using incompatible C++.