#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);
Related
#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);
I'm still relatively new to c++. I am trying to write a program that takes an array of numbers and reverses the order of those numbers in the array with a function. The program is as follows:
#include <iostream>
using namespace std;
void reverse(int *array, int size);
int main() {
int Array[] = { 1, 2, 3, 4, 5 };
int size = sizeof(Array) / 4;
reverse(Array, size);
return 0;
}
void reverse(int *array, int size) {
int Array2[5];
for (int i = 0; i < size; i++) {
Array2[i + size] = array[i];
array[i + size] = Array2[i + size];
};
}
When I run this program, it crashes and I'm not sure why. If anyone can help my figure out why it would be much appreciated. Thank you.
Zenith has it, but there are a few points and quick hacks worth noting to help you out.
#include <iostream>
//using namespace std; don't need this, and using namespace std is overkill and often
// causes problems. It pulls in a lot of stuff that may conflict, case in point
// std::reverse now becomes reverse. Which reverse will you get? Your reverse or the standard
// library's reverse? Only pull in what you need, for example
using std::cout; // still not used, but makes a good example.
void reverse(int *array, int size)
{
// no need for the other array and another loop. You can swap each element for
//it's counterpart in the upper half of the array.
for (int i = 0; i < size /2 ; i++) // only need to go half way. Other half was
// already swapped doing the first half.
{
int temp = array[i]; // store a temporary copy of element i
array[i] = array[size-1-i]; // replace element i with it's counterpart
// from the second half of the array
array[size-1-i] = temp; // replace the counterpart of i with the copy of i
// or call std::swap(array[i], array[size-1-i]);
};
}
int main() {
int Array[] = { 1, 2, 3, 4, 5 };
// int size = sizeof(Array) / 4; using 4 here can trip you up on a computer with
// a different sized int
int size = sizeof(Array) / sizeof(Array[0]);
// dividing the size of the array by the size of an element in the array will always
// get you the correct size
reverse(Array, size);
return 0;
}
Array2[i + size]
You're accessing out-of-bounds, no matter the value of i.
You probably meant Array2[size - 1 - i] to iterate the array backwards. (size - 1 is the index of the last element.)
by using swap you will get a much nicer solution which is also more efficient
void reverse(int *array, int size) {
for (int i = 0; i < size/2; i++) {
std::swap(array[i],array[size-1-i]);
};
}
When you say int size = sizeof(Array) / 4;, size is now (5 * sizeof(int)) / 4. That's how the sizeof operator works (at least when applied to arrays).
So size is probably 5, assuming a 4-byte int. Now, you get to reverse and its argument size is also 5.
You get to the for loop, and even at the first iteration, you have Array2[5] = /* something */ and array[5] = /* something */, both of which are buffer overruns.
Also, your reverse function doesn't actually do any reversing. Try this:
void reverse(int *arr, int size);
int main()
{
int Array[] = { 1, 2, 3, 4, 5 };
int size = sizeof(Array) / sizeof(int);
reverse(Array, size);
return 0;
}
void reverse(int *arr, int size)
{
int temp[5];
for (int i = 0; i < size; i++)
temp[size - 1 - i] = arr[i];
for (int i = 0; i < size; i++)
arr[i] = temp[i];
}
I tried to program a merge sort function using only static arrays and of course I have an incomprehensible error.
So the first question is that when I compile my program it gives me a multiple definition error on line 9 of that code:
#include <cmath>
#include "include\sort.h"
/*
Theses fonctions accept only arrays of unsigned short int with a length in unsigned int
*/
unsigned short int* sortByFusion(unsigned short int arr[]) { // HERE !!!!!!!!
const unsigned int arrSize = sizeof(arr)/sizeof(unsigned short int);
if (arrSize <= 1) {return arr;}
/*
Declarations and initializations of the two half array
*/
const unsigned int arrLeftSize = static_cast<unsigned int>(floor(arrSize/2));
const unsigned int arrRightSize = static_cast<unsigned int>(arrSize - arrLeftSize);
unsigned short int arrLeft[arrLeftSize];
unsigned short int arrRight[arrRightSize];
for (unsigned int i = 0; i < arrLeftSize; i ++)
arrLeft[i] = arr[i];
for (unsigned int i = 0; i < arrRightSize; i ++)
arrRight[i] = arr[i + arrLeftSize];
/*
Sort the two arrays
*/
for (unsigned int i = 0; i < arrLeftSize; i ++)
arrLeft[i] = *(sortByFusion(arrLeft) + i);
for (unsigned int i = 0; i < arrRightSize; i ++)
arrRight[i] = *(sortByFusion(arrRight) + i);
/*
And fusion them
*/
unsigned int i(0), j(0);
for (unsigned int k = 0; k < arrSize; k ++) {
if (i >= arrLeftSize) {arr[k] = arrRight[j]; j ++;} //That line
else if (j >= arrRightSize) {arr[k] = arrLeft[i]; i ++;} //And that line are here to avoid segmentation fault
else {
if (arrLeft[i] <= arrRight[j]) {arr[k] = arrLeft[i]; i ++;}
else {arr[k] = arrRight[j]; j ++;}
}
}
return arr;
}
What did I do wrong? I tried to put some ifndef define endif but it did nothing more. It seems that everybody has a problem with multiple definition always a bit different on this forum.
Secondly, I used some static_cast but why does floor return a double when we pass a double as argument? Logically, it should give us an integer (the floor of a number is always an integer ...)?
For the compiler, it is GNU GCC, but I don't know how to find its version. And I work with Code::Blocks.
And this is the header file :
#ifndef SORT_H_INCLUDED
#define SORT_H_INCLUDED
unsigned short int* sortByFusion(unsigned short int arr[]);
#endif // SORT_H_INCLUDED
const unsigned int arrSize = sizeof(arr)/sizeof(unsigned short int);
This line is not working as expected, because c++ doesn't keep any information about the length of an array with a runtime array - which is really just a pointer to the array, thus sizeof(arr) returns sizeof(unsigned short int*) which is the size of a pointer.
As why there is an error in line 9, I can't help you without seeing the header sort.h.
//static_cast<unsigned int>(floor(arrSize/2)); // Wrong
arrSize/2; // Right, C++ does floor integer divisions.
unsigned short int arrLeft[arrLeftSize];
unsigned short int arrRight[arrRightSize];
These two lines aren't valid C++ because the length of the declared arrays can not be determined at compile time.
I'm trying to get unified memory to work with classes, and to pass and manipulate arrays in unified memory with kernel calls. I want to pass everything by reference.
So I'm overriding the new method for classes and arrays so they are accessible by the GPU, but I think I need to add more code to have arrays in unified memory, but not quite sure how to do this. I get a memory access error when the fillArray() method is called.
If I have to do these sorts of operations (arithmetic on arrays and copying between different sized arrays) hundreds of times, is unified memory a good approach or should I stick with manually copying between cpu and gpu memory? Thank you very much!
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#define TILE_WIDTH 4
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
__global__ void add1(int height, int width, int *a, int *resultArray)
{
int w = blockIdx.x * blockDim.x + threadIdx.x; // Col // width
int h = blockIdx.y * blockDim.y + threadIdx.y;
int index = h * width + w;
if ((w < width) && (h < height))
resultArray[index] = a[index] + 1;
}
class Managed
{
public:
void *operator new(size_t len)
{
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void Managed::operator delete(void *ptr)
{
cudaFree(ptr);
}
void* operator new[] (size_t len) {
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void Managed::operator delete[] (void* ptr) {
cudaFree(ptr);
}
};
class testArray : public Managed
{
public:
testArray()
{
height = 16;
width = 8;
myArray = new int[height*width];
}
~testArray()
{
delete[] myArray;
}
CUDA_CALLABLE_MEMBER void runTest()
{
fillArray(myArray);
printArray(myArray);
dim3 dimGridWidth((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
add1<<<dimGridWidth,dimBlock>>>(height, width, myArray, myArray);
cudaDeviceSynchronize();
printArray(myArray);
}
private:
int *myArray;
int height;
int width;
void fillArray(int *myArray)
{
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++)
myArray[i*width+j] = i*width+j;
}
}
void printArray(int *myArray)
{
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++)
printf("%i ",myArray[i*width+j]);
printf("\n");
}
}
};
int main()
{
testArray *test = new testArray;
test->runTest();
//testArray test;
//test.runTest();
system("pause");
return 0;
}
I want to pass everything by reference so there's no copying.
__global__ void add1(int height, int width, int *&a, int *&resultArray)
Passing a pointer by reference has one use: to modify (reseat) the pointer in the caller's scope. Which you do not do. So the references are, in this case, superfluous. In fact, it's a pessimization, because you're introducing another level of indirection. Use the following signature instead:
__global__ void add1(int height, int width, int* a, int* resultArray)
This compiles and runs, but it seems that the +1 operation never occurs. Why is this?
I know I should have catch error statements, this code is just a simple example.
Well, it's really unfortunate, because adding proper error checking would probably have helped you find the error. In the future, consider adding error checking before asking on SO.
Your kernel expects its arguments to be in an address space it can access. That means it must be a pointer that was obtained through a call to any of the cudaMalloc variants.
But what are you passing?
myArray = new int[height*width]; // Not a cudaMalloc* variant
[...]
add1<<<dimGridWidth,dimBlock>>>(height, width, myArray, myArray);
Therefore the pointer you pass to your kernel has no meaning, because it is not in a "CUDA address space". Your kernel probably segfaults immediately.
I think your confusion may arise from the fact that the enclosing class of myArray (testArray) inherits from Managed. This means that new testArray will allocate a testArray in GPU-accessible address space, but it doesn't mean that using operator new on that class members will allocate them in that address space, too. They too need to be allocated through cudaMalloc* (for example, although not required, through an overloaded operator new that forwards the allocation to cudaMallocManaged). A simple solution is to allocate your array not with new but like this:
cudaMallocManaged(&myArray, width * height* sizeof(*myArray));
Replace the corresponding call to delete with cudaFree.
Additionally:
testArray test;
This does not allocate test on GPU-accessible space, because it is not allocated through operator new.
this is my code
#include "stdafx.h"
#include <iostream>
using namespace std;
#define n 10
__device__ int glMem[n];
__global__ void initVals()
{
for(int i=0;i<n;i++)
glMem[i] = 0;
}
__global__ void test(int *out)
{
for(int i=0;i<n;i++)
out[i] = 10;
}
int main()
{
const size_t sz = size_t(n)*sizeof(int);
initVals<<<1,1>>>();
int *devMem;
cudaMalloc((void **)&devMem, sz);
test<<<1, 1>>>(devMem);
int *hoMem=new int[n];
cudaMemcpy(hoMem, devMem,sz, cudaMemcpyDeviceToHost);
//print
for(int i=0;i<n;i++)
cout<<hoMem[i]<<endl;
return 0;
}
IN this code I define
glMem
to size n. If I dont know the size earlier hw can I define??
for example I need to define like this.
__device__ int *glMem;
It doesnt work. Please give some code sample..
In that case you need to allocate the memory into the device.
// size of data
unsigned int size_of_glMem = n * sizeof(int);
// allocate device memory for result
int* glMem = NULL;
cudaMalloc( (void**) &glMem, size_of_glMem );
Hope this help.