I build a simple cuda kernel that performs a sum on elements. Each thread adds an input value to an output buffer. Each thread calculates one value. 2432 threads are being used (19 blocks * 128 threads).
The output buffer remains the same, the input buffer pointer is shifted by threadcount after each kernel execution. So in total, we have a loop invoking the add kernel until we computed all input data.
Example:
All my input values are set to 1. The output buffer size is 2432. The input buffer size is 2432 *2000.
2000 times the add kernel is called to add 1 to each field of output. The endresult in output is 2000 at every field. I call the function aggregate which contains a for loop, calling the kernel as often as needed to pass over the complete input data.
This works so far unless I call the kernel too often.
However if I call the Kernel 2500 times, I get an illegalmemoryaccess cuda error.
As you can see, the runtime of the last successfull kernel increases by 3 orders of magnitude. Afterwards my pointers are invalidated and the following invocations result in CudaErrorIllegalAdress.
I cleaned up the code to get a minimal working example:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <vector>
#include <stdio.h>
#include <iostream>
using namespace std;
template <class T> __global__ void addKernel_2432(int *in, int * out)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
out[i] = out[i] + in[i];
}
static int aggregate(int* array, size_t size, int* out) {
size_t const vectorCount = size / 2432;
cout << "ITERATIONS: " << vectorCount << endl;
for (size_t i = 0; i < vectorCount-1; i++)
{
addKernel_2432<int><<<19,128>>>(array, out);
array += vectorCount;
}
addKernel_2432<int> << <19, 128 >> > (array, out);
return 1;
}
int main()
{
int* dev_in1 = 0;
size_t vectorCount = 2432;
int * dev_out = 0;
size_t datacount = 2432*2500;
std::vector<int> hostvec(datacount);
//create input buffer, filled with 1
std::fill(hostvec.begin(), hostvec.end(), 1);
//allocate input buffer and output buffer
cudaMalloc(&dev_in1, datacount*sizeof(int));
cudaMalloc(&dev_out, vectorCount * sizeof(int));
//set output buffer to 0
cudaMemset(dev_out, 0, vectorCount * sizeof(int));
//copy input buffer to GPU
cudaMemcpy(dev_in1, hostvec.data(), datacount * sizeof(int), cudaMemcpyHostToDevice);
//call kernel datacount / vectorcount times
aggregate(dev_in1, datacount, dev_out);
//return data to check for corectness
cudaMemcpy(hostvec.data(), dev_out, vectorCount*sizeof(int), cudaMemcpyDeviceToHost);
if (cudaSuccess != cudaMemcpy(hostvec.data(), dev_out, vectorCount * sizeof(int), cudaMemcpyDeviceToHost))
{
cudaError err = cudaGetLastError();
cout << " CUDA ERROR: " << cudaGetErrorString(err) << endl;
}
else
{
cout << "NO CUDA ERROR" << endl;
cout << "RETURNED SUM DATA" << endl;
for (int i = 0; i < 2432; i++)
{
cout << hostvec[i] << " ";
}
}
cudaDeviceReset();
return 0;
}
If you compile and run it, you get an error.
Change:
size_t datacount = 2432 * 2500;
to
size_t datacount = 2432 * 2400;
and it gives the correct results.
I am looking for any ideas, why it breaks after 2432 kernel invocations.
What i have found so far googeling around:
Wrong target architecture set. I use a 1070ti. My target is set to: compute_61,sm_61 In visual studio project properties. That does not change anything.
Did I miss something? Is there a limit how many times a kernel can be called until cuda invalidates pointer? Thank you for your help. I used windows, Visual Studio 2019 and CUDA runtime 11.
This is the output in both cases. Succes and failure:
[
Error:
[
static int aggregate(int* array, size_t size, int* out) {
size_t const vectorCount = size / 2432;
for (size_t i = 0; i < vectorCount-1; i++)
{
array += vectorCount;
}
}
That's not vectorCount but the number of iterations you have been accidentally incrementing by. Works fine while vectorCount <= 2432 (but yields wrong results), and results in buffer overflow above.
array += 2432 is what you intended to write.
Related
GCC version: gcc 4.8.5
copt: -std=c++11 -O3
SIZE = 50 * 1024 * 1024
The first piece of code:
int main() {
char* src = new char[SIZE];
char* dst = new char[SIZE];
memset(dst, 'a', SIZE);
for (size_t i = 0; i < 5; ++i) {
size_t start = now();
memcpy(dst, src, SIZE);
cout << "timer:" << now() - start << "ms" << endl;
}
return 0;
}
Output:
timer:5ms
timer:4ms
timer:5ms
timer:5ms
timer:4ms
The second piece of code:
int main() {
char* src = new char[SIZE];
char* dst = new char[SIZE];
memset(src, 'a', SIZE);
memset(dst, 'a', SIZE);
for (size_t i = 0; i < 5; ++i) {
size_t start = now();
memcpy(dst, src, SIZE);
cout << "timer:" << now() - start << "ms" << endl;
}
return 0;
}
Output:
timer:9ms
timer:8ms
timer:8ms
timer:8ms
timer:8ms
The third piece of code:
int main() {
char* src = new char[SIZE];
char* dst = new char[SIZE];
for (size_t i = 0; i < 5; ++i) {
size_t start = now();
memcpy(dst, src, SIZE);
cout << "timer:" << now() - start << "ms" << endl;
}
return 0;
}
Output:
timer:22ms
timer:4ms
timer:5ms
timer:5ms
timer:5ms
Summary:
Compare first and third case: first round of 3rd case slow is because of minor page fault.
Questions:
Why in the 1st case, memcpy src wouldn't trigger any minor page fault?
Why in the 2nd case, 1x slower than 1st case. Any optimization in OS?
Memcpy is bounded by external memory throughput; it looks like the OS is able to allocate memory virtually into the page tables and performing Copy-on-write. This would explain both phenomena: there would be only one reserved block of physical memory for unmodified src, which would be located in the fastest cache in cases 2 and 3. In case one all memory access would go up and down to external memory. The 5x speed penalty in run 1 of case 2 is due to the virtually allocated src being copied on write to unique physical pages.
Timing the initial memsets N times in a row should confirm the hypothesis.
The copy-on-write technique can be extended to support efficient memory allocation by having a page of physical memory filled with zeros. When the memory is allocated, all the pages returned refer to the page of zeros and are all marked copy-on-write. This way, physical memory is not allocated for the process until data is written, allowing processes to reserve more virtual memory than physical memory and use memory sparsely, at the risk of running out of virtual address space.
I wrote a small OpenCL application which calculates the product of two matrices. Now I've noticed that if the size of the matrix exceeds 8192 x 8192 there is a significant performance drop (calculation for a 16384 x 16384 is ~80 times slower) and even the serial implementation is over 5 times faster. Here is the host code:
/*Make some includes and definitions here*/
#include "stdafx.h"
#include <CL/cl.hpp>
#include <vector>
#include <iostream>
#include "util.hpp" // utility library
#define __CL_ENABLE_EXCEPTIONS
#define ROWS (16384) // ROWS of vectors a, b, and c
#define COLUMNS (16384)
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
#include "metrics.h"
/*Start main()*/
int main(void)
{
int A;
// Fill vectors X and Y with random float values
float* h_x = new float[ROWS*COLUMNS];
for (int i = 0; i < ROWS; ++i){
for (int j = 0; j < COLUMNS; ++j){
h_x[j + i*COLUMNS] = rand() / (float)RAND_MAX;;
}
}
float* h_y = new float[ROWS*COLUMNS];
for (int i = 0; i < ROWS; ++i){
for (int j = 0; j < COLUMNS; ++j){
h_y[j + i*COLUMNS] = rand() / (float)RAND_MAX;;
}
}
float* h_s = new float[ROWS*COLUMNS];
for (int i = 0; i < ROWS; ++i){
for (int j = 0; j < COLUMNS; ++j){
h_s[j + i*COLUMNS] = 0.0;
}
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0){ // Check for issues
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
cl::Platform default_platform = all_platforms[0];
std::cout << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>() << "\n";
// Get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0){ // Check for issues
std::cout << " No devices found. Check OpenCL installation!\n";
exit(1);
}
cl::Device default_device = all_devices[0];
std::cout << "Using device: " << default_device.getInfo<CL_DEVICE_NAME>() << "\n";
// Create an OpenCL context
cl::Context context({ default_device });
cl::Program program(context, util::loadProgram("saxy_kernel.cl"), true);
if (program.build({ default_device }) != CL_SUCCESS){
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << "\n";
getchar();
exit(1);
}
// create buffers on the device
cl::Buffer buffer_X(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_Y(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_S(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int));
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_X, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_x[0]);
queue.enqueueWriteBuffer(buffer_Y, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_y[0]);
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int), &A);
StartCounter();
//run the kernel
cl::Kernel kernel_add = cl::Kernel(program, "simple_add");
kernel_add.setArg(0, buffer_X);
kernel_add.setArg(1, buffer_Y);
kernel_add.setArg(2, buffer_S);
kernel_add.setArg(3, buffer_A);
cl::NDRange global(ROWS*COLUMNS);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, global, cl::NullRange);
queue.finish();
std::cout << "Kernel execution time: " << GetCounter() << "ms \n";
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_S, CL_TRUE, 0, sizeof(float)*ROWS*COLUMNS, &h_s[0]);
/*Print vectors
std::cout << "\nMatrix #1: \n";
for (int i = 0; i<ROWS*COLUMNS; i++){
std::cout << "" << h_x[i] << "\t ";
}
std::cout << "\n\nMatrix #2: \n";
for (int i = 0; i<ROWS*COLUMNS; i++){
std::cout << "" << h_y[i] << "\t ";
}
std::cout << "\n\nResult: \n";
for (int i = 0; i<ROWS*COLUMNS; i++){
std::cout << "" << h_s[i] << "\t ";
}*/
getchar();
return 0;
}
and here is the kernel:
__kernel void kernel simple_add(
__global float* X,
__global float* Y,
__global float* S,
__global int *A){
S[get_global_id(0)] = X[get_global_id(0)] * Y[get_global_id(0)];
}
Could you please explain me the reason? I know that I can achieve much better performance if I perform some algorithm optimizations, but I'm trying to figure out if this is the threshold of the "naive" implementation, or I'm doing something wrong (incorrect assignment of the work to groups).
EDIT: Because I was asked for in comments, the GPU I'm running the kernel is an AMD R9 270/2GB RAM. The CPU is an i7-4771 and the system has 8GB RAM.
Writing an answer about "how to do more calculations per thread" because code-formatting is non-existent in comments, and also covering a little on memory usage...
So, most OpenCL implementatins will need to run more than a couple of instructions per thread (and the right number of threads) for efficient performance. But like I said in comments, this is HIGHLY dependent on the actual architecture of the processing unit (GPU, CPU, or OpenCL-capable magical unit weaved from unicorn hair, whatever it may be) - each manufacturer of GPUs, CPUs and unicorn weavers have their own ideas of how to make a very efficient unit, and they all tend to change their mind as time flows too... ;)
To do a little more work in one thread you could simply do:
#define NUM_PER_THREAD 16
__kernel void kernel simple_add(
__global float* X,
__global float* Y,
__global float* S,
__global int *A)
{
for(i = 0; i < NUM_PER_THREAD; i++)
{
size_t index = get_global_id(0)*NUM_PER_THREAD + i;
S[index] = X[index] * Y[index];
}
}
[This will do 1 x 16 blocks. It gets a bit more fun to try to do 16 x 16 or something like that, but can be done if you know the size (width) of the matrix]
Regarding memory: GPU's that have dedicated local memory (in other words most graphics cards) will work MUCH faster if all the data fits in the graphics memory. Accessing "main" memory involves one of two approaches:
long access times for each cache-line when the GPU is reading over the PCI-express bus [or whatever infrastructure is used] - this can be 100 or 1000x slower than "local" memory. And the GPU also (most likely) has to ask the CPU if the memory content is in cache, and if so, wait further for the CPU to copy the data out to main memory...
"page in/out" where the GPU stops, sends an interrupt to the CPU,
the CPU finds some suitable lump [lump in this context is the technical term for "some amount of memory most likely around 4K or multiple thereof"] of memory to "remove" from the GPU
memory, and copies that out to main memory, then copies in the
required other lump of memory to the GPU memory - similar to when the OS is swapping memory to/from the hard-disk. And if you are unlucky, the GPU also has to do some interesting cache or TLB flushing to ensure that the correct data is being used.
Note that I still (in the last hour or so) haven't got any particular insight in how the AMD/ATI GPU's work, or how their OpenCL driver works. The above is a mixture of guessing/knowing how GPUs work in general, understanding of how OpenCL works in general, and calculating the memory needed to store the three different arrays of 16K x 16K using float.
I wrote some pretty simple GPU code here in CUDA C to copy an array, nums, into an array, vals. Nums is [4,7,1,9,2]. This is how I wanted to copy each element over:
__global__ void makeArray(int*);
int main()
{
int* d_nums;
int nums[5];
nums[0] = 4;
nums[1] = 7;
nums[2] = 1;
nums[3] = 9;
nums[4] = 2;
cudaMalloc(&d_nums, sizeof(int)*5);
makeArray<<<2,16>>>(d_nums);
cudaMemcpy(nums, d_nums, sizeof(int)*5, cudaMemcpyDeviceToHost);
for (int i = 0; i < 5; i++)
cout << i << " " << nums[i] << endl;
return 0;
}
__global__ void makeArray(int* nums)
{
int vals[5];
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
vals[threadIndex%5] = nums[threadIndex%5];
__syncthreads();
if (threadIndex < 5)
nums[threadIndex] = vals[threadIndex];
}
In the long run, I want to transfer an array from the CPU to the GPU shared memory using this method, but I can't even get this simple practice file to work. I'm expecting the output to look something like this:
0 4
1 7
2 1
3 9
4 2
But I'm getting this:
0 219545856
1 219546112
2 219546368
3 219546624
4 219546880
My thought process is that by using the modulus of the thread index, which is greater than the number of elements in this array, I can cover all 5 data points, and not worry about over reading the array. I can also assign each array spot at the same time, one per thread, and then __syncthreads() at the end to make sure every thread is done copying over. Clearly, that isn't working. Help!
After your edit, we can see d_nums points to uninitialised memory. You just allocated it and didn't fill it with anything. If you want data accessible to the GPU, you have to copy it:
cudaMemcpy(d_nums, nums, sizeof(nums), cudaMemcpyHostToDevice);
before you run the kernel.
I am just beginning to play with CUDA so I tried out a textbook vector addition code. However, when I specify kernel calls to only add the first half of vector, the second half also gets added! This behavior stops when I include some thrust library header.
I am totally confused. Please see the code below:
#include <iostream>
using namespace std;
__global__ void VecAdd(float *d_dataA, float *d_dataB, float *d_resultC)
{
//printf("gridDim.x is %d \n",gridDim.x);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// printf("tid is %d \n",tid);
d_resultC[tid] = d_dataA[tid] + d_dataB[tid];
}
int main()
{
const int ARRAY_SIZE = 8*1024;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
float *h_dataA, *h_dataB, *h_resultC;
float *d_dataA, *d_dataB, *d_resultC;
h_dataA = (float *)malloc(ARRAY_BYTES);
h_dataB = (float *)malloc(ARRAY_BYTES);
h_resultC = (float *)malloc(ARRAY_BYTES);
for(int i=0; i<ARRAY_SIZE;i++){
h_dataA[i]=i+1;
h_dataB[i]=2*(i+1);
};
cudaMalloc((void **)&d_dataA,ARRAY_BYTES);
cudaMalloc((void **)&d_dataB,ARRAY_BYTES);
cudaMalloc((void **)&d_resultC,ARRAY_BYTES);
cudaMemcpy(d_dataA, h_dataA,ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataB, h_dataB,ARRAY_BYTES, cudaMemcpyHostToDevice);
cout << h_resultC[0] << endl;
cout << h_resultC[ARRAY_SIZE-1] << endl;
dim3 dimBlock(ARRAY_SIZE/8,1,1);
dim3 dimGrid(1,1,1);
VecAdd<<<dimGrid,dimBlock>>>(d_dataA, d_dataB, d_resultC);
cout << h_resultC[0] << endl;
cout << h_resultC[ARRAY_SIZE-1] << endl;
cudaMemcpy(h_resultC,d_resultC ,ARRAY_BYTES,cudaMemcpyDeviceToHost);
cout << h_resultC[0] << endl;
cout << h_resultC[ARRAY_SIZE-1] << endl;
return 0;
}
Have you launched it first with ARRAY_SIZE threads and then with the half of them? (or 1/8)
You are not initializing d_resultC, so it's probably that d_resultC has the result of the previous executions. That would explain that behavior, but maybe it doesn't.
Add a cudaMemset over d_result_C and tell us what happens.
I can't answer for sure why your kernel is processing more elements than expected. It's processing one elements per thread, so the number of elements processed definitely should be blockDim.x*gridDim.x.
I want to point out though, that it's good practice to write kernels that use "grid stride loops" so they aren't so dependent on the block and thread count. The performance cost is negligible and if you are performance-sensitive, the blocking parameters are different for different GPUs.
http://cudahandbook.to/15QbFWx
So you should add a count parameter (the number of elements to process), then write something like:
__global__ void VecAdd(float *d_dataA, float *d_dataB, float *d_resultC, int N)
{
for ( int i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
d_resultC[i] = d_dataA[i] + d_dataB[i];
}
}
As some guys mentioned above. This may be caused by the remain data from your previous run. You didn't free the memory you allocated may be the reason of this odd situation.
I think you should free the allocated arrays on the host using free and also free the memory on the GPU using CudaFree
Also I strongly recommend you to allocate the host memory using CudaMallocHost instead of malloc and free them at the end of the program by CudaFreeHost. This will give you fast copy. See here: CudaMallocHost
Anyway, don't forget to free heap memory on C/C++ program, whether with CUDA or not.
I think the easiest way to describe the problem is with a simple code. On each processor I have dynamically allocated '2D arrays' (achieved via the new*[rows],new[cols] formalism, see code below for clarification). Rightly or wrongly, I'm trying to use a committed MPI_Datatype to help me do MPI_Gatherv() to gather all the arrays into a single 2D array on the root processor.
Here's the code, and below it I highlight to salient points of it (it should be very easy to understand if compiled and ran - it asks for the dimensions of the array you desire):
#include <iostream>
#include <string>
#include <cmath>
#include <cstdlib>
#include <time.h>
#include "mpi.h"
using namespace std;
// A function that prints out the 2D arrays to the terminal.
void print_2Darray(int **array_in,int dim_rows, int dim_cols) {
cout << endl;
for (int i=0;i<dim_rows;i++) {
for (int j=0;j<dim_cols;j++) {
cout << array_in[i][j] << " ";
if (j==(dim_cols-1)) {
cout << endl;
}
}
}
cout << endl;
}
int main(int argc, char *argv[]) {
MPI::Init(argc, argv);
// Typical MPI incantations...
int size, rank;
size = MPI::COMM_WORLD.Get_size();
rank = MPI::COMM_WORLD.Get_rank();
cout << "size = " << size << endl;
cout << "rank = " << rank << endl;
sleep(1);
// Dynamically allocate a 2D square array of user-defined size 'dim'.
int dim;
if (rank == 0) {
cout << "Please enter dimensions of 2D array ( dim x dim array ): ";
cin >> dim;
cout << "dim = " << dim << endl;
}
MPI_Bcast(&dim,1,MPI_INT,0,MPI_COMM_WORLD);
int **array2D;
array2D = new int*[dim];
for (int i=0; i<dim; i++) {
array2D[i] = new int[dim](); // the extra '()' initializes to zero.
}
// Fill the arrays with i*j+rank where i and j are the indices.
for (int i=0;i<dim;i++) {
for (int j=0;j<dim;j++) {
array2D[i][j] = i*j + rank;
}
}
// Print out the arrays.
print_2Darray(array2D,dim,dim);
// Commit a MPI_Datatype for these arrays.
MPI_Datatype MPI_ARRAYROW;
MPI_Type_contiguous(dim, MPI_INT, &MPI_ARRAYROW);
MPI_Type_commit(&MPI_ARRAYROW);
// Declare 'all_array2D[][]' which will contain array2D[][] from all procs.
int **all_array2D;
all_array2D = new int*[size*dim];
for (int i=0; i<size*dim; i++) {
all_array2D[i] = new int[dim](); // the extra '()' initializes to zero.
}
// Print out the arrays.
print_2Darray(all_array2D,size*dim,dim);
// Displacement vector for MPI_Gatherv() call.
int *displace;
displace = (int *)calloc(size,sizeof(int));
int *dim_list;
dim_list = (int *)calloc(size,sizeof(int));
int j = 0;
for (int i=0; i<size; i++) {
displace[i] = j;
cout << "displace[" << i << "] = " << displace[i] << endl;
j += dim;
dim_list[i] = dim;
}
// MPI_Gatherv call.
MPI_Barrier(MPI_COMM_WORLD);
MPI_Gatherv(array2D,dim,MPI_ARRAYROW,all_array2D,&dim_list[rank],&displace[rank],MPI_ARRAYROW,0,MPI_COMM_WORLD);
// Print out the arrays.
print_2Darray(all_array2D,size*dim,dim);
MPI::Finalize();
return 0;
}
The code compiles, but runs into segmentation faults (I compile with 'mpic++' and used 'mpirun -np 2' to use 2 processors):
[unknown-78-ca-39-b4-09-4f:02306] *** Process received signal ***
[unknown-78-ca-39-b4-09-4f:02306] Signal: Segmentation fault (11)
[unknown-78-ca-39-b4-09-4f:02306] Signal code: Address not mapped (1)
[unknown-78-ca-39-b4-09-4f:02306] Failing at address: 0x0
[unknown-78-ca-39-b4-09-4f:02306] [ 0] 2 libSystem.B.dylib 0x00007fff844021ba _sigtramp + 26
[unknown-78-ca-39-b4-09-4f:02306] [ 1] 3 ??? 0x0000000000000001 0x0 + 1
[unknown-78-ca-39-b4-09-4f:02306] [ 2] 4 gatherv2Darrays.x 0x00000001000010c2 main + 1106
[unknown-78-ca-39-b4-09-4f:02306] [ 3] 5 gatherv2Darrays.x 0x0000000100000a98 start + 52
[unknown-78-ca-39-b4-09-4f:02306] *** End of error message ***
mpirun noticed that job rank 0 with PID 2306 on node unknown-78-ca-39-b4-09-4f.home exited on signal 11 (Segmentation fault).
1 additional process aborted (not shown)
The segmentation fault occurs upon execution of the 'print_2Darray(all_array2D,size*dim,dim)' function near the end of the code, where 'all_array2D' is 'supposed to' contain the gathered arrays. More specifically, the code seems to print the 'all_array2D' OK for the bit gathered from the master processor, but then gives the seg fault when the print_2Darray() function starts working on the bits from other processors.
Salient points of code:
I declare an MPI_Datatype that is a contiguous block of memory of sufficient size to store a single row of the 2D arrays. I then use MPI_Gatherv() to try and gathers these rows.
The code's sleep(1) call is just to help the user see the prompt for 'dims' more clearly, otherwise it get's buried between the 'size' and 'rank' couts.
The elements of the 2D array are initialized to values "i*j + rank" where i and j are the row and column indices respectively. My rationale is that the resulting numbers easily give away the rank of the processor that generated that array.
I guess it boils down to me not knowing how properly to MPI_Gatherv() dynamically allocated arrays... Should I be using MPI_Datatypes at all? It's quite important to me that the arrays are dynamically allocated.
I will be very grateful for any help/suggestions! I'm pretty much depleted of ideas!
MPI_Gatherv, MPI_Scatterv, and in fact all other MPI communication calls that take array arguments, expect that array elements are laid out consecutively in memory. This means that in the call MPI_Gatherv(array2D, dim, MPI_ARRAYROW, ...), MPI expects that the first element of type MPI_ARRAYROW starts at the memory location that array2D points to, the second element starts at (BYTE*)array2D + extent_of(MPI_ARRAYROW), the third element starts at (BYTE*)array2D + 2*extent_of(MPI_ARRAYROW), and so on. Here extent_of() is the extent of the MPI_ARRAYROW type, which can be obtained by calling MPI_Type_get_extent.
Clearly the rows of your 2D array are not consecutive in memory since each of them is allocated by a separate invocation of the new operator. Also array2D is not a pointer to the data, but rather a pointer to the vector of pointers to each row. This doesn't work in MPI and there are countless of other questions here on StackOverflow, where this fact is discussed - just search for MPI 2D and see for yourself.
The solution is to use a big chunk of singly allocated memory block with an accompanying dope vector - see this question and the arralloc() function mentioned in the answer.
This problem, involving array allocations, comes up all the time in dealing with C/C++ and MPI. This:
int **array2D;
array2D = new int*[dim];
for (int i=0; i<dim; i++) {
array2D[i] = new int[dim](); // the extra '()' initializes to zero.
}
allocates dim 1d arrays, each dim ints in length. However, there's no reason at all why these should be laid out next to each other - the dim arrays are likely scattered across memory. So even sending dim*dim ints from array2D[0] won't work. The all_array2D is the same; you are creating size*dim arrays, each of size dim, but where they are in relation to each other who knows, making your displacements likely wrong.
To make the arrays contiguous in memory, you need to do something like
int **array2D;
array2D = new int*[dim];
array2D[0] = new int[dim*dim];
for (int i=1; i<dim; i++) {
array2D[i] = &(array2D[dim*i]);
}
and similarly for all_array2D. Only then can you start reasoning about memory layouts.
I just wanted to summarise the solution which #Hristolliev and #JonathanDursi helped me get to.
MPI commands like MPI_Gatherv() work with contiguously allocated blocks of memory, hence use of 'new' to construct 2D arrays which then feed into MPI commands won't work since 'new' doesn't guarantee contiguous blocks. Use instead 'calloc' to make these arrays (see code below as an example).
An important point by #Hristolliev: The 1st and 4th arguments of MPI_Gatherv() must be pointers to the first elements of type MPI_ARRAYROW. Dereferencing the 2D arrays by one level e.g. array2D[0] will achieve this (again, see modified working code below).
The final, working code is given below:
#include <iostream>
#include <string>
#include <cmath>
#include <cstdlib>
#include <time.h>
#include "mpi.h"
using namespace std;
void print_2Darray(int **array_in,int dim_rows, int dim_cols) {
cout << endl;
for (int i=0;i<dim_rows;i++) {
for (int j=0;j<dim_cols;j++) {
cout << array_in[i][j] << " ";
if (j==(dim_cols-1)) {
cout << endl;
}
}
}
cout << endl;
}
int main(int argc, char *argv[]) {
MPI::Init(argc, argv);
// Typical MPI incantations...
int size, rank;
size = MPI::COMM_WORLD.Get_size();
rank = MPI::COMM_WORLD.Get_rank();
cout << "size = " << size << endl;
cout << "rank = " << rank << endl;
sleep(1);
// Dynamically allocate a 2D square array of user-defined size 'dim'.
int dim;
if (rank == 0) {
cout << "Please enter dimensions of 2D array ( dim x dim array ): ";
cin >> dim;
cout << "dim = " << dim << endl;
}
MPI_Bcast(&dim,1,MPI_INT,0,MPI_COMM_WORLD);
// Use another way of declaring the 2D array which ensures it is contiguous in memory.
int **array2D;
array2D = (int **) calloc(dim,sizeof(int *));
array2D[0] = (int *) calloc(dim*dim,sizeof(int));
for (int i=1;i<dim;i++) {
array2D[i] = array2D[0] + i*dim;
}
// Fill the arrays with i*j+rank where i and j are the indices.
for (int i=0;i<dim;i++) {
for (int j=0;j<dim;j++) {
array2D[i][j] = i*j + rank;
}
}
// Print out the arrays.
print_2Darray(array2D,dim,dim);
// Commit a MPI_Datatype for these arrays.
MPI_Datatype MPI_ARRAYROW;
MPI_Type_contiguous(dim, MPI_INT, &MPI_ARRAYROW);
MPI_Type_commit(&MPI_ARRAYROW);
// Use another way of declaring the 2D array which ensures it is contiguous in memory.
int **all_array2D;
all_array2D = (int **) calloc(size*dim,sizeof(int *));
all_array2D[0] = (int *) calloc(dim*dim,sizeof(int));
for (int i=1;i<size*dim;i++) {
all_array2D[i] = all_array2D[0] + i*dim;
}
// Print out the arrays.
print_2Darray(all_array2D,size*dim,dim);
// Displacement vector for MPI_Gatherv() call.
int *displace;
displace = (int *)calloc(size,sizeof(int));
int *dim_list;
dim_list = (int *)calloc(size,sizeof(int));
int j = 0;
for (int i=0; i<size; i++) {
displace[i] = j;
cout << "displace[" << i << "] = " << displace[i] << endl;
j += dim;
dim_list[i] = dim;
cout << "dim_list[" << i << "] = " << dim_list[i] << endl;
}
// MPI_Gatherv call.
MPI_Barrier(MPI_COMM_WORLD);
cout << "array2D[0] = " << array2D[0] << endl;
MPI_Gatherv(array2D[0],dim,MPI_ARRAYROW,all_array2D[0],&dim_list[rank],&displace[rank],MPI_ARRAYROW,0,MPI_COMM_WORLD);
// Print out the arrays.
print_2Darray(all_array2D,size*dim,dim);
MPI::Finalize();
return 0;
}
Compile with mpic++.