CUDA: Filling a column-major matrix - c++

I am fairly new to CUDA, and I am trying to offload to the GPU some cumbersome computations I am doing for a performance-critical project. On my computer I have two NVS 510 Graphic cards, but I am currently experimenting with one only.
I have some big column-major matrix (1000-5000 rows x 1-5 M columns) to be filled. I was so far able to write the code to fill the matrix like it were an array, and it works well for matrices of relatively small size.
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
The kernel is called:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
and everything works just fine.
If I change the kernel to work with 2D threads:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
and I invoke it
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
the results are all zero and CUDA returns unknown error. What am I missing? the actual code, which compiles without error with VS2015 and CUDA 8.0, can be found here: https://pastebin.com/XBCVC7VV
Here is the code from the pastebin link:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//2d
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
Moreover, I would extremely grateful for any direction on how to improve the performance of the code.

Any time you are having trouble with a CUDA code, I recommend doing proper CUDA error checking (which you mostly seem to be doing), and also run your code with cuda-memcheck. This last utility is similar to "enabling the memory checker" in Nsight VSE, but not quite the same. However the Nsight VSE memory checker may have given you the same indication.
In C (or C++) indexing of arrays generally starts at 0. Therefore, to test for an out-of-bounds index, I must check to see if the generated index is equal to or greater than the size of the array. But in your case you are only testing for greater than:
if (row > n_sim || column > num_cols) return;
You make a similar error in both your 1D kernel and in your 2D kernel, and although you believe your 1D kernel is working correctly, it is actually making out-of-bounds accesses. You can verify this if you run with the aforementioned cuda-memcheck utility (or probably also with the memory checker that can be enabled in Nsight VSE).
When I modify your code in the pastebin link to use proper range/bounds checking, cuda-memcheck reports no errors, and your program reports the correct results. I've tested both cases, but the code below is modified from your pastebin link to uncomment the 2D case, and use that instead of the 1D case:
$ cat t375.cu | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
#endif
//2d
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
$ nvcc -arch=sm_52 -o t375 t375.cu -std=c++11
$ cuda-memcheck ./t375
========= CUDA-MEMCHECK
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
$
Note that cuda-memcheck slows down the execution of your program on the GPU to do rigorous memory bounds checking. Therefore the performance may not match the ordinary case. This is what an "ordinary" run looks like:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
$

You are accessing memory beyond the allocated chunk. To check if row and column indices are within the range:
if (row >= n_rows || column >= num_cols) return; // Do this
if (row > n_rows || column > num_cols) return; // Instead of this
In flat version this int row = index % n_rows; makes row stay below the n_rows. You only access one column beyond the allocated memory, which for small matrix could still be withing the memory alignment. Python demo.
The second version does access an extra column plus and extra element, and one extra element for each row (the first element of the following row), as this:
int row = blockIdx.y * blockDim.y + threadIdx.y;
no longer keeps row index within the valid range. Python demo.
Looking at your pastebin, this is probably the place where it breaks:
44. fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
^^^
45. for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
46. res += weights[w_p + inter_point] * \
grid_values[row + node_map[w_p + inter_point] * n_sim];
^^^
47. }

Related

CUDA straight array implementation 3x faster than Struct of Arrays (SoA) AND Array of Structs (AoS)

I found this question about the speed of Structure of Array (SoA) and Array of Struct (AoS) implementation and the top answer states that the speed of each implementation depends on access pattern. To test this, I've created 3 kernels where one is for raw array of doubles, another is for AoS pattern, and the last is for SoA pattern. Each method adds 1.0 to each respective "element" in some column vector if you will. What I'm finding is that the straight array case is more 3x faster than both SoA and AoS implementations. I cannot understand why the straight array is this much faster when the access patterns are identical between all three methods? The CUDA profiler outputs the same message for each kernel which is
The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To further improve performance, work will likely need to be shifted from the most utilized to another unit. Start by analyzing workloads in the Compute Workload Analysis section.
Here is a minimal reproducible example:
#include <iostream>
#include <chrono>
#include <vector>
using namespace std::chrono;
constexpr int nvars = 4;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct AoS {
double t, x, y, z;
__device__ AoS& operator+=(const AoS &vec) {
t += vec.t;
x += vec.x;
y += vec.y;
z += vec.z;
return *this;
};
};
struct SoA {
size_t size;
bool host_allocated, device_allocated;
double* t;
double* x;
double* y;
double* z;
double* gpu_t;
double* gpu_x;
double* gpu_y;
double* gpu_z;
SoA* device_ptr;
SoA(int elements) : size(elements * sizeof(double)), host_allocated(false), device_allocated(false) {};
~SoA(){
if (host_allocated){
free(t);
free(x);
free(y);
free(z);
}
if (device_allocated) {
cudaFree(gpu_t);
cudaFree(gpu_x);
cudaFree(gpu_y);
cudaFree(gpu_z);
cudaFree(device_ptr);
}
}
void host_allocate() {
t = (double*)malloc(size);
x = (double*)malloc(size);
y = (double*)malloc(size);
z = (double*)malloc(size);
host_allocated = true;
};
void device_allocate() {
if (!host_allocated) {
host_allocate();
}
gpuErrchk(cudaMalloc((void**)&device_ptr, sizeof(SoA)));
gpuErrchk(cudaMalloc((void**)&gpu_t, size));
gpuErrchk(cudaMalloc((void**)&gpu_x, size));
gpuErrchk(cudaMalloc((void**)&gpu_y, size));
gpuErrchk(cudaMalloc((void**)&gpu_z, size));
device_allocated = true;
};
void copy_to_device() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(gpu_t, t, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_x, x, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_y, y, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_z, z, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->t), &gpu_t, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->x), &gpu_x, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->y), &gpu_y, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->z), &gpu_z, sizeof(double *), cudaMemcpyHostToDevice));
}
void copy_to_host() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(t, gpu_t, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(x, gpu_x, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(y, gpu_y, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(z, gpu_z, size, cudaMemcpyDeviceToHost));
}
SoA* get_ptr(){
if (device_allocated) {
return device_ptr;
}
return this;
}
};
__global__ void arr_add(double* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
#pragma unroll
for (int var = 0; var < nvars; var++) {
vec[ii + var] = vec[ii + var] + 1.0;
}
};
__global__ void aos_add(AoS* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec[ii] += AoS{1.0, 1.0, 1.0, 1.0};
};
__global__ void soa_add(SoA *vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec->t[ii] = vec->t[ii] + 1.0;
vec->x[ii] = vec->x[ii] + 1.0;
vec->y[ii] = vec->y[ii] + 1.0;
vec->z[ii] = vec->z[ii] + 1.0;
};
int main() {
constexpr int n = 1 << 25;
constexpr int block_size = 128;
high_resolution_clock::time_point t1, t2;
duration<double> dt1, dt2, dt3;
SoA hybrid_soa(n);
hybrid_soa.device_allocate();
hybrid_soa.copy_to_device();
std::vector<AoS> host_vec_aos(n);
std::vector<double> host_arr(n * nvars);
AoS *dev_aos;
double *dev_arr;
gpuErrchk(cudaMalloc((void**)&dev_aos, n * sizeof(AoS)));
gpuErrchk(cudaMalloc((void**)&dev_arr, n * nvars * sizeof(double)));
gpuErrchk(cudaMemcpy(dev_aos, host_vec_aos.data(), n * sizeof(AoS), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_arr, host_arr.data(), n * nvars * sizeof(double), cudaMemcpyHostToDevice));
const int nblocks = (n + block_size - 1) / block_size;
std::cout << "Size of AoS struct is " << sizeof(AoS) << " bytes" << "\n";
std::cout << "Size of SoA struct is " << sizeof(SoA) << " bytes" << "\n";\
t1 = high_resolution_clock::now();
arr_add<<<nblocks, block_size>>>(dev_arr, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt1 = t2 - t1;
std::cout << "SrA took: " << std::scientific << dt1.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
aos_add<<<nblocks, block_size>>>(dev_aos, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt2 = t2 - t1;
std::cout << "AoS took: " << std::scientific << dt2.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
soa_add<<<nblocks, block_size>>>(hybrid_soa.get_ptr(), n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt3 = t2 - t1;
std::cout << "SoA took: " << std::scientific << dt3.count() << " seconds" << "\n";
gpuErrchk(cudaMemcpy(host_vec_aos.data(), dev_aos, n * sizeof(AoS), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(host_arr.data(), dev_arr, n * nvars * sizeof(double), cudaMemcpyDeviceToHost));
hybrid_soa.copy_to_host();
std::cout << "Straight array is: " << dt2.count() / dt1.count() << " times faster than AoS" << "\n";
std::cout << "Straight array is: " << dt3.count() / dt1.count() << " times faster than SoA" << "\n";
cudaFree(dev_aos);
cudaFree(dev_arr);
return 0;
}
I was expecting the speed to be near identical.

Why is my multithreading code slower than the serial version and what can I do to speed it up?

I've recently learnt the basics of threading and throught i'd take it for a spin. However doing some tests, it seems that the threadded version of the code is actually slower than the serial code. Can anybody spot any problems with the following program? If there are none (highly doubtful) can you propose a strategy where I do observe speed ups. The other thing that crossed my mind is that this is a simple problem, so maybe the overhead incurred by threading isn't worth the effort. In reality the BealeFunction below will be a system of ODE's, so computational expensive to evaluate.
Here are the results:
/**
* Results
* -------
*
* In serial: fitness = 163.179; computation took: 4085 microseonds
* In serial: fitness = 163.179; computation took: 3606 microseonds
* In serial: fitness = 163.179; computation took: 4288 microseonds
*
* With threading: fitness = 163.179; computation took: 16893 microseonds
* With threading: fitness = 163.179; computation took: 14333 microseonds
* With threading: fitness = 163.179; computation took: 13636 microseonds
*
*/
And the code to generate them:
#include <chrono>
#include <random>
#include <iostream>
#include <thread>
#include <future>
#include <mutex>
#include <vector>
double BealeFunction(double *parameters) {
double x = parameters[0];
double y = parameters[1];
double first = pow(1.5 - x + x * y, 2);
double second = pow(2.25 - x + x * pow(y, 2), 2);
double third = pow(2.625 - x + x * pow(y, 3), 2);
return first + second + third;
};
double inSerial(std::vector<std::vector<double>> matrix){
double total = 0;
for (auto & i : matrix){
total += BealeFunction(i.data());
}
return total;
}
double withThreading(std::vector<std::vector<double>> matrix){
double total = 0;
std::mutex mtx;
std::vector<std::shared_future<double>> futures;
auto compute1 = [&](int startIndex, int endIndex) {
double sum = 0;
for (int i = startIndex; i <= endIndex; i++) {
sum += BealeFunction(matrix[i].data());
}
return sum;
};
// deal with situation where population size < hardware_concurrency.
int numThreads = 0;
if (matrix.size() < (int) std::thread::hardware_concurrency() - 1) {
numThreads = matrix.size() - 1; // account for 0 index
} else {
numThreads = (int) std::thread::hardware_concurrency() - 1; // account for main thread
}
int numPerThread = floor(matrix.size() / numThreads);
int remainder = matrix.size() % numThreads;
int startIndex = 0;
int endIndex = numPerThread;
for (int i = 0; i < numThreads; i++) {
if (i < remainder) {
// we need to add one more job for these threads
startIndex = i * (numPerThread + 1);
endIndex = startIndex + numPerThread;
} else {
startIndex = i * numPerThread + remainder;
endIndex = startIndex + (numPerThread - 1);
}
std::cout << "thread " << i << "; start index: " << startIndex << "; end index: " << endIndex << std::endl;
std::shared_future<double> f = std::async(std::launch::async, compute1, startIndex, endIndex);
futures.push_back(f);
}
// now collect the results from futures
for (auto &future : futures) {
total += future.get();
}
return total;
}
int main() {
auto start = std::chrono::steady_clock::now();
int N = 2000;
int M = 2;
// (setup code)
std::vector<std::vector<double>> matrix(N, std::vector<double>(M));
int seed = 5;
std::default_random_engine e(seed);
std::uniform_real_distribution<double> dist1(2.9, 3.1);
std::uniform_real_distribution<double> dist2(0.4, 0.6);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
matrix[i][0] = dist1(e);
matrix[i][1] = dist2(e);
}
}
double total = withThreading(matrix);
// double total = inSerial(matrix);
auto end = std::chrono::steady_clock::now();
std::cout << "fitness: " << total << std::endl;
std::cout << "computation took: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< " microseonds" << std::endl;
}

Receiving char array via MPI_Recv results in the output image being partially black

I am calculating a picture and i am using OpenMPI to destribute my loop onto several cores.
Every core calculates a part of the loop correctly, but merging
both arrays on the core with rank 0 results in part of the picture being black.
I do not know if sending/receiving the char array corrupts the data or
if my merge is not correct.
(My virtual machine has 2 cores).
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <xmmintrin.h>
#include "complex.h"
#include "lodepng.h"
using namespace std;
#include <sys/time.h>
#include <mpi.h>
int julia(int width, int height, int x, int y, const complex &c, const int max_iter) {
/*
* calculates the number of iterations (between 0 and 255) for a pixel (x,y)
*/
complex z(1.5f * (x - width / 2) / (0.5f * width), (y - height / 2) / (0.5f * height));
int n = 0;
for (int i = 0; ((z.abs() < 2.0f) && (i < max_iter)); i++) {
float tmp = z.re() * z.re() - z.im() * z.im() + c.re();
z.set_im(2.0f * z.re() * z.im() + c.im());
z.set_re(tmp);
n++;
}
return max_iter - n;
}
int main(int argc, char **argv) {
int rank, num_procs;
//Initialize the infrastructure necessary for coomunication
MPI_Init(&argc, &argv);
//Identify this process
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//Find out how many processes are active
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
const complex c(-0.7, 0.27015); // pretty
// const complex c( -0.8, 0.156); // alternative complex number
const int width = 2 * 1024, height = 2 * 1024; // image size
int max_iter = 256;
// Allocate aligned image buffer
unsigned char *image = NULL;
int err = posix_memalign((void **) &image, 64, width * height * 4 * sizeof(unsigned char));
if (err){
cout << "[Error] Couldn't allocate memory for regular Image." << endl;
}
int count = height / num_procs;
int remainder = height % num_procs;
int start, stop;
if (rank < remainder) {
// The first 'remainder' ranks get 'count + 1' tasks each
start = rank * (count + 1);
stop = start + count;
} else {
// The remaining 'size - remainder' ranks get 'count' task each
start = rank * count + remainder;
stop = start + (count - 1);
}
/*
* Creates a pictures of a julia set
*/
cout << "--------------------- " << endl;
cout << "my rank " << rank << endl;
cout << "number of procs " << num_procs << endl;
cout << "start " << start << endl;
cout << "end " << stop << endl;
cout << "--------------------- "<< endl;
// iterate over image pixels and calculate their value
for (int y = start; y <= stop; y++) {
for (int x = 0; x < width; x++) {
int a = julia(width, height, x, y, c, max_iter);
image[4 * width * y + 4 * (x + 0) + 0] = a; // R
image[4 * width * y + 4 * (x + 0) + 1] = a; // G
image[4 * width * y + 4 * (x + 0) + 2] = a; // B
image[4 * width * y + 4 * (x + 0) + 3] = 255; // Alpha
}
}
if(rank != 0)
{
//send image to rank 0
MPI_Send(image, sizeof(image), MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD);
}
if(rank == 0 && (num_procs > 0))
{
// Allocate aligned image buffer
unsigned char *lImage = (unsigned char *) malloc(width * height * 4 * sizeof(unsigned char));
/*
unsigned char *lImage = NULL;
int err = posix_memalign((void **) &lImage, 64, width * height * 4 * sizeof(unsigned char));
if (err){
cout << "[Error] Couldn't allocate memory for regular Image." << endl;
}
*/
for(int i = 1; i < num_procs; i++)
{
//lImage receives the image array from process i
MPI_Recv(lImage, sizeof(lImage), MPI_UNSIGNED_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//calculate start and stop for the following for loop
int lCount = height / num_procs;
int lRemainder = height % num_procs;
int lStart, lStop;
if (i < lRemainder) {
// The first 'remainder' ranks get 'count + 1' tasks each
lStart = i * (lCount + 1);
lStop = lStart + lCount;
} else {
// The remaining 'size - remainder' ranks get 'count' task each
lStart = i * lCount + lRemainder;
lStop = lStart + (lCount - 1);
}
cout << "--------inside loop----------- " << endl;
cout << "my rank " << i << endl;
cout << "start " << lStart << endl;
cout << "end " << lStop << endl;
cout << "--------------------- "<< endl;
for (int y = lStart; y <= lStop; y++) {
for (int x = 0; x < width; x++) {
image[4 * width * y + 4 * (x + 0) + 0] = lImage[4 * width * y + 4 * (x + 0) + 0]; // R
image[4 * width * y + 4 * (x + 0) + 1] = lImage[4 * width * y + 4 * (x + 0) + 1]; // G
image[4 * width * y + 4 * (x + 0) + 2] = lImage[4 * width * y + 4 * (x + 0) + 2]; // B
image[4 * width * y + 4 * (x + 0) + 3] = 255; // Alpha
}
}
//receive image of the rank i
//merge received image array and rank 0 image array
}
free(lImage);
}
if(rank == 0)
{
/*Encode the image*/
unsigned error = lodepng_encode32_file("julia_openmp0.png", image, width, height);
if (error) {
std::cout << "[Error] " << error << " : " << lodepng_error_text(error) << std::endl;
}
}
if(rank == 1)
{
/*Encode the image*/
unsigned error = lodepng_encode32_file("julia_openmp1.png", image, width, height);
if (error) {
std::cout << "[Error] " << error << " : " << lodepng_error_text(error) << std::endl;
}
}
/*cleanup*/
free(image);
/* Tear down the communication infrastructure */
MPI_Finalize();
return 0;
Sorry I had no chance to do a proper checking, but I think it boils down to this line:
MPI_Recv(lImage, sizeof(lImage), MPI_UNSIGNED_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
You are trying to receive the messages in a particular order (process 1, process 2), while you don't really know which one is ready first. And MPI_Recv receive operation is blocking. Can you try to rewrite the logic in such a way that it does not depend on the order of computation? One way to do it would be to use MPI_Gather(). Another option would be to include process number into the message you send, then use MPI_ANY_SOURCE to receive the next incoming message and to use process number in the loop to handle the block correctly.

cudaErrorIllegalAdress on cudaMemcpy

I am new to cuda and trying to write a little code which should generate random points on a sphere. Here is the code.
__global__
void setup_kernel(curandStateMRG32k3a *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
int threads = 256;
dim3 grid = size / threads;
setup_kernel<<<grid,threads>>>(devStates);
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
This code works if the number of elements in the vectors is less than 4000 (I tried 1K,2K,3K and 4K). Than it gives me cuda Error Illegal Address in the first cudaMemcpy. I don't think I run out of memory, I am working with gtx 980 (4GB of global memory). Any idea how to fix this?
EDIT: The code after the suggested modifications is the following:
__global__
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < numberOfElements) curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
int threads = 512;
dim3 grid = (h_x.size() + threads - 1) / threads;
setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
cudaDeviceSynchronize();
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
I feel sorry for keeping posting here but I think by understanding what are my mistakes now I think I might get a better understanding of cuda.
So, now I am getting errorIllegalAdress on cudaMemcpy device->host when h_x.size() is 20k. I still do not understand how the code works for small numbers but not for big ones.
The problem is here:
size_t size = h_x.size() * sizeof(float);
...
int threads = 256;
dim3 grid = size / threads;
Your size variable is scaled by the number of bytes. So that is not the correct variable to use for the grid size. You should compute the grid size like this:
dim3 grid = h_x.size() / threads;
or similar. Also note that this construct won't properly initialize all curand state unless the vector length (h_x.size()) is evenly divisible by threads i.e. 256. The method to address this would be to include a thread check in your setup_kernel similar to the one in your other kernel:
__global__
void setup_kernel(curandStateMRG32k3a *state, int size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < size)
curand_init(0, id, 0, &state[id]);
}
and launch enough threads to cover the vector size:
dim3 grid = (h_x.size()+threads-1) / threads;

Optimizing 1D Convolution

Is there a way to speed up this 1D convolution ? I tried to make the dy cache efficient
but compiling with g++ and -O3 gave worse performances.
I am convolving with [-1. , 0., 1] in both directions.
Is not homework.
#include<iostream>
#include<cstdlib>
#include<sys/time.h>
void print_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
std::cout << matrix[j * width + i] << ",";
}
std::cout << std::endl;
}
}
void fill_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
matrix[j * width + i] = ((float)rand() / (float)RAND_MAX) ;
}
}
}
#define RESTRICT __restrict__
void dx_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[1];
for (int j=0; j < height; j++){
float* row = in_matrix + j * width;
for (int i=1; i < width-1; i++){
float res = -1.F * row[i-1] + row[i+1]; /* -1.F * value + 0.F * value + 1.F * value; */
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}
void dy_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[ width + 1];
for (int j=1; j < height-1; j++){
for (int i=0; i < width; i++){
float res = -1.F * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}
double now (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
int main(int argc, char **argv){
int width, height;
float *in_matrix;
float *out_matrix;
if(argc < 3){
std::cout << argv[0] << "usage: width height " << std::endl;
return -1;
}
srand(123);
width = atoi(argv[1]);
height = atoi(argv[2]);
std::cout << "Width:"<< width << " Height:" << height << std::endl;
if (width < 3){
std::cout << "Width too short " << std::endl;
return -1;
}
if (height < 3){
std::cout << "Height too short " << std::endl;
return -1;
}
in_matrix = (float *) malloc( height * width * sizeof(float));
out_matrix = (float *) malloc( height * width * sizeof(float));
fill_matrix(height, width, in_matrix);
//print_matrix(height, width, in_matrix);
float min, max;
double a = now();
dx_matrix(height, width, in_matrix, out_matrix, &min, &max);
std::cout << "dx min:" << min << " max:" << max << std::endl;
dy_matrix(height, width, in_matrix, out_matrix, &min, &max);
double b = now();
std::cout << "dy min:" << min << " max:" << max << std::endl;
std::cout << "time: " << b-a << " sec" << std::endl;
return 0;
}
Use local variables for computing the min and max. Every time you do this:
if (res > *max ) *max = res;
if (res < *min ) *min = res;
max and min have to get written to memory. Adding restrict on the pointers would help (indicating the writes are independent), but an even better way would be something like
//Setup
float tempMin = ...
float tempMax = ...
...
// Inner loop
tempMin = (res < tempMin) ? res : tempMin;
tempMax = (res > tempMax) ? res : tempMax;
...
// End
*min = tempMin;
*max = tempMax;
First of all, I would rewrite the dy loop to get rid of "[ (j-1) * width + i]" and "in_matrix[ (j+1) * width + i]", and do something like:
float* p, *q, *out;
p = &in_matrix[(j-1)*width];
q = &in_matrix[(j+1)*width];
out = &out_matrix[j*width];
for (int i=0; i < width; i++){
float res = -1.F * p[i] + q[i] ;
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out[i] = res;
}
But that is a trivial optimization that the compiler may already be doing for you.
It will be slightly faster to do "q[i]-p[i]" instead of "-1.f*p[i]+q[i]", but, again, the compiler may be smart enough to do that behind your back.
The whole thing would benefit considerably from SSE2 and multithreading. I'd bet on at least a 3x speedup from SSE2 right away. Multithreading can be added using OpenMP and it will only take a few lines of code.
The compiler might notice this but you are creating/freeing a lot of variables on the stack as you go in and out of the scope operators {}. Instead of:
for (int j=0; j < height; j++){
float* row = in_matrix + j * width;
for (int i=1; i < width-1; i++){
float res = -1.F * row[i-1] + row[i+1];
How about:
int i, j;
float *row;
float res;
for (j=0; j < height; j++){
row = in_matrix + j * width;
for (i=1; i < width-1; i++){
res = -1.F * row[i-1] + row[i+1];
Well, the compiler might be taking care of these, but here are a couple of small things:
a) Why are you multiplying by -1.F? Why not just subtract? For instance:
float res = -1.F * row[i-1] + row[i+1];
could just be:
float res = row[i+1] - row[i-1];
b) This:
if (res > *max ) *max = res;
if (res < *min ) *min = res;
can be made into
if (res > *max ) *max = res;
else if (res < *min ) *min = res;
and in other places. If the first is true, the second can't be so let's not check it.
Addition:
Here's another thing. To minimize your multiplications, change
for (int j=1; j < height-1; j++){
for (int i=0; i < width; i++){
float res = -1.F * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;
to
int h = 0;
int width2 = 2 * width;
for (int j=1; j < height-1; j++){
h += width;
for (int i=h; i < h + width; i++){
float res = in_matrix[i + width2] - in_matrix[i];
and at the end of the loop
out_matrix[i + width] = res;
You can do similar things in other places, but hopefully you get the idea. Also, there is a minor bug,
*min = *max = -1.F * in_matrix[0] + in_matrix[ width + 1 ];
should be just in_matrix[ width ] at the end.
Profiling this with -O3 and -O2 using versions of both the clang and g++ compilers on OS X, I found that
30% of the time was spent filling the initial matrix
matrix[j * width + i] = ((float)rand() / (float)RAND_MAX) ;
40% of the time was spent in dx_matrix on the line.
out_matrix[j * width + i] = row[i+1] -row[i-1];
About 9% of the time was spent in the conditionals in dx_matrix .. I separated them into a separate loop to see if that helped, but it didn't change anything much.
Shark gave the suggestion that this could be improved through the use of SSE instructions.
Interestingly only about 19% of the time was spent in the dy_matrix routine.
This was running on 10k by 10k matrix ( about 1.6 seconds )
Note your results may be different if you're using a different compiler, different OS etc.