Receiving char array via MPI_Recv results in the output image being partially black

Receiving char array via MPI_Recv results in the output image being partially black - c++

I am calculating a picture and i am using OpenMPI to destribute my loop onto several cores.
Every core calculates a part of the loop correctly, but merging
both arrays on the core with rank 0 results in part of the picture being black.
I do not know if sending/receiving the char array corrupts the data or
if my merge is not correct.
(My virtual machine has 2 cores).
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <xmmintrin.h>
#include "complex.h"
#include "lodepng.h"
using namespace std;
#include <sys/time.h>
#include <mpi.h>
int julia(int width, int height, int x, int y, const complex &c, const int max_iter) {
/*
* calculates the number of iterations (between 0 and 255) for a pixel (x,y)
*/
complex z(1.5f * (x - width / 2) / (0.5f * width), (y - height / 2) / (0.5f * height));
int n = 0;
for (int i = 0; ((z.abs() < 2.0f) && (i < max_iter)); i++) {
float tmp = z.re() * z.re() - z.im() * z.im() + c.re();
z.set_im(2.0f * z.re() * z.im() + c.im());
z.set_re(tmp);
n++;
}
return max_iter - n;
}
int main(int argc, char **argv) {
int rank, num_procs;
//Initialize the infrastructure necessary for coomunication
MPI_Init(&argc, &argv);
//Identify this process
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//Find out how many processes are active
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
const complex c(-0.7, 0.27015); // pretty
// const complex c( -0.8, 0.156); // alternative complex number
const int width = 2 * 1024, height = 2 * 1024; // image size
int max_iter = 256;
// Allocate aligned image buffer
unsigned char *image = NULL;
int err = posix_memalign((void **) &image, 64, width * height * 4 * sizeof(unsigned char));
if (err){
cout << "[Error] Couldn't allocate memory for regular Image." << endl;
}
int count = height / num_procs;
int remainder = height % num_procs;
int start, stop;
if (rank < remainder) {
// The first 'remainder' ranks get 'count + 1' tasks each
start = rank * (count + 1);
stop = start + count;
} else {
// The remaining 'size - remainder' ranks get 'count' task each
start = rank * count + remainder;
stop = start + (count - 1);
}
/*
* Creates a pictures of a julia set
*/
cout << "--------------------- " << endl;
cout << "my rank " << rank << endl;
cout << "number of procs " << num_procs << endl;
cout << "start " << start << endl;
cout << "end " << stop << endl;
cout << "--------------------- "<< endl;
// iterate over image pixels and calculate their value
for (int y = start; y <= stop; y++) {
for (int x = 0; x < width; x++) {
int a = julia(width, height, x, y, c, max_iter);
image[4 * width * y + 4 * (x + 0) + 0] = a; // R
image[4 * width * y + 4 * (x + 0) + 1] = a; // G
image[4 * width * y + 4 * (x + 0) + 2] = a; // B
image[4 * width * y + 4 * (x + 0) + 3] = 255; // Alpha
}
}
if(rank != 0)
{
//send image to rank 0
MPI_Send(image, sizeof(image), MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD);
}
if(rank == 0 && (num_procs > 0))
{
// Allocate aligned image buffer
unsigned char *lImage = (unsigned char *) malloc(width * height * 4 * sizeof(unsigned char));
/*
unsigned char *lImage = NULL;
int err = posix_memalign((void **) &lImage, 64, width * height * 4 * sizeof(unsigned char));
if (err){
cout << "[Error] Couldn't allocate memory for regular Image." << endl;
}
*/
for(int i = 1; i < num_procs; i++)
{
//lImage receives the image array from process i
MPI_Recv(lImage, sizeof(lImage), MPI_UNSIGNED_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//calculate start and stop for the following for loop
int lCount = height / num_procs;
int lRemainder = height % num_procs;
int lStart, lStop;
if (i < lRemainder) {
// The first 'remainder' ranks get 'count + 1' tasks each
lStart = i * (lCount + 1);
lStop = lStart + lCount;
} else {
// The remaining 'size - remainder' ranks get 'count' task each
lStart = i * lCount + lRemainder;
lStop = lStart + (lCount - 1);
}
cout << "--------inside loop----------- " << endl;
cout << "my rank " << i << endl;
cout << "start " << lStart << endl;
cout << "end " << lStop << endl;
cout << "--------------------- "<< endl;
for (int y = lStart; y <= lStop; y++) {
for (int x = 0; x < width; x++) {
image[4 * width * y + 4 * (x + 0) + 0] = lImage[4 * width * y + 4 * (x + 0) + 0]; // R
image[4 * width * y + 4 * (x + 0) + 1] = lImage[4 * width * y + 4 * (x + 0) + 1]; // G
image[4 * width * y + 4 * (x + 0) + 2] = lImage[4 * width * y + 4 * (x + 0) + 2]; // B
image[4 * width * y + 4 * (x + 0) + 3] = 255; // Alpha
}
}
//receive image of the rank i
//merge received image array and rank 0 image array
}
free(lImage);
}
if(rank == 0)
{
/*Encode the image*/
unsigned error = lodepng_encode32_file("julia_openmp0.png", image, width, height);
if (error) {
std::cout << "[Error] " << error << " : " << lodepng_error_text(error) << std::endl;
}
}
if(rank == 1)
{
/*Encode the image*/
unsigned error = lodepng_encode32_file("julia_openmp1.png", image, width, height);
if (error) {
std::cout << "[Error] " << error << " : " << lodepng_error_text(error) << std::endl;
}
}
/*cleanup*/
free(image);
/* Tear down the communication infrastructure */
MPI_Finalize();
return 0;

Sorry I had no chance to do a proper checking, but I think it boils down to this line:
MPI_Recv(lImage, sizeof(lImage), MPI_UNSIGNED_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
You are trying to receive the messages in a particular order (process 1, process 2), while you don't really know which one is ready first. And MPI_Recv receive operation is blocking. Can you try to rewrite the logic in such a way that it does not depend on the order of computation? One way to do it would be to use MPI_Gather(). Another option would be to include process number into the message you send, then use MPI_ANY_SOURCE to receive the next incoming message and to use process number in the loop to handle the block correctly.

Related

MPI C++ Runtime Error: signal 11 (Segmentation fault) with multi-dimensional array creation

Making Mandelbrot with MPI
So I've made a Mandelbrot generator and everything worked fine. Now I'm throwing in a speedup from MPI. Process 0 generates a file name mbrot.ppm and adds the appropriate metadata, then divides up the workload into chunks.
Each process receives the chunk's starting and ending positions and gets to work calculating its portion of the Mandelbrot set. To write to the mbrot.ppm file, each process saves its data in an array so it doesn't write to the file before the previous process finishes.
My Problem
Its a runtime error that says:
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node Lenovo exited on signal 11 (Segmentation fault).
I believe it comes from the line int data[3][xrange][yrange]; (line 120) since the print statement after this line never executes. Would there be an obvious reason I'm missing why this multi-dimensional array is causing me problems?
Full Code
#include <iostream>
#include <mpi.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <fstream>
#define MCW MPI_COMM_WORLD
using namespace std;
struct Complex {
double r;
double i;
};
Complex operator + (Complex s, Complex t) {
Complex v;
v.r = s.r + t.r;
v.i = s.i + t.i;
return v;
};
Complex operator * (Complex s, Complex t) {
Complex v;
v.r = s.r * t.r - s.i * t.i;
v.i = s.r * t.i + s.i * t.r;
return v;
};
int rcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int gcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int bcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int mbrot(Complex c, int maxIters) {
int i = 0;
Complex z;
z = c;
while (i < maxIters && z.r * z.r + z.i * z.i < 4) {
z = z * z + c;
i++;
}
return i;
};
int main(int argc, char * argv[]) {
int rank, size;
MPI_Init( & argc, & argv);
MPI_Comm_rank(MCW, & rank);
MPI_Comm_size(MCW, & size);
if (size < 2) {
printf("Not an MPI process if only 1 process runs.\n");
exit(1);
}
if (size % 2 != 0) {
printf("Please use a even number\n");
exit(1);
}
Complex c1, c2, c;
char path[] = "brot.ppm";
int DIM;
int chunk[4];
c1.r = -1;
c1.i = -1;
c2.r = 1;
c2.i = 1;
if (rank == 0) { //start the file
ofstream fout;
fout.open(path);
DIM = 2000; // pixel dimensions
fout << "P3" << endl; // The file type .ppm
fout << DIM << " " << DIM << endl; // dimensions of the image
fout << "255" << endl; // color depth
fout.close();
// making dimesions marks
for (int i = 0; i < size; i++) {
chunk[0] = 0; // startX
chunk[1] = DIM; // endX
chunk[2] = (DIM / size) * i; // startY
chunk[3] = (DIM / size) * (i + 1); // endY
MPI_Send(chunk, 4, MPI_INT, i, 0, MCW);
};
};
MPI_Recv(chunk, 4, MPI_INT, 0, 0, MCW, MPI_STATUS_IGNORE);
printf("Process %d recieved chunk\n\t StartX: %d, EndX: %d\n\t StartY: %d, EndY: %d\n", rank, chunk[0], chunk[1], chunk[2], chunk[3]);
// do stuff save in array
// data[3 elements][Xs][Ys]
int xrange = chunk[1] - chunk[0];
int yrange = chunk[3] - chunk[2];
printf("Process %d, x: %d, y: %d\n", rank, xrange, yrange);
int data[3][xrange][yrange];
printf("done\n");
// generate data for mandlebrot
for (int j = chunk[2]; j < chunk[3]; ++j) {
for (int i = chunk[0]; i < chunk[1]; ++i) {
// calculate one pixel of the DIM x DIM image
c.r = (i * (c1.r - c2.r) / DIM) + c2.r;
c.i = (j * (c1.i - c2.i) / DIM) + c2.i;
int iters = mbrot(c, 255);
data[0][i][j] = rcolor(iters);
data[1][i][j] = gcolor(iters);
data[2][i][j] = bcolor(iters);
}
}
printf("here2\n");
// taking turns to write their data to file
for (int k = 0; k < size; k++) {
if (rank == k) {
ofstream fout;
fout.open(path, ios::app);
fout << rank << " was here" << endl;
for (int j = chunk[2]; j < chunk[3]; ++j) {
for (int i = chunk[0]; i < chunk[1]; ++i) {
fout << data[0][i][j] << " " << data[1][i][j] << " " << data[2][i][j] << " ";
}
fout << endl;
}
printf("Process %d done and waiting\n", rank);
} else {
MPI_Barrier(MCW);
}
}
MPI_Finalize();
};
How to Run
$ mpic++ -o mbrot.out mbrot.cpp
$ mpirun -np 4 mbrot.out

CUDA: Filling a column-major matrix

I am fairly new to CUDA, and I am trying to offload to the GPU some cumbersome computations I am doing for a performance-critical project. On my computer I have two NVS 510 Graphic cards, but I am currently experimenting with one only.
I have some big column-major matrix (1000-5000 rows x 1-5 M columns) to be filled. I was so far able to write the code to fill the matrix like it were an array, and it works well for matrices of relatively small size.
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
The kernel is called:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
and everything works just fine.
If I change the kernel to work with 2D threads:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
and I invoke it
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
the results are all zero and CUDA returns unknown error. What am I missing? the actual code, which compiles without error with VS2015 and CUDA 8.0, can be found here: https://pastebin.com/XBCVC7VV
Here is the code from the pastebin link:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//2d
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
Moreover, I would extremely grateful for any direction on how to improve the performance of the code.

Any time you are having trouble with a CUDA code, I recommend doing proper CUDA error checking (which you mostly seem to be doing), and also run your code with cuda-memcheck. This last utility is similar to "enabling the memory checker" in Nsight VSE, but not quite the same. However the Nsight VSE memory checker may have given you the same indication.
In C (or C++) indexing of arrays generally starts at 0. Therefore, to test for an out-of-bounds index, I must check to see if the generated index is equal to or greater than the size of the array. But in your case you are only testing for greater than:
if (row > n_sim || column > num_cols) return;
You make a similar error in both your 1D kernel and in your 2D kernel, and although you believe your 1D kernel is working correctly, it is actually making out-of-bounds accesses. You can verify this if you run with the aforementioned cuda-memcheck utility (or probably also with the memory checker that can be enabled in Nsight VSE).
When I modify your code in the pastebin link to use proper range/bounds checking, cuda-memcheck reports no errors, and your program reports the correct results. I've tested both cases, but the code below is modified from your pastebin link to uncomment the 2D case, and use that instead of the 1D case:
$ cat t375.cu | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
#endif
//2d
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
$ nvcc -arch=sm_52 -o t375 t375.cu -std=c++11
$ cuda-memcheck ./t375
========= CUDA-MEMCHECK
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
$
Note that cuda-memcheck slows down the execution of your program on the GPU to do rigorous memory bounds checking. Therefore the performance may not match the ordinary case. This is what an "ordinary" run looks like:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
$

You are accessing memory beyond the allocated chunk. To check if row and column indices are within the range:
if (row >= n_rows || column >= num_cols) return; // Do this
if (row > n_rows || column > num_cols) return; // Instead of this
In flat version this int row = index % n_rows; makes row stay below the n_rows. You only access one column beyond the allocated memory, which for small matrix could still be withing the memory alignment. Python demo.
The second version does access an extra column plus and extra element, and one extra element for each row (the first element of the following row), as this:
int row = blockIdx.y * blockDim.y + threadIdx.y;
no longer keeps row index within the valid range. Python demo.
Looking at your pastebin, this is probably the place where it breaks:
44. fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
^^^
45. for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
46. res += weights[w_p + inter_point] * \
grid_values[row + node_map[w_p + inter_point] * n_sim];
^^^
47. }

can I create a window in c++ using existing project code

I was wondering if i could create a window using my existing project code. This is a school project. However, I have completed the actual coding part and just wanted to make the project fancier, so to say. Thank you so much in advance for all the support.
Here is the actual code in case it would be of any help. It's a bit lengthy so be warned :) Once again, thank you in advance
#define NOMINMAX
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <windows.h>
#include <Windows.h>
#include <chrono>
#include <string>
#include <fstream>
#include <iomanip>
#include <cstdlib>
using namespace std;
int key[3][3];
double inverted[3][3];
int store[1][3] = { 0 };
int conv[666];
int random1()
{
unsigned long long int xRan;
srand(time(NULL));
xRan = rand() % 9999 + 1;
return xRan;
}
int random2()
{
unsigned long long int xRan;
xRan = rand() % 9999 + 1;
return xRan;
}
int random3()
{
int xRan;
xRan = rand() % 9999 + 1;
return xRan;
}
void clear_screen(char fill = ' ') {
COORD tl = { 0, 0 };
CONSOLE_SCREEN_BUFFER_INFO s;
HANDLE console = GetStdHandle(STD_OUTPUT_HANDLE);
GetConsoleScreenBufferInfo(console, &s);
DWORD written, cells = s.dwSize.X * s.dwSize.Y;
FillConsoleOutputCharacter(console, fill, cells, tl, &written);
FillConsoleOutputAttribute(console, s.wAttributes, cells, tl, &written);
SetConsoleCursorPosition(console, tl);
}
int convert(char letter) {
int conv;
conv = (int)letter;
return conv;
}
void reverseMult(double inv[3][3], int decode[1][3])
{
store[0][0] = decode[0][0] * inv[0][0] + decode[0][1] * inv[1][0] + decode[0][2] * inv[2][0] + 0.5;
store[0][1] = decode[0][0] * inv[0][1] + decode[0][1] * inv[1][1] + decode[0][2] * inv[2][1] + 0.5;
store[0][2] = decode[0][0] * inv[0][2] + decode[0][1] * inv[1][2] + decode[0][2] * inv[2][2] + 0.5;
}
void matrixMult(int q, int w, int e, int a[3][3])
{
int A[1][3] = { q, w, e };
int B[3][3] = {
{ a[0][0], a[0][1], a[0][2] },
{ a[1][0], a[1][1], a[1][2] },
{ a[2][0], a[2][1], a[2][2] }
};
store[0][0] = A[0][0] * B[0][0] + A[0][1] * B[1][0] + A[0][2] * B[2][0];
store[0][1] = A[0][0] * B[0][1] + A[0][1] * B[1][1] + A[0][2] * B[2][1];
store[0][2] = A[0][0] * B[0][2] + A[0][1] * B[1][2] + A[0][2] * B[2][2];
//cout << store[0][0] << endl << store[0][1] << endl << store[0][2] << endl << endl;
}
char reverseConv(int x){
char conv;
conv = (char)x;
return conv;
}
void inverse(int key[3][3], double det){
int cofactor[3][3] = {
{ (key[1][1] * key[2][2] - key[1][2] * key[2][1]), -(key[1][0] * key[2][2] - key[1][2] * key[2][0]), (key[1][0] * key[2][1] - key[1][1] * key[2][0]) },
{ -(key[0][1] * key[2][2] - key[0][2] * key[2][1]), (key[0][0] * key[2][2] - key[0][2] * key[2][0]), -(key[0][0] * key[2][1] - key[0][1] * key[2][0]) },
{ (key[0][1] * key[1][2] - key[0][2] * key[1][1]), -(key[0][0] * key[1][2] - key[0][2] * key[1][0]), (key[0][0] * key[1][1] - key[0][1] * key[1][0]) }
};
for (int i = 0; i < 3; ++i)
{
for (int j = 0; j < 3; ++j)
{
inverted[i][j] =det * cofactor[i][j];
}
}
}
int main()
{
while (1){
cout << "Would you like to encrypt or decrypt?(e/d)\n " << endl;
string ende;
cin >> ende;
clear_screen();
if (ende == "e")
{
cout << "Please enter a name for the message: " << endl << endl;
string file;
cin.ignore(numeric_limits<streamsize>::max(), '\n');
getline(cin, file);
clear_screen();
file += ".txt";
ofstream encrypt;
encrypt.open(file);
string message;
cout << "Please enter the message you would like to encrypt: " << endl << endl;
//cin.ignore(numeric_limits<streamsize>::max(), '\n'); --- Not needed anymore, uncomment if you cannot input message
getline(cin, message);
if (message.length() % 3 != 0)
message += ' ';
if (message.length() % 3 != 0)
message += ' ';
clear_screen();
for (int i = 0; i < message.length(); ++i)
{
conv[i] = convert(message[i]);
}
int det = 0;
while (1){
key[0][0] = random1(); //1
key[0][1] = random2(); //2
key[0][2] = random3(); //3
key[1][0] = random1() * 13 / 7; //4
key[1][1] = random2() * 23 / 7; //5
key[1][2] = random3() * 33 / 7; //6
key[2][0] = random1() * 18 / 15; //7
key[2][1] = random2() * 18 / 12; //8
key[2][2] = random3() * 18 / 10; //9
det = key[0][0] * key[1][1] * key[2][2] + key[0][1] * key[1][2] * key[2][0] + key[0][2] * key[1][0] * key[2][1]
- key[0][2] * key[1][1] * key[2][0] - key[0][0] * key[1][2] * key[2][1] - key[0][1] * key[1][0] * key[2][2];
if (det != 0)
break;
}
encrypt << key[0][0] << ' ' << key[0][1] << ' ' << key[0][2] << ' '
<< key[1][0] << ' ' << key[1][1] << ' ' << key[1][2] << ' '
<< key[2][0] << ' ' << key[2][1] << ' ' << key[2][2] << endl << endl;
int a, b, c;
int count = 0;
for (int i = 0; i < (message.length)() / 3; ++i)
{
int counting = 0;
a = conv[count];
count++;
b = conv[count];
count++;
c = conv[count];
count++;
matrixMult(a, b, c, key);
encrypt << store[0][counting] << ' ';
counting++;
encrypt << store[0][counting] << ' ';
counting++;
encrypt << store[0][counting] << endl;
}
encrypt.close();
Sleep(750);
cout << "Your message has been encrypted." << endl;
Sleep(750);
cout << "Please check " << file << " for the encrypted message and key" << endl << endl;
Sleep(750);
}
if (ende == "d")
{
cout << "Please enter the name of the file you would like to decrypt: " << endl << endl;
string file;
cin.ignore(numeric_limits<streamsize>::max(), '\n');
getline(cin, file);
clear_screen();
file += ".txt";
ifstream decrypt;
decrypt.open(file);
int key[3][3];
for (int i = 0; i < 3; ++i)
{
for (int k = 0; k < 3; ++k){
decrypt >> key[k][i];
}
}
int det = 0;
det = key[0][0] * key[1][1] * key[2][2] + key[0][1] * key[1][2] * key[2][0] + key[0][2] * key[1][0] * key[2][1] - key[0][2] * key[1][1] * key[2][0] - key[0][0] * key[1][2] * key[2][1] - key[0][1] * key[1][0] * key[2][2];
double detInv = 1;
detInv /= det;
//double inv;
inverse(key, detInv);
int out[1][3];
int count = 0;
while (!decrypt.eof()){
for (int i = 0; i < 3; ++i)
{
decrypt >> out[0][i];
}
reverseMult(inverted, out);
count++;
char a, b, c;
a = reverseConv(store[0][0]);
b = reverseConv(store[0][1]);
c = reverseConv(store[0][2]);
if (decrypt.eof())
break;
cout << a << b << c;
}
}
cout << endl << endl;
cout << "Would you like to continue?(y/n) ";
char again;
cin >> again;
if (again != 'y')
exit(0);
clear_screen();
}
return 0;
}

Yes, you can convert your project to a Windowing application. You have two choices:
Use Windows (native) API
Use graphics framework
Windows API
The Windows API is the direct method for creating windows. However, it is a lot of code, lots of chances for defects to be injected. It is a good learning experience about how the Windowing system works. Get Petzold's book.
GUI Framework
There are a lot of GUI frameworks out there. These C++ frameworks have simplified the GUI and Widget creation, using object oriented programming. There are many out there, so search the internet for "GUI Framework C++ review".
A Different Programming Perspective
In your present project, the OS executes the program and statements are executed in order. A windowing system is based on event driven programming. In summary, your GUI is waiting for an event to occur.
A simple example for your project is a window with a single button. When the User clicks on the button, the Windowing system sends a message to the button event handler. The event handler is a function that will execute your code.

As Thomas said, yes you can migrate your code to a Windows application, by either going native with Win32 or by either using a C++ GUI Framework (QT, wxWindows, ...).
However, you will need to invest time to learn one of the solution. I would suggest to learn a C++ Framework, programming with low-level Win32 api is not very used today.
Although it's off-topic, I would suggest some improvements to your code.
First, you shouldn't use goto, and replace them by a while. You can replace
again:
xRan = rand() % 9999 + 1;
if (xRan <1)
goto again;
By
do{
xRan = rand() % 9999 + 1;
} while (xRan < 1);
Note that in this case, a goto or a while is useless as xRan will always be superior or equal to 1 (rand() always returns a positive value)
Also you can replace convert and reverseConv very long functions by a constant array of struct values (struct contains a int and a const char*). convert and reverseConvert functions would only parse the array to find a proper match.

Write small bmp on console via C++

I'm trying to read painted 16x16 bmp, but there is no 1 pixel = 3 bits (RGB). Even if first 4-5 lines is white and the rest is black the document is still full of 255 255 255 for each pixel.
In my case I need to show this image in console by analyzing RGB layers of each pixel but have a lot of trouble with it.
int main()
{
FILE* f = fopen("image.bmp", "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f); // read the 54-byte header
// extract image height and width from header
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = 3 * width * height;
unsigned char * data = new unsigned char[size]; // allocate 3 bytes per pixel
fread(data, sizeof(unsigned char), size, f); // read the rest of the data at once
fclose(f);
for (int i = 0; i < size; i += 3)
{
unsigned char tmp = data[i];
data[i] = data[i + 2];
data[i + 2] = tmp;
}
unsigned int * byteData = new unsigned int[size];
for (int i = 0; i <= size; i++)
{
byteData[i] = (int) data[i];
}
for (int i = 0; i <= size / 3; i++)
{
cout << i << ".\t" << byteData[i] << "\t" << byteData[i + 1] << "\t" << byteData[i + 2] << endl;
}
cout << endl;
cout << "=======================" << endl;
for (int j = 0; j < width; j++)
{
cout << j + 1 << ".\t";
for (int i = 0; i < height; i++)
{
//if ((int)data[j * width + i] >= 100 && (int)data[j * width + i + 1] >= 100 && (int)data[j * width + i + 2] >= 100)
if (((int) data[j * width + i] + (int) data[j * width + i + 1] + (int) data[j * width + i + 2]) / 3 <= 170)
cout << " ";
else cout << "*";
}
cout << endl;
}
getchar();
return 0;
}
As I think problem with byte sequenses and reading memory frome garbage, but if you could explain where is leak?

The solution is next: bmp should be created with 24 bpp and top-down row order.
Correct code is for 16x16 bitmap:
#include <iostream>
using namespace std;
unsigned char* readBMP(char* filename);
int main()
{
unsigned char * data = readBMP("winLogo.bmp");
int size = 16*16*3;
unsigned int * byteData = new unsigned int[size];
for (int i = 0; i <= size; i++)
{
byteData[i] = (int)data[i];
}
int k = 0;
//uncomment to write a line of RGB values
//for (int i = 0; i < size; i += 3)
//{
// cout << k+1 << ".\t" << byteData[i] << "\t" << byteData[i + 1] << "\t" << byteData[i + 2] << endl;
// k++;
//}
for (int i = 0; i < size; i+= 3)
{
if (i % 16*3 == 0)
{
cout << endl << (i/(16*3))+1 << ".\t";
}
else {};
if (byteData[i] >= 200 && byteData[i + 1] >= 200 && byteData[i + 2] >= 200)
cout << " ";
else
cout << "#";
}
getchar();
return 0;
}
unsigned char* readBMP(char* filename)
{
int i;
FILE* f = fopen(filename, "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f); // read the 54-byte header
// extract image height and width from header
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = -3 * width * height;
unsigned char* data = new unsigned char[size]; // allocate 3 bytes per pixel
fread(data, sizeof(unsigned char), size, f); // read the rest of the data at once
fclose(f);
//BGR -> RGB
for (i = 0; i < size; i += 3)
{
unsigned char tmp = data[i];
data[i] = data[i + 2];
data[i + 2] = tmp;
}
return data;
}

class variable access within recursive function

for an intro program, we were asked to build a program that could find every single possible working magic square of a given size. I am having trouble modifying a class variable from within a recursive function. I am trying to increment the number of magic squares found every time the combination of numbers I am trying yields a magic square.
More specifically, I am trying to modify numSquares within the function recursiveMagic(). After setting a breakpoint at that specific line, the variable, numSquares does not change, even though I am incrementing it. I think it has something to do with the recursion, however, I am not sure. If you want to lend some advice, I appreciate it.
//============================================================================
// Name : magicSquare.cpp
// Author :
// Version :
// Copyright : Your copyright notice
// Description : Hello World in C++, Ansi-style
//============================================================================
#include <iostream>
using namespace std;
/**
* MagicSquare
*/
class MagicSquare {
private:
int magicSquare[9];
int usedNumbers[9];
int numSquares;
int N;
int magicInt;
public:
MagicSquare() {
numSquares = 0;
for (int i = 0; i < 9; i++)
usedNumbers[i] = 0;
N = 3; //default is 3
magicInt = N * (N * N + 1) / 2;
}
MagicSquare(int n) {
numSquares = 0;
for (int i = 0; i < 9; i++)
usedNumbers[i] = 0;
N = n;
magicInt = N * (N * N + 1) / 2;
}
void recursiveMagic(int n) {
for (int i = 1; i <= N * N + 1; i++) {
if (usedNumbers[i - 1] == 0) {
usedNumbers[i - 1] = 1;
magicSquare[n] = i;
if (n < N * N)
recursiveMagic(n + 1);
else {
if (isMagicSquare()) {
numSquares++; //this is the line that is not working correctly
printSquare();
}
}
usedNumbers[i - 1] = 0;
}
}
}
//To efficiently check all rows and collumns, we must convert the one dimensional array into a 2d array
//since the sudo 2d array looks like this:
// 0 1 2
// 3 4 5
// 6 7 8
//the following for-if loops convert the i to the appropriate location.
bool isMagicSquare() {
for (int i = 0; i < 3; i++) {
if ((magicSquare[i * 3] + magicSquare[i * 3 + 1] + magicSquare[i * 3 + 2]) != magicInt) //check horizontal
return false;
else if ((magicSquare[i] + magicSquare[i + 3] + magicSquare[i + 6]) != magicInt) // check vertical
return false;
}
if ((magicSquare[0] + magicSquare[4] + magicSquare[8]) != magicInt)
return false;
if ((magicSquare[6] + magicSquare[4] + magicSquare[2]) != magicInt)
return false;
return true;
}
/**
* printSquare: prints the current magic square combination
*/
void printSquare() {
for (int i = 0; i < 3; i++)
cout << magicSquare[i * 3] << " " << magicSquare[i * 3 + 1]
<< " " << magicSquare[i * 3 + 2] << endl;
cout << "------------------" << endl;
}
/**
* checkRow: checks to see if the current row will complete the magic square
* #param i - used to determine what row is being analyzed
* #return true if it is a working row, and false if it is not
*/
bool checkRow(int i) {
i = (i + 1) % 3 - 1;
return (magicSquare[i * 3] + magicSquare[i * 3 + 1] + magicSquare[i * 3 + 2]) == magicInt;
}
int getnumSquares() {
return numSquares;
}
}; //------End of MagicSquare Class-----
int main() {
MagicSquare square;
cout << "Begin Magic Square recursion:" << endl << "------------------"
<< endl;
square.recursiveMagic(0);
cout << "Done with routine, returned combinations: " << square.getnumSquares() << endl;
return 0;
}

The array is being overwritten leading to overwriting the numSquares field.
class MagicSquare {
private:
int magicSquare[9];
int usedNumbers[9];
Changes to
class MagicSquare {
private:
int magicSquare[10];
int usedNumbers[10];
Also in your initializer the loop says < 9 but what you want to say is < 10. Or just use memset is better for that purpose.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Receiving char array via MPI_Recv results in the output image being partially black - c++

Related

MPI C++ Runtime Error: signal 11 (Segmentation fault) with multi-dimensional array creation

CUDA: Filling a column-major matrix

can I create a window in c++ using existing project code

Write small bmp on console via C++

class variable access within recursive function

Categories

Resources