C++ memory leak, how to detect - c++

I am using SSE to implement matrix multiplication, but I found there exists memory leak(see the picture below), the memory usage is increasing from 400M to 1G or more.
But, I free the memory in the code.
The following are codes
main.cpp
#include "sse_matrix.h"
#include <ctime>
int main(int argc, char* argv[])
{
vector<float> left(size, 0);
vector<float> right(size, 0);
vector<float> result(size, 0);
// initialize value
for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
left[i*dim + j] = j;
right[i*dim + j] = j;
}
}
cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
// calculate the result
clock_t my_time = clock();
SSE_Matrix_Multiply(&left, &right, &result);
cout << "2. INFO: SSE matrix multiplication result has got" << endl;
/*for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
cout << result[i * dim + j] << " ";
}
cout << endl;
}*/
cout << "3. INFO: " << float(clock() - my_time)/1000.0 << endl;
system("pause");
return 0;
}
sse_matrix.h
#ifndef __SSE_MATRIX_H
#define __SSE_MATRIX_H
#include <vector>
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
using std::vector;
//#define dim 8
//#define size (dim * dim)
const int dim = 4096;
const int size = dim * dim;
struct Matrix_Info
{
vector<float> * A;
int ax, ay;
vector<float> * B;
int bx, by;
vector<float> * C;
int cx, cy;
int m;
int n;
};
void Transpose_Matrix_SSE(float * matrix)
{
__m128 row1 = _mm_loadu_ps(&matrix[0*4]);
__m128 row2 = _mm_loadu_ps(&matrix[1*4]);
__m128 row3 = _mm_loadu_ps(&matrix[2*4]);
__m128 row4 = _mm_loadu_ps(&matrix[3*4]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_storeu_ps(&matrix[0*4], row1);
_mm_storeu_ps(&matrix[1*4], row2);
_mm_storeu_ps(&matrix[2*4], row3);
_mm_storeu_ps(&matrix[3*4], row4);
}
float * Shuffle_Matrix_Multiply(float * left, float * right)
{
__m128 _t1, _t2, _sum;
_sum = _mm_setzero_ps(); // set all value of _sum to zero
float * _result = new float[size];
float _res[4] = {0};
for (int i = 0; i < 4; i ++)
{
for (int j = 0; j < 4; j ++)
{
_t1 = _mm_loadu_ps(left + i * 4);
_t2 = _mm_loadu_ps(right + j * 4);
_sum = _mm_mul_ps(_t1, _t2);
_mm_storeu_ps(_res, _sum);
_result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
}
}
return _result;
}
float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int ax = my_info->ax;
int ay = my_info->ay;
int bx = my_info->bx;
int by = my_info->by;
//1. split Matrix A and Matrix B
float * _a = new float[16];
float * _b = new float[16];
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
_a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
_b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
}
}
//2. transpose Matrix B
Transpose_Matrix_SSE(_b);
//3. calculate result and return a float pointer
return Shuffle_Matrix_Multiply(_a, _b);
}
int Matrix_Multiply(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int cx = my_info->cx;
int cy = my_info->cy;
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
float * temp = SSE_4_Matrix(my_info);
(*my_info->C)[(i + cx) * n + j + cy] += temp[i*m + j];
delete [] temp;
}
}
return 0;
}
void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result)
{
struct Matrix_Info my_info;
my_info.A = left;
my_info.B = right;
my_info.C = result;
my_info.n = dim;
my_info.m = 4;
// Matrix A row:i, column:j
for (int i = 0; i < dim; i += 4)
{
for (int j = 0; j < dim; j += 4)
{
// Matrix B row:j column:k
for (int k = 0; k < dim; k += 4)
{
my_info.ax = i;
my_info.ay = j;
my_info.bx = j;
my_info.by = k;
my_info.cx = i;
my_info.cy = k;
Matrix_Multiply(&my_info);
}
}
}
}
#endif
And I guess maybe the memory leak is in Shuffle_Matrix_Multiply function in sse_matrix.h file. But, I am not sure, and now, the memory usage is increasing and my system will crash.
Hope someone can help to figure out and thanks in advance.

You never free the _a and _b allocated in SSE_4_Matrix.
You also allocate a lot dynamically just to throw it away a bit later. For example the _a and _b could be arrays of 16 floats in stack.

I would like to use a header file to help me to check memory leak. The header file as follows:
MemoryLeakChecker.hpp
#ifndef __MemoryLeakChecker_H__
#define __MemoryLeakChecker_H__
#include <crtdbg.h>
#include <cassert>
//for memory leak check
#ifdef _DEBUG
#define DEBUG_CLIENTBLOCK new(_CLIENT_BLOCK,__FILE__,__LINE__)
#else
#define DEBUG_CLIENTBLOCK
#endif
#define _CRTDBG_MAP_ALLOC
#ifdef _DEBUG
#define new DEBUG_CLIENTBLOCK
#endif
inline void checkMemoryLeak() {
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
int m_count = _CrtDumpMemoryLeaks();
assert(m_count == 0);
}
#endif
In my project, i will use MemoryLeakChecker.hpp in the file including main function as follows:
MemoryLeakTest.cpp
#include "MemoryLeakChecker.hpp"
int main() {
//_crtBreakAlloc = 148; //if you only know the memory leak block number is 148 after checking memory leak log, use this to locate the code causing memory leak.
//do some things
atexit(checkMemoryLeak); //check all leak after main() function called
return 0;
}
Run your program in debug mode in Visual Studio, you can get the memory leak log in output window after your program exited. Also, you can find the place where memory leaked in the memory leak log.

Related

C++ 'unresolved errors' using Intel's oneAPI compiler, possibly involved with the Math Kernel Library?

I'm trying to run the following code on an Ubuntu machine:
/**
For compiling -->
export GOMP_CPU_AFFINITY='0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,260,264,268'
export OMP_NUM_THREADS=68
set MKL_NUM_THREADS = 68
icc -std=gnu++98 -O3 -qopenmp -xhost -ansi-alias -ipo -AVX512 mkl_2d_heat_fftw_P.cpp -o mkl_2d_heat_fftw_P -lm -mkl
For running -->
* ./mkl_2d_heat_fftw_P N T numThreads
* Example: ./mkl_2d_heat_fftw_P 1000 100000 1
*/
#include <iostream>
#include <vector>
#include <algorithm>
#include <cstring>
#include <complex.h>
#include "mkl_service.h"
#include "mkl_dfti.h"
#include <string>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <sys/time.h>
#include <cstdio>
#include <omp.h>
// #include <cilk/cilk.h>
// #include <cilk/cilk_api.h>
// #include "cilktime.h"
#ifdef USE_PAPI
#include <papi.h>
#include "papilib.h"
#endif
#ifdef POLYBENCH
#include <polybench.h>
#endif
using namespace std;
typedef vector<double> vd;
typedef vector<vector<double> > vvd;
#define PB push_back
#define SZ(x) (int)x.size()
#define MAXN 8010
int T, N, N_THREADS;
const int BASE = 1024;
double a1[MAXN][MAXN], a2[MAXN][MAXN];
// double *forward_input_buffer, *backward_output_buffer;
// double complex *forward_output_buffer, *backward_input_buffer;
double complex *a_complex, *odd_mults, *input_complex;
double *mkl_forward_input_buffer, *mkl_backward_output_buffer;
double complex *mkl_forward_output_buffer, *mkl_backward_input_buffer;
template<class T> void out(const vector<T> &a) { cout<<"array: "; for (int i=0;i<SZ(a);i++) cout<<a[i]<<" "; cout<<endl; cout.flush(); }
long getTime(){
struct timeval tp;
gettimeofday(&tp, NULL);
long int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
return ms;
}
void pad_matrix(vvd &v, int rows, int cols){
int pad_rows = rows - SZ(v);
int pad_cols = cols - SZ(v[0]);
for (int i = 0; i < pad_rows; i++)
v.PB(vd(SZ(v[0]), 0.0));
for (int i = 0; i < SZ(v); i++){
for (int j = 0; j < pad_cols; j++)
v[i].PB(0.0);
}
}
// Resizing matrices to [r1+r2, c1+c2] for circular convolution
void pad_vectors(vd &input, vd &formula)
{
int n = SZ(input);
vd tmp = vd(n*3, 0);
for (int i = 0; i < n; i++)
tmp[i] = tmp[n + i] = tmp[n + n + i] = input[i];
input = tmp;
int diff = abs(SZ(input) - SZ(formula));
for (int i = 0; i < diff; i++)
if (SZ(input) < SZ(formula))
input.PB(0.0);
else
formula.PB(0.0);
}
void print_matrix(vvd v, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < SZ(v); i++){
for (int j = 0; j < SZ(v[i]); j++)
cout << v[i][j] << " ";
cout << endl;
}
cout << endl;
}
void print_matrix_arr(double *v, int n, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
cout << v[i*n + j] << " ";
cout << endl;
}
cout << endl;
}
void print_complex_matrix(double complex* input_buffer1, double complex* input_buffer2, int n, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < n * n; i++){
if (i % n == 0)
cout << endl;
printf("ratio:%f\t%f%+fi\t \t%f%+fi\n", crealf(input_buffer1[i])/crealf(input_buffer2[i]), crealf(input_buffer1[i]), cimagf(input_buffer1[i]), crealf(input_buffer2[i]), cimagf(input_buffer2[i]));
// cout << (*input_buffer[i]).real() << " " << (*input_buffer[i]).imag() << ",\t";
}
cout << endl;
}
void print_vector(vd v, string msg){
cout << msg << ": ";
for (int i = 0; i < SZ(v); i++)
cout << v[i] << " ";
cout << endl;
}
// fftw_plan plan_forward, plan_backward;
DFTI_DESCRIPTOR_HANDLE my_desc1_handle = NULL, my_desc2_handle = NULL;
// double mkl_forward_input_buffer[MAXN * MAXN], mkl_backward_output_buffer[MAXN * MAXN];
// double complex mkl_forward_output_buffer[MAXN * MAXN], mkl_backward_input_buffer[MAXN * MAXN];
// DFT of real valued matrix. CAUTION: initialize the input array after creating the plan
void mkl_fft_forward(vvd &v, double complex *output_buffer, int n)
{
int sz_i = SZ(v), sz_j = SZ(v[0]);
#pragma omp parallel for
for (int i = 0; i < sz_i; i++)
for (int j = 0; j < sz_j; j++){
mkl_forward_input_buffer[i*n + j] = v[i][j];
}
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
mkl_forward_output_buffer[i*n + j] = 0.0;
// print_matrix_arr(mkl_forward_input_buffer, n, "input bufer");
DftiComputeForward(my_desc1_handle, mkl_forward_input_buffer, mkl_forward_output_buffer);
#pragma omp parallel for
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++){
output_buffer[i*n + j] = mkl_forward_output_buffer[i*n + j];
// printf("%f+%f\n", crealf(output_buffer[i*n + j]), cimagf(output_buffer[i*n+j]));
}
}
}
// Inverse DFT of complex input array
void mkl_fft_backward(double complex* input_buffer, vvd &output, int n)
{
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
{
mkl_backward_input_buffer[i*n + j] = input_buffer[i*n + j];
mkl_backward_output_buffer[i*n + j] = 0.0;
}
DftiComputeBackward(my_desc2_handle, mkl_backward_input_buffer, mkl_backward_output_buffer);
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
output[i][j] = mkl_backward_output_buffer[i*n + j]/(n * n * 1.0);
}
// Takes two array(a_real and b_real) as input and writes the output to "res"
void convolution_fftw_2d(vvd &a_real, vvd &input, vvd &result)
{
if (T == 0)
return ;
int n_formula = N;
mkl_fft_forward(a_real, a_complex, n_formula);
// double complex* odd_mults = fftw_alloc_complex(n_formula * n_formula); // Do not need to allocate new space, we can use the space of a_complex or b_complex
bool is_initialized = false; // if odd_mult array is initialized
int t = T;
// ############# Repeated squaring - start ############
while (t > 1){
if (t & 1){
if (is_initialized == false){
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
odd_mults[i] = a_complex[i];
is_initialized = true;
} else {
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
odd_mults[i] = odd_mults[i] * a_complex[i];
}
}
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
a_complex[i] = a_complex[i] * a_complex[i];
t /= 2;
}
if (is_initialized){
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
a_complex[i] = a_complex[i] * odd_mults[i];
}
// ############# Repeated squaring - end ############
// while (--t > 0){
// cout << "t: " << t << endl;
// for(int i = 0; i < n_formula * n_formula; i++)
// pointwise_mult[i] = pointwise_mult[i] * a_complex[i];
// }
// fft_backward(a_complex, formula, n_formula);
// // Scale the output array according to number of samples
// #pragma omp parallel for
// for (int i = 0; i < SZ(formula); i++)
// for (int j = 0; j < SZ(formula[0]); j++){
// double r = formula[i][j] / (n_formula * n_formula);
// formula[i][j] = r;
// // formula[i][j] = (abs(r) < 1e-8? 0:r);
// }
// print_matrix(formula, "Formula");
// vvd input(N, vd(N, 0.0));
// #pragma omp parallel for
// for (int i = 0; i < N; i++)
// for (int j = 0; j < N; j++){
// input[i][j] = a1[i][j];
// }
// print_matrix(input, "Input");
// reverse(input.begin(), input.end());
// double complex* formula_complex = fftw_alloc_complex(N * N);
// fft_forward(formula, formula_complex, N);
// double complex* input_complex = fftw_alloc_complex(N * N);
mkl_fft_forward(input, input_complex, N);
// fft_forward(input, input_complex, N);
// double complex* result_complex = fftw_alloc_complex(n * n); // Do not need to allocate new space, we can use the space of a_complex or b_complex
#pragma omp parallel for
for (int i = 0; i < N * N; i++){
a_complex[i] = input_complex[i] * a_complex[i];
}
mkl_fft_backward(a_complex, result, N);
// fft_backward(a_complex, result, N);
// print_matrix(result, "Result (needs to be rotated)");
return ;
}
void mkl_init(int n)
{
MKL_LONG status;
MKL_LONG len[2] = {n, n};
len[0] = n; len[1] = n;
status = DftiCreateDescriptor(&my_desc1_handle, DFTI_DOUBLE, DFTI_REAL, 2, len);
status = DftiSetValue(my_desc1_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiSetValue(my_desc1_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
status = DftiSetValue( my_desc1_handle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT );
status = DftiCommitDescriptor(my_desc1_handle);
status = DftiCreateDescriptor(&my_desc2_handle, DFTI_DOUBLE, DFTI_REAL, 2, len);
status = DftiSetValue(my_desc2_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
status = DftiSetValue(my_desc2_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiSetValue( my_desc2_handle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT );
status = DftiCommitDescriptor(my_desc2_handle);
}
void initialize(){
mkl_init(N);
// forward_input_buffer = fftw_alloc_real(N * N);
// backward_output_buffer = fftw_alloc_real(N * N);
// forward_output_buffer = fftw_alloc_complex(N * N);
// backward_input_buffer = fftw_alloc_complex(N * N);
a_complex = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
odd_mults = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
input_complex = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
mkl_forward_input_buffer = (double *)malloc(sizeof(double) * N * N);
mkl_backward_output_buffer = (double *)malloc(sizeof(double) * N * N);
mkl_forward_output_buffer = (double complex *)malloc(sizeof(double complex) * N * N);
mkl_backward_input_buffer = (double complex *)malloc(sizeof(double complex) * N * N);
for (int i = 0; i < N+2; ++i)
for (int j = 0; j < N+2; j++)
a1[i][j] = a2[i][j] = 1.0 * (rand() % BASE);
}
void mkl_destroy(){
MKL_LONG status;
status = DftiFreeDescriptor(&my_desc1_handle);
status = DftiFreeDescriptor(&my_desc2_handle);
free(a_complex);
free(odd_mults);
free(input_complex);
free(mkl_forward_input_buffer);
free(mkl_backward_output_buffer);
free(mkl_forward_output_buffer);
free(mkl_backward_input_buffer);
}
#define getIdx(i, N) ((i + N) % N)
bool verify(vvd result){
for (int t = 0; t < T; ++t) {
// cout << "t: " << t << endl;
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; j++){
// a2[i] = 0.125 * (a1[i+1] - 2.0 * a1[i] + a1[i-1]);
// cout << i << " " << j << " : " << getIdx(i -1, N) << " " << getIdx(i + 1, N) << " " << getIdx(j - 1, N) << " " << getIdx(j + 1, N) << endl;
// a2[i][j] = a1[getIdx(i - 1, N)][getIdx(j, N)] + a1[getIdx(i, N)][getIdx(j + 1, N)]
// + a1[getIdx(i + 1, N)][getIdx(j, N)] + a1[getIdx(i, N)][getIdx(j - 1, N)];
a2[i][j] = 0.125*a1[getIdx(i - 1, N)][getIdx(j, N)] + 0.125*a1[getIdx(i, N)][getIdx(j + 1, N)]
+ 0.125*a1[getIdx(i + 1, N)][getIdx(j, N)] + 0.125*a1[getIdx(i, N)][getIdx(j - 1, N)]
+ (-2.0*(0.125*2.0) + 1.0)*a1[i][j];
}
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; j++)
a1[i][j] = a2[i][j];
}
// cout << "Final Answer (iter): ";
// for (int i = 0; i < N; i++){
// for (int j = 0; j < N; j++)
// cout << a1[i][j] << " ";
// cout << endl;
// }
// cout << endl;
int cnt = 0;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (fabs (a1[i][j] - result[i][j]) > 1e-8)
cnt++;
cout << "Number of Mismatched Cell: " << cnt << endl;
return 0;
}
int main(int argc, char *argv[])
{
double x;
int t, n, numThreads;
// vvd a, b;
if (argc < 4){
cout << "Enter: N T numThreads" << endl;
return 1;
}
if (argc > 1){
n = atoi(argv[1]);
}
if (argc > 2)
t = atoi(argv[2]);
numThreads = 1;
if (argc > 3){
numThreads = atoi(argv[3]);
omp_set_num_threads(numThreads);
}
N = n; T = t; N_THREADS = numThreads;
initialize();
#ifdef USE_PAPI
papi_init();
#endif
int sz_formula = 3;
// double formula[3][3] = {{0, 1, 0},
// {1, 0, 1},
// {0, 1, 0}};
double formula[3][3] = {{0, 0.125, 0},
{0.125, (-2.0*(0.125*2.0) + 1.0), 0.125},
{0, 0.125, 0}};
// double formula[3][3] = {{1, 0, 1},
// {0, 0, 0},
// {0, 0, 0}};
vvd a(sz_formula, vd(sz_formula));
for (int i = 0; i < sz_formula; i++)
for (int j = 0; j < sz_formula; j++)
a[i][j] = formula[i][j];
vvd input(n, vd(n)), result(n, vd(n,0.0));
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
input[i][j] = a1[i][j];
long start = getTime();
#ifdef POLYBENCH
/* Start timer. */
polybench_start_instruments;
#endif
convolution_fftw_2d(a, input, result);
// Result must be rotated (T mod N) indices
#ifdef POLYBENCH
/* Stop and print timer. */
polybench_stop_instruments;
polybench_print_instruments;
#endif
long end = getTime();
vvd rotated_result(n, vd(n, 0.0));
int k = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
rotated_result[i][j] = result[(i+(t%n)) % n][(j+(t%n)) % n];
// print_matrix(rotated_result, "rotated");
cout << N << "," << T << "," << numThreads << "," << (end - start) / 1000.0 << endl;
mkl_destroy();
#ifdef USE_PAPI
countTotalMiss(p);
PAPI_shutdown();
delete threadcounter;
for (int i = 0; i < p; i++) delete l2miss[i];
delete l2miss;
delete errstring;
delete EventSet;
delete eventCode;
#endif
long start_iter = getTime();
// verify(rotated_result);
long end_iter = getTime();
// cout << "Time (Iter): " << end_iter - start_iter << endl;
return 0;
}
I'm pretty new to running C++ overall (I'm most familiar with Python, which is an interpreted language), so I'm not sure how it all works. This was pre-made code that I have to test before I make my own. I'm on Ubuntu 20.04 and to begin with, I'm not sure how to work everything. I have Visual Studio Code installed, and I'm trying to run the command icc -std=gnu++98 -O3 -qopenmp -xhost -ansi-alias -ipo -AVX512 mkl_2d_heat_fftw_P.cpp -o mkl_2d_heat_fftw_P -lm -mkl through the terminal. I'm not even sure if this is right, but I'm getting the following error messages:
ipo: warning #11021 (6 times): unresolved DftiFreeDescriptor, DftiCommitDescriptor, DftiSetValue, DftiCreateDescriptor_d_md, DftiComputeBackward, DftiComputeForward
ld: cannot find (3 times): -lmkl_intel_lp64, -lmkl_intel_thread, -lmkl_core.
These are eventually meant to run on supercomputers at my institution, but is the compiling command wrong for my local computer? If it helps, I'm running this on a VirtualBox on Windows 10 Pro (I tried running Ubuntu 20.04 LTS on Hyper-V but I never figured out how to successfully connect it to the Internet because whenever I made the VM's switch my WiFi one, it made my computer overall have no Internet access).
Looking at it, it seems like the Math Kernel Library is the problem, though I'm not too sure. Did I mess up the installation of the oneAPI Base Toolkit? I have every oneAPI Toolkit installed (and all of their features too, including FPGA support).
Or am I just compiling it wrong as the script described listed in the top comment is for a supercomputer (I'm not sure what to do with the first three lines, like export GOMP_CPU_AFFINITY, export OMP_NUM_THREADS, and set MKL_NUM_THREADS).
Thanks for any help in advance!

Reading Bitmap file

I try to read a bitmap file. This my program:
#include<iostream>
#include<fstream>
#include <string>
#include<windows.h>
using namespace std;
#pragma pack(1)
struct header
{
char header[2];
int32_t filesize;
int16_t reser;
int16_t reser1;
int32_t dataoffset;
};
struct infoheader
{
int32_t headersize;
int32_t width;
int32_t height;
int16_t plans;
int16_t bpp;
int32_t compression;
int32_t datasize;
int32_t re;
int32_t ve;
int32_t color;
int32_t importantcolor;
};
struct PIxel
{
unsigned char G;
unsigned char B;
unsigned char R;
};
int main()
{
header h;
infoheader info;
PIxel *p;
ifstream file("bmp2.bmp", ios::binary);
if (file.is_open())
{
cout << "true" << endl;
file.read((char*)&h, sizeof(h));
file.read((char*)&info, sizeof(info));
cout << info.width << " " << info.height << " " << h.filesize << " " << info.bpp << endl;
int pa = info.width % 4;
int size = info.width * info.height * (info.bpp / 3) + pa * info.height;
char* arr = new char[size];
file.read(arr, size);
char* temp = arr;
int sizep = info.height * info.width;
p = new PIxel[sizep];
for (int i = 0; i < info.height; i++)
{
for (int j = 0; j < info.width; j++)
{
p[i * info.height + j].B = *(temp++);
p[i * info.height + j].G = *(temp++);
p[i * info.height + j].R = *(temp++);
//p = p + 3;
}
p += pa;
}
HWND consoleWindow = GetConsoleWindow();
HDC hdc = GetDC(consoleWindow);
for (int i = 0; i < info.height; i++)
{
for (int j = 0; j < info.width; j++)
{
PIxel m = p[i * info.height + j];
SetPixel(hdc, i, j, RGB(m.R, m.G, m.B));
}
}
ReleaseDC(consoleWindow, hdc);
}
}
It works but the image on my console is not right...
Can you help me to fix it?
int size = info.width * info.height * (info.bpp / 3) + pa * info.height;
The above calculation for size is incorrect. Bits per pixel should be divided by 8. The indexing in the for loops is also wrong. It ends ups multiplying height x height.
Also SetPixel(... i, j ...) should be changed to SetPixel(... j, i ...) since i in your case, refers to the y-axis.
As mentioned in previous answer, the padding has to be fixed too.
Note that you can use LoadImage and other Windows GDI functions to open and draw bitmaps.
int size = (info.width * (info.bpp / 8) + pa) * info.height;
...
for(int i = info.height - 1; i >= 0; i--)
{
for(int j = 0; j < info.width; j++)
{
int index = i * (info.width) + j;
p[index].B = *(temp++);
p[index].G = *(temp++);
p[index].R = *(temp++);
}
temp += pa;
}
for(int i = 0; i < info.height; i++)
{
for(int j = 0; j < info.width; j++)
{
int index = i * (info.width) + j;
PIxel m = p[index];
SetPixel(hdc, j, i, RGB(m.R, m.G, m.B));
}
}
I believe you have your padding adjustment on the wrong pointer. The padding is present on the source image. You don't want it on the destination image. You are accounting for the padding with p += pa; you should instead replace this line with temp += pa to account for the padding of the source image.

Multithread calculate mean and std does not improve efficiency

I am a novice in the field of C++ multithread programming and I try to use multithread to compute the mean and standard deviation of my data in parallel to reduce the cost of time. My function of calculation of mean and standard deviation is as the following.
void cal_mean_std(float* data, float* mean, float* sd, int N, int start_index, int span_cols)
{
int value;
for(int j = start_index; j < start_index + span_cols; j++){
mean[j] = 0;
sd[j] = 0;
for (int i = 0; i < N; i++) {
value = data[j * N + i];
mean[j] += value;
sd[j] += value * value;
}
mean[j] = mean[j] / N;
sd[j] = sqrt(sd[j] / N - mean[j] * mean[j]);
}
}
I specify the start index and calculation spans of each thread and I activate my thread_pool as the following.
x.mean = new float[x.M];
x.sd = new float[x.M];
std::vector<std::thread> thread_pool;
int h = 4;
thread_pool.reserve(h);
int SNIPs = static_cast<int>(x.M / h + 1);
int SNIPs_final = x.M - (h - 1) * SNIPs;
for (int i = 0; i < h - 1; i++)
{
thread_pool.push_back(std::thread(std::bind(cal_mean_std, x.data, x.mean, x.sd,
x.N, i*SNIPs, SNIPs)));
}
thread_pool.push_back(std::thread(std::bind(cal_mean_std, x.data, x.mean, x.sd,
x.N, (h-1)*SNIPs, SNIPs_final)));
for (int i = 0; i < h; i++)
thread_pool.at(i).join();
where the x.M is the total number of cols of my data. However, I found that implement in this way did not improve the program efficiency. I am not sure what the problem is.
Actually, we can simulate data to do the computation. My data size is 5k x 300k. The sequential calculation by using for loop all over the data one thread takes 15 seconds. My multithreading version sometimes takes 16 seconds.
The simulation code is as the following and I find that when I use h = 1, the program takes 6s to finish. However, when I use h = 4, the program takes 14s to finish.
#include <thread>
#include <vector>
#include <stdlib.h>
#include <vector>
#include <stdio.h>
#include <iostream>
#include <math.h>
void gen_matrix(int N, int P, float* data){
for (int i = 0; i < N * P; i++)
{
data[i] = rand() % 10;
}
}
void cal_mean_std(float* data, float* mean, float* sd, int N, int start_index, int span_cols)
{
int value;
for(int j = start_index; j < start_index + span_cols; j++){
mean[j] = 0;
sd[j] = 0;
for (int i = 0; i < N; i++) {
value = data[j * N + i];
mean[j] += value;
sd[j] += value * value;
}
mean[j] = mean[j] / N;
sd[j] = sqrt(sd[j] / N - mean[j] * mean[j]);
}
}
int main()
{
int N = 5000;
int P = 300000;
float* data = new float[N*P];
gen_matrix(N, P, data);
float* mean = new float[P];
float* std = new float[P];
std::vector<std::thread> thread_pool;
clock_t t1;
t1 = clock();
int h = 1;
thread_pool.reserve(h);
int SNIPs = static_cast<int>(P / h + 1);
int SNIPs_final = P - (h - 1) * SNIPs;
for (int i = 0; i < h - 1; i++)
{
thread_pool.push_back(std::thread(std::bind(cal_mean_std, data, mean, std,
N, i*SNIPs, SNIPs)));
}
thread_pool.push_back(std::thread(std::bind(cal_mean_std, data, mean, std,
N, (h-1)*SNIPs, SNIPs_final)));
for (int i = 0; i < h; i++)
thread_pool.at(i).join();
std::cout <<"Time for the cal mean and std is " << (clock() - t1) * 1.0/CLOCKS_PER_SEC << std::endl;
return 0;
}
Thank you, everyone. Finally, I found what the problem is with my code. The timer clock_t computes the CPU consumption time instead of wall time.

C++ contiguous memory operation

I have c++ program in which I am calculating determinant of a matrix using normal array which is as follows:
/* rand example: guess the number */
#include <stdio.h> /* printf, scanf, puts, NULL */
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <iostream>
#include <cstdlib>
#include <vector>
using namespace std;
int** generateStandardMatrix(int dimension);
void ijMinor(int *matrix[], int *minorMatrix[], int size, int row, int column);
int determinant(int *matrix[], int size);
void ijMinor(int *matrix[], int *minorMatrix[], int size, int row, int column) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
if (i < row) {
if (j < column)minorMatrix[i][j] = matrix[i][j];
else if (j == column)continue;
else minorMatrix[i][j - 1] = matrix[i][j];
}
else if (i == row)continue;
else {
if (j < column)minorMatrix[i - 1][j] = matrix[i][j];
else if (j == column)continue;
else minorMatrix[i - 1][j - 1] = matrix[i][j];
}
}
}
}
int determinant(int *matrix[], int size) {
if (size == 1)return matrix[0][0];
else {
int result = 0, sign = -1;
for (int j = 0; j < size; j++) {
int **minorMatrix;
minorMatrix = new int*[size - 1];
for (int k = 0 ; k < size - 1 ; k++)
minorMatrix[k] = new int[size - 1];
ijMinor(matrix, minorMatrix, size, 0, j);
sign *= -1;
result += sign * matrix[0][j] * determinant(minorMatrix, size - 1);
for (int i = 0; i < size - 1; i++) {
delete minorMatrix[i];
}
}
return result;
}
}
int main (int argc, char* argv[])
{
/* initialize random seed: */
srand (time(NULL));
// int iSecret, iGuess;
int dimension = atoi(argv[1]);
int rowCount = dimension , colCount = dimension;
//2d array storing the integer values
int** ary = new int*[dimension];
//vector of vector storing the indices across the array for the threads to pick up from
vector<vector<int> > vec;
ary = generateStandardMatrix(dimension);
printf("Array value : %d\n", ary[0][0]);
int detVal = determinant(ary, dimension);
printf("determinant value : %d\n", detVal);
return 0;
}
int** generateStandardMatrix(int dimension) {
int** ary = new int*[dimension];
int counter = 0;
for (int i = 0; i < dimension; ++i) {
ary[i] = new int[dimension];
counter = counter + 1;
for (int j = 0; j < dimension; ++j)
{
ary[i][j] = counter;
std::cout << ary[i][j] << "\t" << std::flush;
}
std::cout << std::endl;
}
return ary;
}
I want to replace it with code in which I allocate memory for the array before the start of the algorithm and then change the determinant and the ijMonor functions so that they don't create new array's but use the same array only.
The determinant will take parameter like: determinant(int *matrix, int *startOfMyWorkspace, int size) so that it knows where to start.
I am not good at c++ and so far I was not able to do it.
Can someone please provide some sample code.
I allocated some memory for array and created and array but was unable to change the ijMinor and determinant functions for that.
This is how I am allocating memory:
int main (int argc, char* argv[])
{
/* initialize random seed: */
srand (time(NULL));
// int iSecret, iGuess;
int dimension = atoi(argv[1]);
int *a;
size_t const N_BYTES = dimension * dimension * sizeof(int);
a = (int*)malloc(N_BYTES);
createData(dimension,a);
return 0;
}
void createData(int const dimension, int* const a)
{
int row, col;
srand((unsigned)time(NULL));
int counter;
for(int row = 0; row < dimension; row++) {
counter = counter + 1;
for(int col = 0; col < dimension; col++) {
int i = col + row * dimension;
a[i] = counter;
std::cout << a[i] << "\t" << std::flush;
}
std::cout << std::endl;
}
}
Try this.
Note if you use new to allocate an array, you need to use delete[] to free all of it. You'll get away with delete (i.e. it won't crash) but this will only free the first element. Your other functions are the same as you posted.
You're dynamically allocating space for minorMatrix in determinant function, but it's hard to see how that could be preallocated. I've modified determinant function to use allocate_arr and deallocate_arr.
int ** allocate_arr(int dimension)
{
int** a = new int*[dimension];
for (int i = 0; i < dimension; ++i)
a[i] = new int[dimension];
return a;
}
void deallocate_arr(int dimension, int **a)
{
for (int i = 0; i < dimension; ++i)
delete[] a[i];
delete[] a;
}
int determinant(int *matrix[], int size) {
if (size == 1)return matrix[0][0];
else {
int result = 0, sign = -1;
for (int j = 0; j < size; j++) {
int **minorMatrix = allocate_arr(size - 1);
ijMinor(matrix, minorMatrix, size, 0, j);
sign *= -1;
result += sign * matrix[0][j] * determinant(minorMatrix, size - 1);
deallocate_arr(size - 1, minorMatrix);
}
return result;
}
}
void generateStandardMatrix(int dimension, int**ary) {
int counter = 0;
for (int i = 0; i < dimension; ++i) {
counter = counter + 1;
for (int j = 0; j < dimension; ++j)
{
ary[i][j] = counter;
std::cout << ary[i][j] << "\t" << std::flush;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
srand(time(NULL));
int dimension = atoi(argv[1]);
int** a = allocate_arr(dimension);
generateStandardMatrix(dimension, a);
printf("Array value : %d\n", a[0][0]);
int detVal = determinant(a, dimension);
printf("determinant value : %d\n", detVal);
// ... do more computations here, reusing `a` ...
deallocate_arr(dimension, a);
return 0;
}

Trying to use std::aligned_storage with SSE and new

I wanted to try getting a square root of some floats using SSE instrincs in C++. But I get a exception when I try to store the result. Can I use std::aligned_storage like that?
#include <iostream>
#include <type_traits>
#include <xmmintrin.h>
using namespace std;
using float_storage = aligned_storage<4 * sizeof(float), 16>;
int main()
{
int N;
cin >> N;
float_storage * values = new float_storage[ N / 4 ]; // 4 floats in pack
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
for(int i = 0; i < 4; i++)
cin >> fptr[i];
}
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
__m128 x = _mm_loadu_ps(fptr);
x = _mm_sqrt_ps(x);
_mm_store_ps(fptr, x); // im getting an crash here
}
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
for(int i = 0; i < 4; i++)
cout << fptr[i] << endl;
}
delete[] values;
}
It's aligned_storage<size, align>::type. aligned_storage itself is just a metaprogramming struct.
Also, new is only rated to std::max_align_t, if I recall correctly, even if you new up a type with higher alignment requirements.