Multiplication with AVX - c++

Please this is my first time of using AVX and I'm trying to perform a simple multiplication on double precision numbers but I'm not getting all results correct.
I get just the first 4 results and the others are jargon.
#include <immintrin.h>
#include <iostream>
#include <math.h>
#include <time.h>
using namespace std;
int main() {
double *a, *b; // data pointers
double *pA,*pB; // work pointer
__m256d rA_AVX, rB_AVX; // variables for AVX
const int vector_size = 8;
a = (double*) _mm_malloc (vector_size*sizeof(double),64);
b = (double*) _mm_malloc (vector_size*sizeof(double),64);
for(int i = 0; i < vector_size; i++) {
a[i] = (rand() % 48);
b[i] = 0.0f;
cout << a[i] << endl;
}
for (int i = 0; i < vector_size; i += 8)
{
pA = a;
pB = b;
rA_AVX = _mm256_load_pd(pA);
rB_AVX = _mm256_mul_pd(rA_AVX,rA_AVX);
_mm256_store_pd(pB,rB_AVX);
pA += 8;
pB += 8;
}
for (int i=0; i<vector_size; i++){
cout << endl << b[i] << endl;
}
_mm_free(a);
_mm_free(b);
system("PAUSE");
return 0;
}

Related

OpenMP code is aborted

I'm trying to perform matrix multiplication using openMP as follows and I compile it using GCC : g++ -std=gnu++11 -g -Wall -fopenmp -o parallel_not_opt parallel_not_opt.cpp
But when I try to run it by using parallel_not_opt.exe, it aborts giving the typical Windows error parallel_not_opt.exe has stopped working...
Am I missing something?
#include "includes/stdafx.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
# include <omp.h>
#include <chrono>
#include <fstream>
#include <algorithm>
#include <immintrin.h>
#include <cfloat>
#include <limits>
#include <math.h>
using namespace std::chrono;
using namespace std;
//populate matrix with random values.
double** generateMatrix(int n){
double max = DBL_MAX;
double min = DBL_MIN;
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
double randVal = (double)rand() / RAND_MAX;
matA[i][j] = min + randVal * (max - min);
}
}
return matA;
}
//generate matrix for final result.
double** generateMatrixFinal(int n){
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
matA[i][j] = 0;
}
}
return matA;
}
//matrix multiplication - parallel
double matrixMultiplicationParallel(double** A, double** B, double** C, int n){
int i, j, k;
clock_t begin_time = clock();
# pragma omp parallel shared ( A,B,C,n ) // private ( i, j, k )
{
# pragma omp for
for (i = 0; i < n; i++) {
// cout<< i << ", " ;
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
double t = float(clock() - begin_time);
return t;
}
int _tmain(int argc, _TCHAR* argv[])
{
ofstream out("output.txt", ios::out | ios::app);
out << "--------------STARTED--------------" << "\n";
int start = 200, stop = 2000, step = 200;
for (int n = start; n <= stop; n += step)
{
srand(time(NULL));
cout << "\nn: " << n << "\n";
double t1 = 0;
int my_size = n;
double **A = generateMatrix(my_size);
double **B = generateMatrix(my_size);
double **C = generateMatrixFinal(my_size);
double single_sample_time = matrixMultiplicationParallel(A, B, C, n);
t1 += single_sample_time;
for (int i = 0; i < n; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
out << "-----------FINISHED-----------------" << "\n";
out.close();
return 0;
}
The private ( i, j, k ) declaration is not optional. Add it back, otherwise the inner loop variables j and k are shared, which completely messes up the inner loops.
It is better to declare variables as locally as possible. That makes reasoning about OpenMP code much easier:
clock_t begin_time = clock();
# pragma omp parallel
{
# pragma omp for
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
return float(clock() - begin_time);
In that case, A,B,C will be shared by default - coming from the outside, and j,k are private because they are declared within the parallel scope. The loop variable of a parallel for is always implicitly private.

pass empty matrix to function

I'm trying to pass a matrix without dimensions to a function and fill with data. Here is an example:
#include <iostream>
#include <stdlib.h>
using namespace std;
void fun(double **matrix);
int main(void)
{
double **matrix;
fun(matrix);
for(int i = 0; i < 10; i ++)
{
cout << "matrix = " << matrix[i][0] << "\t" << matrix[i][1] << endl;
}
}
void fun(double **matrix)
{
int rowCount = 10;
int colCount = 2;
matrix = new double*[rowCount];
for(int i = 0; i < rowCount; ++i)
matrix[i] = new double[colCount];
for(int i = 0; i < rowCount; i ++)
{
matrix[i][0] = 3.;
matrix[i][1] = 4.;
}
}
It compiles but when I execute it, it return the following error:
Illegal instruction: 4
Do you know why?
First of all if you are got a pointer it should point at something, so in order to not have a warnings just initialize it with nullptr. Second thing is that you should return a pointer to pointer.I fixed your code so you can look what you have made wrong,
#include <iostream>
#include <stdlib.h>
using namespace std;
double **fun(double **matrix);
int main(void)
{
double **matrix= nullptr;
matrix = fun(matrix);
for (int i = 0; i < 10; i++)
{
cout << "matrix = " << matrix[i][0] << "\t" << matrix[i][1] << endl;
}
getchar();
getchar();
}
double **fun(double **matrix)
{
int rowCount = 10;
int colCount = 2;
matrix = new double*[rowCount];
for (int i = 0; i < rowCount; ++i)
matrix[i] = new double[colCount];
for (int i = 0; i < rowCount; i++)
{
matrix[i][0] = 3.;
matrix[i][1] = 4.;
}
return matrix;
}
Your program fails because fun allocates memory, but does not return the newly allocated matrix array to main. It would be easier to do this by return value than by argument:
double** fun() {
// ...
double** matrix = new double*[...];
// ...
return matrix;
}
int main() {
double** matrix = fun();
// ...
}
There are a million other ways to do that (by passing a double*** into fun, or a reference, or by using some sort of object etc). Pick whatever you like best.

C++ memory leak, how to detect

I am using SSE to implement matrix multiplication, but I found there exists memory leak(see the picture below), the memory usage is increasing from 400M to 1G or more.
But, I free the memory in the code.
The following are codes
main.cpp
#include "sse_matrix.h"
#include <ctime>
int main(int argc, char* argv[])
{
vector<float> left(size, 0);
vector<float> right(size, 0);
vector<float> result(size, 0);
// initialize value
for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
left[i*dim + j] = j;
right[i*dim + j] = j;
}
}
cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
// calculate the result
clock_t my_time = clock();
SSE_Matrix_Multiply(&left, &right, &result);
cout << "2. INFO: SSE matrix multiplication result has got" << endl;
/*for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
cout << result[i * dim + j] << " ";
}
cout << endl;
}*/
cout << "3. INFO: " << float(clock() - my_time)/1000.0 << endl;
system("pause");
return 0;
}
sse_matrix.h
#ifndef __SSE_MATRIX_H
#define __SSE_MATRIX_H
#include <vector>
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
using std::vector;
//#define dim 8
//#define size (dim * dim)
const int dim = 4096;
const int size = dim * dim;
struct Matrix_Info
{
vector<float> * A;
int ax, ay;
vector<float> * B;
int bx, by;
vector<float> * C;
int cx, cy;
int m;
int n;
};
void Transpose_Matrix_SSE(float * matrix)
{
__m128 row1 = _mm_loadu_ps(&matrix[0*4]);
__m128 row2 = _mm_loadu_ps(&matrix[1*4]);
__m128 row3 = _mm_loadu_ps(&matrix[2*4]);
__m128 row4 = _mm_loadu_ps(&matrix[3*4]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_storeu_ps(&matrix[0*4], row1);
_mm_storeu_ps(&matrix[1*4], row2);
_mm_storeu_ps(&matrix[2*4], row3);
_mm_storeu_ps(&matrix[3*4], row4);
}
float * Shuffle_Matrix_Multiply(float * left, float * right)
{
__m128 _t1, _t2, _sum;
_sum = _mm_setzero_ps(); // set all value of _sum to zero
float * _result = new float[size];
float _res[4] = {0};
for (int i = 0; i < 4; i ++)
{
for (int j = 0; j < 4; j ++)
{
_t1 = _mm_loadu_ps(left + i * 4);
_t2 = _mm_loadu_ps(right + j * 4);
_sum = _mm_mul_ps(_t1, _t2);
_mm_storeu_ps(_res, _sum);
_result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
}
}
return _result;
}
float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int ax = my_info->ax;
int ay = my_info->ay;
int bx = my_info->bx;
int by = my_info->by;
//1. split Matrix A and Matrix B
float * _a = new float[16];
float * _b = new float[16];
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
_a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
_b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
}
}
//2. transpose Matrix B
Transpose_Matrix_SSE(_b);
//3. calculate result and return a float pointer
return Shuffle_Matrix_Multiply(_a, _b);
}
int Matrix_Multiply(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int cx = my_info->cx;
int cy = my_info->cy;
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
float * temp = SSE_4_Matrix(my_info);
(*my_info->C)[(i + cx) * n + j + cy] += temp[i*m + j];
delete [] temp;
}
}
return 0;
}
void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result)
{
struct Matrix_Info my_info;
my_info.A = left;
my_info.B = right;
my_info.C = result;
my_info.n = dim;
my_info.m = 4;
// Matrix A row:i, column:j
for (int i = 0; i < dim; i += 4)
{
for (int j = 0; j < dim; j += 4)
{
// Matrix B row:j column:k
for (int k = 0; k < dim; k += 4)
{
my_info.ax = i;
my_info.ay = j;
my_info.bx = j;
my_info.by = k;
my_info.cx = i;
my_info.cy = k;
Matrix_Multiply(&my_info);
}
}
}
}
#endif
And I guess maybe the memory leak is in Shuffle_Matrix_Multiply function in sse_matrix.h file. But, I am not sure, and now, the memory usage is increasing and my system will crash.
Hope someone can help to figure out and thanks in advance.
You never free the _a and _b allocated in SSE_4_Matrix.
You also allocate a lot dynamically just to throw it away a bit later. For example the _a and _b could be arrays of 16 floats in stack.
I would like to use a header file to help me to check memory leak. The header file as follows:
MemoryLeakChecker.hpp
#ifndef __MemoryLeakChecker_H__
#define __MemoryLeakChecker_H__
#include <crtdbg.h>
#include <cassert>
//for memory leak check
#ifdef _DEBUG
#define DEBUG_CLIENTBLOCK new(_CLIENT_BLOCK,__FILE__,__LINE__)
#else
#define DEBUG_CLIENTBLOCK
#endif
#define _CRTDBG_MAP_ALLOC
#ifdef _DEBUG
#define new DEBUG_CLIENTBLOCK
#endif
inline void checkMemoryLeak() {
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
int m_count = _CrtDumpMemoryLeaks();
assert(m_count == 0);
}
#endif
In my project, i will use MemoryLeakChecker.hpp in the file including main function as follows:
MemoryLeakTest.cpp
#include "MemoryLeakChecker.hpp"
int main() {
//_crtBreakAlloc = 148; //if you only know the memory leak block number is 148 after checking memory leak log, use this to locate the code causing memory leak.
//do some things
atexit(checkMemoryLeak); //check all leak after main() function called
return 0;
}
Run your program in debug mode in Visual Studio, you can get the memory leak log in output window after your program exited. Also, you can find the place where memory leaked in the memory leak log.

AVX2 instruction interrupt in Visual Studio 2013

Here's the c++ code:
#include <stdio.h>
#include <iostream>
#include <immintrin.h>
using namespace std;
int main(int argc, char* argv[]) {
char a[100];
for (int i = 0; i < 32; i++)
a[i] = 1;
for (int i = 32; i < 64; i++)
a[i] = 0;
__m256i x = _mm256_loadu_si256((__m256i *)(a));
__m256i y = _mm256_loadu_si256((__m256i *)(a + 32));
for (int i = 0; i < 32; i++)
cout << (int)x.m256i_i8[i];
cout << endl;
for (int i = 0; i < 32; i++)
cout << (int)y.m256i_i8[i];
cout << endl;
__m256i z = _mm256_xor_si256(x, y);
for (int i = 0; i < 32; i++)
cout << (int)z.m256i_i8[i];
cout << endl;
return 0;
}
When I run this code in VS2013, the xor operation will interrupt.
And I think my cpu can support AVX2 since x and y can be output successfully.
Can somebody tell me how to fix this, please?

Trying to use std::aligned_storage with SSE and new

I wanted to try getting a square root of some floats using SSE instrincs in C++. But I get a exception when I try to store the result. Can I use std::aligned_storage like that?
#include <iostream>
#include <type_traits>
#include <xmmintrin.h>
using namespace std;
using float_storage = aligned_storage<4 * sizeof(float), 16>;
int main()
{
int N;
cin >> N;
float_storage * values = new float_storage[ N / 4 ]; // 4 floats in pack
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
for(int i = 0; i < 4; i++)
cin >> fptr[i];
}
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
__m128 x = _mm_loadu_ps(fptr);
x = _mm_sqrt_ps(x);
_mm_store_ps(fptr, x); // im getting an crash here
}
for(int i = 0; i < N / 4; i++)
{
void *vptr = static_cast<void*>(&values[i]);
float *fptr = static_cast<float*>(vptr);
for(int i = 0; i < 4; i++)
cout << fptr[i] << endl;
}
delete[] values;
}
It's aligned_storage<size, align>::type. aligned_storage itself is just a metaprogramming struct.
Also, new is only rated to std::max_align_t, if I recall correctly, even if you new up a type with higher alignment requirements.