I wrote the code according to the algorithm, but the result is incorrect. According to the algorithm, we must indicate the dimension of the matrix and manually fill in the main matrix A and vector B. We need to generate an LU matrix. It is generated, but with the wrong numbers. And in the end we have to get the vector X with solutions. And this is in windowed mode.
https://imgur.com/TSsjMXp
int N = 1; // matrix dimension
double R = 0;
typedef double Matrix [6][6];
typedef double Vec [6];
.
.
.
void Decomp (Matrix A, int N, int &Change)
{
int i, j, k ;
double R, L, U;
Change = 1;
R = Math::Abs(A[1][1]);
for(j=2; j<=N; j++)
if (Math::Abs(A[j][1])>= R)
{
Change = j;
R = Math::Abs(A[j][1]);
}
if (R<= 1E-7)
{
MessageBox::Show("The system is degenerate");
}
if (k!=1)
{
for(i=1; i<=N; i++)
{
R = A[Change][i];
A[Change][i] = A[1][i];
A[1][i] = R;
}
}
for(i=2; i<=N; i++)
A[1][i] = A[1][i]/A[1][1];
for(i=2; i<=N; i++)
{
for(k=i; k<=N; k++);
{
R = 0;
for ( j=1; j<=(i-1); j++)
R = R + A[k][j] * A[j][i];
A[k][i] = A[k][i] - R;
}
if (A[i][i]<= 1E-7)
{
MessageBox::Show("The system is degenerate[enter image description here][1]");
}
for(k = i+1; k<=N; k++)
{
R = 0;
for (j=1; j<=(i-1); j++)
R = R + A[i][j] * A[j][k];
A[i][k] = (A[i][k] - R) / A[i][i];
}
}
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
C_matrix_dgv->Rows[i]->Cells[j] -> Value = Convert::ToString(A[i+1][j+1]);
}
}
void Solve (Matrix A, Vec b, Vec x, int Change, int N)
{
int i = 0,j = 0;
double R;
if (Change!=1)
{
R = b[Change];
b[Change] = b[1];
b[1] = R;
}
b[1] = b[1]/A[1][1];
for(i=2; i<=N; i++)
{
R = 0;
for( j=1; j<=(i-1); j++)
R = R + A[i][j] * b[j];
b[i] = (b[i] - R) / A[i][i];
}
x[N] = b[N];
for( i=1; i<=(N-1); i++)
{
R = 0;
for(j = (N+1-i); j<=N; j++)
R = R + A[N - i][j] * x[j];
x[N - i] = b[N - i] - R;
}
}
int N = 1; // matrix dimension
If you use this in the rest of the code you cannot get correct results. The dimension of the matrix is 6x6. Use a std::array or std::vector so that you dont need to keep the size in a seperate variable.
Related
this is a piece of code for a simple iteration method for solving systems of linear algebraic equations:
double* iter(double** a, double* y, int n, int& iter)
{
double* res = new double[n];
int i, j;
for (i = 0; i < n; i++)
{
res[i] = y[i] / a[i][i];
}
double eps = 0.0001;
double* Xn = new double[n];
do {
iter++;
for (i = 0; i < n; i++) {
Xn[i] = y[i] / a[i][i];
for (j = 0; j < n; j++) {
if (i == j)
continue;
else {
Xn[i] -= a[i][j] / a[i][i] * res[j];
}
}
}
bool flag = true;
for (i = 0; i < n - 1; i++) {
if (fabs(Xn[i] - res[i]) > eps) {
flag = false;
break;
}
}
for (i = 0; i < n; i++) {
res[i] = Xn[i];
}
if (flag)
break;
} while (1);
return res;
}
and formula for it:
but I would like to implement the seidel method.and slightly changed the code according to the formula below
for (i = 0; i < n; i++) {
Xn[i] = y[i] / a[i][i];
for (j = 0; j < i-1; j++) {
Xn[i] -= a[i][j] / a[i][i] * Xn[j];
}
for (j = i+1; j < n; j++){
Xn[i] -= a[i][j] / a[i][i] * res[j];
}
}
but I'm not getting exactly what I expected:
I would be grateful if you could tell me where I made a mistake. thank you in advance for your answers.
Your mistake lies in the new implementation.
The first sum of the Seidel method sums up to the element before the diagonal, while your for loop goes up to two elements before the diagonal.
Instead of
for(j = 0; j < i-1; j++)
you should have
for(j = 0; j < i; j++)
Note that Gauss Seidel method is applicable if the elements on the diagonal are non-zero.
I am using gcc compiler. (g++ -o test testfile.cpp)
I want to use Openacc to parallelize my code but I am a bit confused about using #pragma correctly.
Below is the part where I used parallelization.
Even after using Openacc the code is not faster than before.
I guess this is related to 'data-moving' thing.
So I think I need to use #pragma acc data copy here. But I am not sure how to use this properly.
Any help?
Thanks in advance.
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc kernels
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
}
Are you using the flag to enable OpenACC, i.e. "-fopenacc"? If not the OpenACC directives will be ignored.
Note that you'll want to use a newer GNU version, 10.2 preferable, as GNU support for OpenACC has gotten better over the years. I believe their compiler loop dependency analysis is still lacking so will run "kernels" compute regions sequentially on the device. Hence, for now, you'll want to stick to using "parallel" regions. If you do really want to use "kernels", I'd suggest switching to the NVIDIA HPC compilers (full disclosure, I work for NVIDIA)
Now I think the initial problem is just that you're not enabling OpenACC and why it's the same speed. Actually here I'd expect this case to be extremely slow if you tried to offload it. Besides running the "kernels" region sequentially on the device, the data would need to be transferred back and forth between the host and device each timestep.
The optimal strategy is to have a data region outside of the while loop, use an "update" directive when needed to synchronize the device and host copies of the arrays, and then ensure all the compute has been offload to the device.
Since you didn't post a complete reproducer, I can't test this code and hence verify that it's correct. But to give you an idea of this strategy, I modified your code below:
#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny], Ptrans[:Ny]) \
create(Vfuture[:Na][Nd], V[:Na][:Nd][:Ny], maxposition_a[:Na][:Nd][:Ny], maxposition_b[:Na][:Nd][:Ny]) \
create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc parallel loop collapse(3) default(present)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// I'm unclear why your using a 2D array for temphoward. It's preventing
// parallelzation of the inner loop and could be replaced with a scalar.
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
#pragma acc parallel loop collapse(3) reduction(max:dif)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// Again, why aren't you using a scalar here for tempdiff?
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
#pragma acc update self(Vnew[:Na][:Nd][:Ny])
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
}
#pragma acc exit data delete(Vnew, Ptrans, Vfuture, V, maxposition_a, maxposition_b, Vhoward)
// add others here as needed
I am trying to use GPU to parallelize my code in Visual Studio C++.
Currently, I used OpenMP to use CPU parallelization.
But I am thinking of using GPU parallelization because I think it would be faster if I use a bigger size of arrays in calculations.
Below is the code that I am working on. I only used parallelization once.
I found out that in order to use GPU parallelization, I need to use OpenCL or Cuda.
And OpenCL and Cuda seem like I need to change the whole code. So I was wondering whether there is a way to use GPU parallelization without changing the whole code (maybe just changing '#pragma omp parallel for')
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val) // USE PARALLELIZATION
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}
There is no convenient way to add a #pragma and everything magically runs on the GPU.
However your code is well suited for GPU acceleration: In your loops, the elements are independent of each other. You can especially parallelize the Na, Nd, and Ny loops on the GPU. You will need to:
include the OpenCL C++ headers, see here
linearize the triple loop: crerate a linear index n = (i*Nd+j)*Ny+k;, turn the three loops into one
transfer your code to OpenCL C and get rid of the linear loop, a simple example how a kernel looks is here
create Buffers (allocate memory on the GPU)
create Kernel objects (one for each linearized triple loop) in C++ and link the Buffers as Kernel arguments
manually handle CPU<->GPU memory transfer (enqueueReadBuffer/enqueueWriteBuffer)
run the Kernels (enqueueNDRangeKernel)
I'm trying to calculate the Covariance (matrix) of a vector in C++ ...
I have carried out the following:
std::vector<std::vector<double> > data = { {2.5, 2.4}, {0.5, 0.7} };
I have then calculated and subtracted the mean, which gave the following result:
data = { {0.05, -0.05}, {-0.1, 0.1} }
As far as I'm aware, the next step is to transpose the matrix, and multiply the origin together, take the sum and finally divide by the dimensions X - 1..
I have written the following:
void cover(std::vector<std::vector<double> > &d)
{
double cov = 0.0;
for(unsigned i=0; (i < d.size()); i++)
{
for(unsigned j=0; (j < d[i].size()); j++)
{
cov += d[i][j] * d[j][i] / (d[i].size() - 1);
std::cout << cov << " ";
}
std::cout << std::endl;
}
}
Where d is the vector after the mean has been subtracted from each of the points
Which gives me the result:
0.0025, 0.0075
0.0125, 0.0225
Where compared with matlab:
2.0000 1.7000
1.7000 1.4450
Does anyone have any ideas to where I am going wrong?
Thanks
This statement:
As far as I'm aware, the next step is to transpose the matrix, and multiply the origin together, take the sum and finally divide by the dimensions X - 1..
And this implementation:
cov += d[i][j] * d[j][i] / (d[i].size() - 1);
Don't say the same thing. Based on the definition here:
void outer_product(vector<double> row, vector<double> col, vector<vector<double>>& dst) {
for(unsigned i = 0; i < row.size(); i++) {
for(unsigned j = 0; j < col.size(); i++) {
dst[i][j] = row[i] * col[j];
}
}
}
//computes row[i] - val for all i;
void subtract(vector<double> row, double val, vector<double>& dst) {
for(unsigned i = 0; i < row.size(); i++) {
dst[i] = row[i] - val;
}
}
//computes m[i][j] + m2[i][j]
void add(vector<vector<double>> m, vector<vector<double>> m2, vector<vector<double>>& dst) {
for(unsigned i = 0; i < m.size(); i++) {
for(unsigned j = 0; j < m[i].size(); j++) {
dst[i][j] = m[i][j] + m2[i][j];
}
}
}
double mean(std::vector<double> &data) {
double mean = 0.0;
for(unsigned i=0; (i < data.size());i++) {
mean += data[i];
}
mean /= data.size();
return mean;
}
void scale(vector<vector<double>> & d, double alpha) {
for(unsigned i = 0; i < d.size(); i++) {
for(unsigned j = 0; j < d[i].size(); j++) {
d[i][j] *= alpha;
}
}
}
So, given these definitions, we can compute the value for the covariance matrix.
void compute_covariance_matrix(vector<vector<double>> & d, vector<vector<double>> & dst) {
for(unsigned i = 0; i < d.size(); i++) {
double y_bar = mean(d[i]);
vector<double> d_d_bar(d[i].size());
subtract(d[i], y_bar, d_d_bar);
vector<vector<double>> t(d.size());
outer_product(d_d_bar, d_d_bar, t);
add(dst, t, dst);
}
scale(dst, 1/(d.size() - 1));
}
I think maybe For loop in outer_product it's wrong:
void outer_product(vector<double> row, vector<double> col, vector<vector<double>>& dst) {
for(unsigned i = 0; i < row.size(); i++) {
for(unsigned j = 0; j < col.size(); i++) {
dst[i][j] = row[i] * col[j];
}
}
I will change i++ -> j++
I`m trying to do this function that solves linear systems, A*x = b, where A = lower triangular matrix, linear independent matrix and with only one solution.
But the results always show 0 0 0 0 ...
I have printed the sum, s, and it always shows 0 as well...
#include <iostream>
using namespace std;
void solve(int n, float a[][MAX], float b[], float x[]){
int i,j;
float s;
for(i = 0; i < n; i++){
s = 0;
for(j = 0; j < n; j++){
s = s + a[i][j]*x[j];
cout<<s<<endl;
}
x[i] = (b[i] - s)/a[i][i];
}
}
void solve(int n, float a[][MAX], float b[], float x[]){
int i,j;
float s;
for(i = 0; i < n; i++) {
s = 0;
for(j = 0; j < i; j++) {
^
s = s + a[ i][ j] * x[ j];
}
x[ i] = ( b[ i] - s) / a[ i][ i];
}
}
BackSubstitution.pdf
compiled example
This line:
for(j = 0; j < n; j++){
Should be:
for(j = 0; j < i; j++){
Then it works fine - assuming your pivots are always non zero.