I am using gcc compiler. (g++ -o test testfile.cpp)
I want to use Openacc to parallelize my code but I am a bit confused about using #pragma correctly.
Below is the part where I used parallelization.
Even after using Openacc the code is not faster than before.
I guess this is related to 'data-moving' thing.
So I think I need to use #pragma acc data copy here. But I am not sure how to use this properly.
Any help?
Thanks in advance.
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc kernels
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
}
Are you using the flag to enable OpenACC, i.e. "-fopenacc"? If not the OpenACC directives will be ignored.
Note that you'll want to use a newer GNU version, 10.2 preferable, as GNU support for OpenACC has gotten better over the years. I believe their compiler loop dependency analysis is still lacking so will run "kernels" compute regions sequentially on the device. Hence, for now, you'll want to stick to using "parallel" regions. If you do really want to use "kernels", I'd suggest switching to the NVIDIA HPC compilers (full disclosure, I work for NVIDIA)
Now I think the initial problem is just that you're not enabling OpenACC and why it's the same speed. Actually here I'd expect this case to be extremely slow if you tried to offload it. Besides running the "kernels" region sequentially on the device, the data would need to be transferred back and forth between the host and device each timestep.
The optimal strategy is to have a data region outside of the while loop, use an "update" directive when needed to synchronize the device and host copies of the arrays, and then ensure all the compute has been offload to the device.
Since you didn't post a complete reproducer, I can't test this code and hence verify that it's correct. But to give you an idea of this strategy, I modified your code below:
#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny], Ptrans[:Ny]) \
create(Vfuture[:Na][Nd], V[:Na][:Nd][:Ny], maxposition_a[:Na][:Nd][:Ny], maxposition_b[:Na][:Nd][:Ny]) \
create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc parallel loop collapse(3) default(present)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// I'm unclear why your using a 2D array for temphoward. It's preventing
// parallelzation of the inner loop and could be replaced with a scalar.
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
#pragma acc parallel loop collapse(3) reduction(max:dif)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// Again, why aren't you using a scalar here for tempdiff?
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
#pragma acc update self(Vnew[:Na][:Nd][:Ny])
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
}
#pragma acc exit data delete(Vnew, Ptrans, Vfuture, V, maxposition_a, maxposition_b, Vhoward)
// add others here as needed
Related
enter image description here
i have this expression of a tour of worker and i tried to coded it and i don't know if it is correct or no because i have many errors so can someone help me ,
X[w][w][i][j] is a decision variable , p [i][j] is the weight of processing arc from i to j and d[i][j] is a distance
this is my proposition of code
compteur = 0;
IloFloatVarArray2 CW(env, W);
for (w = 0; w < W; w++)
{
CW[w] = IloFloatVarArray(env, W, 0.0, INFINITY);
model.add(CW[w]);
#ifdef DEBUG
for (w = 0; w < W; w++)
{
sprintf(varname, "CW_%d_%d", w, w);
CW[w][w].setName(varname);
compteur++;
}
}
#endif
#ifdef DEBUG
printf("compteur cw =%d\n", compteur);
#endif
IloExpr CW[w][w](env);
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += d[i][j] * xW[w][i][j][w];
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += Parc[i][j] * xW[w][i][j][w];
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += 1 * xW[w][i][j][w];
model.add(env, CW[w][w]);
CW[w][w].end();
you have to create a 2-D array of expressions, fill it, and then constrain the expressions by some constraints. For example:
#include <ilcp/cp.h>
int main() {
typedef IloArray<IloNumExprArray> IloNumExprArray2;
IloEnv env;
IloInt n = 4;
IloNumExprArray2 c(env, n);
IloIntVarArray var(env, n, 0, n-1);
for (IloInt x = 0; x < n; x++) {
c[x] = IloNumExprArray(env, n);
for (IloInt y = 0; y < n; y++) {
c[x][y] = IloNumExpr(env, 0);
c[x][y] += var[x] * var[y];
}
}
IloModel mdl(env);
for (IloInt x = 0; x < n; x++) {
for (IloInt y = 0; y < n; y++) {
mdl.add(c[x][y] ==1);
}
}
IloCP cp(mdl);
cp.startNewSearch();
while (cp.next()) {
cp.out() << cp.domain(var) << std::endl;
}
cp.end();
env.end();
return 0;
}
I am using Visual studio now and I have a question about an error message that I got.
It says 'std::bad_alloc at memory location'.
I guess this means that I don't have enough memory. I am using arrays for computation and I think the problem arises from the size of each array.
First I used stack instead of heap so I allocate every array dynamically. But still, I got the same error message.
If I reduce the size of the array, I could run this code but I need to use this with a bigger size of arrays.
If I use a better computer, can I run this code? I am not sure how to reduce the memory required to run this code.... should I use fewer arrays?
Below, I put the entire code just in case....
Thanks in advance.
include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(double)
void dallo_fn(double**** pMat, int Na, int Nd, int Ny) {
double*** Mat = new double** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new double* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new double[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(double)
void dallo_fn0(double**** pMat, int Na, int Nd, int Ny) {
double*** Mat = new double** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new double* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new double[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
double utility(double a, double a_f, double d, double d_f, double y, double sig, double psi, double delta, double R) {
double C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
double result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
double duration;
// Iteration Parameters
double tol = 0.000001;
double itmax = 200;
double H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 2 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const double amin = -2;
const double amax = 7;
const double dmin = 0.01;
const double dmax = 7;
const double ymin = 0.5;
const double ymax = 1.5;
const double Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
double ca = (amax - amin) / (Na - 1.0);
double cd = (dmax - dmin) / (Nd - 1.0);
double cy = (ymax - ymin) / (Ny - 1.0);
double* A = new double[Na];
double* Y = new double[Ny];
double* D = new double[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
double*** V;
dallo_fn(&V, Na, Nd, Ny);
double*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
double Val[Na][Nd];
double Vfuture[Na][Nd];
double** temphoward = new double* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new double[Nd];
}
double*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
double*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
double** mg_A_v = new double* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new double[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
double** mg_D_v = new double* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new double[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
double***** Uvec = new double**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new double*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new double** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new double* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new double[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
double dif;
double max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<double> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}
Yes, with enough memory you can run this code. I've tested it just now on my machine with 64GBs of RAM and it ran just fine.
One way to reduce the amount of memory required is to use floats instead of doubles, as they take up half the size.
Moreover, as others suggested in the comments already you could represent the multidimensional structures as 1d arrays and compute the indices into the array instead of actually having nested arrays. The benefit is that you get rid of a huge number of pointers.
The code is admittedly pretty memory hungry, but works on my 8GB machine. Please make sure you are targetting x64 platform, otherwise you reach the 4GB inherent limit.
I am trying to use GPU to parallelize my code in Visual Studio C++.
Currently, I used OpenMP to use CPU parallelization.
But I am thinking of using GPU parallelization because I think it would be faster if I use a bigger size of arrays in calculations.
Below is the code that I am working on. I only used parallelization once.
I found out that in order to use GPU parallelization, I need to use OpenCL or Cuda.
And OpenCL and Cuda seem like I need to change the whole code. So I was wondering whether there is a way to use GPU parallelization without changing the whole code (maybe just changing '#pragma omp parallel for')
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val) // USE PARALLELIZATION
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}
There is no convenient way to add a #pragma and everything magically runs on the GPU.
However your code is well suited for GPU acceleration: In your loops, the elements are independent of each other. You can especially parallelize the Na, Nd, and Ny loops on the GPU. You will need to:
include the OpenCL C++ headers, see here
linearize the triple loop: crerate a linear index n = (i*Nd+j)*Ny+k;, turn the three loops into one
transfer your code to OpenCL C and get rid of the linear loop, a simple example how a kernel looks is here
create Buffers (allocate memory on the GPU)
create Kernel objects (one for each linearized triple loop) in C++ and link the Buffers as Kernel arguments
manually handle CPU<->GPU memory transfer (enqueueReadBuffer/enqueueWriteBuffer)
run the Kernels (enqueueNDRangeKernel)
I wrote the code according to the algorithm, but the result is incorrect. According to the algorithm, we must indicate the dimension of the matrix and manually fill in the main matrix A and vector B. We need to generate an LU matrix. It is generated, but with the wrong numbers. And in the end we have to get the vector X with solutions. And this is in windowed mode.
https://imgur.com/TSsjMXp
int N = 1; // matrix dimension
double R = 0;
typedef double Matrix [6][6];
typedef double Vec [6];
.
.
.
void Decomp (Matrix A, int N, int &Change)
{
int i, j, k ;
double R, L, U;
Change = 1;
R = Math::Abs(A[1][1]);
for(j=2; j<=N; j++)
if (Math::Abs(A[j][1])>= R)
{
Change = j;
R = Math::Abs(A[j][1]);
}
if (R<= 1E-7)
{
MessageBox::Show("The system is degenerate");
}
if (k!=1)
{
for(i=1; i<=N; i++)
{
R = A[Change][i];
A[Change][i] = A[1][i];
A[1][i] = R;
}
}
for(i=2; i<=N; i++)
A[1][i] = A[1][i]/A[1][1];
for(i=2; i<=N; i++)
{
for(k=i; k<=N; k++);
{
R = 0;
for ( j=1; j<=(i-1); j++)
R = R + A[k][j] * A[j][i];
A[k][i] = A[k][i] - R;
}
if (A[i][i]<= 1E-7)
{
MessageBox::Show("The system is degenerate[enter image description here][1]");
}
for(k = i+1; k<=N; k++)
{
R = 0;
for (j=1; j<=(i-1); j++)
R = R + A[i][j] * A[j][k];
A[i][k] = (A[i][k] - R) / A[i][i];
}
}
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
C_matrix_dgv->Rows[i]->Cells[j] -> Value = Convert::ToString(A[i+1][j+1]);
}
}
void Solve (Matrix A, Vec b, Vec x, int Change, int N)
{
int i = 0,j = 0;
double R;
if (Change!=1)
{
R = b[Change];
b[Change] = b[1];
b[1] = R;
}
b[1] = b[1]/A[1][1];
for(i=2; i<=N; i++)
{
R = 0;
for( j=1; j<=(i-1); j++)
R = R + A[i][j] * b[j];
b[i] = (b[i] - R) / A[i][i];
}
x[N] = b[N];
for( i=1; i<=(N-1); i++)
{
R = 0;
for(j = (N+1-i); j<=N; j++)
R = R + A[N - i][j] * x[j];
x[N - i] = b[N - i] - R;
}
}
int N = 1; // matrix dimension
If you use this in the rest of the code you cannot get correct results. The dimension of the matrix is 6x6. Use a std::array or std::vector so that you dont need to keep the size in a seperate variable.
I have a computation with a matrix(88147*2000) and it runs very slow.
So I want to use openMP to speed it up. This is my first time to use
openMP so I just use it in "loop-for".
This is my code:
#include<iostream>
#include<fstream>
#include<math.h>
#include<omp.h>
using namespace std;
#define LONGTH 88147
int label[LONGTH] ;
float data[LONGTH][2000] ;
float w[2000];
float e[2000];
void Input()
{
ifstream fin;
float a;
fin.open("/home/data.train");
if (!fin)
{
cout << "file error";
return;
}
for (int i = 0; i < LONGTH; i++)
{
fin >> a;
label[i] = int(a);
for (int j = 0; j < 2000; j++)
{
fin>>data[i][j];
}
}
fin.close();
cout<<"input over"<<endl;
return;
}
void Initial()
{
for (int i = 0; i < 2000; i++)
{
w[i] = 1;
e[i] = 1;
}
return;
}
bool End()
{
for (int i = 0; i < 2000; i++)
{
if (fabs(e[i])>pow(0.1, 6))
return 0;
}
return 1;
}
float Tkj(int i, int j, int k,float w[2000])
{
return w[i] * data[k][i] - w[j] * data[k][j];
}
float En(int n)//*computation*
{
float result = 0;
#pragma omp parallel for num_threads(64) reduction(+:result)
for (int k = 0; k < LONGTH; k++)
{
int tnum = omp_get_thread_num();
float tmp = 0;
int i = label[k] - 1;
for (int j = 0; j < 2000; j++)
{
if (j != i)
{
float l = 0;
if (n == i)
{
l = data[k][i];
float e = exp(Tkj(i, j, k,w));
tmp = tmp + (-e*l) / pow(1 + e, 2);
}
else if (n == j)
{
l = -data[k][j];
float e = exp(Tkj(i, j, k,w));
tmp = tmp + (-e*l) / pow(1 + e, 2);
}
else
{
continue;
}
}
}
result = result + tmp;
}
return result;
}
float Ex(float w[2000])
{
float result = 0;
#pragma omp parallel for num_threads(64) reduction(+:result)
for (int k = 0; k < LONGTH; k++)
{
int i = label[k] - 1;
float tmp = 0;
int tnum = omp_get_thread_num();
for (int j = 0; j < 2000; j++)
{
if (j != i)
{
tmp = tmp + 1 / (1 + exp(Tkj(i,j,k,w)));
}
}
result = result+tmp;
}
return result;
}
int main()
{
Input();
Initial();
float w2[2000] = { 0 };
float b = pow(0.1,5);
int times = 0;
while (!End()&×<=30000)
{
times++;
cout<<times<<endl;
for (int i = 0; i < 2000; i++)
{
e[i] = En(i);
w2[i] = w[i] - b*e[i];
}
if (Ex(w2)<=Ex(w))//better
{
b = b * 2;
for (int i = 0; i < 2000; i++)
w[i] = w2[i];
}
else//worser
{
b = b / 2;
}
}
ofstream fout("/home/w.txt");
for(int i=0;i<2000;i++)
{
fout<<w[i]<<' ';
}
fout.close();
return 0;
}
The function 'En' costs most of time,so I use "#pragma omp parallel for num_threads(64) reduction(+:result)"to speed it up.
I run it in a CentOS server which has 32 cores and use "-fopenmp" to compile it.But it doesn't be a little quicker.
How can I speed this program up more?