enter image description here
i have this expression of a tour of worker and i tried to coded it and i don't know if it is correct or no because i have many errors so can someone help me ,
X[w][w][i][j] is a decision variable , p [i][j] is the weight of processing arc from i to j and d[i][j] is a distance
this is my proposition of code
compteur = 0;
IloFloatVarArray2 CW(env, W);
for (w = 0; w < W; w++)
{
CW[w] = IloFloatVarArray(env, W, 0.0, INFINITY);
model.add(CW[w]);
#ifdef DEBUG
for (w = 0; w < W; w++)
{
sprintf(varname, "CW_%d_%d", w, w);
CW[w][w].setName(varname);
compteur++;
}
}
#endif
#ifdef DEBUG
printf("compteur cw =%d\n", compteur);
#endif
IloExpr CW[w][w](env);
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += d[i][j] * xW[w][i][j][w];
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += Parc[i][j] * xW[w][i][j][w];
for (i = 0; i < A; i++)
for (j = 0; j < A; j++)
CW[w][w] += 1 * xW[w][i][j][w];
model.add(env, CW[w][w]);
CW[w][w].end();
you have to create a 2-D array of expressions, fill it, and then constrain the expressions by some constraints. For example:
#include <ilcp/cp.h>
int main() {
typedef IloArray<IloNumExprArray> IloNumExprArray2;
IloEnv env;
IloInt n = 4;
IloNumExprArray2 c(env, n);
IloIntVarArray var(env, n, 0, n-1);
for (IloInt x = 0; x < n; x++) {
c[x] = IloNumExprArray(env, n);
for (IloInt y = 0; y < n; y++) {
c[x][y] = IloNumExpr(env, 0);
c[x][y] += var[x] * var[y];
}
}
IloModel mdl(env);
for (IloInt x = 0; x < n; x++) {
for (IloInt y = 0; y < n; y++) {
mdl.add(c[x][y] ==1);
}
}
IloCP cp(mdl);
cp.startNewSearch();
while (cp.next()) {
cp.out() << cp.domain(var) << std::endl;
}
cp.end();
env.end();
return 0;
}
Related
I need to print all possible correct credit card numbers according to the Luhn algorithm, and I did like a 16 for loops nested inside each other,
I was wondering if there is a way to make my code shorter?
Here is my code:
this is for index 0,2,4,6,8,10,12,14,16 It takes the number and multiply it by 2. and after that it takes the sum of the individual numbers; lets say index 0 is 6 it takes 6*2=12 and sums 1+ 2
int dbl(int x) {
int sum = 0;
while (x !=0)
{
sum += x % 10;
x /= 10;
}
return sum;
}
void bruh(int x, int y) {
x += (dbl(2 * y));
}
This is my loops:
int main()
{
for (int i = 0; i <= 9; i++)
{
for (int j = 0; j <= 9; j++)
{
for (int k = 0; k <= 9; k++)
{
for (int l = 0; l <= 9; l++)
{
for (int m = 0; m <= 9; m++)
{
for (int n = 0; n <= 9; n++)
{
for (int o = 0; o <= 9; o++)
{
for (int p = 0; p <= 9; p++)
{
for (int q = 0; q <= 9; q++)
{
for (int r = 0; r <=9; r++)
{
for (int s = 0; s <= 9; s++)
{
for (int u = 0; u <= 9; u++)
{
for (int v = 0; v <= 9; v++)
{
for (int x = 0; x <= 9; x++)
{
for (int w = 0; w <= 9; w++)
{
for (int y = 0; y <= 9; y++)
{
int dbles = 0, sngls =0;
bruh(dbles, i);
sngls += j;
bruh(dbles, k);
sngls += l;
bruh(dbles, m);
sngls += n;
bruh(dbles, o);
sngls += p;
bruh(dbles, q);
sngls += r;
bruh(dbles, s);
sngls += u;
bruh(dbles, v);
sngls += x;
bruh(dbles, w);
sngls += y;
if (dbles+sngls%10==0)
{
cout << "Valid Number: " << i << j << k << l << m << n << o << p << q << r << s << u << v << x << w << y << endl;
cout << "---------------" << endl;
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
return 0;
}
The program works fine(added a break statement to stop after 100th cc to check if it runs, And it dose), I just want to know if there is away to make my code shorter
Something like this will be much shorter and easier to read:
#include <string>
void increment(std::string& s) {
for (int i = s.length() - 1; i >= 0; --i) {
if (s[i] != '9') {
s[i]++;
return;
}
s[i] = '0';
}
}
int main()
{
std::string s = "0000000000000000";
for (uint64_t i = 0; i < 1'000'000'000'000'000; ++i) {
// do your check with s[0]..s[15]
increment(s);
}
}
NOTE: this does pretty much what you tried to do in your code.
You can take n0rd's suggestion and loop through 15 digit numbers, calculating the checksum and appending it to the end. In that case you'd only need 3,000 years instead of 30,000 :)
I am using gcc compiler. (g++ -o test testfile.cpp)
I want to use Openacc to parallelize my code but I am a bit confused about using #pragma correctly.
Below is the part where I used parallelization.
Even after using Openacc the code is not faster than before.
I guess this is related to 'data-moving' thing.
So I think I need to use #pragma acc data copy here. But I am not sure how to use this properly.
Any help?
Thanks in advance.
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc kernels
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
}
Are you using the flag to enable OpenACC, i.e. "-fopenacc"? If not the OpenACC directives will be ignored.
Note that you'll want to use a newer GNU version, 10.2 preferable, as GNU support for OpenACC has gotten better over the years. I believe their compiler loop dependency analysis is still lacking so will run "kernels" compute regions sequentially on the device. Hence, for now, you'll want to stick to using "parallel" regions. If you do really want to use "kernels", I'd suggest switching to the NVIDIA HPC compilers (full disclosure, I work for NVIDIA)
Now I think the initial problem is just that you're not enabling OpenACC and why it's the same speed. Actually here I'd expect this case to be extremely slow if you tried to offload it. Besides running the "kernels" region sequentially on the device, the data would need to be transferred back and forth between the host and device each timestep.
The optimal strategy is to have a data region outside of the while loop, use an "update" directive when needed to synchronize the device and host copies of the arrays, and then ensure all the compute has been offload to the device.
Since you didn't post a complete reproducer, I can't test this code and hence verify that it's correct. But to give you an idea of this strategy, I modified your code below:
#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny], Ptrans[:Ny]) \
create(Vfuture[:Na][Nd], V[:Na][:Nd][:Ny], maxposition_a[:Na][:Nd][:Ny], maxposition_b[:Na][:Nd][:Ny]) \
create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc parallel loop collapse(3) default(present)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// I'm unclear why your using a 2D array for temphoward. It's preventing
// parallelzation of the inner loop and could be replaced with a scalar.
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
#pragma acc parallel loop collapse(3) reduction(max:dif)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// Again, why aren't you using a scalar here for tempdiff?
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
#pragma acc update self(Vnew[:Na][:Nd][:Ny])
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
}
#pragma acc exit data delete(Vnew, Ptrans, Vfuture, V, maxposition_a, maxposition_b, Vhoward)
// add others here as needed
I am using Visual studio now and I have a question about an error message that I got.
It says 'std::bad_alloc at memory location'.
I guess this means that I don't have enough memory. I am using arrays for computation and I think the problem arises from the size of each array.
First I used stack instead of heap so I allocate every array dynamically. But still, I got the same error message.
If I reduce the size of the array, I could run this code but I need to use this with a bigger size of arrays.
If I use a better computer, can I run this code? I am not sure how to reduce the memory required to run this code.... should I use fewer arrays?
Below, I put the entire code just in case....
Thanks in advance.
include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(double)
void dallo_fn(double**** pMat, int Na, int Nd, int Ny) {
double*** Mat = new double** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new double* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new double[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(double)
void dallo_fn0(double**** pMat, int Na, int Nd, int Ny) {
double*** Mat = new double** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new double* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new double[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
double utility(double a, double a_f, double d, double d_f, double y, double sig, double psi, double delta, double R) {
double C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
double result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
double duration;
// Iteration Parameters
double tol = 0.000001;
double itmax = 200;
double H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 2 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const double amin = -2;
const double amax = 7;
const double dmin = 0.01;
const double dmax = 7;
const double ymin = 0.5;
const double ymax = 1.5;
const double Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
double ca = (amax - amin) / (Na - 1.0);
double cd = (dmax - dmin) / (Nd - 1.0);
double cy = (ymax - ymin) / (Ny - 1.0);
double* A = new double[Na];
double* Y = new double[Ny];
double* D = new double[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
double*** V;
dallo_fn(&V, Na, Nd, Ny);
double*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
double Val[Na][Nd];
double Vfuture[Na][Nd];
double** temphoward = new double* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new double[Nd];
}
double*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
double*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
double** mg_A_v = new double* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new double[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
double** mg_D_v = new double* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new double[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
double***** Uvec = new double**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new double*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new double** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new double* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new double[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
double dif;
double max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<double> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}
Yes, with enough memory you can run this code. I've tested it just now on my machine with 64GBs of RAM and it ran just fine.
One way to reduce the amount of memory required is to use floats instead of doubles, as they take up half the size.
Moreover, as others suggested in the comments already you could represent the multidimensional structures as 1d arrays and compute the indices into the array instead of actually having nested arrays. The benefit is that you get rid of a huge number of pointers.
The code is admittedly pretty memory hungry, but works on my 8GB machine. Please make sure you are targetting x64 platform, otherwise you reach the 4GB inherent limit.
I am trying to use GPU to parallelize my code in Visual Studio C++.
Currently, I used OpenMP to use CPU parallelization.
But I am thinking of using GPU parallelization because I think it would be faster if I use a bigger size of arrays in calculations.
Below is the code that I am working on. I only used parallelization once.
I found out that in order to use GPU parallelization, I need to use OpenCL or Cuda.
And OpenCL and Cuda seem like I need to change the whole code. So I was wondering whether there is a way to use GPU parallelization without changing the whole code (maybe just changing '#pragma omp parallel for')
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val) // USE PARALLELIZATION
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}
There is no convenient way to add a #pragma and everything magically runs on the GPU.
However your code is well suited for GPU acceleration: In your loops, the elements are independent of each other. You can especially parallelize the Na, Nd, and Ny loops on the GPU. You will need to:
include the OpenCL C++ headers, see here
linearize the triple loop: crerate a linear index n = (i*Nd+j)*Ny+k;, turn the three loops into one
transfer your code to OpenCL C and get rid of the linear loop, a simple example how a kernel looks is here
create Buffers (allocate memory on the GPU)
create Kernel objects (one for each linearized triple loop) in C++ and link the Buffers as Kernel arguments
manually handle CPU<->GPU memory transfer (enqueueReadBuffer/enqueueWriteBuffer)
run the Kernels (enqueueNDRangeKernel)
all the arrays in this code are complex type in this code and the running time for this for loop is about 1 min. Ktemp is an array with size 141*1202*141. could anyone help me to optimize this code and save the running time?
complex<double> ***P1;
P1 = new complex<double>**[141];
for (i = 0; i < num_y; i++)
{
P1[i] = new complex<double> *[1202];
for (j = 0; j < tsize; j++)
{
P1[i][j] = new complex<double>[141];
}
}
for (int zz = 1; zz < 20; zz++)//in z direction
{
for (i = 0; i < 141; i++)
{
for (j = 0; j < 1202; j++)
{
for (k = 0; k < 141; k++)
{
if (Ktemp[i][j][k].real() <= 0)
{
P1[i][j][k] = 0;
}
else
{
P1[i][j][k] = excit_pfft[i][j][k] * expn[i][j][k];
}
}
}
}
excit_pfft = P1;
}
my second question is about rewriting matlab function 'fftshift' with C++. I have finished the code, but it seems not that efficient. could anyone help me rewrite this code? my code is attached below:
complex<double> ***fftw_shift(complex<double> ***te, int a, int b, int c)
{
complex<double> ***tempa;
tempa = new complex<double> **[a];
for (i = 0; i < a; i++)
{
tempa[i] = new complex<double> *[b];
for (j = 0; j < b; j++)
{
tempa[i][j] = new complex<double>[c];
}
}
/*for the row*/
if (c % 2 == 1)
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c / 2; k++)
{
tempa[i][j][k] = te[i][j][k + c / 2 + 1];
tempa[i][j][k + c / 2] = te[i][j][k];
tempa[i][j][c - 1] = te[i][j][c / 2];
}
}
}
}
else
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c / 2; k++)
{
tempa[i][j][k] = te[i][j][k + c / 2];
tempa[i][j][k + c / 2] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
te[i][j][k] = tempa[i][j][k];
}
}
}
/*for the column*/
if (b % 2 == 1)
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b / 2; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i][j + b / 2 + 1][k];
tempa[i][j + b / 2][k] = te[i][j][k];
tempa[i][b - 1][k] = te[i][b / 2][k];
}
}
}
}
else
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b / 2; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i][j + b / 2][k];
tempa[i][j + b / 2][k] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
te[i][j][k] = tempa[i][j][k];
}
}
}
/*for the third dimension*/
if (a % 2 == 1)
{
for (i = 0; i < a / 2; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i + a / 2 + 1][j][k];
tempa[i + a / 2][j][k] = te[i][j][k];
tempa[a - 1][j][k] = te[a / 2][j][k];
}
}
}
}
else
{
for (i = 0; i < a / 2; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i + a / 2][j][k];
tempa[i + a / 2][j][k] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
te[i][j][k] = tempa[i][j][k];
}
}
}
return (te);
}
Since you are repeatedly multiplying by the values in expn (i.e. calculating an exponent) you can do this more efficiently using the pow function and get rid of the zz loop:
for (i = 0; i < 141; i++)
{
for (j = 0; j < 1202; j++)
{
for (k = 0; k < 141; k++)
{
if (Ktemp[i][j][k].real() <= 0)
{
excit_pfft[i][j][k] = 0;
}
else
{
excit_pfft[i][j][k] = excit_pfft[i][j][k] * pow(expn[i][j][k], 20);
}
}
}
}
Your code also seems to have a memory leak because you assign P1 to excit_pfft, but never free the previous contents of excit_pfft. You don't need to have the P1 temporary array in any case once you get rid of the outer loop.
I'm not sure of the internals of the complex pow() function, but you can calculate the (scalar) exponent of a complex number geometrically by converting it to polar co-ordinates (angle + distance scalar), then multiplying the angle by the power and raising the distance to the power, then converting back. So it's a lot faster than repeated multiplication.
First (will probably give you a big performance boost), get rid of the pointer arrays if you know beforehand the size of your arrays and simply allocate them in the stack:
complex<double> P1[141][1202][141];
Instead of :
complex<double> ***P1;
P1 = new complex<double>**[141];
for (i = 0; i < num_y; i++)
{
P1[i] = new complex<double> *[1202];
for (j = 0; j < tsize; j++)
{
P1[i][j] = new complex<double>[141];
}
}
And since I don't know exactly what this does, I'm assuming this:
for (int zz = 1; zz < 20; zz++)//in z direction
{
for (i = 0; i < 141; i++)
{
for (j = 0; j < 1202; j++)
{
for (k = 0; k < 141; k++)
{
if (Ktemp[i][j][k].real() <= 0)
{
P1[i][j][k] = 0;
}
else
{
P1[i][j][k] = excit_pfft[i][j][k] * expn[i][j][k];
}
}
}
}
excit_pfft = P1;
}
Could become this:
for (int zz = 1; zz < 20; zz++)//in z direction
{
for (i = 0; i < 141; i++)
{
for (j = 0; j < 1202; j++)
{
for (k = 0; k < 141; k++)
{
if (Ktemp[i][j][k].real() <= 0)
{
P1[i][j][k] = 0;
}
else
{
P1[i][j][k] = P1[i][j][k] * expn[i][j][k];
}
}
}
}
}
If this cannot be done than I'll need a more broad chunk of this code to analyze excit_pfft, etc.
A huge performance boost you could have is to use Worker Threads and run this last code multithreaded.
The same goes for our second question, Worker Threads should do it.
EDIT:
On second though, the stack won't be able to handle that much variables.
I'd recommend using vector<vector<vector<complex<double> > > > instead.