How to get all solutions in CPLEX to a MIP - scheduling

UPDATE: Now I get the count of solutions, but when I try to get them out, it just gives me identical schedules. I have added the code suggested by Alex below. As an example, it gives me that there is 4 different solutions, but when I write the X matrices they are identical. Can anyone help me with this? I want the four different solutions.
I am making a schedule for a Single round robin tournament.
It is modelled as a MIP in CPLEX and in my solution pool there is at the moment four solutions with the same optimal objective value.
I want to get each of these four solutions, so they can be printed and examined indivudally. Is that possible?
// Create Parameters:
{string} G1 = ...; // Set of teams in first group
{string} G2 = ...; // Set of teams in second group
{string} Teams = G1 union G2;
tuple Match {string team1; string team2;}
{Match} Matches_G1 = {<t1,t2>| ordered t1,t2 in G1};
{Match} Matches_G2 = {<t1,t2>| ordered t1,t2 in G2};
{Match} MD1 = ...;
{Match} MD2 = ...;
{Match} MD3 = ...;
{Match} M = Matches_G1 union Matches_G2; //All matches for the two groups
{Match} matchForTeam[t in Teams] = {m| m in M : m.team1 == t || m.team2 == t}; //List of all teams
{string} S =...; //Set of stadiums
{string} T = ...; //Set of kick off times
{string} D = ...; //Set of kick off days
int K[D][S][T] = ...; //Predetermined schedule between stadium and kickoff time
float VT[M][T] = ...; //Value of match if played on Matchday M at Time T according to TV distribution
// Decision Variables:
dvar int X[M][S][T] in 0..1; // if match M is played at time T
dvar int Dist; //Object function for distribution
//////////// OBJECTIVE FUNCTION ///////////////
maximize
Dist;
//////////// CONSTRAINTS ///////////////
subject to{
Dist == sum(m in M, s in S, t in T) (VT[m][t])*X[m][s][t];
//A match can only be played one time
forall(m in M)
sum(s in S, t in T) X[m][s][t] == 1;
//Simultaneous Kickoff on matchday 3
sum(s in S)X[<"A1", "A4">][s]["22.00"] == sum(s in S)X[<"A2", "A3">][s]["22.00"];
//only one match on possible kick off times at matchday 1
forall(t in T : t != "18.00")
sum(s in S, m in MD1) X[m][s][t]==1;
//only one match on possible kick off times at matchday 2
forall(t in T : t != "18.00")
sum(s in S, m in MD2) X[m][s][t]==1;
//two matches per possible kick off times at matchday 3
forall(t in T : t in {"18.00", "22.00"})
sum(s in S, m in MD3) X[m][s][t]==2;
//One match per stadium on matchday 1
forall(s in S)
sum(m in MD1, t in T: t != "18.00") X[m][s][t] == 1;
//One match per stadium on matchday 2
forall(s in S)
sum(m in MD2, t in T: t != "18.00") X[m][s][t] == 1;
//one match per stadium on matchday 3
forall(s in S)
sum(m in MD3, t in T: t in {"18.00", "22.00"}) X[m][s][t] == 1;
//Each team can play at most two matches per stadium
forall(i in Teams, s in S)
sum(t in T, m in matchForTeam[i]) X[m][s][t] <= 2;
//Each team can play at most two matches per kickoff time
forall(i in Teams, t in T)
sum(s in S, m in matchForTeam[i]) X[m][s][t] <= 2;
forall(s in S, t in T, m in MD1)
X[m][s][t] <= K["1"][s][t];
forall(s in S, t in T, m in MD2)
X[m][s][t] <= K["2"][s][t];
forall(s in S, t in T, m in MD3)
X[m][s][t] <= K["3"][s][t];
}
execute{
writeln("schedule: ", X);
var cd = new IloOplOutputFile("resbi2.txt");
for(var m in M)
for(var s in S)
for(var t in T)
cd.writeln(thisOplModel.X[m][s][t]);
cd.close();
}
main{
cplex.solnpoolintensity=4;
cplex.solnpoolagap=0;
thisOplModel.generate();
cplex.solve();
if (cplex.populate()) {
var nsolns = cplex.solnPoolNsolns;
writeln("number of solutions: ", nsolns);
writeln("average object value: ", cplex.getSolnPoolMeanObjValue());
writeln();
for (var s=0; s<nsolns; s++) {
thisOplModel.setPoolSolution(s);
var cd = new IloOplOutputFile("resAB" +s+".txt");
cd.writeln(thisOplModel.X);
cd.close();
thisOplModel.postProcess();
}
}
}

yes in scripting you can loop into all solutions from a solution pool.
See https://github.com/AlexFleischerParis/zooopl/blob/master/zooseveral.mod
int nbKids=300;
float costBus40=500;
float costBus30=400;
dvar int+ nbBus40;
dvar int+ nbBus30;
//minimize
//costBus40*nbBus40 +nbBus30*costBus30;
subject to
{
40*nbBus40+nbBus30*30>=nbKids;
}
execute
{
writeln("nbBus40 = ",nbBus40," and nbBus30 = ",nbBus30," and the cost is ",costBus40*nbBus40 +nbBus30*costBus30);
}
main {
cplex.solnpoolintensity=4;
thisOplModel.generate();
cplex.solve();
if (cplex.populate()) {
var nsolns = cplex.solnPoolNsolns;
writeln("Number of solutions found = ",nsolns);
writeln();
for (var s=0; s<nsolns; s++) {
thisOplModel.setPoolSolution(s);
thisOplModel.postProcess();
}
}
}

Related

My segment tree update function doesn't work properly

The problem:
In this task, you need to write a regular segment tree for the sum.
Input The first line contains two integers n and m (1≤n,m≤100000), the
size of the array and the number of operations. The next line contains
n numbers a_i, the initial state of the array (0≤a_i≤10^9). The following
lines contain the description of the operations. The description of
each operation is as follows:
1 i v: set the element with index i to v (0≤i<n, 0≤v≤10^9).
2 l r:
calculate the sum of elements with indices from l to r−1 (0≤l<r≤n).
Output
For each operation of the second type print the corresponding
sum.
I'm trying to implement segment tree and all my functions works properly except for the update function:
void update(int i, int delta, int v = 0, int tl = 0, int tr = n - 1)
{
if (tl == i && tr == i)
t[v] += delta;
else if (tl <= i && i <= tr)
{
t[v] += delta;
int m = (tl + tr) / 2;
int left = 2 * v + 1;
int right = left + 1;
update(i, delta, left, tl, m);
update(i, delta, right, m + 1, tr);
}
}
I got WA on segment tree problem, meanwhile with this update function I got accepted:
void update(int i, int new_value, int v = 0, int tl = 0, int tr = n - 1)
{
if (tl == i && tr == i)
t[v] = new_value;
else if (tl <= i && i <= tr)
{
int m = (tl + tr) / 2;
int left = 2 * v + 1;
int right = left + 1;
update(i, new_value, left, tl, m);
update(i, new_value, right, m + 1, tr);
t[v] = t[left] + t[right];
}
}
I really don't understand why my first version is not working. I thought maybe I had some kind of overflowing problem and decided to change everything to long longs, but it didn't help, so the problem in the algorithm of updating itself. But it seems ok to me. For every segment that includes i I need to add sum of this segment to some delta (it can be negative, if for example I had number 5 and decided to change it to 3, then delta will be -2). So what's the problem? I really don't see it :(
There are 2 problems with your first solution:
The question expects you to do a point update. The condition (tl == i && tr == i) checks if you are the leaf node of the tree.
At leaf node, you have to actually replace the value instead of adding something into it, which you did for the second solution.
Secondly, you can only update the non-leaf nodes after all its child nodes are updated. Updating t[v] before making recursive call will anyways result into wrong answer.

How to find N points on an infinite axis so that sum of distances from M points to its nearest N is smallest?

Consider there are N houses on a single road. I have M lightpoles. Given that M < N. Distance between all adjacent houses are different. Lightpole can be placed at the house only. And I have to place all lightpoles at house so that sum of distances from each house to its nearest lightpole is smallest. How can I code this problem?
After a little research I came to know that I have to use dynamic programming for this problem. But I don't know how to approach it to this problem.
Here's a naive dynamic program with search space O(n^2 * m). Perhaps others know of another speedup? The recurrence should be clear from the function f in the code.
JavaScript code:
// We can calculate these in O(1)
// by using our prefixes (ps) and
// the formula for a subarray, (j, i),
// reaching for a pole at i:
//
// ps[i] - ps[j-1] - (A[i] - A[j-1]) * j
//
// Examples:
// A: [1,2,5,10]
// ps: [0,1,7,22]
// (2, 3) =>
// 22 - 1 - (10 - 2) * 2
// = 5
// = 10-5
// (1, 3) =>
// 22 - 0 - (10 - 1) * 1
// = 13
// = 10-5 + 10-2
function sumParts(A, j, i, isAssigned){
let result = 0
for (let k=j; k<=i; k++){
if (isAssigned)
result += Math.min(A[k] - A[j], A[i] - A[k])
else
result += A[k] - A[j]
}
return result
}
function f(A, ps, i, m, isAssigned){
if (m == 1 && isAssigned)
return ps[i]
const start = m - (isAssigned ? 2 : 1)
const _m = m - (isAssigned ? 1 : 0)
let result = Infinity
for (let j=start; j<i; j++)
result = Math.min(
result,
sumParts(A, j, i, isAssigned)
+ f(A, ps, j, _m, true)
)
return result
}
var A = [1, 2, 5, 10]
var m = 2
var ps = [0]
for (let i=1; i<A.length; i++)
ps[i] = ps[i-1] + (A[i] - A[i-1]) * i
var result = Math.min(
f(A, ps, A.length - 1, m, true),
f(A, ps, A.length - 1, m, false))
console.log(`A: ${ JSON.stringify(A) }`)
console.log(`ps: ${ JSON.stringify(ps) }`)
console.log(`m: ${ m }`)
console.log(`Result: ${ result }`)
I got you covered bud. I will write to explain the dynamic programming algorithm first and if you are not able to code it, let me know.
A-> array containing points so that A[i]-A[i-1] will be the distance between A[i] and A[i-1]. A[0] is the first point. When you are doing memoization top-down, you will have to handle cases when you would want to place a light pole at the current house or you would want to place it at a lower index. If you place it now, you recurse with one less light pole available and calculate the sum of distances with previous houses. You handle the base case when you are not left with any ligh pole or you are done with all the houses.

Computing Rand error efficiently

I'm trying to compare two image segmentations to one another.
In order to do so, I transform each image into a vector of unsigned short values, and calculate the rand error,
according to the following formula:
where:
Here is my code (the rand error calculation part):
cv::Mat im1,im2;
//code for acquiring data for im1, im2
//code for copying im1(:)->v1, im2(:)->v2
int N = v1.size();
double a = 0;
double b = 0;
for (int i = 0; i <N; i++)
{
for (int j = 0; j < i; j++)
{
unsigned short l1 = v1[i];
unsigned short l2 = v1[j];
unsigned short gt1 = v2[i];
unsigned short gt2 = v2[j];
if (l1 == l2 && gt1 == gt2)
{
a++;
}
else if (l1 != l2 && gt1 != gt2)
{
b++;
}
}
}
double NPairs = (double)(N*N)/2;
double res = (a + b) / NPairs;
My problem is that length of each vector is 307,200.
Therefore the total number of iterations is 47,185,920,000.
It makes the running time of the entire process is very slow (a few minutes to compute).
Do you have any idea how can I improve it?
Thanks!
Let's assume that we have P distinct labels in the first image and Q distinct labels in the second image. The key observation for efficient computation of Rand error, also called Rand index, is that the number of distinct labels is usually much smaller than the number of pixels (i.e. P, Q << n).
Step 1
First, pre-compute the following auxiliary data:
the vector s1, with size P, such that s1[p] is the number of pixel positions i with v1[i] = p.
the vector s2, with size Q, such that s2[q] is the number of pixel positions i with v2[i] = q.
the matrix M, with size P x Q, such that M[p][q] is the number of pixel positions i with v1[i] = p and v2[i] = q.
The vectors s1, s2 and the matrix M can be computed by passing once through the input images, i.e. in O(n).
Step 2
Once s1, s2 and M are available, a and b can be computed efficiently:
This holds because each pair of pixels (i, j) that we are interested in has the property that both its pixels have the same label in image 1, i.e. v1[i] = v1[j] = p; and the same label in image 2, i.e. v2[i] = v2[ j ] = q. Since v1[i] = p and v2[i] = q, the pixel i will contribute to the bin M[p][q], and the same does the pixel j. Therefore, for each combination of labels p and q we need to consider the number of pairs of pixels that fall into the M[p][q] bin, and then to sum them up for all possible labels p and q.
Similarly, for b we have:
Here, we are counting how many pairs are formed with one of the pixels falling into the bin M[p][q]. Such a pixel can form a good pair with each pixel that is falling into a bin M[p'][q'], with the condition that p != p' and q != q'. Summing over all such M[p'][q'] is equivalent to subtracting from the sum over the entire matrix M (this sum is n) the sum on row p (i.e. s1[p]) and the sum on the column q (i.e. s2[q]). However, after subtracting the row and column sums, we have subtracted M[p][q] twice, and this is why it is added at the end of the expression above. Finally, this is divided by 2 because each pair was counted twice (once for each of its two constituent pixels as being part of a bin M[p][q] in the argument above).
The Rand error (Rand index) can now be computed as:
The overall complexity of this method is O(n) + O(PQ), with the first term usually being the dominant one.
After reading your comments, I tried the following approach:
calculate the intersections for each possible pair of values.
use the intersection results to calculate the error.
I performed the calculation straight on the cv::Mat objects, without converting them into std::vector objects. That gave me the ability to use opencv functions and achieve a faster runtime.
Code:
double a = 0, b = 0; //init variables
//unique function finds all the unique value of a matrix, with an optional input mask
std::set<unsigned short> m1Vals = unique(mat1);
for (unsigned short s1 : m1Vals)
{
cv::Mat mask1 = (mat1 == s1);
std::set<unsigned short> m2ValsInRoi = unique(mat2, mat1==s1);
for (unsigned short s2 : m2ValsInRoi)
{
cv::Mat mask2 = mat2 == s2;
cv::Mat andMask = mask1 & mask2;
double andVal = cv::countNonZero(andMask);
a += (andVal*(andVal - 1)) / 2;
b += ((double)cv::countNonZero(andMask) * (double)cv::countNonZero(~mask1 & ~mask2)) / 2;
}
}
double NPairs = (double)(N*(N-1)) / 2;
double res = (a + b) / NPairs;
The runtime is now reasonable (only a few milliseconds vs a few minutes), and the output is the same as the code above.
Example:
I ran the code on the following matrices:
//mat1 = [1 1 2]
cv::Mat mat1 = cv::Mat::ones(cv::Size(3, 1), CV_16U);
mat1.at<ushort>(cv::Point(2, 0)) = 2;
//mat2 = [1 2 1]
cv::Mat mat2 = cv::Mat::ones(cv::Size(3, 1), CV_16U);
mat2.at<ushort>(cv::Point(1, 0)) = 2;
In this case a = 0 (no matching pairs correspondence), and b=1(one matching pair for i=2,j=3). The algorithm result:
a = 0
b = 1
NPairs = 3
result = 0.3333333
Thank you all for your help!

Spark - Not all data processes in JavaRDD complex object

I have the following code that reads in a text file of 5 rows CSV floats:
0.014, 0.035, 0.030, 0.018, 0.023, 0.027, 0.035, 0.036, -0.009, -0.013, 0.026, 0.042
0.032, 0.055, -0.036, 0.052, 0.047, 0.034, 0.063, 0.048, 0.025, 0.040, 0.036, -0.017
0.054, 0.056, 0.048, -0.007, 0.053, 0.036, 0.017, 0.047, 0.019, 0.017, 0.040, 0.032
0.038, 0.062, -0.037, 0.050, 0.065, -0.043, 0.062, 0.034, 0.035, 0.056, 0.057, 0.025
0.049, 0.067, -0.039, 0.051, 0.049, 0.037, 0.055, 0.025, 0.052, 0.020, 0.045, 0.040
The code loads in the data using Spark's JavaSparkContext textFile().
JavaSparkContext sc = new JavaSparkContext(master, "basicportopt", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> lines = sc.textFile(".../src/main/Resources/portfolio.txt");
Next the data is loaded into a JavaRDD type as a List of Lists of type Double:
JavaRDD<List<List<Double>>> inputData = lines.map(new Function<String, List<List<Double>>>() {
#Override
public List<List<Double>> call(String s) {
List<List<Double>> dd = new ArrayList<List<Double>>();
double d = 0;
List<Double> myDoubles = new ArrayList<Double>();
for (String value : s.split(",\\s*")) {
d = Double.parseDouble(value);
myDoubles.add(d);
}
dd.add(myDoubles);
return dd;
}
});
Finally, the idea is that the data will be manipulated to produce some calculations to produce some summary results in the following algorithm:
inputData.foreach(new VoidFunction<List<List<Double>>>() {
public void call(List<List<Double>> col) {
System.out.println("Starting with first row...");
ArrayList l = (ArrayList) col.get(0);
for (List<Double> m : col) {
Double sum1 = 0.0;
for (Double d : m) {
sum1 += d;
}
Double avg1 = sum1 / m.size();
System.out.println("The avg of the row \"m\" being worked with: " + avg1);
System.out.println("Crunch the first fow with the other rows including self.");
for (List<Double> n : col) {
Double sum2 = 0.0;
for (Double d : n) {
sum2 += d;
}
Double avg2 = sum1 / m.size();
System.out.println("The avg of the row \"n\" being worked with: " + avg2);
Double xy = 0.0;
for (int index = 0; index < m.size(); index++) {
xy += m.get(index) * n.get(index);
}
xy -= (avg1 * avg2);
System.out.println("Resulting covariant: " + xy);
}
}
}
});
However I would expect to get 25 results I only get 5 results, because in the line:
for (List<Double> m : col) {...}
I would expect "col" to have 5 elements but stepping through the debugger shows only 1 element.
But using the collect() method:
List<List<List<Double>>> cols = inputData.collect();
shows 5 elements.
Why does the foreach() method not contain the 5 elements?

Generating a random DAG

I am solving a problem on directed acyclic graph.
But I am having trouble testing my code on some directed acyclic graphs. The test graphs should be large, and (obviously) acyclic.
I tried a lot to write code for generating acyclic directed graphs. But I failed every time.
Is there some existing method to generate acyclic directed graphs I could use?
I cooked up a C program that does this. The key is to 'rank' the nodes, and only draw edges from lower ranked nodes to higher ranked ones.
The program I wrote prints in the DOT language.
Here is the code itself, with comments explaining what it means:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MIN_PER_RANK 1 /* Nodes/Rank: How 'fat' the DAG should be. */
#define MAX_PER_RANK 5
#define MIN_RANKS 3 /* Ranks: How 'tall' the DAG should be. */
#define MAX_RANKS 5
#define PERCENT 30 /* Chance of having an Edge. */
int main (void)
{
int i, j, k,nodes = 0;
srand (time (NULL));
int ranks = MIN_RANKS
+ (rand () % (MAX_RANKS - MIN_RANKS + 1));
printf ("digraph {\n");
for (i = 0; i < ranks; i++)
{
/* New nodes of 'higher' rank than all nodes generated till now. */
int new_nodes = MIN_PER_RANK
+ (rand () % (MAX_PER_RANK - MIN_PER_RANK + 1));
/* Edges from old nodes ('nodes') to new ones ('new_nodes'). */
for (j = 0; j < nodes; j++)
for (k = 0; k < new_nodes; k++)
if ( (rand () % 100) < PERCENT)
printf (" %d -> %d;\n", j, k + nodes); /* An Edge. */
nodes += new_nodes; /* Accumulate into old node set. */
}
printf ("}\n");
return 0;
}
And here is the graph generated from a test run:
The answer to https://mathematica.stackexchange.com/questions/608/how-to-generate-random-directed-acyclic-graphs applies: if you have a adjacency matrix representation of the edges of your graph, then if the matrix is lower triangular, it's a DAG by necessity.
A similar approach would be to take an arbitrary ordering of your nodes, and then consider edges from node x to y only when x < y. That constraint should also get your DAGness by construction. Memory comparison would be one arbitrary way to order your nodes if you're using structs to represent nodes.
Basically, the pseudocode would be something like:
for(i = 0; i < N; i++) {
for (j = i+1; j < N; j++) {
maybePutAnEdgeBetween(i, j);
}
}
where N is the number of nodes in your graph.
The pseudocode suggests that the number of potential DAGs, given N nodes, is
2^(n*(n-1)/2),
since there are
n*(n-1)/2
ordered pairs ("N choose 2"), and we can choose either to have the edge between them or not.
So, to try to put all these reasonable answers together:
(In the following, I used V for the number of vertices in the generated graph, and E for the number of edges, and we assume that E ≤ V(V-1)/2.)
Personally, I think the most useful answer is in a comment, by Flavius, who points at the code at http://condor.depaul.edu/rjohnson/source/graph_ge.c. That code is really simple, and it's conveniently described by a comment, which I reproduce:
To generate a directed acyclic graph, we first
generate a random permutation dag[0],...,dag[v-1].
(v = number of vertices.)
This random permutation serves as a topological
sort of the graph. We then generate random edges of the
form (dag[i],dag[j]) with i < j.
In fact, what the code does is generate the request number of edges by repeatedly doing the following:
generate two numbers in the range [0, V);
reject them if they're equal;
swap them if the first is larger;
reject them if it has generated them before.
The problem with this solution is that as E gets closes to the maximum number of edges V(V-1)/2, then the algorithm becomes slower and slower, because it has to reject more and more edges. A better solution would be to make a vector of all V(V-1)/2 possible edges; randomly shuffle it; and select the first (requested edges) edges in the shuffled list.
The reservoir sampling algorithm lets us do this in space O(E), since we can deduce the endpoints of the kth edge from the value of k. Consequently, we don't actually have to create the source vector. However, it still requires O(V2) time.
Alternatively, one can do a Fisher-Yates shuffle (or Knuth shuffle, if you prefer), stopping after E iterations. In the version of the FY shuffle presented in Wikipedia, this will produce the trailing entries, but the algorithm works just as well backwards:
// At the end of this snippet, a consists of a random sample of the
// integers in the half-open range [0, V(V-1)/2). (They still need to be
// converted to pairs of endpoints).
vector<int> a;
int N = V * (V - 1) / 2;
for (int i = 0; i < N; ++i) a.push_back(i);
for (int i = 0; i < E; ++i) {
int j = i + rand(N - i);
swap(a[i], a[j]);
a.resize(E);
This requires only O(E) time but it requires O(N2) space. In fact, this can be improved to O(E) space with some trickery, but an SO code snippet is too small to contain the result, so I'll provide a simpler one in O(E) space and O(E log E) time. I assume that there is a class DAG with at least:
class DAG {
// Construct an empty DAG with v vertices
explicit DAG(int v);
// Add the directed edge i->j, where 0 <= i, j < v
void add(int i, int j);
};
Now here goes:
// Return a randomly-constructed DAG with V vertices and and E edges.
// It's required that 0 < E < V(V-1)/2.
template<typename PRNG>
DAG RandomDAG(int V, int E, PRNG& prng) {
using dist = std::uniform_int_distribution<int>;
// Make a random sample of size E
std::vector<int> sample;
sample.reserve(E);
int N = V * (V - 1) / 2;
dist d(0, N - E); // uniform_int_distribution is closed range
// Random vector of integers in [0, N-E]
for (int i = 0; i < E; ++i) sample.push_back(dist(prng));
// Sort them, and make them unique
std::sort(sample.begin(), sample.end());
for (int i = 1; i < E; ++i) sample[i] += i;
// Now it's a unique sorted list of integers in [0, N-E+E-1]
// Randomly shuffle the endpoints, so the topological sort
// is different, too.
std::vector<int> endpoints;
endpoints.reserve(V);
for (i = 0; i < V; ++i) endpoints.push_back(i);
std::shuffle(endpoints.begin(), endpoints.end(), prng);
// Finally, create the dag
DAG rv;
for (auto& v : sample) {
int tail = int(0.5 + sqrt((v + 1) * 2));
int head = v - tail * (tail - 1) / 2;
rv.add(head, tail);
}
return rv;
}
You could generate a random directed graph, and then do a depth-first search for cycles. When you find a cycle, break it by deleting an edge.
I think this is worst case O(VE). Each DFS takes O(V), and each one removes at least one edge (so max E)
If you generate the directed graph by uniformly random selecting all V^2 possible edges, and you DFS in random order and delete a random edge - this would give you a uniform distribution (or at least close to it) over all possible dags.
A very simple approach is:
Randomly assign edges by iterating over the indices of a lower diagonal matrix (as suggested by a link above: https://mathematica.stackexchange.com/questions/608/how-to-generate-random-directed-acyclic-graphs)
This will give you a DAG with possibly more than one component. You can use a Disjoint-set data structure to give you the components that can then be merged by creating edges between the components.
Disjoint-sets are described here: https://en.wikipedia.org/wiki/Disjoint-set_data_structure
Edit: I initially found this post while I was working with a scheduling problem named flexible job shop scheduling problem with sequencing flexibility where jobs (the order in which operations are processed) are defined by directed acyclic graphs. The idea was to use an algorithm to generate multiple random directed graphs (jobs) and create instances of the scheduling problem to test my algorithms. The code at the end of this post is a basic version of the one I used to generate the instances. The instance generator can be found here.
I translated to Python and integrated some functionalities to create a transitive set of the random DAG. In this way, the graph generated has the minimum number of edges with the same reachability.
The transitive graph can be visualized at http://dagitty.net/dags.html by pasting the output in Model code (in the right).
Python version of the algorithm
import random
class Graph:
nodes = []
edges = []
removed_edges = []
def remove_edge(self, x, y):
e = (x,y)
try:
self.edges.remove(e)
# print("Removed edge %s" % str(e))
self.removed_edges.append(e)
except:
return
def Nodes(self):
return self.nodes
# Sample data
def __init__(self):
self.nodes = []
self.edges = []
def get_random_dag():
MIN_PER_RANK = 1 # Nodes/Rank: How 'fat' the DAG should be
MAX_PER_RANK = 2
MIN_RANKS = 6 # Ranks: How 'tall' the DAG should be
MAX_RANKS = 10
PERCENT = 0.3 # Chance of having an Edge
nodes = 0
ranks = random.randint(MIN_RANKS, MAX_RANKS)
adjacency = []
for i in range(ranks):
# New nodes of 'higher' rank than all nodes generated till now
new_nodes = random.randint(MIN_PER_RANK, MAX_PER_RANK)
# Edges from old nodes ('nodes') to new ones ('new_nodes')
for j in range(nodes):
for k in range(new_nodes):
if random.random() < PERCENT:
adjacency.append((j, k+nodes))
nodes += new_nodes
# Compute transitive graph
G = Graph()
# Append nodes
for i in range(nodes):
G.nodes.append(i)
# Append adjacencies
for i in range(len(adjacency)):
G.edges.append(adjacency[i])
N = G.Nodes()
for x in N:
for y in N:
for z in N:
if (x, y) != (y, z) and (x, y) != (x, z):
if (x, y) in G.edges and (y, z) in G.edges:
G.remove_edge(x, z)
# Print graph
for i in range(nodes):
print(i)
print()
for value in G.edges:
print(str(value[0]) + ' ' + str(value[1]))
get_random_dag()
Bellow, you may see in the figure the random DAG with many redundant edges generated by the Python code above.
I adapted the code to generate the same graph (same reachability) but with the least possible number of edges. This is also called transitive reduction.
def get_random_dag():
MIN_PER_RANK = 1 # Nodes/Rank: How 'fat' the DAG should be
MAX_PER_RANK = 3
MIN_RANKS = 15 # Ranks: How 'tall' the DAG should be
MAX_RANKS = 20
PERCENT = 0.3 # Chance of having an Edge
nodes = 0
node_counter = 0
ranks = random.randint(MIN_RANKS, MAX_RANKS)
adjacency = []
rank_list = []
for i in range(ranks):
# New nodes of 'higher' rank than all nodes generated till now
new_nodes = random.randint(MIN_PER_RANK, MAX_PER_RANK)
list = []
for j in range(new_nodes):
list.append(node_counter)
node_counter += 1
rank_list.append(list)
print(rank_list)
# Edges from old nodes ('nodes') to new ones ('new_nodes')
if i > 0:
for j in rank_list[i - 1]:
for k in range(new_nodes):
if random.random() < PERCENT:
adjacency.append((j, k+nodes))
nodes += new_nodes
for i in range(nodes):
print(i)
print()
for edge in adjacency:
print(str(edge[0]) + ' ' + str(edge[1]))
print()
print()
Result:
Create a graph with n nodes and an edge between each pair of node n1 and n2 if n1 != n2 and n2 % n1 == 0.
I recently tried re-implementing the accepted answer and found that it is indeterministic. If you don't enforce the min_per_rank parameter, you could end up with a graph with 0 nodes.
To prevent this, I wrapped the for loops in a function and then checked to make sure that, after each rank, that min_per_rank was satisfied. Here's the JavaScript implementation:
https://github.com/karissa/random-dag
And some pseudo-C code that would replace the accepted answer's main loop.
int pushed = 0
int addRank (void)
{
for (j = 0; j < nodes; j++)
for (k = 0; k < new_nodes; k++)
if ( (rand () % 100) < PERCENT)
printf (" %d -> %d;\n", j, k + nodes); /* An Edge. */
if (pushed < min_per_rank) return addRank()
else pushed = 0
return 0
}
Generating a random DAG which might not be connected
Here's an simple algorithm for generating a random DAG that might not be connected.
const randomDAG = (x, n) => {
const length = n * (n - 1) / 2;
const dag = new Array(length);
for (let i = 0; i < length; i++) {
dag[i] = Math.random() < x ? 1 : 0;
}
return dag;
};
const dagIndex = (n, i, j) => n * i + j - (i + 1) * (i + 2) / 2;
const dagToDot = (n, dag) => {
let dot = "digraph {\n";
for (let i = 0; i < n; i++) {
dot += ` ${i};\n`;
for (let j = i + 1; j < n; j++) {
const k = dagIndex(n, i, j);
if (dag[k]) dot += ` ${i} -> ${j};\n`;
}
}
return dot + "}";
};
const randomDot = (x, n) => dagToDot(n, randomDAG(x, n));
new Viz().renderSVGElement(randomDot(0.3, 10)).then(svg => {
document.body.appendChild(svg);
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/viz.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/full.render.js"></script>
If you run this code snippet a couple of times, you might see a DAG which is not connected.
So, how does this code work?
A directed acyclic graph (DAG) is just a topologically sorted undirected graph. An undirected graph of n vertices can have a maximum of n * (n - 1) / 2 edges, not counting repeated edges or edges from a vertex to itself. Now, you can only have an edge from a lower vertex to a higher vertex. Hence, the direction of all the edges are predetermined.
This means that you can represent the entire DAG using a one dimensional array of n * (n - 1) / 2 edge weights. An edge weight of 0 means that the edge is absent. Hence, we just create a random array of zeros or ones, and that's our random DAG.
An edge from vertex i to vertex j in a DAG of n vertices, where i < j, has an edge weight at index k where k = n * i + j - (i + 1) * (i + 2) / 2.
Generating a connected DAG
Once you generate a random DAG, you can check if it's connected using the following function.
const isConnected = (n, dag) => {
const reached = new Array(n).fill(false);
reached[0] = true;
const queue = [0];
while (queue.length > 0) {
const x = queue.shift();
for (let i = 0; i < n; i++) {
if (i === n || reached[i]) continue;
const j = i < x ? dagIndex(n, i, x) : dagIndex(n, x, i);
if (dag[j] === 0) continue;
reached[i] = true;
queue.push(i);
}
}
return reached.every(x => x); // return true if every vertex was reached
};
If it's not connected then its complement will always be connected.
const complement = dag => dag.map(x => x ? 0 : 1);
const randomConnectedDAG = (x, n) => {
const dag = randomDAG(x, n);
return isConnected(n, dag) ? dag : complement(dag);
};
Note that if we create a random DAG with 30% edges then its complement will have 70% edges. Hence, the only safe value for x is 50%. However, if you care about connectivity more than the percentage of edges then this shouldn't be a deal breaker.
Finally, putting it all together.
const randomDAG = (x, n) => {
const length = n * (n - 1) / 2;
const dag = new Array(length);
for (let i = 0; i < length; i++) {
dag[i] = Math.random() < x ? 1 : 0;
}
return dag;
};
const dagIndex = (n, i, j) => n * i + j - (i + 1) * (i + 2) / 2;
const isConnected = (n, dag) => {
const reached = new Array(n).fill(false);
reached[0] = true;
const queue = [0];
while (queue.length > 0) {
const x = queue.shift();
for (let i = 0; i < n; i++) {
if (i === n || reached[i]) continue;
const j = i < x ? dagIndex(n, i, x) : dagIndex(n, x, i);
if (dag[j] === 0) continue;
reached[i] = true;
queue.push(i);
}
}
return reached.every(x => x); // return true if every vertex was reached
};
const complement = dag => dag.map(x => x ? 0 : 1);
const randomConnectedDAG = (x, n) => {
const dag = randomDAG(x, n);
return isConnected(n, dag) ? dag : complement(dag);
};
const dagToDot = (n, dag) => {
let dot = "digraph {\n";
for (let i = 0; i < n; i++) {
dot += ` ${i};\n`;
for (let j = i + 1; j < n; j++) {
const k = dagIndex(n, i, j);
if (dag[k]) dot += ` ${i} -> ${j};\n`;
}
}
return dot + "}";
};
const randomConnectedDot = (x, n) => dagToDot(n, randomConnectedDAG(x, n));
new Viz().renderSVGElement(randomConnectedDot(0.3, 10)).then(svg => {
document.body.appendChild(svg);
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/viz.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/full.render.js"></script>
If you run this code snippet a couple of times, you may see a DAG with a lot more edges than others.
Generating a connected DAG with a certain percentage of edges
If you care about both connectivity and having a certain percentage of edges then you can use the following algorithm.
Start with a fully connected graph.
Randomly remove edges.
After removing an edge, check if the graph is still connected.
If it's no longer connected then add that edge back.
It should be noted that this algorithm is not as efficient as the previous method.
const randomDAG = (x, n) => {
const length = n * (n - 1) / 2;
const dag = new Array(length).fill(1);
for (let i = 0; i < length; i++) {
if (Math.random() < x) continue;
dag[i] = 0;
if (!isConnected(n, dag)) dag[i] = 1;
}
return dag;
};
const dagIndex = (n, i, j) => n * i + j - (i + 1) * (i + 2) / 2;
const isConnected = (n, dag) => {
const reached = new Array(n).fill(false);
reached[0] = true;
const queue = [0];
while (queue.length > 0) {
const x = queue.shift();
for (let i = 0; i < n; i++) {
if (i === n || reached[i]) continue;
const j = i < x ? dagIndex(n, i, x) : dagIndex(n, x, i);
if (dag[j] === 0) continue;
reached[i] = true;
queue.push(i);
}
}
return reached.every(x => x); // return true if every vertex was reached
};
const dagToDot = (n, dag) => {
let dot = "digraph {\n";
for (let i = 0; i < n; i++) {
dot += ` ${i};\n`;
for (let j = i + 1; j < n; j++) {
const k = dagIndex(n, i, j);
if (dag[k]) dot += ` ${i} -> ${j};\n`;
}
}
return dot + "}";
};
const randomDot = (x, n) => dagToDot(n, randomDAG(x, n));
new Viz().renderSVGElement(randomDot(0.3, 10)).then(svg => {
document.body.appendChild(svg);
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/viz.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/viz.js/2.1.2/full.render.js"></script>
Hope that helps.
To test algorithms I generated random graphs based on node layers. This is the Python script (also print the adjacency list). You can change the nodes connection probability percentages or add layers to have a slightly different or "taller" graphs:
# Weighted DAG generator by forward layers
import argparse
import random
parser = argparse.ArgumentParser("dag_gen2")
parser.add_argument(
"--layers",
help="DAG forward layers. Default=5",
type=int,
default=5,
)
args = parser.parse_args()
layers = [[] for _ in range(args.layers)]
edges = {}
node_index = -1
print(f"Creating {len(layers)} layers graph")
# Random horizontal connections -low probability-
def random_horizontal(layer):
for node1 in layer:
# Avoid cycles
for node2 in filter(
lambda n2: node1 != n2 and node1 not in map(lambda el: el[0], edges[n2]),
layer,
):
if random.randint(0, 100) < 10:
w = random.randint(1, 10)
edges[node1].append((node2, w))
# Connect two layers
def connect(layer1, layer2):
random_horizontal(layer1)
for node1 in layer1:
for node2 in layer2:
if random.randint(0, 100) < 30:
w = random.randint(1, 10)
edges[node1].append((node2, w))
# Start nodes 1 to 3
start_nodes = random.randint(1, 3)
start_layer = []
for sn in range(start_nodes + 1):
node_index += 1
start_layer.append(node_index)
# Gen nodes
for layer in layers:
nodes = random.randint(2, 5)
for n in range(nodes):
node_index += 1
layer.append(node_index)
# Connect all
layers.insert(0, start_layer)
for layer in layers:
for node in layer:
edges[node] = []
for i, layer in enumerate(layers[:-1]):
connect(layer, layers[i + 1])
# Print in DOT language
print("digraph {")
for node_key in [node_key for node_key in edges.keys() if len(edges[node_key]) > 0]:
for node_dst, weight in edges[node_key]:
print(f" {node_key} -> {node_dst} [label={weight}];")
print("}")
print("---- Adjacency list ----")
print(edges)