I have found this code that solves the Knight's Tour problem.
If I, for example, want to solve a board of size 800x800 I get the following error:
Exception thrown at 0x00007FF6345D3778 in test.exe: 0xC00000FD: Stack overflow (parameters: 0x0000000000000001, 0x00000082140C3000).
Unhandled exception at 0x00007FF6345D3778 in test.exe: 0xC00000FD: Stack overflow (parameters: 0x0000000000000001, 0x00000082140C3000).
How can I avoid this error? How should I change the Board class such that it can solve a board this big?
I want to be able to write: Board<800> b6 for example.
PS. This code works for small boards.
Thank you very much.
class Board
{
public:
array<pair<int, int>, 8> moves;
array<array<int, N>, N> data;
Board()
{
moves[0] = make_pair(2, 1);
moves[1] = make_pair(1, 2);
moves[2] = make_pair(-1, 2);
moves[3] = make_pair(-2, 1);
moves[4] = make_pair(-2, -1);
moves[5] = make_pair(-1, -2);
moves[6] = make_pair(1, -2);
moves[7] = make_pair(2, -1);
}
array<int, 8> sortMoves(int x, int y) const
{
array<tuple<int, int>, 8> counts;
for (int i = 0; i < 8; ++i)
{
int dx = get<0>(moves[i]);
int dy = get<1>(moves[i]);
int c = 0;
for (int j = 0; j < 8; ++j)
{
int x2 = x + dx + get<0>(moves[j]);
int y2 = y + dy + get<1>(moves[j]);
if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N)
continue;
if (data[y2][x2] != 0)
continue;
c++;
}
counts[i] = make_tuple(c, i);
}
sort(counts.begin(), counts.end());
array<int, 8> out;
for (int i = 0; i < 8; ++i)
out[i] = get<1>(counts[i]);
return out;
}
void solve(string start)
{
for (int v = 0; v < N; ++v)
for (int u = 0; u < N; ++u)
data[v][u] = 0;
int x0 = start[0] - 'a';
int y0 = N - (start[1] - '0');
data[y0][x0] = 1;
array<tuple<int, int, int, array<int, 8>>, N*N> order;
order[0] = make_tuple(x0, y0, 0, sortMoves(x0, y0));
int n = 0;
while (n < N*N - 1)
{
int x = get<0>(order[n]);
int y = get<1>(order[n]);
bool ok = false;
for (int i = get<2>(order[n]); i < 8; ++i)
{
int dx = moves[get<3>(order[n])[i]].first;
int dy = moves[get<3>(order[n])[i]].second;
if (x + dx < 0 || x + dx >= N || y + dy < 0 || y + dy >= N)
continue;
if (data[y + dy][x + dx] != 0)
continue;
++n;
get<2>(order[n]) = i + 1;
data[y + dy][x + dx] = n + 1;
order[n] = make_tuple(x + dx, y + dy, 0, sortMoves(x + dx, y + dy));
ok = true;
break;
}
if (!ok) // Failed. Backtrack.
{
data[y][x] = 0;
--n;
}
}
}
template<int N>
friend ostream& operator<<(ostream &out, const Board<N> &b);
};
template<int N>
ostream& operator<<(ostream &out, const Board<N> &b)
{
for (int v = 0; v < N; ++v)
{
for (int u = 0; u < N; ++u)
{
if (u != 0) out << ",";
out << setw(3) << b.data[v][u];
}
out << endl;
}
return out;
}
int main{
Board<800> b2;
b2.solve("b5");
cout << b2 << endl;
return 0
}
array<array<int, N>, N> data with N being 800 requires around 2.5 MB of memory.
Board<800> b2 is allocated on the stack.
Depending on the platform the default stack size is around 2-8MB. It looks like you are on windows where the stack size is usually 2MB. As your array is larger than the size of the stack you get a stack overflow.
You need to allocate Board on the heap. e.g.:
int main{
auto b2 = std::make_unique<Board<800>>();
b2->solve("b5");
cout << *b2 << endl;
return 0
}
In the solve function you are also allocating order on the stack. This should be changed to something like this in order to allocate it on the heap:
auto orderPointer = std::make_unique<array<tuple<int, int, int, array<int, 8>>, N*N>>();
// dereference the pointer to make array indexes easier
auto& order = *orderPointer;
Related
I am trying to multithread a raytracer, and am trying to write a function to pass into the thread. The function throws a stack overflow error, and when I try to heap allocate, all of a sudden I can't write to the array. Any tips?
std::array<std::array<std::array<int,3>,400>,225> idk(std::array<std::array<std::array<int,3>,400>,225> pc,const int img_start,const camera &cam,const hittableList &world,const int imageWidth,const int img_end,const int maxDepth,const int samplesPerPixel,bool fml){
auto pic = std::array<std::array<std::array<int,3>,400>,225> {};
for (int i = img_start; i >= img_end; --i) {
color pixelColour(0, 0, 0);
for (int j = 0; j < 400; j++) {
for (int k = 0; k < samplesPerPixel; k++) {
double u = double(j + randomDouble()) / (imageWidth - 1);
double v = double(i + randomDouble()) / (img_start - 1);
ray r = cam.get_ray(u, v);
pixelColour += rayColor(r, world, maxDepth);
}
pic[i][j] = writeColour(pixelColour, maxDepth);
}
}
fml = true;
return pic;
}
std::array<std::array<std::array<int,3>,400>,225>* idk(std::array<std::array<std::array<int,3>,400>,225> pc,const int img_start,const camera &cam,const hittableList &world,const int imageWidth,const int img_end,const int maxDepth,const int samplesPerPixel,bool fml){
auto* pic = new std::array<std::array<std::array<int,3>,400>,225> {};
for (int i = img_start; i >= img_end; --i) {
color pixelColour(0, 0, 0);
for (int j = 0; j < 400; j++) {
for (int k = 0; k < samplesPerPixel; k++) {
double u = double(j + randomDouble()) / (imageWidth - 1);
double v = double(i + randomDouble()) / (img_start - 1);
ray r = cam.get_ray(u, v);
pixelColour += rayColor(r, world, maxDepth);
}
//this line doesnt work the = is underlined and says: no viable overloas '='
pic[i][j] = writeColour(pixelColour, maxDepth);
}
}
fml = true;
return pic;
}
The C++ standard library requires that the payload of std::array has the same storage duration as the std::array itself.
In other words, you are attempting to place 400 * 255 * 3 ints on your stack, and that is above a run-time limit.
A fix is to use a multidimensional array from a third party library like Boost.MultiArray (part of www.boost.org).
I'm trying to solve https://open.kattis.com/problems/rootedsubtrees and part of the solution requires finding the minimum distance between any 2 nodes on the tree. To do this, I'm using Lowest Common Ancestor as a subroutine. Part of my LCA code uses a DFS to traverse the tree. Somehow, running this code on a line graph of size 200000 leads to a segmentation fault during the DFS section of the code.
#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
typedef vector<int> vi;
#define fast_cin() \
ios_base::sync_with_stdio(false); \
cin.tie(NULL); \
cout.tie(NULL);
int n, q, idx;
vector<int> adjlist[200009];
vector<int> L, E,
H; // depth at traversal index, node at traversal index, first traversal index of node
void dfs(int cur, int depth) {
cout << "dfs " << cur << " " << idx << endl;
H[cur] = idx;
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
dfs(nxt, depth + 1);
E[idx] = cur; // backtrack to current node
L[idx++] = depth;
}
}
class SparseTable { // OOP style
private:
vi A, P2, L2;
vector<vi> SpT; // the Sparse Table
public:
SparseTable() {} // default constructor
SparseTable(vi &initialA) { // pre-processing routine
A = initialA;
int n = (int)A.size();
int L2_n = (int)log2(n) + 1;
P2.assign(L2_n, 0);
L2.assign(1 << L2_n, 0);
for (int i = 0; i <= L2_n; ++i) {
P2[i] = (1 << i); // to speed up 2^i
L2[(1 << i)] = i; // to speed up log_2(i)
}
for (int i = 2; i < P2[L2_n]; ++i)
if (L2[i] == 0) L2[i] = L2[i - 1]; // to fill in the blanks
// the initialization phase
SpT = vector<vi>(L2[n] + 1, vi(n));
for (int j = 0; j < n; ++j) SpT[0][j] = j; // RMQ of sub array [j..j]
// the two nested loops below have overall time complexity = O(n log n)
for (int i = 1; P2[i] <= n; ++i) // for all i s.t. 2^i <= n
for (int j = 0; j + P2[i] - 1 < n; ++j) { // for all valid j
int x = SpT[i - 1][j]; // [j..j+2^(i-1)-1]
int y = SpT[i - 1][j + P2[i - 1]]; // [j+2^(i-1)..j+2^i-1]
SpT[i][j] = A[x] <= A[y] ? x : y;
}
}
int RMQ(int i, int j) {
int k = L2[j - i + 1]; // 2^k <= (j-i+1)
int x = SpT[k][i]; // covers [i..i+2^k-1]
int y = SpT[k][j - P2[k] + 1]; // covers [j-2^k+1..j]
return A[x] <= A[y] ? x : y;
}
};
int LCA(int u, int v, SparseTable &SpT) {
if (H[u] > H[v]) swap(u, v);
return E[SpT.RMQ(H[u], H[v])];
}
int APSP(int u, int v, SparseTable &SpT) {
int ancestor = LCA(u, v, SpT);
return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}
int main() {
fast_cin();
cin >> n >> q;
L.assign(2 * (n + 9), 0);
E.assign(2 * (n + 9), 0);
H.assign(n + 9, -1);
idx = 0;
int u, v;
for (int i = 0; i < n - 1; i++) {
cin >> u >> v;
u--;
v--;
adjlist[u].emplace_back(v);
adjlist[v].emplace_back(u);
}
dfs(0, 0);
SparseTable SpT(L);
ll d;
while (q--) {
cin >> u >> v;
u--;
v--;
d = (ll) APSP(u, v, SpT) + 1;
cout << (ll) n - d + (d) * (d + 1) / 2 << endl;
}
return 0;
}
Using the following Python Code to generate the input of a large line graph
n = 200000
q = 1
print(n, q)
for i in range(1, n):
print(i, i+1)
print(1, 200000)
I get the following last few lines of output before my program crashes.
.
.
.
dfs 174494 174494
dfs 174495 174495
dfs 174496 174496
dfs 174497 174497
dfs 174498 174498
Segmentation fault (core dumped)
Is the problem an issue of exhausting stack space with the recursion or something else?
You posted a lot of code, but here is one obvious error in the SparseMatrix class:
std::vector<int> P2;
//...
P2.assign(L2_n, 0);
for (int i = 0; i <= L2_n; ++i)
{
P2[i] = (1 << i); // <-- Out of bounds access when i == L2_n
To show you the error, change that line of code to this:
P2.at(i) = (1 << i); // <-- Out of bounds access when i == L2_n
You will now get a std::out_of_range exception thrown.
If you write a loop using <=, that loop will be considered suspicious, since a lot of off-by-one and buffer overrun errors occur with loop conditions written this way.
I believe stack exhaustion was the main problem in running the code on my machine. I re-implemented the DFS in an iterative fashion.
stack<tuple<int, int, bool>> st; // cur, depth, first_time
st.push ({0, 0, 1});
while (!st.empty()) {
auto [cur, depth, first_time] = st.top();
st.pop();
if (first_time){
H[cur] = idx;
}
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
st.push({cur, depth, 0});
st.push({nxt, depth+1, 1});
break;
}
}
and my code was able to run the large testcase on my machine.
I'm not sure is this is relevant to the original question, but after this change, the code still flagged a run-time error on the online judge and I eventually realized that the issue was that the sparse table was using too much memory, so I fixed that by avoiding wasted declared but not used memory spaces in rows of the sparse table. Then the online judge deemed it as being too slow. So I reverted the DFS code back to the recursive version, and it was accepted. Note that the accepted solution actually crashes on my machine when running the large testcase... I guess my machine has a more limited stack space than the online grader.
The accepted solution is here
#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
typedef vector<int> vi;
#define fast_cin() \
ios_base::sync_with_stdio(false); \
cin.tie(NULL); \
cout.tie(NULL);
int n, q, idx;
vector<int> adjlist[(int)2e5 + 9];
vector<int> L, E,
H; // depth at traversal index, node at traversal index, first traversal index of node
void dfs(int cur, int depth) {
H[cur] = idx;
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
dfs(nxt, depth + 1);
E[idx] = cur; // backtrack to current node
L[idx++] = depth;
}
}
class SparseTable { // OOP style
private:
vi A, P2, L2;
vector<vi> SpT; // the Sparse Table
public:
SparseTable() {} // default constructor
SparseTable(vi &initialA) { // pre-processing routine
A = initialA;
int n = (int)A.size();
int L2_n = (int)log2(n) + 1;
P2.assign(L2_n + 1, 0);
L2.assign((1 << L2_n) + 1, 0);
for (int i = 0; i <= L2_n; ++i) {
P2[i] = (1 << i); // to speed up 2^i
L2[(1 << i)] = i; // to speed up log_2(i)
}
for (int i = 2; i < P2[L2_n]; ++i)
if (L2[i] == 0) L2[i] = L2[i - 1]; // to fill in the blanks
// the initialization phase
SpT = vector<vi>(L2[n] + 1, vi());
SpT[0] = vi(n, 0);
for (int j = 0; j < n; ++j) SpT[0][j] = j; // RMQ of sub array [j..j]
// the two nested loops below have overall time complexity = O(n log n)
for (int i = 1; P2[i] <= n; ++i) { // for all i s.t. 2^i <= n
SpT[i] = vi(n + 1 - P2[i]); // initialize SpT[i]
for (int j = 0; j + P2[i] - 1 < n; ++j) { // for all valid j
int x = SpT[i - 1][j]; // [j..j+2^(i-1)-1]
int y = SpT[i - 1][j + P2[i - 1]]; // [j+2^(i-1)..j+2^i-1]
SpT[i][j] = A[x] <= A[y] ? x : y;
}
}
}
int RMQ(int i, int j) {
int k = L2[j - i + 1]; // 2^k <= (j-i+1)
int x = SpT[k][i]; // covers [i..i+2^k-1]
int y = SpT[k][j - P2[k] + 1]; // covers [j-2^k+1..j]
return A[x] <= A[y] ? x : y;
}
};
int LCA(int u, int v, SparseTable &SpT) {
if (H[u] > H[v]) swap(u, v);
return E[SpT.RMQ(H[u], H[v])];
}
int APSP(int u, int v, SparseTable &SpT) {
int ancestor = LCA(u, v, SpT);
return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}
int main() {
fast_cin();
cin >> n >> q;
L.assign(2 * (n), 0);
E.assign(2 * (n), 0);
H.assign(n, -1);
idx = 0;
int u, v;
for (int i = 0; i < n - 1; i++) {
cin >> u >> v;
u--;
v--;
adjlist[u].emplace_back(v);
adjlist[v].emplace_back(u);
}
dfs(n - 1, 0);
SparseTable SpT(L);
ll d;
while (q--) {
cin >> u >> v;
u--;
v--;
d = (ll)APSP(u, v, SpT) + 1LL;
cout << (ll)n - d + (d) * (d + 1) / (ll)2 << endl;
}
return 0;
}
Much thanks for the help in additionally.
I'm trying to build a BVH Tree with Surface Area Heuristic, but everytime I compile my code it gives me random errors like:
"Access violation reading location"
"Run-Time Check Failure #2 - Stack around the variable 'x' was
corrupted."
"Stack overflow "
The errors happen in the BVH::buildSAH() function.
And I have tried to find a solution for the whole day, meaningless. Could it be something from the std::partition function or from sending variables with pointers to a recursion?
I'm reading from the book "Physically Based Rendering: From Theory to Implementation
By Matt Pharr, Greg Humphreys"
It works for 2 primitives in the area, but thats trivial...
If you would like to clone: https://github.com/vkaytsanov/MortonCode-BVH-KD
My BVH.hpp:
#include <vector>
#include <cassert>
#include <algorithm>
#include "memory.hpp"
#include "Screen.hpp"
#include "Point3D.hpp"
#include "BoundBox.hpp"
#pragma once
enum Axis{
X, Y, Z
};
struct MortonPrimitive{
int primitiveIndex;
uint32_t mortonCode;
};
struct BVHPrimitiveInfo {
BVHPrimitiveInfo() {}
BVHPrimitiveInfo(int primitiveNumber, const BoundBox& box) : primitiveNumber(primitiveNumber), box(box),
centroid(Point3D(box.pMin.x* 0.5f + box.pMax.x * 0.5f, box.pMin.y* 0.5f + box.pMax.y * 0.5f, box.pMin.z* 0.5f + box.pMax.z * 0.5f)) {}
int primitiveNumber;
BoundBox box;
Point3D centroid;
};
struct BVHNode {
void InitLeaf(int first, int n, const BoundBox& b) {
firstPrimOffset = first;
nPrimitives = n;
box = b;
children[0] = children[1] = nullptr;
}
void InitInterior(int axis, BVHNode* c0, BVHNode* c1) {
assert(c0 != NULL || c1 != NULL);
children[0] = c0;
children[1] = c1;
this->box = Union(c0->box, c1->box);
splitAxis = axis;
nPrimitives = 0;
}
BoundBox box;
BVHNode* children[2];
int splitAxis, firstPrimOffset, nPrimitives;
};
struct LinearBVHNode {
BoundBox bounds;
union {
int primitivesOffset; // leaf
int secondChildOffset; // interior
};
uint16_t nPrimitives; // 0 -> interior node
uint8_t axis; // interior node: xyz
uint8_t pad[1]; // ensure 32 byte total size
};
struct BVHLittleTree {
int startIndex;
int numPrimitives;
BVHNode* nodes;
};
struct BVH {
BVH(std::vector<std::shared_ptr<Primitive>> p) : primitives(std::move(p)) {
std::vector<BVHPrimitiveInfo> BVHPrimitives;
BVHPrimitives.reserve(primitives.size());
for (int i = 0; i < primitives.size(); i++) {
BVHPrimitives.push_back({ i, primitives[i]->box });
}
MemoryArena arena(1024 * 1024);
int totalNodes = 0;
std::vector<std::shared_ptr<Primitive>> orderedPrimitives;
orderedPrimitives.reserve(primitives.size());
BVHNode* root;
root = HLBVHBuild(arena, BVHPrimitives, &totalNodes, orderedPrimitives);
primitives.swap(orderedPrimitives);
BVHPrimitives.resize(0);
printf("BVH created with %d nodes for %d "
"primitives (%.4f MB), arena allocated %.2f MB\n",
(int)totalNodes, (int)primitives.size(),
float(totalNodes * sizeof(LinearBVHNode)) /
(1024.f * 1024.f),
float(arena.TotalAllocated()) /
(1024.f * 1024.f));
assert(root != NULL);
nodes = AllocAligned<LinearBVHNode>(totalNodes);
int offset = 0;
flattenBVHTree(root, &offset);
}
~BVH() { FreeAligned(nodes); }
BVHNode* build(std::vector<MortonPrimitive>&, std::vector<Primitive>&);
BVHNode* HLBVHBuild(MemoryArena& arena, const std::vector<BVHPrimitiveInfo>& BVHPrimitives, int* totalNodes, std::vector<std::shared_ptr<Primitive>>& orderedPrims);
BVHNode* emit(BVHNode*& nodes, const std::vector<BVHPrimitiveInfo>& BVHPrimitives, MortonPrimitive* mortonPrimitives, std::vector<std::shared_ptr<Primitive>>&, int, int*, int*, int);
BVHNode* buildSAH(MemoryArena& arena, std::vector<BVHNode*>& treeRoots, int start, int end, int* total) const;
int flattenBVHTree(BVHNode*, int*);
std::vector<std::shared_ptr<Primitive>> primitives;
LinearBVHNode* nodes = nullptr;
int maxPrimsInNode = 1;
};
inline uint32_t LeftShift3(uint32_t x) {
if (x == (1 << 10)) --x;
x = (x | (x << 16)) & 0b00000011000000000000000011111111;
x = (x | (x << 8)) & 0b00000011000000001111000000001111;
x = (x | (x << 4)) & 0b00000011000011000011000011000011;
x = (x | (x << 2)) & 0b00001001001001001001001001001001;
return x;
}
uint32_t EncodeMorton3(const Point3D& p) {
return (LeftShift3(p.z) << 2) |
(LeftShift3(p.y) << 1) |
(LeftShift3(p.x) << 0);
}
short bitValue(uint32_t& number, uint32_t& mask) {
return number & mask ? 1 : 0;
}
static void radixSort(std::vector<MortonPrimitive>* v)
{
std::vector<MortonPrimitive> tempVector(v->size());
const int bitsPerPass = 6;
const int nBits = 30;
static_assert((nBits % bitsPerPass) == 0,
"Radix sort bitsPerPass must evenly divide nBits");
const int nPasses = nBits / bitsPerPass;
for (int pass = 0; pass < nPasses; ++pass) {
// Perform one pass of radix sort, sorting _bitsPerPass_ bits
int lowBit = pass * bitsPerPass;
// Set in and out vector pointers for radix sort pass
std::vector<MortonPrimitive>& in = (pass & 1) ? tempVector : *v;
std::vector<MortonPrimitive>& out = (pass & 1) ? *v : tempVector;
// Count number of zero bits in array for current radix sort bit
const int nBuckets = 1 << bitsPerPass;
int bucketCount[nBuckets] = { 0 };
const int bitMask = (1 << bitsPerPass) - 1;
for (const MortonPrimitive& mp : in) {
int bucket = (mp.mortonCode >> lowBit) & bitMask;
++bucketCount[bucket];
}
// Compute starting index in output array for each bucket
int outIndex[nBuckets];
outIndex[0] = 0;
for (int i = 1; i < nBuckets; ++i)
outIndex[i] = outIndex[i - 1] + bucketCount[i - 1];
// Store sorted values in output array
for (const MortonPrimitive& mp : in) {
int bucket = (mp.mortonCode >> lowBit) & bitMask;
out[outIndex[bucket]++] = mp;
}
}
// Copy final result from _tempVector_, if needed
if (nPasses & 1) std::swap(*v, tempVector);
}
//BVHNode* BVH::build(std::vector<MortonPrimitive>& mortonPrimitives, std::vector<Primitive>& prims) {
//
//
//}
struct BucketInfo {
int count = 0;
BoundBox bounds;
};
BVHNode* BVH::HLBVHBuild(MemoryArena& arena, const std::vector<BVHPrimitiveInfo>& BVHPrimitives, int* totalNodes, std::vector<std::shared_ptr<Primitive>>& orderedPrims) {
BoundBox box;
for (const BVHPrimitiveInfo& pi : BVHPrimitives) {
box = box.Union(box, pi.centroid); // maybe it should be UNION #TODO
}
std::vector<MortonPrimitive> mortonPrims(BVHPrimitives.size());
for (int i = 0; i < BVHPrimitives.size(); i++) {
const int mortonBits = 10;
const int mortonScale = 1 << mortonBits;
mortonPrims[i].primitiveIndex = BVHPrimitives[i].primitiveNumber;
Point3D p = box.offset(BVHPrimitives[i].centroid);
p.x = p.x * mortonScale;
p.y = p.y * mortonScale;
p.z = p.z * mortonScale;
mortonPrims[i].mortonCode = EncodeMorton3(p);
}
radixSort(&mortonPrims);
//for (MortonPrimitive mp : mortonPrims) {
// std::cout << mp.primitiveIndex << " " << mp.mortonCode << std::endl;
//}
std::vector<BVHLittleTree> treesToBuild;
uint32_t mask = 0b00111111111111000000000000000000; // first 12 bits describe the position of the primitive
for (int start = 0, end = 1; end <= (int)mortonPrims.size(); end++) {
if (end == mortonPrims.size() || ((mortonPrims[start].mortonCode & mask) != (mortonPrims[end].mortonCode & mask))) {
int n = end - start;
int maxNodes = 2 * n;
BVHNode* nodes = arena.Alloc<BVHNode>(maxNodes, false);
treesToBuild.push_back({ start, n, nodes });
start = end;
}
}
int orderedPrimsOffset = 0;
orderedPrims.resize(primitives.size());
int nodesCreated = 0;
int firstBitIndex = 29 - 12;
for (int i = 0; i < treesToBuild.size(); i++) {
treesToBuild[i].nodes = BVH::emit(treesToBuild[i].nodes, BVHPrimitives, &mortonPrims[treesToBuild[i].startIndex], orderedPrims, treesToBuild[i].numPrimitives, &nodesCreated, &orderedPrimsOffset, firstBitIndex);
*totalNodes += nodesCreated;
}
totalNodes += nodesCreated;
std::vector<BVHNode*> finishedTrees;
finishedTrees.reserve(treesToBuild.size());
for (BVHLittleTree& tr : treesToBuild) {
finishedTrees.emplace_back(tr.nodes);
}
return buildSAH(arena, finishedTrees, 0, finishedTrees.size(), totalNodes);
}
BVHNode* BVH::emit(BVHNode*& nodes, const std::vector<BVHPrimitiveInfo>& BVHPrimitive, MortonPrimitive* mortonPrimitives, std::vector<std::shared_ptr<Primitive>>& orderedPrimitives, int primitivesCount, int* totalNodes, int* orderedPrimsOffset, int bitIndex) {
if (bitIndex == -1 || primitivesCount < maxPrimsInNode) {
(*totalNodes)++;
BVHNode* tmp = nodes++;
BoundBox box;
int firstPrimOffset = *orderedPrimsOffset;
for (int i = 0; i < primitivesCount; i++) {
int index = mortonPrimitives[i].primitiveIndex;
orderedPrimitives[firstPrimOffset + i] = primitives[index];
box = box.Union(box, BVHPrimitive[index].box);
}
tmp->InitLeaf(0, primitivesCount, box);
return tmp;
}
else {
int mask = 1 << bitIndex;
if ((mortonPrimitives[0].mortonCode & mask) == (mortonPrimitives[primitivesCount - 1].mortonCode & mask)){ // Next tree if nothing to split for this bit
return emit(nodes, BVHPrimitive, mortonPrimitives, orderedPrimitives, primitivesCount, totalNodes, orderedPrimsOffset, bitIndex - 1);
}
int start = 0;
int end = primitivesCount - 1;
while (start + 1 != end) {
int mid = (end - start) / 2 + start; // (start-end)/2
if ((mortonPrimitives[start].mortonCode & mask) == (mortonPrimitives[mid].mortonCode & mask)) {
start = mid;
}
else {
end = mid;
}
}
int split = end;
(*totalNodes)++;
BVHNode* tmp = nodes++;
BVHNode* lbvh[2];
lbvh[0] = emit(nodes, BVHPrimitive, mortonPrimitives, orderedPrimitives, split, totalNodes, orderedPrimsOffset, bitIndex-1);
lbvh[1] = emit(nodes, BVHPrimitive, &mortonPrimitives[split], orderedPrimitives, primitivesCount - split, totalNodes, orderedPrimsOffset, bitIndex - 1);
int axis = bitIndex % 3;
tmp->InitInterior(axis, lbvh[0], lbvh[1]);
return tmp;
}
}
BVHNode* BVH::buildSAH(MemoryArena& arena, std::vector<BVHNode*>& treeRoots, int start, int end, int* total) const {
int nodesCount = end - start;
if (nodesCount == 1) {
return treeRoots[start];
}
assert(nodesCount > 1);
(*total)++;
BVHNode* node = arena.Alloc<BVHNode>();
BoundBox box;
for (int i = start; i < end; i++) {
box = Union(box, treeRoots[i]->box);
}
BoundBox centroidBox;
for (int i = start; i < end; i++) {
Point3D centroid = Point3D((treeRoots[i]->box.pMin.x + treeRoots[i]->box.pMax.x) * 0.5f, (treeRoots[i]->box.pMin.y + treeRoots[i]->box.pMax.y) * 0.5f, (treeRoots[i]->box.pMin.z + treeRoots[i]->box.pMax.z) * 0.5f);
centroidBox = Union(centroidBox, centroid);
}
const int dimension = centroidBox.MaximumExtent();
const int nBuckets = 12;
struct Buckets {
int count = 0;
BoundBox box;
};
Buckets buckets[nBuckets];
for (int i = start; i < end; i++) {
float centroid = (treeRoots[i]->box.pMin[dimension] * 0.5f + treeRoots[i]->box.pMax[dimension] * 0.5f) ;
int b = nBuckets * ((centroid - centroidBox.pMin[dimension]) / (centroidBox.pMax[dimension] - centroidBox.pMin[dimension]));
if (b == nBuckets) b = nBuckets - 1;
//assert(b < nBuckets);
buckets[b].count++;
buckets[b].box = Union(buckets[b].box, treeRoots[i]->box);
}
float cost[nBuckets - 1];
for (int i = 0; i < nBuckets - 1; i++) {
BoundBox b0, b1;
int count0 = 0, count1 = 0;
for (int j = 0; j <= i; j++) {
b0 = Union(b0, buckets[j].box);
count0 += buckets[j].count;
}
for (int j = i+1; j < nBuckets; j++) {
b1 = Union(b1, buckets[j].box);
count1 += buckets[j].count;
}
cost[i] = (.125f + (count0 * b0.surfaceArea() + count1 * b1.surfaceArea())) / box.surfaceArea();
}
double minCost = cost[0];
int minCostSplitBucket = 0;
for (int i = 1; i < nBuckets - 1; ++i) {
if (cost[i] < minCost) {
minCost = cost[i];
minCostSplitBucket = i;
}
}
BVHNode** pmid = std::partition(&treeRoots[start], &treeRoots[end - 1] + 1, [=](const BVHNode* node) {
double centroid = (node->box.pMin[dimension]*0.5f + node->box.pMax[dimension] * 0.5f) ;
int b = nBuckets * ((centroid - centroidBox.pMin[dimension]) / (centroidBox.pMax[dimension] - centroidBox.pMin[dimension]));
if (b == nBuckets) b = nBuckets - 1;
return b <= minCostSplitBucket;
});
assert(pmid != NULL);
//std::cout << pmid << " " << &treeRoots[0];
int mid = pmid - &treeRoots[0];
//std::cout << start << " " << mid << std::endl;
//std::cout << mid << " " << end << std::endl;
std::cout << dimension << std::endl;
//assert(dimension < 3);
node->InitInterior(dimension, this->buildSAH(arena, treeRoots, start, mid, total), this->buildSAH(arena, treeRoots, mid, end, total));
return node;
}
int BVH::flattenBVHTree(BVHNode* node, int* offset) {
LinearBVHNode* linearNode = &nodes[*offset];
linearNode->bounds = node->box;
int myOffset = (*offset)++;
if (node->nPrimitives > 0) {
linearNode->primitivesOffset = node->firstPrimOffset;
linearNode->nPrimitives = node->nPrimitives;
}
else {
// Create interior flattened BVH node
linearNode->axis = node->splitAxis;
linearNode->nPrimitives = 0;
flattenBVHTree(node->children[0], offset);
linearNode->secondChildOffset = flattenBVHTree(node->children[1], offset);
}
return myOffset;
}
My Point3D.hpp
#include <cstdint>
#pragma once
struct Point3D {
float x;
float y;
float z;
Point3D(uint32_t, uint32_t, uint32_t);
Point3D();
int operator[](int);
int operator[](int) const;
Point3D operator+(int);
Point3D operator-(int);
Point3D operator-(Point3D&);
};
Point3D::Point3D() {
x = 0;
y = 0;
z = 0;
}
Point3D::Point3D(uint32_t x, uint32_t y, uint32_t z) {
this->x = x;
this->y = y;
this->z = z;
}
bool operator<(Point3D a, Point3D b) {
uint32_t xSquare = a.x * a.x;
uint32_t ySquare = a.y * a.y;
uint32_t zSquare = a.z * a.z;
uint32_t x2Square = b.x * b.x;
uint32_t y2Square = b.y * b.y;
uint32_t z2Square = b.z * b.z;
int64_t sum = std::sqrt(xSquare + ySquare + z2Square) - std::sqrt(x2Square + y2Square + z2Square);
return sum < 0 ||
sum == 0 && xSquare < x2Square ||
sum == 0 && xSquare == x2Square && ySquare < y2Square ||
sum == 0 && xSquare == x2Square && ySquare == y2Square && zSquare < z2Square;
}
bool operator>(Point3D a, Point3D b) {
uint32_t xSquare = a.x * a.x;
uint32_t ySquare = a.y * a.y;
uint32_t zSquare = a.z * a.z;
uint32_t x2Square = b.x * b.x;
uint32_t y2Square = b.y * b.y;
uint32_t z2Square = b.z * b.z;
int32_t sum = std::sqrt(xSquare + ySquare + z2Square) - std::sqrt(x2Square + y2Square + z2Square);
return sum > 0 ||
sum == 0 && xSquare > x2Square ||
sum == 0 && xSquare == x2Square && ySquare > y2Square ||
sum == 0 && xSquare == x2Square && ySquare == y2Square && zSquare > z2Square;
}
int Point3D::operator[](int i) {
if (i == 0) return x;
if (i == 1) return y;
return z;
}
Point3D Point3D::operator+(int i) {
this->x += i;
this->y += i;
this->z += i;
return *this;
}
Point3D Point3D::operator-(const int i) {
this->x -= i;
this->y -= i;
this->z -= i;
return *this;
}
Point3D Point3D::operator-(Point3D& p) {
this->x -= p.x;
this->y -= p.y;
this->z -= p.z;
return *this;
}
int Point3D::operator[](const int i) const {
if (i == 0) return x;
if (i == 1) return y;
return z;
}
My BoundBox.hpp
#include "Point3D.hpp"
#include "Vector3D.hpp"
#pragma once
struct BoundBox {
Point3D pMin;
Point3D pMax;
BoundBox(Point3D);
BoundBox(Point3D, Point3D);
BoundBox();
void setBounds(BoundBox);
void Union(BoundBox);
BoundBox Union(BoundBox&, Point3D&);
BoundBox Union(BoundBox, BoundBox);
BoundBox unite(BoundBox, BoundBox);
BoundBox unite(BoundBox);
const Point3D offset(const Point3D&);
Point3D diagonal();
const int MaximumExtent();
float surfaceArea();
};
BoundBox::BoundBox() {
float minNum = 0;
pMin = Point3D(800, 600, 300);
pMax = Point3D(minNum, minNum, minNum);
}
BoundBox::BoundBox(Point3D p){
pMin = p;
pMax = p;
}
BoundBox::BoundBox(Point3D p1, Point3D p2) {
pMin = Point3D(std::min(p1.x, p2.x), std::min(p1.y, p2.y), std::min(p1.z, p2.z));
pMax = Point3D(std::max(p1.x, p2.x), std::max(p1.y, p2.y), std::max(p1.z, p2.z));
}
BoundBox BoundBox::Union(BoundBox& box, Point3D& p) {
BoundBox newBox;
newBox.pMin = Point3D(std::min(box.pMin.x, p.x), std::min(box.pMin.y, p.y), std::min(box.pMin.z, p.z));
newBox.pMax = Point3D(std::max(box.pMax.x, p.x), std::max(box.pMax.y, p.y), std::max(box.pMax.z, p.z));
return newBox;
}
BoundBox BoundBox::Union(BoundBox box1, BoundBox box2) {
BoundBox newBox;
newBox.pMin = std::min(box1.pMin, box2.pMin);
newBox.pMax = std::max(box1.pMax, box2.pMax);
return newBox;
}
BoundBox Union(BoundBox box1, BoundBox box2) {
BoundBox newBox;
newBox.pMin = std::min(box1.pMin, box2.pMin);
newBox.pMax = std::max(box1.pMax, box2.pMax);
return newBox;
}
BoundBox BoundBox::unite(BoundBox b1, BoundBox b2) {
bool x = (b1.pMax.x >= b2.pMin.x) && (b1.pMin.x <= b2.pMax.x);
bool y = (b1.pMax.y >= b2.pMin.y) && (b1.pMin.y <= b2.pMax.y);
bool z = (b1.pMax.z >= b2.pMin.z) && (b1.pMin.z <= b2.pMax.z);
if (x && y && z) {
return Union(b1, b2);
}
}
BoundBox BoundBox::unite(BoundBox b2) {
bool x = (this->pMax.x >= b2.pMin.x) && (this->pMin.x <= b2.pMax.x);
bool y = (this->pMax.y >= b2.pMin.y) && (this->pMin.y <= b2.pMax.y);
bool z = (this->pMax.z >= b2.pMin.z) && (this->pMin.z <= b2.pMax.z);
if (x && y && z) {
return Union(*this, b2);
}
else return *this;
}
const int BoundBox::MaximumExtent() {
Point3D d = Point3D(this->pMax.x - this->pMin.x, this->pMax.y - this->pMin.y, this->pMax.z - this->pMin.z); // diagonal
if (d.x > d.y && d.x > d.z) {
return 0;
}
else if (d.y > d.z) {
return 1;
}
else {
return 2;
}
}
float BoundBox::surfaceArea() {
Point3D d = Point3D(this->pMax.x - this->pMin.x, this->pMax.y - this->pMin.y, this->pMax.z - this->pMin.z); // diagonal
return 2 * (d.x * d.y + d.x * d.z + d.y * d.z);
}
const Point3D BoundBox::offset(const Point3D& p) {
Point3D o = Point3D(p.x - pMin.x, p.y - pMin.y, p.z - pMin.z);
if (pMax.x > pMin.x) o.x /= pMax.x - pMin.x;
if (pMax.y > pMin.y) o.y /= pMax.y - pMin.y;
if (pMax.z > pMin.z) o.z /= pMax.z - pMin.z;
return o;
}
My memory.hpp
#include <list>
#include <cstddef>
#include <algorithm>
#include <malloc.h>
#include <stdlib.h>
#pragma once
#define ARENA_ALLOC(arena, Type) new ((arena).Alloc(sizeof(Type))) Type
void* AllocAligned(size_t size);
template <typename T>
T* AllocAligned(size_t count) {
return (T*)AllocAligned(count * sizeof(T));
}
void FreeAligned(void*);
class
#ifdef PBRT_HAVE_ALIGNAS
alignas(PBRT_L1_CACHE_LINE_SIZE)
#endif // PBRT_HAVE_ALIGNAS
MemoryArena {
public:
// MemoryArena Public Methods
MemoryArena(size_t blockSize = 262144) : blockSize(blockSize) {}
~MemoryArena() {
FreeAligned(currentBlock);
for (auto& block : usedBlocks) FreeAligned(block.second);
for (auto& block : availableBlocks) FreeAligned(block.second);
}
void* Alloc(size_t nBytes) {
// Round up _nBytes_ to minimum machine alignment
#if __GNUC__ == 4 && __GNUC_MINOR__ < 9
// gcc bug: max_align_t wasn't in std:: until 4.9.0
const int align = alignof(::max_align_t);
#elif !defined(PBRT_HAVE_ALIGNOF)
const int align = 16;
#else
const int align = alignof(std::max_align_t);
#endif
#ifdef PBRT_HAVE_CONSTEXPR
static_assert(IsPowerOf2(align), "Minimum alignment not a power of two");
#endif
nBytes = (nBytes + align - 1) & ~(align - 1);
if (currentBlockPos + nBytes > currentAllocSize) {
// Add current block to _usedBlocks_ list
if (currentBlock) {
usedBlocks.push_back(
std::make_pair(currentAllocSize, currentBlock));
currentBlock = nullptr;
currentAllocSize = 0;
}
// Get new block of memory for _MemoryArena_
// Try to get memory block from _availableBlocks_
for (auto iter = availableBlocks.begin();
iter != availableBlocks.end(); ++iter) {
if (iter->first >= nBytes) {
currentAllocSize = iter->first;
currentBlock = iter->second;
availableBlocks.erase(iter);
break;
}
}
if (!currentBlock) {
currentAllocSize = std::max(nBytes, blockSize);
currentBlock = AllocAligned<uint8_t>(currentAllocSize);
}
currentBlockPos = 0;
}
void* ret = currentBlock + currentBlockPos;
currentBlockPos += nBytes;
return ret;
}
template <typename T>
T* Alloc(size_t n = 1, bool runConstructor = true) {
T* ret = (T*)Alloc(n * sizeof(T));
if (runConstructor)
for (size_t i = 0; i < n; ++i) new (&ret[i]) T();
return ret;
}
void Reset() {
currentBlockPos = 0;
availableBlocks.splice(availableBlocks.begin(), usedBlocks);
}
size_t TotalAllocated() const {
size_t total = currentAllocSize;
for (const auto& alloc : usedBlocks) total += alloc.first;
for (const auto& alloc : availableBlocks) total += alloc.first;
return total;
}
private:
MemoryArena(const MemoryArena&) = delete;
MemoryArena & operator=(const MemoryArena&) = delete;
// MemoryArena Private Data
const size_t blockSize;
size_t currentBlockPos = 0, currentAllocSize = 0;
uint8_t * currentBlock = nullptr;
std::list<std::pair<size_t, uint8_t*>> usedBlocks, availableBlocks;
};
template <typename T, int logBlockSize>
class BlockedArray {
public:
// BlockedArray Public Methods
BlockedArray(int uRes, int vRes, const T* d = nullptr)
: uRes(uRes), vRes(vRes), uBlocks(RoundUp(uRes) >> logBlockSize) {
int nAlloc = RoundUp(uRes) * RoundUp(vRes);
data = AllocAligned<T>(nAlloc);
for (int i = 0; i < nAlloc; ++i) new (&data[i]) T();
if (d)
for (int v = 0; v < vRes; ++v)
for (int u = 0; u < uRes; ++u) (*this)(u, v) = d[v * uRes + u];
}
const int BlockSize() const { return 1 << logBlockSize; }
int RoundUp(int x) const {
return (x + BlockSize() - 1) & ~(BlockSize() - 1);
}
int uSize() const { return uRes; }
int vSize() const { return vRes; }
~BlockedArray() {
for (int i = 0; i < uRes * vRes; ++i) data[i].~T();
FreeAligned(data);
}
int Block(int a) const { return a >> logBlockSize; }
int Offset(int a) const { return (a & (BlockSize() - 1)); }
T& operator()(int u, int v) {
int bu = Block(u), bv = Block(v);
int ou = Offset(u), ov = Offset(v);
int offset = BlockSize() * BlockSize() * (uBlocks * bv + bu);
offset += BlockSize() * ov + ou;
return data[offset];
}
const T & operator()(int u, int v) const {
int bu = Block(u), bv = Block(v);
int ou = Offset(u), ov = Offset(v);
int offset = BlockSize() * BlockSize() * (uBlocks * bv + bu);
offset += BlockSize() * ov + ou;
return data[offset];
}
void GetLinearArray(T * a) const {
for (int v = 0; v < vRes; ++v)
for (int u = 0; u < uRes; ++u) * a++ = (*this)(u, v);
}
private:
// BlockedArray Private Data
T * data;
const int uRes, vRes, uBlocks;
};
void* AllocAligned(size_t size) {
return _aligned_malloc(size, 32);
}
void FreeAligned(void* ptr) {
if (!ptr) return;
_aligned_free(ptr);
}
and My Source.cpp
#include <iostream>
#include <vector>
#include <chrono>
#include "Point3D.hpp"
#include "Screen.hpp"
#include "BVH.hpp"
#define N 150
int main(){
auto startTime = std::chrono::high_resolution_clock::now();
Screen* screen = new Screen(800, 600, 300);
screen->generatePoints(N);
//for (MortonPrimitive m : mortonPrims) {
// std::cout << m.mortonCode << std::endl;
//}
std::vector<std::shared_ptr<Primitive>> primitives;
primitives.reserve(N);
for (int i = 0; i < N; i++) {
primitives.emplace_back(screen->castPointToPrimitive(i));
}
BVH test(primitives);
auto endTime = std::chrono::high_resolution_clock::now();
std::cout << "Time spent: " << std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count() << "ms\n";
getchar();
delete screen;
}
Probably it would be wise to first cleanup your github. This mean update stuff to the recent c++ standard. It seems that you can use c++17 so use it. Also please look at some names. For example 'nodes' is used as member variable as well as parameter name, this is confusion. Please also initialize relevant (all) member variables.
Now it seems that the code in buildSAH override memory. It seems that it it can write over the end of buckets array.
I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
}
return 0;
}
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
}
return res / 2.0 / (double)n;
}
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
}
p += alpha * pg / n;
m += alpha * mg / n;
}
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm
This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
display();
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
display();
}
return 0;
}
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
}
return c;
}
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
}
}
omega = omega2;
}
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
}
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
}
std::cout << "\tError : " << res * .5/(double)n << std::endl;
}
The problem is, in a table of (h+1)*(w+1),the first row contains w values: a[1] ... a[w] which fill in the 2rd ... (w+1)th column. The first column contains h values: b[1] ... b[h] which fill in the 2rd ... (h+1)th row. sum(a[i]) is equal to sum(b[i]).
The question is to give one possible solution: result, so that sum(result[i][K]) for a certain K, is equal to a[i] with result[i][K] != result[j][K] (i != j and 0 < i < h+1). And the same rule for rows. PS: All the integers are positive.
For example:
a[] = {10, 3, 3}, b[] = {9, 7}
// 10 3 3
// 9 6 2 1
// 7 4 1 2
result = {6, 2, 1;
4, 1, 2}
It is like Kakuro but not the same. I cannot figure out which algorithm to apply, if anyone knows how to solve it, please give me some help. Thanks a lot.
You can always solve your problem with backtracking. Basic idea here: from top-to-bottom and left-to-right try a valid value in the partially filled table, backtrack when this value doesn't lead to a solution.
Minimal example in C++ with annotated solve:
#include <algorithm>
#include <iostream>
#include <iterator>
#include <memory>
class Problem {
public:
template<class AIter, class BIter>
Problem(AIter abegin, AIter aend, BIter bbegin, BIter bend)
: m_width(std::distance(abegin, aend))
, m_height(std::distance(bbegin, bend))
, m_table(new int[(m_width + 1) * (m_height + 1)])
{
std::fill(m_table.get(), m_table.get() + (m_width + 1) * (m_height + 1), 0);
for(size_t i = 0; i < m_width; ++i)
m_table[i + 1] = *abegin++;
for(size_t j = 0; j < m_height; ++j)
m_table[(j + 1) * (m_width + 1)] = *bbegin++;
}
bool Solve() { return solve(0, 0); }
int operator()(size_t i, size_t j) const;
private:
int a(size_t i) const { return m_table[i + 1]; }
int b(size_t j) const { return m_table[(j + 1) * (m_width + 1)]; }
int get(size_t i, size_t j) const { return m_table[(j + 1) * (m_width + 1) + i + 1]; }
void set(size_t i, size_t j, int value) { m_table[(j + 1) * (m_width + 1) + i + 1] = value; }
int colSum(size_t i) const;
int rowSum(size_t j) const;
bool solve(size_t i, size_t j);
size_t m_width, m_height;
std::unique_ptr<int[]> m_table; // (width + 1) x (height + 1)
};
int Problem::colSum(size_t i) const {
int sum = 0;
for(size_t j = 0; j < m_height; ++j)
sum += get(i, j);
return sum;
}
int Problem::rowSum(size_t j) const {
int sum = 0;
for(size_t i = 0; i < m_width; ++i)
sum += get(i, j);
return sum;
}
// solves column-wise using backtracking
bool Problem::solve(size_t i, size_t j) {
size_t width = m_width, height = m_height;
// past last column?
if(i >= width) {
// found solution
return true;
}
// remainder in column and row
int remColSum = a(i) - colSum(i);
int remRowSum = b(j) - rowSum(j);
// early break
if(remColSum <= 0 || remRowSum <= 0)
return false;
// starting at the minimal required value (1 or remColSum if on last row)
int startValue = j + 1 < height ? 1 : remColSum;
// remaining row sum cannot support the starting value
if(remRowSum < startValue)
return false;
// end value minimum remaining sum
int endValue = remColSum < remRowSum ? remColSum : remRowSum;
// on last element must equal starting value
if(i + 1 == width && j + 1 == height && startValue != endValue)
return false;
// column-wise i.e. next cell is (i, j + 1) wrapped
int nextI = i + (j + 1) / height;
int nextJ = (j + 1) % height;
for(int value = startValue; value <= endValue; ++value) {
bool valid = true;
// check row up to i
for(size_t u = 0; u < i && valid; ++u)
valid = (get(u, j) != value);
// check column up to j
for(size_t v = 0; v < j && valid; ++v)
valid = (get(i, v) != value);
if(!valid) {
// value is invalid in partially filled table
continue;
}
// value produces a valid, partially filled table, now try recursing
set(i, j, value);
// upon first solution break
if(solve(nextI, nextJ))
return true;
}
// upon failure backtrack
set(i, j, 0);
return false;
}
int Problem::operator()(size_t i, size_t j) const {
return get(i, j);
}
int main() {
int a[] = { 10, 3, 3 };
int b[] = { 9, 7 };
size_t width = sizeof(a) / sizeof(*a);
size_t height = sizeof(b) / sizeof(*b);
Problem problem(a, a + width, b, b + height);
if(!problem.Solve()) {
std::cout << "No solution" << std::endl;
}
for(size_t j = 0; j < height; ++j) {
if(j == 0) {
std::cout << " ";
for(size_t i = 0; i < width; ++i)
std::cout << " " << a[i];
std::cout << std::endl;
}
std::cout << b[j];
for(size_t i = 0; i < width; ++i) {
int value = problem(i, j);
if(value == 0)
std::cout << " ";
else
std::cout << " " << value;
}
std::cout << std::endl;
}
return 0;
}