Could not get the expected output

Could not get the expected output - c++

Problme:
I won't get the result that I get from Matlab implementation, I am not sure pow_pos(norm(x(:, 1) - start'), 2) I have converted correctly, here is the converted code
Variable::t x_0 = Var::vstack(x->index(0, 0), x->index(1, 0), x->index(2, 0));
M->constraint(Expr::vstack(t_tmp->index(0), Expr::sub(x_0, start_)), Domain::inQCone());
M->constraint(Expr::hstack(t->index(0), 1, t_tmp->index(0)), Domain::inPPowerCone(1.0/2));
Output
Here black dot represents what I get from Matlab and green dots depicts what I get from C++
Here is the original code I wrote in Matlab which gives the expected output
clear all
close all
clc
number_of_steps = 15;
lambda3_val = 1000;
lambda1_val = 1000;
lambda2_val = 0.1;
dim_ = 3;
Ab = [-0.470233 0.882543 0 3.21407
0.470233 -0.882543 -0 0.785929
-0.807883 -0.430453 0.402535 4.81961
0.807883 0.430453 -0.402535 -1.40824
-0.355254 -0.189285 -0.915405 0.878975
0.355254 0.189285 0.915405 1.12103];
A = Ab(:,1:dim_);
b = Ab(:,dim_+1);
start = [-4 , 0, 2];
goal = [-1, -1, 1];
nPolytopes = 1;
free_space = polytope(A, b);
cvx_solver mosek
cvx_begin
variable x(3, number_of_steps)
binary variable c(nPolytopes, number_of_steps);
cost = 0;
for i = 1:(number_of_steps-1)
cost = cost + pow_pos(norm(x(:, i) - x(:, i+1), 1),2)*lambda2_val;
end
cost = cost + pow_pos(norm(x(:, 1) - start'), 2)*lambda1_val;
cost = cost + pow_pos(norm(x(:, number_of_steps) - goal'),2)*lambda3_val;
minimize(cost)
subject to
for i = 1:number_of_steps
A*x(:, i) <= b;
end
cvx_end
Here is the code the conversion of the above into c++ using Mosek
#include <iostream>
#include "fusion.h"
using namespace mosek::fusion;
using namespace monty;
int main(int argc, char ** argv)
{
int number_of_steps = 15;
int lambda1_val = 1000;
int lambda2_val = 1000;
int lambda3_val = 0.1;
int dim_space = 3;
auto A = new_array_ptr<double, 2>({{-0.4702, 0.8825, 0},
{0.4702, -0.8825, 0},
{-0.8079, -0.4305, 0.4025},
{0.8079, 0.4305, -0.4025},
{-0.3553, -0.1893, -0.9154},
{0.3553, 0.1893, 0.9154}});
auto b = new_array_ptr<double, 1>({3.2141,
0.7859,
4.8196,
-1.4082,
0.8790,
1.1210});
auto end_ = new_array_ptr<double, 1>({-1, -1, -1});
auto start_ = new_array_ptr<double, 1>({-4, 0, 2});
Model::t M = new Model();
auto x = M->variable(new_array_ptr<int,1>({dim_space, number_of_steps}), Domain::unbounded()) ;
auto t = M->variable(number_of_steps, Domain::unbounded());
auto t_tmp = M->variable(number_of_steps, Domain::unbounded());
auto lambda_1 = M->parameter("lambda_1");
auto lambda_2 = M->parameter("lambda_2");
auto lambda_3 = M->parameter("lambda_3");
Variable::t x_0 = Var::vstack(x->index(0, 0), x->index(1, 0), x->index(2, 0));
M->constraint(Expr::vstack(t_tmp->index(0), Expr::sub(x_0, start_)), Domain::inQCone());
M->constraint(Expr::hstack(t->index(0), 1, t_tmp->index(0)), Domain::inPPowerCone(1.0/2));
for(int i=1; i<number_of_steps-1; i++){
Variable::t x_i = Var::vstack(x->index(0,i), x->index(1,i), x->index(2,i));
Variable::t x_i1 = Var::vstack(x->index(0,i+1), x->index(1,i+1), x->index(2,i+1));
M->constraint(Expr::vstack(t_tmp->index(i), Expr::sub(x_i1, x_i)), Domain::inQCone());
M->constraint(Expr::hstack(t->index(i), 1, t_tmp->index(i)), Domain::inPPowerCone(1.0/2));
}
Variable::t x_n = Var::vstack(x->index(0, number_of_steps-1), x->index(1, number_of_steps-1), x->index(2, number_of_steps-1));
M->constraint(Expr::vstack(t_tmp->index(number_of_steps-1), Expr::sub(x_n, end_)), Domain::inQCone());
M->constraint(Expr::hstack(t->index(number_of_steps-1), 1, t_tmp->index(number_of_steps-1)), Domain::inPPowerCone(1.0/2));
for (int i = 0; i < number_of_steps; i++)
{
auto x_i = Var::vstack(x->index(0,i), x->index(1, i), x->index(2, i));
M->constraint(Expr::mul(A, x_i), Domain::lessThan(b));
}
M->setLogHandler([](const std::string& msg){std::cout<< msg << std::flush;});
auto lambda1 = M->getParameter("lambda_1");
auto lambda2 = M->getParameter("lambda_2");
auto lambda3 = M->getParameter("lambda_3");
lambda1->setValue(lambda1_val);
lambda2->setValue(lambda2_val);
lambda3->setValue(lambda3_val);
auto objs = new_array_ptr<Expression::t, 1>(number_of_steps);
(*objs)[0] = (Expr::mul(lambda1, t->index(0)));
for(int i=1; i<number_of_steps-1; i++){
(*objs)[i] = Expr::mul(lambda2, t->index(i));
}
(*objs)[number_of_steps-1] = Expr::mul(lambda3, t->index(number_of_steps-1));
M->objective(ObjectiveSense::Minimize, Expr::add(objs));
M->solve();
auto sol = *(x->level());
std::cout<< "solution "<< sol << std::endl;
}

With the help of #michal-adamaszek and the answer given in the Mosek Google Group (https://mail.google.com/mail/u/0/#inbox/FMfcgzGkXmgmHqFBVJFSNRWmjQfQSPlg) Here is the working solution for the above problem,
#include <iostream>
#include "fusion.h"
using namespace mosek::fusion;
using namespace monty;
#define nint1(a) new_array_ptr<int>({(a)})
#define nint(a,b) new_array_ptr<int>({(a),(b)})
int main(int argc, char ** argv)
{
int number_of_steps = 15;
double lambda1_val = 1000;
double lambda2_val = 0.1;
double lambda3_val = 1000;
int dim_space = 3;
auto A = new_array_ptr<double, 2>({{-0.470233, 0.882543, 0},
{0.470233, -0.882543, 0},
{-0.807883, -0.430453, 0.402535},
{0.807883, 0.430453, -0.402535},
{-0.355254, -0.189285, -0.915405},
{0.355254, 0.189285, 0.915405}});
auto b = new_array_ptr<double, 1>({3.21407,
0.785929,
4.81961,
-1.40824,
0.878975,
1.12103});
auto end_ = new_array_ptr<double, 1>({-1, -1, 1});
auto start_ = new_array_ptr<double, 1>({-4, 0, 2});
Model::t M = new Model();
auto x = M->variable("x", new_array_ptr<int,1>({dim_space, number_of_steps}), Domain::unbounded()) ;
auto t = M->variable("t", number_of_steps-1, Domain::unbounded());
auto tstart = M->variable("ts",Domain::unbounded());
auto tend = M->variable("te",Domain::unbounded());
M->constraint(Expr::vstack(tstart, 0.5, Expr::sub(x->slice(nint(0,0), nint(3,1))->reshape(nint1(3)),
start_)),
Domain::inRotatedQCone());
M->constraint(Expr::hstack(t,
Expr::constTerm(number_of_steps-1, 0.5),
Expr::transpose(Expr::sub(x->slice(nint(0,0), nint(3,number_of_steps-1)),
x->slice(nint(0,1), nint(3,number_of_steps))))),
Domain::inRotatedQCone());
M->constraint(Expr::vstack(tend, 0.5, Expr::sub(x->slice(nint(0,number_of_steps-1), nint(3,number_of_steps))->reshape(nint1(3)),
end_)),
Domain::inRotatedQCone());
for (int i = 0; i < number_of_steps; i++)
M->constraint(Expr::mul(A, x->slice(nint(0,i), nint(3,i+1))->reshape(nint1(3))), Domain::lessThan(b));
M->setLogHandler([](const std::string& msg){std::cout<< msg << std::flush;});
auto lambda1 = M->parameter("lambda_1");
auto lambda2 = M->parameter("lambda_2");
auto lambda3 = M->parameter("lambda_3");
lambda1->setValue(lambda1_val);
lambda2->setValue(lambda2_val);
lambda3->setValue(lambda3_val);
M->objective(ObjectiveSense::Minimize,
Expr::add( Expr::sum(Expr::mul(lambda2, t)),
Expr::add(Expr::mul(lambda1, tstart), Expr::mul(lambda3, tend))));
M->writeTask("a.ptf");
M->solve();
auto sol = *(x->level());
std::cout<< "solution "<< sol << std::endl;
}

Related

Half-precision PyTorch float tensors have same performance as single precision float tensors?

Background: I've implemented the antiobject/"field AI" pattern (https://home.cs.colorado.edu/~ralex/papers/PDF/OOPSLA06antiobjects.pdf) for single diffusion using LibTorch/PyTorch.
This works fine, but in the process of running it on the GPU and optimizing it, I've run into a problem. I have a Titan V, which I believe excels at half-precision float math. However, when I make the tensors torch::kHalf, the performance is the same. (I've also tried torch::kFloat16). Any ideas?
The code that I timed is in update():
#define SDL_MAIN_HANDLED
#include <simple2d.h>
#include <torch/torch.h>
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDAEvent.h>
#include <math.h>
#include <chrono>
#define DEBUG_NO_DRAW
torch::Device gpu(torch::kCUDA);
torch::Device cpu(torch::kCPU);
torch::Device device = gpu;
const int windowLength = 1000;
const int64_t length = 500;
const float diffusionRate = 0.25;
const int obstacleCount = 4000;
const int entityCount = 1000;
float cellLength = windowLength / length;
torch::Tensor scent = torch::zeros({ length, length }, device).to(torch::kHalf);
torch::Tensor up, down, left, right;
torch::Tensor topWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor bottomWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor leftWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor rightWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor obstaclesMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor entities = torch::zeros({ length, length }, device).to(torch::kHalf);
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream();
std::time_t *lastFpsUpdate = NULL;
std::time_t *currentTime = new std::time_t();
int frameAccumulator = 0;
std::vector<long> updateDurations;
void update() {
torch::NoGradGuard no_grad;
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto startTime = std::chrono::high_resolution_clock::now();
down = scent.roll(1, 0) * obstaclesMask * topWallMask;
up = scent.roll(-1, 0) * obstaclesMask * bottomWallMask;
right = scent.roll(1, 1) * obstaclesMask * leftWallMask;
left = scent.roll(-1, 1) * obstaclesMask * rightWallMask;
scent = scent + ((down - scent) + (up - scent) + (right - scent) + (left - scent)) * diffusionRate;
scent = torch::max(scent, entities);
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime);
updateDurations.push_back(duration.count());
}
void render() {
if (lastFpsUpdate == NULL) {
lastFpsUpdate = new std::time_t();
std::time(lastFpsUpdate);
}
torch::Tensor sqrtedScent = scent.sqrt().to(torch::kFloat).to(cpu); // just to make darker scents a little brighter for display
auto obstaclesMaskCPU = obstaclesMask.to(torch::kFloat).to(cpu);
auto sqrtedScentAccessor = sqrtedScent.accessor<float, 2>();
auto obstaclesMaskAccessor = obstaclesMaskCPU.accessor<float, 2>();
float r = 0, g = 0, b = 0, a = 0;
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(
0, 0, 0, 0, 0, 1,
windowLength, 0, 0, 0, 0, 1,
windowLength, windowLength, 0, 0, 0, 1,
0, windowLength, 0, 0, 0, 1);
#endif
for (int i = 0; i < length; i++) {
for(int j = 0; j < length; j++) {
if (obstaclesMaskAccessor[i][j] == 0) {
r = 1; g = 1; b = 1; a = 1;
}
else {
r = 1; g = 0; b = 0; a = sqrtedScentAccessor[i][j];
}
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(cellLength * j, cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * (i + 1), r, g, b, a,
cellLength * j, cellLength * (i + 1), r, g, b, a);
#endif
}
}
frameAccumulator++;
std::time(currentTime);
if (std::difftime(*currentTime, *lastFpsUpdate) > 1.0) {
std::cout << "FPS: " << frameAccumulator << std::endl;
frameAccumulator = 0;
*lastFpsUpdate = *currentTime;
int updateCount = updateDurations.size();
long totalUpdateTime = 0;
for (int i = 0; i < updateCount; i++) {
totalUpdateTime += updateDurations[i];
}
long averageUpdateTime = totalUpdateTime / updateCount;
std::cout << "AverageUpdateTime: " << averageUpdateTime << "us" << std::endl;
updateDurations.clear();
}
}
int main() {
if (torch::cuda::is_available()) {
std::cout << "CUDA is available!" << std::endl;
}
std::cout << "Using " << (device == cpu ? "CPU" : "GPU") << std::endl;
for (int i = 0; i < length; i++) {
topWallMask[0][i] = 0;
bottomWallMask[length - 1][i] = 0;
leftWallMask[i][0] = 0;
rightWallMask[i][length - 1] = 0;
}
for (int i = 0; i < obstacleCount; i++) {
int x = rand() % length;
int y = rand() % length;
obstaclesMask[x][y] = 0;
}
//std::cout << obstaclesMask << std::endl;
for (int i = 0; i < entityCount; i++) {
int x = rand() % length;
int y = rand() % length;
if (obstaclesMask[x][y].item() == 0)
continue;
entities[x][y] = 1;
}
S2D_Window* window = S2D_CreateWindow(
"Collab Diffuse", windowLength, windowLength, update, render, 0
);
S2D_Show(window);
return 0;
}
In both single precision and half precision versions of the code, update() takes about 2700 microseconds.
I'm using PyTorch/LibTorch 1.7.1.
Any other performance tips would be appreciated. (I'm aware drawing pixel by pixel is very slow, so I plan to switch from Simple2D to something else that can draw bitmaps from memory).

Size of numbers in float array have significant impact on performance - Why?

UPDATE 2:
I've switched to initializing all arrays with fixed numbers. Why is performance faster when using modeDampingTermsExp:
12625671
12285846
12819392
11179072
12272587
11722863
12648955
vs using modeDampingTermsExp2 ?
1593620
1668170
1614495
1785965
1814576
1851797
1808568
1801945
It's about 10x faster.
Full code
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float tIncr = 0.1f;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5] = { 0.447604, 0.0497871, 0.00247875, 0.00012341, 1.37263e-05 };
float modeDampingTermsExp2[5] = { -0.803847, -3, -6, -9, -11.1962 };
int main(int argc, char** argv) {
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
//Slow version
//damping[i] = damping[i] * modeDampingTermsExp2[i];
//Fast version
damping[i] = damping[i] * modeDampingTermsExp2[i];
float cosT = 2 * damping[i];
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
//t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
UPDATE 1:
I changed tIncr to 0.1f to avoid a possible subnormal number as mentioned by #JaMit. I've also removed t += tIncr; from the calculation.
Full code:
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float t = 0;
float tIncr = 0.1f;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5];
int main(int argc, char** argv) {
/*
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}*/
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
//std::thread t1(audioStringSimCos);
//t1.detach();
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
damping[i] = damping[i] * modeDampingTermsExp[i];
float cosT = 2 * damping[i] * cos(t * modeFrequenciesArr[i]);
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
//t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
Now it runs faster WITH the exp in the intialization?
Output with exp:
7632423
7516857
6855266
6251330
7040232
6784555
7169865
7638150
7403717
7626824
7408493
7722998
around 7 million/s, and without exp:
1229743
1193849
1069924
1426083
1472080
1484318
1503082
1462985
1433372
1357586
1483370
1491731
1526445
1516673
1517916
1522941
1523948
1506818
around 1.5 million. This is confusing to me.
ORIGINAL POST:
It seems like the way I initialize modeDampingTermsExp in my small example has a huge impact on the performance of my calculations, where I access it. Here is my minimum, reproducible example:
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float t = 0;
float tIncr = 1.0f / 48000;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5];
int main(int argc, char** argv) {
/*
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}*/
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
//std::thread t1(audioStringSimCos);
//t1.detach();
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
damping[i] = damping[i] * modeDampingTermsExp[i];
float cosT = 2 * damping[i] * cos(t * modeFrequenciesArr[i]);
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
When I initialize it like this
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}
using the exp function, performance is about 10 times slower than like this:
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
I measure the calculations per second unsing chrono right below the 2 nested for loops in the endless while(true) loop (snippet of the fulle example above):
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
Using the exp function, my program gives the following output for example:
538108
356659
356227
383885
389902
390405
391748
391375
388910
383791
391691
it stays at around 390k.
Using the other initialization without it, I get the following output:
3145299
3049618
2519474
2755627
2846730
2824666
2893591
3119401
3492762
3366317
3470675
3505168
3492805
3523005
3432182
3561458
3580840
3576725
around 3 - 3.5 million "samples" per second.
Why does the way I initialize the modeDampingTermsExp array impact performance later in the code where I access it? What am I missing here?
I am using Visual Studio 2019 with the following flags:
/O2 /Oi /Ot /fp:fast
Thank you very much!

My c++ code returns 3 (after throwing an instance of bad_alloc)

My code returns 3 even though everything seems to be right. It previously returned 255, and i re-arranged it a bit, and now it returns 3. It also says "terminate called after throwing an instance of 'std::bad_alloc'".
I looked it up on Google but I can't find any solution. Any idea why this happens?
#include <iostream>
#include <vector>
using namespace std;
std::vector<int> calculeazaPunctul(double pct[]) {
double pctfinal[2];
double xv = pct[0];
double yv = pct[1];
double zv = pct[2];
double res = .02;
double xk = 1;
double yk = 1;
double zk = -2;
cout << "h" << endl;
double zp = 0;
double xp = xv - ((xv - xk) * (zv - zp));
xp = xp / (((zv - zk != 0) ? (zv - zk) : 0.0001));
double yp = yv - ((yv - yk) * (zv - zp));
yp = yp / (((zv - zk != 0) ? (zv - zk) : 0.0001));
return {(int)((res * 2 * (xp * 50 + 100))), (int)((res * (yp * 50 + 100)))};
}
int main()
{
double puncte[8][3] = {{1, 0, -1},
{1, 0, 1},
{-1, 0, 1},
{-1, 0, -1},
{1, 1, -1},
{1, 1, 1},
{-1, 1, 1},
{-1, 1, -1}};
std::vector<std::vector<int> > pcteFinal;
pcteFinal.resize(8);
for (int i = 0; i < 8; i++) {
pcteFinal[i] = calculeazaPunctul(puncte[i]);
}
std::vector<std::vector<char> > image;
image.resize(10);
for (int y = 0; y < 10; y++) {
std::vector<char> row;
image[y] = row;
for (int x = 0; x < 20; x++) {
image[y].push_back('.');
}
}
for (int i = 0; i < 8; i++) {
if (pcteFinal[i][0] < 20 && pcteFinal[i][0] >= 0)
{
if (pcteFinal[i][1] < 10 && pcteFinal[i][0] >= 0)
{
image[pcteFinal[i][0]][pcteFinal[i][1]] = '#';
}
}
}
for (int y = 0; y < 10; y++) {
for (int x = 0; x < 20; x++) {
cout << image[y][x];
}
cout << endl;
}
return 0;
}

Tensorflow C++: use array for feed_dict

I have a C++ code in Tensorflow as shown below which involves the multiplication of matrices using placeholders:
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <iostream>
#include "tensorflow/cc/client/client_session.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/framework/tensor.h"
int main(int argc, char const *argv[]){
using namespace tensorflow;
using namespace tensorflow::ops;
Scope root = Scope::NewRootScope();
auto alpha = Const(root, 2.0, {1, 1});
auto beta = Const(root, 3.0, {1, 1});
auto A = Placeholder(root, DT_FLOAT);
auto B = Placeholder(root, DT_FLOAT);
auto C = Placeholder(root, DT_FLOAT);
auto temp1 = MatMul(root, A, B);
auto temp2 = Mul(root, alpha, temp1);
auto temp3 = Mul(root, beta, C);
auto D = Add(root.WithOpName("D"), temp1, temp3);
std::vector<Tensor> outputs;
ClientSession session(root);
int num_size = 2;
for(int step = 1; step < num_size; step++){
/*Allocating arrays*/
int array_size = pow(10, step);
float **a, **b, **c;
a = (float **)malloc(sizeof(float)*array_size);
b = (float **)malloc(sizeof(float)*array_size);
c = (float **)malloc(sizeof(float)*array_size);
for(int i = 0; i < array_size; i++){
a[i] = (float *)malloc(sizeof(float)*array_size);
b[i] = (float *)malloc(sizeof(float)*array_size);
c[i] = (float *)malloc(sizeof(float)*array_size);
}
srand((unsigned)time(0));
for(int i = 0; i < array_size; i++){
for(int j = 0; j < array_size; j++){
a[i][j] = (rand()%100)+1;
b[i][j] = (rand()%200)+1;
c[i][j] = (rand()%300)+1;
}
}
for(int num = 0; num < 10; num++){
Status s = session.Run({{A, a}, {B, b}, {C, c}}, {D}, &outputs);
if(s.ok())
c = outputs[0];
else
printf("Error\n");
}
}
return 0;
}
However the format of sending values to placeholders in C++ is shown in this link. The feedtype used in C++ is given here.
I'm confused as to how I can modify my 2D arrays to the feeddict format so as to supply in 'session.Run()'.
Thank you.
Edit 1
A minimal representation of the question is as follows-
Consider the following snippet of code:
Scope root = Scope::NewRootScope();
auto a = Placeholder(root, DT_INT32);
// [3 3; 3 3]
auto b = Const(root, 3, {2, 2});
auto c = Add(root, a, b);
ClientSession session(root);
std::vector<Tensor> outputs;
// Feed a <- [1 2; 3 4]
int feed_a[2][2] = {{1, 2}, {3, 4}};
session.Run({ {a, feed_a} }, {c}, &outputs);
// The working code is - session.Run({ {a, { {1, 2}, {3, 4} } } }, {c}, &outputs);
// outputs[0] == [4 5; 6 7]
How can I make this code work in the case shown where the 'feed_a' array is received from a separate function and it needs to be used to set the value of placeholder 'a'.

You need to create a c-array and place the data there instead of using a jagged array.
#include "tensorflow/cc/client/client_session.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/framework/tensor.h"
int main() {
using namespace tensorflow;
using namespace tensorflow::ops;
Scope root = Scope::NewRootScope();
// [3 3; 3 3]
auto b = Const(root, {{3.f, 3.f}, {3.f, 3.f}});
ClientSession session(root);
std::vector<Tensor> outputs;
// just print b
TF_CHECK_OK(session.Run({}, {b}, &outputs));
LOG(INFO) << "b = ";
LOG(INFO) << outputs[0].matrix<float>();
// just print c = a + b
float *a_data = new float[4];
for (int i = 0; i < 4; ++i)
a_data[i] = 1.f;
auto a_shape = TensorShape({2, 2});
auto a_init = Input::Initializer(*a_data, a_shape);
auto a_plhdr = Placeholder(root, DT_FLOAT);
auto c = Add(root, a_plhdr, b);
TF_CHECK_OK(session.Run({{a_plhdr, a_init}}, {c}, &outputs));
LOG(INFO) << "a + b";
LOG(INFO) << outputs[0].matrix<float>();
return 0;
}
gives me
2018-02-14 22:45:47.469766: I tensorflow/cc/example/example.cc:20] b =
2018-02-14 22:45:47.469800: I tensorflow/cc/example/example.cc:21] 3 3
3 3
2018-02-14 22:45:47.473519: I tensorflow/cc/example/example.cc:36] a + b
2018-02-14 22:45:47.473543: I tensorflow/cc/example/example.cc:37] 4 4
4 4
Note, for some reason
int32 *a_data = new int32[4];
for (int i = 0; i < 4; ++i)
a_data[i] = 1;
auto a_shape = TensorShape({2, 2});
auto a_init = Input::Initializer(*a_data, a_shape);
auto a_plhdr = Placeholder(root, DT_INT32);
produces a failure (no output):
Check failed: dtype() == expected_dtype (1 vs. 3)
which could not be solved by a
auto a_casted = Cast(root, a_plhdr, DT_FLOAT)
auto c = Add(root, a_casted, b);

Seeded region growing OpenCV

I am using Win 7,64, MS2012(C++) and OpenCV 2.4.11
I write seeded region growing algorithm but I down't know why the results not desired.
The goal is that extract pectoral muscle(left corner) of image with SRG algorithm.
input image :
here is my code (main function)
img = imread("C:/Users/Zimabi/Downloads/region_growing/jhgray.jpg",0);
Mat J1 = Mat::zeros(img.rows,img.cols,CV_8UC1);
int x=15,y=15;
int reg_mean = img.at<uchar>(x,y);
int reg_size = 1;
int neg_pos = 0;
Mat neg_list = Mat::zeros(img.cols*img.rows,3,CV_8UC1);
double pixdist = 0, reg_maxdist = 0.1;
int xn = 0,yn = 0;
Mat neigb = Mat::zeros(4,2,CV_8UC1);
int nei1 [4] = {-1,1,0,0};
int nei2 [4] = {0,0,-1,1};
while ((pixdist < reg_maxdist) && (reg_size < img.cols*img.rows))
{
int inx,min1=1000;
for (int r = 0; r < 4; r++)
{
xn = x + nei1[r] ;
yn = y + nei2[r];
bool ins=(xn>=1)&&(yn>=1)&&(xn<=img.rows)&&(yn<= img.cols);
if (ins && (J1.at<uchar>(xn,yn) == 0))
{
neg_pos = neg_pos+1;
neg_list.at<uchar>(neg_pos,0) = xn;
neg_list.at<uchar>(neg_pos,1) = yn;
neg_list.at<uchar>(neg_pos,2) = img.at<uchar>(xn,yn);
J1.at<uchar>(xn,yn)=255;
}
}
Mat x1 = Mat::zeros(neg_pos,1,CV_8UC1);
for (int i3 = 0; i3 <neg_pos ; i3++)
{
x1.at<uchar>(i3,0) = abs(neg_list.at<uchar>(i3,2) - reg_mean);
if (x1.at<uchar>(i3,0)<min1)
{
min1 = x1.at<uchar>(i3,0);
inx = i3;
}
}
pixdist = min1;
J1.at<uchar>(x,y)=255;
reg_size=reg_size+1;
reg_mean= (reg_mean*reg_size + neg_list.at<uchar> (inx,2))/(reg_size+1);
x = neg_list.at<uchar>(inx,0);
y = neg_list.at<uchar>(inx,1);
neg_list.at<uchar>(inx,0) = neg_list.at<uchar>(neg_pos,0);
neg_list.at<uchar>(inx,1) = neg_list.at<uchar>(neg_pos,1);
neg_list.at<uchar>(inx,2) = neg_list.at<uchar>(neg_pos,2);
neg_pos=neg_pos-1;
}
imshow("J",J1);
waitKey(0);
return 0;
Regards

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Could not get the expected output - c++

Related

Half-precision PyTorch float tensors have same performance as single precision float tensors?

Size of numbers in float array have significant impact on performance - Why?

My c++ code returns 3 (after throwing an instance of bad_alloc)

Tensorflow C++: use array for feed_dict

Seeded region growing OpenCV

Categories

Resources