CNTK Evaluate model has two inputs C++ - c++

I have a project based on CNTK 2.3. I used the code from the integration tests to train MNIST classifier like this:
auto device = DeviceDescriptor::GPUDevice(0);
const size_t inputDim = sizeBlob * sizeBlob;
const size_t numOutputClasses = numberOfClasses;
const size_t hiddenLayerDim = 200;
auto input = InputVariable({ inputDim }, CNTK::DataType::Float, L"features");
auto scaledInput = ElementTimes(Constant::Scalar(0.00390625f, device), input);
auto classifierOutput = FullyConnectedDNNLayer(scaledInput, hiddenLayerDim, device, std::bind(Sigmoid, _1, L""));
auto outputTimesParam = Parameter(NDArrayView::RandomUniform<float>({ numOutputClasses, hiddenLayerDim }, -0.05, 0.05, 1, device));
auto outputBiasParam = Parameter(NDArrayView::RandomUniform<float>({ numOutputClasses }, -0.05, 0.05, 1, device));
classifierOutput = Plus(outputBiasParam, Times(outputTimesParam, classifierOutput), L"classifierOutput");
auto labels = InputVariable({ numOutputClasses }, CNTK::DataType::Float, L"labels");
auto trainingLoss = CNTK::CrossEntropyWithSoftmax(classifierOutput, labels, L"lossFunction");;
auto prediction = CNTK::ClassificationError(classifierOutput, labels, L"classificationError");
// Test save and reload of model
Variable classifierOutputVar = classifierOutput;
Variable trainingLossVar = trainingLoss;
Variable predictionVar = prediction;
auto combinedNet = Combine({ trainingLoss, prediction, classifierOutput }, L"MNISTClassifier");
//SaveAndReloadModel<float>(combinedNet, { &input, &labels, &trainingLossVar, &predictionVar, &classifierOutputVar }, device);
classifierOutput = classifierOutputVar;
trainingLoss = trainingLossVar;
prediction = predictionVar;
const size_t minibatchSize = 64;
const size_t numSamplesPerSweep = 60000;
const size_t numSweepsToTrainWith = 2;
const size_t numMinibatchesToTrain = (numSamplesPerSweep * numSweepsToTrainWith) / minibatchSize;
auto featureStreamName = L"features";
auto labelsStreamName = L"labels";
auto minibatchSource = TextFormatMinibatchSource(trainingSet, { { featureStreamName, inputDim },{ labelsStreamName, numOutputClasses } });
auto featureStreamInfo = minibatchSource->StreamInfo(featureStreamName);
auto labelStreamInfo = minibatchSource->StreamInfo(labelsStreamName);
LearningRateSchedule learningRatePerSample = TrainingParameterPerSampleSchedule<double>(0.003125);
auto trainer = CreateTrainer(classifierOutput, trainingLoss, prediction, { SGDLearner(classifierOutput->Parameters(), learningRatePerSample) });
size_t outputFrequencyInMinibatches = 20;
for (size_t i = 0; i < numMinibatchesToTrain; ++i)
auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSize, device);
trainer->TrainMinibatch({ { input, minibatchData[featureStreamInfo] },{ labels, minibatchData[labelStreamInfo] } }, device);
PrintTrainingProgress(trainer, i, outputFrequencyInMinibatches);
size_t trainingCheckpointFrequency = 100;
if ((i % trainingCheckpointFrequency) == (trainingCheckpointFrequency - 1))
const wchar_t* ckpName = L"";
That part works fine and I train the model then save to a model file. But when I try to evaluate a simple image to test the model it looks like something is wrong in the model.
// Load the model.
// The model is trained by <CNTK>/Examples/Image/Classification/ResNet/Python/
// Please see in <CNTK>/Examples/Image/Classification/ResNet about how to train the model.
FunctionPtr modelFunc = Function::Load(modelFile, device);
// Get input variable. The model has only one single input.
std::vector<Variable> inputs = modelFunc->Arguments();
Variable inputVar = modelFunc->Arguments()[0];
// The model has only one output.
// If the model has more than one output, use modelFunc->Outputs to get the list of output variables.
std::vector<Variable> outputs = modelFunc->Outputs();
Variable outputVar = outputs[0];
// Prepare input data.
// For evaluating an image, you first need to perform some image preprocessing to make sure that the input image has the correct size and layout
// that match the model inputs.
// Please note that the model used by this example expects the CHW image layout.
// inputVar.Shape[0] is image width, inputVar.Shape[1] is image height, and inputVar.Shape[2] is channels.
// For simplicity and avoiding external dependencies, we skip the preprocessing step here, and just use some artificially created data as input.
Mat image = imread(".....");
uint8_t* imagePtr = (uint8_t*)(image).data;
auto width = image.cols;
auto heigth = image.rows;
std::vector<float> inputData(inputVar.Shape().TotalSize());
for (size_t i = 0; i < inputData.size(); ++i)
auto curChVal = imagePtr[(i)];
inputData[i] = curChVal;
// Create input value and input data map
ValuePtr inputVal = Value::CreateBatch(inputVar.Shape(), inputData, device);
std::unordered_map<Variable, ValuePtr> inputDataMap = { { inputVar, inputVal } };
// Create output data map. Using null as Value to indicate using system allocated memory.
// Alternatively, create a Value object and add it to the data map.
std::unordered_map<Variable, ValuePtr> outputDataMap = { { outputVar, nullptr } };
// Start evaluation on the device
modelFunc->Evaluate(inputDataMap, outputDataMap, device);
// Get evaluate result as dense output
ValuePtr outputVal = outputDataMap[outputVar];
std::vector<std::vector<float>> outputData;
outputVal->CopyVariableValueTo(outputVar, outputData);
PrintOutput<float>(outputVar.Shape().TotalSize(), outputData);
I run the same code on c# and it works fine. What I found as a difference is that modelFunc->Arguments() should have one argument but it has two - it finds features and labels as two inputs but I need to have only feature as an input and it throws the following error:

Find input and output variables by name, instead of modelFunc->Arguments()[0].
Variable inputVar;
GetInputVariableByName(modelFunc, L"features", inputVar);
Variable outputVar;
GetOutputVaraiableByName(modelFunc, L"classifierOutput", outputVar);
GetInputVariableByName and GetOutputVaraiableByName() come from


What should you store or append a batch of tensors to in C++ when using LibTorch?

In C++, when using LibTorch (The C++ version of PyTorch), what should you store a batch of tensors in? I'm running into the problem of not being able to reset the batch on the next step because C++ doesn't allow storing a new variable over an existing variable.
In my attempt my batch of tensors is one single 385x385 tensor. The batch size is 385. In a for loop I use torch::cat to concatenate 385 smaller 1D tensors, which are 385 numbers long. (Maybe 'stack' or 'append' are better terms for what I'm doing since the are stacked together picket fence style more than 'concatenated', but that's what I'm using.) Anyways, there is not problem with this shape. It seems to work fine for one forward and backward pass but then the tensor becomes 770x385 on the next pass instead of a 385x385 tensor of the next 385, 385 long arrays. I hope I am painting a picture and not being too verbose.
The code.
Near the bottom I have the line all_step_obs = torch::tensor({}); to try to wipe out the contents of the tensor, AKA, the batch, but this gives me a Segmentation fault (core dumped). I guess for trying to access the tensor outside of the loop(?)
If I don't have this line I get a 770x385 tensor after the next step.
The model
#include "mujoco/mujoco.h"
struct Net : torch::nn::Module {
torch::Tensor action_high, action_low;
Net(torch::Tensor action_high, torch::Tensor action_low) : action_high(action_high), action_low(action_low){
// Construct and register two Linear submodules.
fc1 = torch::nn::Linear(385, 385);
fc2 = torch::nn::Linear(385, 385);
fc3 = torch::nn::Linear(385, 42);
// cholesky_layer = torch::nn::Linear(385, (42 * (42 + 1)) / 2);
cholesky_layer = torch::nn::Linear(385, 385);
// Implement the Net's algorithm.
torch::Tensor forward(torch::Tensor x) {
// Use one of many tensor manipulation functions.
x = torch::relu(fc1->forward(x));
x = torch::dropout(x, /*p=*/0.2, /*train=*/is_training());
x = torch::relu(fc2->forward(x));
auto mean_layer = fc3->forward(x);
auto mean = action_low + (action_high - action_low) * mean_layer;
auto chol_l = cholesky_layer->forward(x);
// auto chol = torch::rand({385, 385});
auto chol = torch::matmul(chol_l, chol_l.transpose(0, 1));
chol = torch::nan_to_num(chol, 0, 2.0);
chol = chol.add(torch::eye(385));
auto cholesky = torch::linalg::cholesky(chol);
// return torch::cat({mean, cholesky}, 0);
return mean_layer;
// Use one of many "standard library" modules.
torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr}, cholesky_layer{nullptr};
The training
auto high = torch::ones({385, 42}) * 0.4;
auto low = torch::ones({385, 42}) * -0.4;
auto actor = Net(low, high);
int max_steps = 385;
int steps = 2000;
auto l1_loss = torch::smooth_l1_loss;
auto optimizer = torch::optim::Adam(actor.parameters(), 3e-4);
torch::Tensor train() {
torch::Tensor all_step_obs;
for (int i = 0; i<steps; ++i)
for (int i = 0; i<max_steps; ++i)
all_step_obs = torch::cat({torch::rand({385}).unsqueeze(0), all_step_obs});
auto mean = actor.forward(all_step_obs);
auto loss = l1_loss(mean, torch::rand({385, 42}), 1, 0);
all_step_obs = torch::tensor({});
if (steps == 1999) {
return loss;
int main (int argc, const char** argv) {
std::cout << train();

Clip Raster with Polygon with GDAL C++

I am trying to clip a raster using a polygon an GDAL. At the moment i get an error that there is a read access violation when initializing the WarpOperation. I can access my Shapefile and check the num of features so the access is fine i think. Also i can access my Raster Data (GetProjectionRef).. All files are in the same CRS. Is there a way to use GdalWarp with Cutline?
const char* inputPath = "input.tif";
const char* outputPath = "output.tif";
//clipper Polygon
auto w_read_filenamePoly = "Polygon.shp";
char* read_filenamePoly = new char[w_read_filenamePoly.length() + 1];
wcstombs(read_filenamePoly, w_read_filenamePoly.c_str(), w_read_filenamePoly.length() + 1);
GDALDataset* hSrcDS;
GDALDataset* hDstDS;
hSrcDS =(GDALDataset *) GDALOpen(inputPath, GA_Update);
hDstDS = (GDALDataset*)GDALOpen(outputPath, GA_Update);
const char* proj = hSrcDS->GetProjectionRef();
const char* proj2 = hDstDS->GetProjectionRef();
//clipper Layer
GDALDataset* poDSClipper;
poDSClipper = (GDALDataset*)GDALOpenEx(read_filenamePoly, GDAL_OF_UPDATE, NULL, NULL, NULL);
OGRLayer* poLayerClipper;
poLayerClipper = poDSClipper->GetLayerByName("Polygon");
int numClip = poLayerClipper->GetFeatureCount();
//setup warp options
GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
psWarpOptions->hSrcDS = hSrcDS;
psWarpOptions->hDstDS = hDstDS;
psWarpOptions->nBandCount = 1;
psWarpOptions->panSrcBands = (int *) CPLMalloc(sizeof(int) * psWarpOptions->nBandCount);
psWarpOptions->panSrcBands[0] = 1;
psWarpOptions->panDstBands = (int*)CPLMalloc(sizeof(int) * psWarpOptions->nBandCount);
psWarpOptions->panDstBands[0] = 1;
psWarpOptions->pfnProgress = GDALTermProgress;
psWarpOptions->hCutline = poLayerClipper;
// Establish reprojection transformer.
psWarpOptions->pTransformerArg = GDALCreateGenImgProjTransformer(hSrcDS,proj, hDstDS, proj2, FALSE, 0.0, 1);
psWarpOptions->pfnTransformer = GDALGenImgProjTransform;
GDALWarpOperation oOperation;
oOperation.ChunkAndWarpImage(0, 0, GDALGetRasterXSize(hDstDS), GDALGetRasterYSize(hDstDS));
Your psWarpOptions->hCutline should be a polygon, not a layer.
Also the cutline should be in source pixel/line coordinates.
Check TransformCutlineToSource from gdalwarp_lib.cpp, you can probably simply get the code from there.
This particular GDAL operation, when called from C++, is so full of pitfalls - and there are so many open questions about it here - that I am reproducing a full working example:
Warping (reprojecting) a raster image with a polygon mask (cutline):
#include <gdal/gdal.h>
#include <gdal/gdal_priv.h>
#include <gdal/gdalwarper.h>
#include <gdal/ogrsf_frmts.h>
int main() {
const char *inputPath = "input.tif";
const char *outputPath = "output.tif";
// clipper Polygon
// copy the function gdalwarp_lib.cpp:TransformCutlineToSource()
// from GDAL's sources
// It is expected that it contains a single polygon feature
const char *read_filenamePoly = "cutline.json";
GDALDataset *hSrcDS;
GDALDataset *hDstDS;
auto poDriver = GetGDALDriverManager()->GetDriverByName("GTiff");
hSrcDS = (GDALDataset *)GDALOpen(inputPath, GA_ReadOnly);
hDstDS = (GDALDataset *)poDriver->CreateCopy(
outputPath, hSrcDS, 0, nullptr, nullptr, nullptr);
// Without this step the cutline is useless - because the background
// will be carried over from the original image
CPLErr e = hDstDS->GetRasterBand(1)->Fill(0);
const char *src_srs = hSrcDS->GetProjectionRef();
const char *dst_srs = hDstDS->GetProjectionRef();
// clipper Layer
GDALDataset *poDSClipper;
poDSClipper = (GDALDataset *)GDALOpenEx(
read_filenamePoly, GDAL_OF_UPDATE, NULL, NULL, NULL);
auto poLayerClipper = poDSClipper->GetLayer(0);
auto geom = poLayerClipper->GetNextFeature()->GetGeometryRef();
// setup warp options
GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
psWarpOptions->hSrcDS = hSrcDS;
psWarpOptions->hDstDS = hDstDS;
psWarpOptions->nBandCount = 1;
psWarpOptions->panSrcBands =
(int *)CPLMalloc(sizeof(int) * psWarpOptions->nBandCount);
psWarpOptions->panSrcBands[0] = 1;
psWarpOptions->panDstBands =
(int *)CPLMalloc(sizeof(int) * psWarpOptions->nBandCount);
psWarpOptions->panDstBands[0] = 1;
psWarpOptions->pfnProgress = GDALTermProgress;
psWarpOptions->hCutline = geom;
// Establish reprojection transformer.
psWarpOptions->pTransformerArg = GDALCreateGenImgProjTransformer(
hSrcDS, src_srs, hDstDS, dst_srs, TRUE, 1000, 1);
psWarpOptions->pfnTransformer = GDALGenImgProjTransform;
GDALWarpOperation oOperation;
0, 0, GDALGetRasterXSize(hDstDS), GDALGetRasterYSize(hDstDS));

Use multiple images for batch inference cppflow C++

I'm trying to use cppflow library in windows 10 x64 machine in VS2019 C++. I want to inference my model for batch of images (vector <cv::Mat> ). I write a simple code as below for single image and it works correctly:
string im_path{ "..." };
string model_path{ "...\\ocr_model" };
cv::Mat tmp, im;
cv::resize(cv::imread(im_path, cv::IMREAD_GRAYSCALE), tmp, cv::Size(127, 25), 0, 0, cv::INTER_CUBIC);
cv::transpose(tmp, im);
int rows = im.rows; int cols = im.cols; int channels = im.channels();
// Put image in tensor
std::vector<uint8_t> img_data;
auto e = std::end(img_data);
img_data.insert(e,, + * channels);
auto input = cppflow::tensor(img_data, {rows, cols, channels});
input = cppflow::cast(input, TF_UINT8, TF_FLOAT);
auto t = input.get_data<float>();
input = input / 255.f;
input = cppflow::expand_dims(input, 0);
cppflow::model model{ model_path };
auto output = model({ {"serving_default_input:0", input}}, { "StatefulPartitionedCall:0"});
I want to load multiple images (in code below I use a cloned image as second image). here is what I really want to do:
string im_path{ "..." };
string model_path{ "...\\ocr_model" };
cv::Mat tmp, im;
cv::resize(cv::imread(im_path, cv::IMREAD_GRAYSCALE), tmp, cv::Size(127, 25), 0, 0, cv::INTER_CUBIC);
cv::transpose(tmp, im);
int rows = im.rows; int cols = im.cols; int channels = im.channels();
// Put image in tensor
std::vector<uint8_t> img_data;
auto im_clone = im.clone();
auto e = std::end(img_data);
img_data.insert(e,, + * channels);
e = std::end(img_data);
img_data.insert(e,, + * channels);
auto input = cppflow::tensor(img_data, {2, rows, cols, channels});
input = cppflow::cast(input, TF_UINT8, TF_FLOAT);
input = input / 255.f;
input = cppflow::expand_dims(input, 0);
cppflow::model model{ model_path };
auto output = model({ {"serving_default_input:0", input}}, { "StatefulPartitionedCall:0"});
As you see the difference between the codes are img_data preparation and tensor definition but unfortunately, I get this error:
Unhandled exception at 0x00007FFFF4514ED9 in cppflow_Test.exe:
Microsoft C++ exception: std::runtime_error at memory location
How can I load multiple images (vector< cv::Mat >) to a tensor and use its corresponding outputs? in other words I need a example for batch inference using cppflow library.
Try using std::copy to insert multiple copies of the image (or multiple images) into the img_data vector.
int data_size = rows * cols * channels;
std::vector<uint8> img_data;
for (size_t i = 0; i < batch_size; i++)
std::copy(, + data_size, std::begin(img_data) + i * data_size);
auto input = cppflow::tensor(img_data, {batch_size, rows, cols, channels});
Use cppflow::concat to concatenate the n tensors you want to send to the model:
auto input1 = cppflow::decode_jpeg(file1);
input1 = cppflow::expand_dims(input1, 0);
auto input2 = cppflow::decode_jpeg(file2);
input2 = cppflow::expand_dims(input2, 0);
cppflow::tensor dim({0});
std::vector<cppflow::tensor> values;
auto inputs = cppflow::concat(dim, values);
auto output = model({ {"serving_default_input:0", inputs}}, { "StatefulPartitionedCall:0"});

Keras custom layer with CNTK backend (CRF as RNN)

I am attempting to duplicate the CRF as RNN which has been implemented in Keras but uses TensorFlow as a backend ( The Keras front-end is fine, but some of the backend code is written as a TensorFlow custom op. I am trying to duplicate this in CNTK, but I have a few questions.
import cntk as C
from cntk import ops
import copy
import numpy as np
ops.register_native_user_function('HighDimFilterOp', 'Cntk.HighDimFilter-' + C.__version__.rstrip('+'), 'CreateHighDimFilter')
def high_dim_filter(image=None, rgb=None, **kwargs):
inputs = [list(image), list(rgb)];
layer_config = copy.deepcopy(kwargs)
ops.native_user_function('HighDimFilterOp', inputs, layer_config, 'high_dim_filter')
This code is the Python call to my user C++ function. The C++ interface is as follows:
#include "HighDimFilter.h"
using namespace CNTK;
extern "C"
#ifdef _WIN32
Function* CreateHighDimFilter(const Variable* operands, size_t /*numOperands*/, const Dictionary* attributes, const wchar_t* name)
printf("Creating HighDimFilter\n");
return new HighDimFilter({operands[0], operands[1]}, *attributes, name);
and the custom function itself is defined as:
#pragma once
#include "CNTKLibrary.h"
#include "modified_permutohedral.h"
using namespace CNTK;
class HighDimFilter final : public Function
bool _bilateral;
float _theta_alpha;
float _theta_beta;
float _theta_gamma;
enum Input : uint32_t
HighDimFilter(const std::vector<Variable>& inputs, const Dictionary& attributes, const std::wstring& name = L"HighDimFilter")
: Function(inputs, attributes, name)
if (attributes.Contains(L"bilateral"))
_bilateral = attributes[L"bilateral"].Value<bool>();
if (_bilateral == false)
if (attributes.Contains(L"theta_gamma"))
_theta_gamma = static_cast<float>(attributes[L"theta_gamma"].Value<double>());
if (attributes.Contains(L"theta_alpha"))
_theta_alpha = static_cast<float>(attributes[L"theta_alpha"].Value<double>());
if (attributes.Contains(L"theta_beta"))
_theta_beta = static_cast<float>(attributes[L"theta_beta"].Value<double>());
void _compute_spatial_kernel(NDArrayViewPtr& Tensor, const float theta_gamma)
auto output_kernel = Tensor->WritableDataBuffer<float>();
auto outputShape = Tensor->Shape();
//auto channels = outputShape[0];
auto height = outputShape[1];
auto width = outputShape[2];
const auto num_pixels = width * height;
for (int p = 0; p < num_pixels; ++p)
output_kernel[2 * p] = static_cast<float>(p % width) / theta_gamma;
output_kernel[2 * p + 1] = static_cast<float>(p / width) / theta_gamma;
void _compute_bilateral_kernel(NDArrayViewPtr& Tensor, const NDArrayViewPtr& Image,
const float theta_alpha, const float theta_beta)
auto output_kernel = Tensor->WritableDataBuffer<float>();
auto rgb = Image->DataBuffer<float>();
auto outputShape = Tensor->Shape();
//auto channels = outputShape[0];
auto height = outputShape[1];
auto width = outputShape[2];
const auto num_pixels = height * width;
for (int p = 0; p < num_pixels; ++p)
// Spatial terms
output_kernel[5 * p] = static_cast<float>(p % width) / theta_alpha;
output_kernel[5 * p + 1] = static_cast<float>(p / width) / theta_alpha;
// Color terms
output_kernel[5 * p + 2] = static_cast<float>(rgb[p] / theta_beta);
output_kernel[5 * p + 3] = static_cast<float>(rgb[num_pixels + p] / theta_beta);
output_kernel[5 * p + 4] = static_cast<float>(rgb[2 * num_pixels + p] / theta_beta);
BackPropStatePtr Forward(const std::vector<ValuePtr>& inputValues,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice,
const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor */) override
#if 0
auto scoresShape = inputValues[Input::SCORES]->Shape();
auto channels = scoresShape[0];
auto height = scoresShape[1];
auto width = scoresShape[2];
const auto num_pixels = width * height;
auto &outputValue = outputs[this->Output()];
if (outputValue == nullptr)
outputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, scoresShape, computeDevice));
if (computeDevice.Type() != DeviceKind::CPU)
throw std::runtime_error("HighDimFilter: only CPU evaluation is supported at the moment.");
ModifiedPermutohedral mp;
if (_bilateral)
auto &kernel_vals = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, NDShape({5, height, width}), computeDevice));
//float* kernel_vals = new float[5 * num_pixels];
_compute_bilateral_kernel(kernel_vals->Data(), inputValues[Input::IM_INFO]->Data(),
_theta_alpha, _theta_beta);
mp.init(kernel_vals->Data(), 5, num_pixels);
mp.compute(outputValue->Data(), inputValues[Input::SCORES]->Data(), false);
auto &kernel_vals = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, NDShape({2, height, width}), computeDevice));
_compute_spatial_kernel(kernel_vals->Data(), _theta_gamma);
mp.init(kernel_vals->Data(), 2, num_pixels);
mp.compute(outputValue->Data(), inputValues[Input::SCORES]->Data(), channels, false);
return MakeSharedObject<BackPropState>(this->shared_from_this(), computeDevice, std::unordered_map<Variable, ValuePtr>({ {Inputs()[Input::IM_INFO], inputValues[Input::IM_INFO]} }));
return nullptr;
void Backward(const BackPropStatePtr& state,
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override
#if 0
auto gradOutputVariable = Inputs()[Input::SCORES];
auto inputVariable = Inputs()[Input::IM_INFO];
auto &gradValue = backPropagatedGradientValuesForInputs[gradOutputVariable];
if (gradValue == nullptr)
gradValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, gradOutputVariable.Shape(), state->Device()));
auto imageData = state->SavedForwardPropValues().at(inputVariable)->Data();
auto imageShape = imageData->Shape();
auto channels = imageShape[0];
auto height = imageShape[1];
auto width = imageShape[2];
const auto num_pixels = width * height;
if (state->Device().Type() != DeviceKind::CPU)
throw std::runtime_error("HighDimFilter: only CPU evaluation is supported at the moment.");
auto rootGradientData =>Output())->Data();
ModifiedPermutohedral mp;
if (_bilateral)
auto &kernel_vals = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, NDShape({5, height, width}), state->Device()));
//float* kernel_vals = new float[5 * num_pixels];
_compute_bilateral_kernel(kernel_vals->Data(), imageData,
_theta_alpha, _theta_beta);
mp.init(kernel_vals->Data(), 5, num_pixels);
mp.compute(gradValue->Data(), rootGradientData, true);
auto &kernel_vals = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, NDShape({2, height, width}), state->Device()));
_compute_spatial_kernel(kernel_vals->Data(), _theta_gamma);
mp.init(kernel_vals->Data(), 2, num_pixels);
mp.compute(gradValue->Data(), rootGradientData, channels, true);
const std::wstring& OpName() const override
static const std::wstring opName = L"HighDimFilterOp";
return opName;
size_t CurrentVersion() const override
void InferOutputs(std::vector<Variable>& /*outputs */) override
FunctionPtr Clone(const std::vector<Variable>& /*clonedInputs */) override
return nullptr;
My python call looks like:
bilateral_high_dim_filter = custom_module.high_dim_filter(image=all_ones_flat,
high_dim_filter = custom_module.high_dim_filter(image=all_ones_flat,
The questions are as follows: 1) What are the "operands" passed in to the native_user_function on initialization? Are these only passed on initialization (are they intended to be weight and bias initialization)? How are the input operands used in the "Function" construction initializer? If I set these to "None" in Python, the code crashes.
2) How do you forward propagate the filter? Just call "forward()"? What about the required arguments to forward propagate?
3) Is there a numerical gradient calculation in CNTK similar to TensorFlow to check the gradient?

How to replace an instance with another instance via pointer?

I'm doing online destructive clustering (clusters replace clustered objects) on a list of class instances (stl::list).
My list of current percepUnits is: stl::list<percepUnit> units; and for each iteration I get a new list of input percepUnits stl::list<percepUnit> scratch; that need to be clustered with the units.
I want to maintain a fixed number of percepUnits (so units.size() is constant), so for each new scratch percepUnit I need to merge it with the nearest percepUnit in units. Following is a code snippet that builds a list (dists) of structures (percepUnitDist) that contain pointers to each pair of items in scratch and units percepDist.scratchUnit = &(*scratchUnit); and percepDist.unit = &(*unit); and their distance. Additionally, for each item in scratch I keep track of which item in units has the least distance minDists.
// For every scratch percepUnit:
for (scratchUnit = scratch.begin(); scratchUnit != scratch.end(); scratchUnit++) {
float minDist=2025.1172; // This is the max possible distance in unnormalized CIELuv, and much larger than the normalized dist.
// For every percepUnit:
for (unit = units.begin(); unit != units.end(); unit++) {
// compare pairs
float dist = featureDist(*scratchUnit, *unit, FGBG);
//cout << "distance: " << dist << endl;
// Put pairs in a structure that caches their distances
percepUnitDist percepDist;
percepDist.scratchUnit = &(*scratchUnit); // address of where scratchUnit points to.
percepDist.unit = &(*unit);
percepDist.dist = dist;
// Figure out the percepUnit that is closest to this scratchUnit.
if (dist < minDist)
minDist = dist;
dists.push_back(percepDist); // append dist struct
minDists.push_back(minDist); // append the min distance to the nearest percepUnit for this particular scratchUnit.
So now I just need to loop through the percepUnitDist items in dists and match the distances with the minimum distances to figure out which percepUnit in scratch should be merged with which percepUnit in units. The merging process mergePerceps() creates a new percepUnit which is a weighted average of the "parent" percepUnits in scratch and units.
I want to replace the instance in the units list with the new percepUnit constructed by mergePerceps(), but I would like to do so in the context of looping through the percepUnitDists. This is my current code:
// Loop through dists and merge all the closest pairs.
// Loop through all dists
for (distIter = dists.begin(); distIter != dists.end(); distIter++) {
// Loop through all minDists for each scratchUnit.
for (minDistsIter = minDists.begin(); minDistsIter != minDists.end(); minDistsIter++) {
// if this is the closest cluster, and the closest cluster has not already been merged, and the scratch has not already been merged.
if (*minDistsIter == distIter->dist and not distIter->scratchUnit->remove) {
percepUnit newUnit;
mergePerceps(*(distIter->scratchUnit), *(distIter->unit), newUnit, FGBG);
*(distIter->unit) = newUnit; // replace the cluster with the new merged version.
distIter->scratchUnit->remove = true;
I thought that I could replace the instance in units via the percepUnitDist pointer with the new percepUnit instance using *(distIter->unit) = newUnit;, but that does not seem to be working as I'm seeing a memory leak, implying the instances in the units are not getting replaced.
How do I delete the percepUnit in the units list and replace it with a new percepUnit instance such that the new unit is located in the same location?
Here is the percepUnit class. Note the cv::Mat members. Following is the mergePerceps() function and the mergeImages() function on which it depends:
// Function to construct an accumulation.
void clustering::mergeImages(Mat &scratch, Mat &unit, cv::Mat &merged, const string maskOrImage, const string FGBG, const float scratchWeight, const float unitWeight) {
int width, height, type=CV_8UC3;
Mat scratchImagePad, unitImagePad, scratchImage, unitImage;
// use the resolution and aspect of the largest of the pair.
if (unit.cols > scratch.cols)
width = unit.cols;
width = scratch.cols;
if (unit.rows > scratch.rows)
height = unit.rows;
height = scratch.rows;
if (maskOrImage == "mask")
type = CV_8UC1; // single channel mask
else if (maskOrImage == "image")
type = CV_8UC3; // three channel image
cout << "maskOrImage is not 'mask' or 'image'\n";
merged = Mat(height, width, type, Scalar::all(0));
scratchImagePad = Mat(height, width, type, Scalar::all(0));
unitImagePad = Mat(height, width, type, Scalar::all(0));
// weight images before summation.
// because these pass by reference, they mess up the images in memory!
scratch *= scratchWeight;
unit *= unitWeight;
// copy images into padded images.
merged = scratchImagePad+unitImagePad;
// Merge two perceps and return a new percept to replace them.
void clustering::mergePerceps(percepUnit scratch, percepUnit unit, percepUnit &mergedUnit, const string FGBG) {
Mat accumulation;
Mat accumulationMask;
Mat meanColour;
int x, y, w, h, area;
float l,u,v;
int numMerges=0;
std::vector<float> featuresVar; // Normalized, Sum, Variance.
//float featuresVarMin, featuresVarMax; // min and max variance accross all features.
float scratchWeight, unitWeight;
if (FGBG == "FG") {
// foreground percepts don't get merged as much.
scratchWeight = 0.65;
unitWeight = 1-scratchWeight;
} else {
scratchWeight = 0.85;
unitWeight = 1-scratchWeight;
// Images TODO remove the meanColour if needbe.
mergeImages(scratch.image, unit.image, accumulation, "image", FGBG, scratchWeight, unitWeight);
mergeImages(scratch.mask, unit.mask, accumulationMask, "mask", FGBG, scratchWeight, unitWeight);
mergeImages(scratch.meanColour, unit.meanColour, meanColour, "image", "FG", scratchWeight, unitWeight); // merge images
// Position and size.
x = (scratch.x1*scratchWeight) + (unit.x1*unitWeight);
y = (scratch.y1*scratchWeight) + (unit.y1*unitWeight);
w = (scratch.w*scratchWeight) + (unit.w*unitWeight);
h = (scratch.h*scratchWeight) + (unit.h*unitWeight);
// area
area = (scratch.area*scratchWeight) + (unit.area*unitWeight);
// colour
l = (scratch.l*scratchWeight) + (unit.l*unitWeight);
u = (scratch.u*scratchWeight) + (unit.u*unitWeight);
v = (scratch.v*scratchWeight) + (unit.v*unitWeight);
// Number of merges
if (scratch.numMerges < 1 and unit.numMerges < 1) { // both units are patches
numMerges = 1;
} else if (scratch.numMerges < 1 and unit.numMerges >= 1) { // unit A is a patch, B a percept
numMerges = unit.numMerges + 1;
} else if (scratch.numMerges >= 1 and unit.numMerges < 1) { // unit A is a percept, B a patch.
numMerges = scratch.numMerges + 1;
cout << "merged scratch??" <<endl;
// TODO this may be an impossible case.
} else { // both units are percepts
numMerges = scratch.numMerges + unit.numMerges;
cout << "Merging two already merged Percepts" <<endl;
// TODO this may be an impossible case.
// Create unit.
mergedUnit = percepUnit(accumulation, accumulationMask, x, y, w, h, area); // time is the earliest value in times?
mergedUnit.l = l; // members not in the constrcutor.
mergedUnit.u = u;
mergedUnit.v = v;
mergedUnit.numMerges = numMerges;
mergedUnit.meanColour = meanColour;
mergedUnit.pActivated = unit.pActivated; // new clusters retain parent's history of activation.
mergedUnit.scratch = false;
mergedUnit.habituation = unit.habituation; // we inherent the habituation of the cluster we merged with.
Changing the copy and assignment operators had performance side-effects and did not seem to resolve the problem. So I've added a custom function to do the replacement, which just like the copy operator makes copies of each member and make's sure those copies are deep. The problem is that I still end up with a leak.
So I've changed this line: *(distIter->unit) = newUnit;
to this: (*(distIter->unit)).clone(newUnit)
Where the clone method is as follows:
// Deep Copy of members
void percepUnit::clone(const percepUnit &source) {
// Deep copy of Mats
this->image = source.image.clone();
this->mask = source.mask.clone();
this->alphaImage = source.alphaImage.clone();
this->meanColour = source.meanColour.clone();
// shallow copies of everything else
this->alpha = source.alpha;
this->fadingIn = source.fadingIn;
this->fadingHold = source.fadingHold;
this->fadingOut = source.fadingOut;
this->l = source.l;
this->u = source.u;
this->v = source.v;
this->x1 = source.x1;
this->y1 = source.y1;
this->w = source.w;
this->h = source.h;
this->x2 = source.x2;
this->y2 = source.y2;
this->cx =;
this->cy =;
this->numMerges = source.numMerges;
this->id =;
this->area = source.area;
this->features = source.features;
this->featuresNorm = source.featuresNorm;
this->remove = source.remove;
this->fgKnockout = source.fgKnockout;
this->colourCalculated = source.colourCalculated;
this->normalized = source.normalized;
this->activation = source.activation;
this->activated = source.activated;
this->pActivated = source.pActivated;
this->habituation = source.habituation;
this->scratch = source.scratch;
this->FGBG = source.FGBG;
And yet, I still see a memory increase. The increase does not happen if I comment out that single replacement line. So I'm still stuck.
I can prevent memory from increasing if I disable the cv::Mat cloning code in the function above:
// Deep Copy of members
void percepUnit::clone(const percepUnit &source) {
/* try releasing Mats first?
// No effect on memory increase, but the refCount is decremented.
/* Deep copy of Mats
this->image = source.image.clone();
this->mask = source.mask.clone();
this->alphaImage = source.alphaImage.clone();
this->meanColour = source.meanColour.clone();*/
// shallow copies of everything else
this->alpha = source.alpha;
this->fadingIn = source.fadingIn;
this->fadingHold = source.fadingHold;
this->fadingOut = source.fadingOut;
this->l = source.l;
this->u = source.u;
this->v = source.v;
this->x1 = source.x1;
this->y1 = source.y1;
this->w = source.w;
this->h = source.h;
this->x2 = source.x2;
this->y2 = source.y2;
this->cx =;
this->cy =;
this->numMerges = source.numMerges;
this->id =;
this->area = source.area;
this->features = source.features;
this->featuresNorm = source.featuresNorm;
this->remove = source.remove;
this->fgKnockout = source.fgKnockout;
this->colourCalculated = source.colourCalculated;
this->normalized = source.normalized;
this->activation = source.activation;
this->activated = source.activated;
this->pActivated = source.pActivated;
this->habituation = source.habituation;
this->scratch = source.scratch;
this->FGBG = source.FGBG;
While I still can't explain this issue, I did notice another hint. I realized that this leak can also be stopped if I don't normalize those features I use to cluster via featureDist() (but continue to clone cv::Mats). The really odd thing is that I rewrote that code entirely and still the problem persists.
Here is the featureDist function:
float clustering::featureDist(percepUnit unitA, percepUnit unitB, const string FGBG) {
float distance=0;
if (FGBG == "BG") {
for (unsigned int i=0; i<unitA.featuresNorm.rows; i++) {
distance += pow(abs(<float>(i) -<float>(i)),0.5);
//cout << "unitA.featuresNorm[" << i << "]: " << unitA.featuresNorm[i] << endl;
//cout << "unitB.featuresNorm[" << i << "]: " << unitB.featuresNorm[i] << endl;
// for FG, don't use normalized colour features.
// TODO To include the area use i=4
} else if (FGBG == "FG") {
for (unsigned int i=4; i<unitA.features.rows; i++) {
distance += pow(abs(<float>(i) -<float>(i)),0.5);
} else {
cout << "FGBG argument was not FG or BG, returning 0." <<endl;
return 0;
return pow(distance,2);
Features used to be a vector of floats, and thus the normalization code was as follows:
void clustering::normalize(list<percepUnit> &scratch, list<percepUnit> &units) {
list<percepUnit>::iterator unit;
list<percepUnit*>::iterator unitPtr;
vector<float> min,max;
list<percepUnit*> masterList; // list of pointers.
// generate pointers
for (unit = scratch.begin(); unit != scratch.end(); unit++)
masterList.push_back(&(*unit)); // add pointer to where unit points to.
for (unit = units.begin(); unit != units.end(); unit++)
masterList.push_back(&(*unit)); // add pointer to where unit points to.
int numFeatures = masterList.front()->features.size(); // all percepts have the same number of features.
min.resize(numFeatures); // allocate for the number of features we have.
// Loop through all units to get feature values
for (int i=0; i<numFeatures; i++) {
min[i] = masterList.front()->features[i]; // starting point.
max[i] = min[i];
// calculate min and max for each feature.
for (unitPtr = masterList.begin(); unitPtr != masterList.end(); unitPtr++) {
if ((*unitPtr)->features[i] < min[i])
min[i] = (*unitPtr)->features[i];
if ((*unitPtr)->features[i] > max[i])
max[i] = (*unitPtr)->features[i];
// Normalize features according to min/max.
for (int i=0; i<numFeatures; i++) {
for (unitPtr = masterList.begin(); unitPtr != masterList.end(); unitPtr++) {
(*unitPtr)->featuresNorm[i] = ((*unitPtr)->features[i]-min[i]) / (max[i]-min[i]);
(*unitPtr)->normalized = true;
I changed the features type to a cv::Mat so I could use the opencv normalization function, so I rewrote the normalization function as follows:
void clustering::normalize(list<percepUnit> &scratch, list<percepUnit> &units) {
Mat featureMat = Mat(1,units.size()+scratch.size(), CV_32FC1, Scalar(0));
list<percepUnit>::iterator unit;
// For each feature
for (int i=0; i< units.begin()->features.rows; i++) {
// for each unit in units
int j=0;
float value;
for (unit = units.begin(); unit != units.end(); unit++) {
// Populate featureMat j is the unit index, i is the feature index.
value = unit-><float>(i);<float>(j) = value;
// for each unit in scratch
for (unit = scratch.begin(); unit != scratch.end(); unit++) {
// Populate featureMat j is the unit index, i is the feature index.
value = unit-><float>(i);<float>(j) = value;
// Normalize this featureMat in place
cv::normalize(featureMat, featureMat, 0, 1, NORM_MINMAX);
// set normalized values in percepUnits from featureMat
// for each unit in units
for (unit = units.begin(); unit != units.end(); unit++) {
// Populate percepUnit featuresNorm, j is the unit index, i is the feature index.
value =<float>(j);
unit-><float>(i) = value;
// for each unit in scratch
for (unit = scratch.begin(); unit != scratch.end(); unit++) {
// Populate percepUnit featuresNorm, j is the unit index, i is the feature index.
value =<float>(j);
unit-><float>(i) = value;
I can't understand what the interaction between mergePercepts and normalization, especially since normalization is an entirely rewritten function.
Massif and my /proc memory reporting don't agree. Massif says there is no effect of normalization on memory usage, only commenting out the percepUnit::clone() operation bypasses the leak.
Here is all the code, in case the interaction is somewhere else I am missing.
Here is another version of the same code with the dependence on OpenCV GPU removed, to facilitate testing...
It was recommended by Nghia (on the opencv forum) that I try and make the percepts a constant size. Sure enough, if I fix the dimensions and type of the cv::Mat members of percepUnit, then the leak disappears.
So it seems to me this is a bug in OpenCV that effects calling clone() and copyTo() on Mats of different sizes that are class members. So far unable to reproduce in a simple program. The leak does seem small enough that it may be the headers leaking, rather than the underlying image data.