I have the following Neural Network code, I'm just trying to work my way up from basic problems, such as the XOR problem, while building up a codebase. This is a hobby project.
#include <iostream>
#include <array>
#include <random>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
static DataType learningRate = 0.02;
static std::size_t numberEpochs = 1000000;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
template<typename T>
class Random
{
public:
T operator()()
{
return m_dis(m_mt);
}
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<T> m_dis;
};
template<typename T> std::mt19937 Random<T>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<typename T> std::uniform_real_distribution<T> Random<T>::m_dis(0,1);
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction)
:
m_activationFunction(activationFunction)
{
Random<DataType> r;
std::generate(m_weights.begin(),m_weights.end(),[&]()
{
return r();
});
m_biasWeight = r();
}
void FeedForward(const std::array<DataType,NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for(std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
DataType GetNetInput() const
{
return m_netInput;
}
std::array<DataType,NumInputs> Backpropagate(const DataType& error,
const std::array<DataType,NumInputs>& inputValues,
std::array<DataType,NumInputs+1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_output * (DataType(1) - m_output); // sigmoid derivative
std::array<DataType,NumInputs> netInputOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
netInputOverWeight[i] = inputValues[i];
}
DataType netInputOverBias = DataType(1);
std::array<DataType,NumInputs> errorOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
}
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for(std::size_t i = 0; i < NumInputs; ++i)
{
weightAdjustments[i] = errorOverWeight[i];
}
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType,NumInputs> errorWeights;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorWeights[i] = errorOverNetInput * m_weights[i];
}
return errorWeights;
}
void AdjustWeights(const std::array<DataType,NumInputs+1>& adjustments)
{
for(std::size_t i = 0; i < NumInputs; ++i)
m_weights[i] = m_weights[i] - learningRate * adjustments[i];
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs];
}
const std::array<DataType,NumInputs> GetWeights() const {return m_weights;}
const DataType& GetBiasWeight() const { return m_biasWeight; }
protected:
std::array<DataType,NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
DataType m_output;
DataType m_netInput;
};
main()
{
std::array<std::array<DataType,2>,4> inputData = {{{0,0},{0,1},{1,0},{1,1}}};
std::array<std::array<DataType,1>,4> desiredOutputs = {{{0},{1},{1},{0}}};
std::array<Neuron<2>*,2> hiddenLayer1 = {{ new Neuron<2>(sigmoid), new Neuron<2>(sigmoid) }};
std::array<Neuron<2>*,1> outputLayer = {{ new Neuron<2>(sigmoid) }};
std::cout << std::fixed << std::setprecision(80);
DataType minError = std::numeric_limits<DataType>::max();
bool minErrorFound = false;
std::size_t epochNumber = 0;
while(epochNumber < numberEpochs && !minErrorFound)
{
DataType epochMSE = 0;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0,2.f);
epochMSE += totalError * totalError;
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType,3> weightAdjustmentsOutput;
std::array<DataType,2> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0,output1},
weightAdjustmentsOutput);
std::array<DataType,3> weightAdjustmentsHidden1;
hiddenLayer1[0]->Backpropagate(outputError[0],dataRow,weightAdjustmentsHidden1);
std::array<DataType,3> weightAdjustmentsHidden2;
hiddenLayer1[1]->Backpropagate(outputError[1],dataRow,weightAdjustmentsHidden2);
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden2);
}
epochMSE *= DataType(1) / inputData.size();
if(epochMSE >= minError + 0.00000001)
{
minErrorFound = true;
}
else
minError = epochMSE;
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Minimum error: " << minError << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
std::cout << finalOutput0 << std::endl;
}
return 0;
}
Most of the time, the output looks like this, and I think "great! success!"
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.00000000106923325748908837340422905981540679931640625000000000000000000000000000
Number epochs: 1000000/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.01
0.00,1.00 (1.00) : 0.99
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.01
Process returned 0 (0x0) execution time : 0.992 s
Press any key to continue.
But then the following is the output occassionally, which I want to understand, is this overfitting, or underfitting, or have I done something wrong somewhere? How can I prevent this?
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.00787912402302026748657226562500000000000000000000000000000000000000000000000000
Number epochs: 1000000/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.01
0.00,1.00 (1.00) : 0.50
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.50
Process returned 0 (0x0) execution time : 1.024 s
Press any key to continue.
I have tried using more or less epochs along with a higher or lower learning rate, but I still ocassionally get a result as above (not always the exact same as above but similar). For example, with a learning rate of 0.002 and 1000000 epochs, I get the following ocassionally:
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.01417684461921453475952148437500000000000000000000000000000000000000000000000000
Number epochs: 176477/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.29
0.00,1.00 (1.00) : 0.59
1.00,0.00 (1.00) : 0.59
1.00,1.00 (0.00) : 0.63
Process returned 0 (0x0) execution time : 0.225 s
Press any key to continue.
I see how it exited early because the error grew rather than shrank, but is that because I exited early when I shouldn't have?
You have done nothing wrong. Notice that you get different results even after training your network with the same amount of epochs and training data. Overfitting would be the cause if you would have used more epochs and/or training data in the network that works wrong. Underfitting is the opposite of that. You don't have underfitting, and you don't have overfitting. You could try to lower your learning rate by an order of magnitude or ar least by half, increase it, change training function or add momentum. It is important for you to know that neural networks is a very empirical process, if your trained network passes validation then it's ok, if not then tweak it a bit and retrain or just retrain. There is no closed form formula, solution or recipe for their design.
Related
As I cited in previous question:
Is it possible to generate multiple custom vertices using the Bundle Properties from Boost Graph Library?
Boost Maximum Weighted Matching in undirected bipartite random graphs hangs in an infinite loop
I'm working on an application benchmark that compare the performance of the boost maximum weighted matching and auction algorithm for the transportation problem on solving the assignment problem for bipartite graphs.
Currently I've implemented a version of the auction algorithm using the bundle proprieties of boost graph library, this implementation is inspired by a vector version from github. I've done this in order to put on the same level both algorithms, to make a fair benchmark. Here it is:
#include "../include/Auction.h"
#include "../include/BipartiteGraph.h"
void auction_algorithm(Graph& graph, const int& n, duration& elapsed) {
const Weight eps = 1;
int unassigned_bidders = n;
GraphProp& gp = graph[boost::graph_bundle];
EdgeFilter any_interconnect = boost::keep_all{};
VertexFilter bidders = [graph](V v) -> bool { return boost::get<Bidder>(&(graph)[v]); };
VertexFilter items = [graph](V v) -> bool { return boost::get<Item>(&(graph)[v]); };
FMap map_bidders = FMap(graph, any_interconnect, bidders);
FMap map_items = FMap(graph, any_interconnect, items);
auto iterator_bidder = boost::make_iterator_range(boost::vertices(map_bidders));
auto iterator_item = boost::make_iterator_range(boost::vertices(map_items));
auto t_start = now();
while (unassigned_bidders > 0) {
for (auto uncasted_bidder : iterator_bidder) {
if (gp.bidder2item[static_cast<int>(uncasted_bidder)] != -1) continue;
Bidder* bidder = boost::get<Bidder>(&graph[uncasted_bidder]);
// 1 Bid
int id_item1 = -1;
Weight val_item1 = -1;
Weight val_item2 = -1;
for (auto uncasted_item : iterator_item) {
Item* item = boost::get<Item>(&graph[static_cast<int>(uncasted_item)]);
Weight val = boost::get(boost::edge_weight_t(), graph, (boost::edge(uncasted_bidder, uncasted_item, graph)).first) - item->cost;
if (val > val_item1) {
val_item2 = val_item1;
val_item1 = val;
id_item1 = item->id;
}
else if (val > val_item2) {
val_item2 = val;
}
}
bidder->best_item = id_item1 + n;
bidder->val_first_best_item = val_item1;
bidder->val_second_best_item = val_item2;
// 2 Compete
Weight bid = bidder->val_first_best_item - bidder->val_second_best_item + eps;
auto best_item = boost::get<Item>(&graph[bidder->best_item]);
if (bid > best_item->high_bid) {
best_item->high_bid = bid;
best_item->high_bidder = bidder->id;
}
}
// 3 Assign
for (auto uncasted_item : iterator_item) {
Item* item = boost::get<Item>(&graph[uncasted_item]);
if (item->high_bid == -1) continue;
item->cost += item->high_bid;
if (gp.item2bidder[item->id] != -1) {
gp.bidder2item[gp.item2bidder[item->id]] = -1;
unassigned_bidders++;
}
gp.item2bidder[item->id] = item->high_bidder;
gp.bidder2item[gp.item2bidder[item->id]] = item->id;
unassigned_bidders--;
}
}
elapsed = now() - t_start;
}
Weight perform_au(Graph& graph, duration& elapsed) {
int n = int(boost::num_vertices(graph) / 2);
Weight total_cost_auction = 0;
auction_algorithm(graph, n, elapsed);
std::cout << "\nThe matching is: ";
for (int bidder = 0; bidder < n; ++bidder) {
std::cout << "(" << bidder << "," << graph[boost::graph_bundle].bidder2item[bidder] << ")";
int item = graph[boost::graph_bundle].bidder2item[bidder];
total_cost_auction += boost::get(boost::edge_weight_t(), graph, (boost::edge(bidder, item + n, graph)).first);
}
std::cout << "\n";
return total_cost_auction;
}
I have compared this to the vector implementation and notice that the latter is much faster than mine (however they return the same amount of total cost). Is it due to the complexity of the boost::get? If so, why is it so heavy?
I'm using the g++ compiler on a Ubuntu machine and to compile the application I run the following line in my console:
g++ -std=c++2a -o ../bin/app BipartiteGraph.cpp MaximumWeightedMatching.cpp Auction.cpp AuctionArray.cpp Main.cpp
I share the link of my github repository so you can have a look at the whole project.
PS: If you have any suggestions for speeding up the algorithm, that would be great!
UPDATE: 09/08/2022
Requirement: Make the auction algorithm generic like the style of the Boost Graph Library. This is the last implementation that I've made.
UPDATE: 10/08/2022
I've made a class that maintain the all stuff like it was before with the Bundle Properties:
UPDATE: 14/08/2022
Actual version
Weight perform_au(const Graph& graph, Duration& elapsed, int& n_iteration_au, bool verbose)
{
int n = int(boost::num_vertices(graph) / 2);
std::vector<int> assignments(n);
Auction<Graph, Weight> auction_problem(n);
auto t_start = now();
auction_problem.auction_algorithm(graph, assignments);
elapsed = now() - t_start;
std::cout << " Finished \nThe matching is: ";
for (int bidder = 0; bidder < n; ++bidder)
std::cout << "(" << bidder << "," << assignments[bidder] << ")";
std::cout << "\n";
if (verbose) auction_problem.printProprieties();
n_iteration_au = auction_problem.getNIterationAu();
return auction_problem.getTotalCost(graph);
}
#ifndef _AA_H
#define _AA_H
#include <vector>
#include <unordered_map>
#include <boost/graph/adjacency_list.hpp>
template<typename T>
using AdjacencyIterator = boost::graph_traits<T>::adjacency_iterator;
template<typename Graph, typename Type>
class Auction
{
private:
struct Bidder {
int best_item = -1;
double val_first_best_item = -1;
double val_second_best_item = -1;
};
struct Item {
double cost = 0;
int high_bidder = -1;
double high_bid = -1;
};
int n_iteration_au = 0;
int vertices = 0;
std::unordered_map<int, Bidder> unassigned_bidder;
std::unordered_map<int, Bidder> assigned_bidder;
std::unordered_map<int, Item> item_map;
bool is_assignment_problem(const Graph& graph);
void auctionRound(const Graph& graph, const double& eps, const auto& vertex_idMap);
public:
void auction_algorithm(const Graph& graph, std::vector<int>& ass);
int getNIterationAu();
Type getTotalCost(const Graph& graph);
void printProprieties();
Type getMaximumEdge(const Graph& graph);
void reset();
Auction(int vertices)
{
this->vertices = vertices;
for (int i : boost::irange(0, vertices))
{
this->unassigned_bidder.insert(std::make_pair(i, Bidder{}));
this->item_map.insert(std::make_pair(i, Item{}));
}
}
};
template<typename Graph, typename Type>
inline int Auction<Graph, Type>::getNIterationAu() { return n_iteration_au; }
template<typename Graph, typename Type>
Type Auction<Graph, Type>::getMaximumEdge(const Graph& graph)
{
Type max = 0;
typedef boost::graph_traits<Graph>::edge_iterator edge_iterator;
std::pair<edge_iterator, edge_iterator> ei = boost::edges(graph);
for (edge_iterator edge_iter = ei.first; edge_iter != ei.second; ++edge_iter)
if (boost::get(boost::edge_weight_t(), graph, *edge_iter) > max)
max = boost::get(boost::edge_weight_t(), graph, *edge_iter);
return max;
}
template<typename Graph, typename Type>
inline Type Auction<Graph, Type>::getTotalCost(const Graph& graph)
{
Type total_cost_auction = 0;
for (int bidder = 0; bidder < vertices; ++bidder)
total_cost_auction += boost::get(boost::edge_weight_t(), graph, (boost::edge(bidder, assigned_bidder[bidder].best_item + vertices, graph)).first);
return total_cost_auction;
}
template<typename Graph, typename Type>
bool Auction<Graph, Type>::is_assignment_problem(const Graph& graph)
{
for (auto v1 : boost::make_iterator_range(boost::vertices(graph)))
{
AdjacencyIterator<Graph> ai, a_end;
boost::tie(ai, a_end) = boost::adjacent_vertices(v1, graph);
if (ai == a_end) return false;
else
for (auto v2 : boost::make_iterator_range(ai, a_end))
if ((v1 < vertices && v2 < vertices) || (v1 > vertices && v2 > vertices))
return false;
}
return true;
}
template<typename Graph, typename Type>
inline void Auction<Graph, Type>::printProprieties()
{
for (auto& bidder : assigned_bidder)
std::cout << "|Bidder:" << bidder.first << "|Best item:" << bidder.second.best_item << "|Value first best item:" << bidder.second.val_first_best_item << "|Value second best item:" << bidder.second.val_second_best_item << "|\n";
for (auto& item : item_map)
std::cout << "|Item:" << item.first << "|Cost:" << item.second.cost << "|Higher bidder:" << item.second.high_bidder << "|Higher bid:" << item.second.high_bid << "|\n";
}
template<typename Graph, typename Type>
void Auction<Graph, Type>::auctionRound(const Graph& graph, const double& eps, const auto& vertex_idMap)
{
for (auto& bidder : unassigned_bidder)
{
int id_item1 = -1;
double val_item1 = -1;
double val_item2 = -1;
AdjacencyIterator<Graph> ai, a_end;
boost::tie(ai, a_end) = boost::adjacent_vertices(vertex_idMap[bidder.first], graph);
for (auto item : boost::make_iterator_range(ai, a_end)) // itero iniziando da quelli che hanno meno vertici?
{
double val = (boost::get(boost::edge_weight_t(), graph, (boost::edge(bidder.first, static_cast<int>(item), graph)).first)) // * (vertices))
- item_map[static_cast<int>(item) - vertices].cost;
if (val > val_item1)
{
val_item2 = val_item1;
val_item1 = val;
id_item1 = static_cast<int>(item) - vertices;
}
else if (val > val_item2) val_item2 = val;
}
bidder.second.best_item = id_item1;
bidder.second.val_second_best_item = val_item2;
bidder.second.val_first_best_item = val_item1;
double bid = bidder.second.val_first_best_item - bidder.second.val_second_best_item + eps;
if (item_map.find(bidder.second.best_item) != item_map.end())
{
if (bid > item_map[bidder.second.best_item].high_bid)
{
item_map[bidder.second.best_item].high_bid = bid;
item_map[bidder.second.best_item].high_bidder = bidder.first;
}
}
}
for (auto& item : item_map)
{
if (item.second.high_bid == -1) continue;
item.second.cost += item.second.high_bid;
int id_to_remove = -1;
for (auto& ass_bidr : assigned_bidder)
{
if (ass_bidr.second.best_item == item.first)
{
id_to_remove = ass_bidr.first;
break;
}
}
if (id_to_remove != -1)
{
unassigned_bidder.insert(std::make_pair(id_to_remove, assigned_bidder[id_to_remove]));
assigned_bidder.erase(id_to_remove);
}
assigned_bidder.insert(std::make_pair(item.second.high_bidder, unassigned_bidder[item.second.high_bidder]));
unassigned_bidder.erase(item.second.high_bidder);
}
}
template<typename Graph, typename Type>
void Auction<Graph, Type>::auction_algorithm(const Graph& graph, std::vector<int>& ass)
{
if (!is_assignment_problem(graph)) throw("Not an assignment problem");
auto vertex_idMap = boost::get(boost::vertex_index, graph);
double eps = static_cast<double>(1.0 / (vertices + 1));
while (unassigned_bidder.size() > 0)
{
auctionRound(graph, eps, vertex_idMap);
n_iteration_au += 1;
}
for (auto& a : assigned_bidder) ass[a.first] = a.second.best_item;
}
#endif
Why would it not be heavy.
Again,
FMap map_bidders = FMap(graph, any_interconnect, bidders);
FMap map_items = FMap(graph, any_interconnect, items);
Just "wishing" things to be a property map doesn't make them so.
Also, your filter predicates:
EdgeFilter any_interconnect = boost::keep_all{};
VertexFilter bidders = [graph](V v) -> bool { return boost::get<Bidder>(&(graph)[v]); };
VertexFilter items = [graph](V v) -> bool { return boost::get<Item>(&(graph)[v]); };
FMap map_bidders = FMap(graph, any_interconnect, bidders);
FMap map_items = FMap(graph, any_interconnect, items);
They...
copy the entire graph(!), twice
uselessly get<> a variant element, just to discard it and return bool
Slightly better:
VertexFilter bidders = [&graph](V v) -> bool {
return graph[v].which() == 0;
};
VertexFilter items = [&graph](V v) -> bool {
return graph[v].which() == 1;
};
FMap map_bidders = FMap(graph, {}, bidders);
FMap map_items = FMap(graph, {}, items);
But it's all kind of useless. I'm not suprised this stuff takes time, because you know your graph is structured (N bidders)(N items), so
auto iterator_bidder = boost::make_iterator_range(vertices(map_bidders));
auto iterator_item = boost::make_iterator_range(vertices(map_items));
CouldShould just be:
auto [b,e] = vertices(graph);
auto iterator_bidder = boost::make_iterator_range(b, b + n);
auto iterator_item = boost::make_iterator_range(b + n, e);
And even those are overkill, since your vertex descriptor is integral anyways:
auto const bidders = boost::irange(0, n);
auto const items = boost::irange(n, 2 * n);
I'll read some more later (family time first), because I'm already noticing more (e.g. why is listS used as the edge container selector?).
Will post here when done.
I try to write 2D Cross correlation in Sycl and OneAPI.
The idea is to write a kind of Map skeleton which wraps OneAPI calls hiding hardware targeting issues through some parameter specifying the kind of target (CPU or GPU/Accelerator).
this is my Map Class:
//Definition of Map Skeleton
template<class Tin, class Tout, class Function>
class Map {
private:
Function fun;
public:
Map() {
}
Map(Function f) :
fun(f) {
}
//Overriding () operator
std::vector<std::vector<Tout>> operator()(bool use_tbb,
std::vector<std::vector<Tin>> &img,
std::vector<std::vector<Tin>> &ker) {
int img_row = img.size();
int img_col = img[0].size();
int filt_row = ker.size();
int filt_col = ker[0].size();
int out_row = img_row - filt_row;
int out_col = img_col - filt_col;
std::vector<std::vector<Tout>> out;
if (use_tbb) {
uTimer *timer = new uTimer("Executing Code On CPU");
tbb::parallel_for(
tbb::blocked_range2d<int, int>(0, out_row, 0, out_col),
[&](tbb::blocked_range2d<int, int> &t) {
for (int n = t.rows().begin(); n < t.rows().end();
++n) {
for (int m = t.cols().begin(); m < t.cols().end();
++m) {
out[n][m] = fun(
slice_matrix(img, n, m, filt_row,
filt_col), ker);
}
}
});
timer->~uTimer();
return out;
} else {
/*change 2D Matrices to the 1D linear arrays,
*
*and operate on them as contiguous blocks */
size_t M = img_row + img_col;
size_t N = filt_row + filt_col;
//size_t O = out_row + out_col;
size_t O_row = out_row;
size_t O_col = out_col;
std::vector<Tin> img_host;
std::vector<Tin> ker_host;
std::vector<Tout> out_gpu;
/* A 2D std::vector<std::vector<T>>
* does not have elements stored contiguously in the memory.
* Thus I define a vector<T> and operate on them as contiguous blocks.*/
//Define Buffer for
sycl::buffer<Tin, 1> img_buffer(img_host.data(), M);
sycl::buffer<Tin, 1> ker_buffer(ker_host.data(), N);
sycl::buffer<Tin, 2> out_buffer(out_gpu.data(), sycl::range<2> {
O_row, O_col });
//Profiling GPU
// Initialize property list with profiling information
sycl::property_list propList {
sycl::property::queue::enable_profiling() };
// Build the command queue (constructed to handle event profling)
sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
propList);
// print out the device information used for the kernel code
std::cout << "Device: "
<< gpuQueue.get_device().get_info<sycl::info::device::name>()
<< std::endl;
std::cout << "Compute Units: "
<< gpuQueue.get_device().get_info<
sycl::info::device::max_compute_units>()
<< std::endl;
auto start_overall = std::chrono::system_clock::now();
auto event = gpuQueue.submit(
[&](sycl::handler &h) {
//local copy of fun
auto f = fun;
sycl::accessor img_accessor(img_buffer, h,
sycl::read_only);
sycl::accessor ker_accessor(ker_buffer, h,
sycl::read_only);
sycl::accessor out_accessor(out_buffer, h,
sycl::write_only);
h.parallel_for(sycl::range<2> { O_row, O_col },
[=](sycl::id<2> index) {
int row = index[0];
int col = index[1];
out_accessor[row][col] = f(
slice_matrix(img_accessor, O_row,
O_col, filt_row, filt_col),
ker_accessor);
});
});
event.wait();
auto end_overall = std::chrono::system_clock::now();
cl_ulong submit_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_submit>();
cl_ulong start_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_start>();
cl_ulong end_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_end>();
auto submission_time = (start_time - submit_time) / 1000000.0f;
std::cout << "Submit Time: " << submission_time << " ms"
<< std::endl;
auto execution_time = (end_time - start_time) / 1000000.0f;
std::cout << "Execution Time: " << execution_time << " ms"
<< std::endl;
auto execution_overall = std::chrono::duration_cast<
std::chrono::milliseconds>(end_overall - start_overall);
std::cout << "Overall Execution Time: " << execution_overall.count()
<< " ms" << std::endl;
}
;
return out;
}
};
And this is my slice_matrix:
//Function which Slice a specific part of my matricx
template<class T>
std::vector<std::vector<T>> slice_matrix(std::vector<std::vector<T>> mat, int i,
int j, int r, int c) {
std::vector<std::vector<T>> out(r, std::vector<T>(c, 0));
for (int k = 0; k < r; k++) {
std::vector<T> temp(mat[i + k].begin() + j, mat[i + k].begin() + j + c);
out[k] = temp;
}
return out;
}
;
The problem is that, in Sycl part inside parallel-for
out_accessor[row][col] = f(
slice_matrix(img_accessor, O_row,
O_col, filt_row, filt_col),
ker_accessor);
});
the program shows me an error which is:
no matching function for call to 'slice_matrix'
I tried to put my slice_matrix inside the Map Class, but nothing changed. Also I thought about limitation of Sycl about
" SYCL device code, as defined by this specification, does not support virtual function calls ", so I defined a local copy of slice_matrix, but again I had an error.
I cannot understand how to resolve this error.
You are passing a sycl::accessor type to slice_matrix, but the signature of slice_matrix is:
//Function which Slice a specific part of my matricx
template<class T>
std::vector<std::vector<T>> slice_matrix(std::vector<std::vector<T>> mat, int i, int j, int r, int c)
So the signature does not match...
You would need a version of slice_matrix that takes an accessor object instead of your vector.
For any experts out there. Can you please let me the most efficient way to get the desired output?
Want to fetch the Index/SubIndex values only for the highest subindex records
Want to get only the index/Subindex for any given input.
Want to fetch the Indexes for a given Range
Have around 2 million Indexes and for each index there are around 100 SubIndexes. All of them are inserted sequentially(Both Indexes and SubIndexes). Trying to find an efficient way to fetch the desired data. Even I am open to use a different data structure as needed as well as use C++20 Ranges as needed.
Thank you in advance. Would post the final solution if I could find one.
#include <iostream>
#include <vector>
class testclass {
public:
int Index;
int SubIndex;
int Data;
// There are few more elements
};
class DataObject {
testclass Rec;
};
std::vector<testclass> v;
int main()
{
testclass Rec;
Rec.Index = 1;
Rec.SubIndex = 0;
Rec.Data = 1;
v.emplace_back(Rec);
Rec.Index = 2;
Rec.SubIndex = 0;
Rec.Data = 1;
v.emplace_back(Rec);
Rec.Index = 2;
Rec.SubIndex = 1;
Rec.Data = 2;
v.emplace_back(Rec);
Rec.Index = 3;
Rec.SubIndex = 1;
Rec.Data = 4;
v.emplace_back(Rec);
// Q1 - How to print only the records with highest SubIndex - Desired output - 1 0 1, 2 1 2 & 3 1 4
// Q2 - How to fetch the Record for a given Index (to fetch the record of the highest subIndex value for a given Index - example Fetch Index 2 --- Output -- 2 1 2 (Highest SubIndex value)
// Q3 - How to fetch the records for a given Index Range with Highest SubIndex - Example - Fetch the Indexes from 2 to 3 - Output - 2 1 2 & 3 1 4
for(auto &i : v) {
std::cout << i.Index << " " << i.SubIndex << i.Data << "\n";
}
}
As I understand it, you need a quick search and at the same time keep the sequence of objects.
For this, you can use additional Associative containers.
Sample code below:
#include <iostream>
#include <vector>
#include <map>
class testclass {
public:
int Index;
int SubIndex;
};
class RecQuery
{
public:
/**
* #brief Pushes a record.
*
* #param[in] Rec The record.
*/
void PushRec(const testclass& Rec)
{
m_mapIndexToSubIndexToRecordIndex[Rec.Index][Rec.SubIndex] = m_vecRecords.size();
m_vecRecords.emplace_back(Rec);
}
/**
* #brief Finds a record.
*
* #param[in] Index The index.
*
* #return The reference to the record.
*/
testclass& FindRecord(const int Index)
{
const std::map<int, size_t>& subIndexToRecordMap = m_mapIndexToSubIndexToRecordIndex.at(Index);
return m_vecRecords[std::rbegin(subIndexToRecordMap)->second];
}
/**
* #brief Visits the Index/SubIndex values only for the highest subindex records.
*
* #param visitor The visitor.
*
* #tparam FVisitor The type of visitor.
*/
template<typename FVisitor>
void VisitHighestSubIndexes(FVisitor&& visitor)
{
for (const auto& [Index, SubIndexToRecIndex] : m_mapIndexToSubIndexToRecordIndex)
{
auto maxIndexIt = std::rbegin(SubIndexToRecIndex);
visitor(m_vecRecords[maxIndexIt->second]);
}
}
/**
* #brief Visits all records in the Index range. [from, to]
*
* #param[in] from The begin of the index range.
* #param[in] to The end of the index range.
* #param visitor The visitor.
*
* #tparam FVisitor The type of visitor.
*/
template<typename FVisitor>
void VisitRangeByIndex(const int from, const int to, FVisitor&& visitor)
{
const auto fromIt = m_mapIndexToSubIndexToRecordIndex.find(from);
if (fromIt == std::end(m_mapIndexToSubIndexToRecordIndex))
{
throw std::out_of_range{ "Index not found." };
}
auto toIt = m_mapIndexToSubIndexToRecordIndex.find(to);
if (toIt == std::end(m_mapIndexToSubIndexToRecordIndex))
{
throw std::out_of_range{ "Index not found." };
}
// If you do not need to visit the last element of the back range, delete this line.
++toIt;
for (auto it = fromIt; it != toIt; ++it)
{
visitor(m_vecRecords[std::rbegin(it->second)->second]);
}
}
/**
* #brief Returns an iterator pointing to the first record.
*
* #return Iterator to the element.
*/
auto begin() const noexcept
{
return std::begin(m_vecRecords);
}
/**
* #brief Returns an iterator pointing to the last record.
*
* #return Iterator to the element.
*/
auto end() const noexcept
{
return std::end(m_vecRecords);
}
private:
std::vector<testclass> m_vecRecords;
// for more performance can you use boost::container::flat_map.
// boost::container::flat_map<int, boost::container::flat_map<int, size_t>> m_mapIndexToSubIndexToRecordIndex;
std::map<int, std::map<int, size_t>> m_mapIndexToSubIndexToRecordIndex;
};
int main()
{
RecQuery recQuery;
testclass Rec;
Rec.Index = 1;
Rec.SubIndex = 0;
recQuery.PushRec(Rec);
Rec.Index = 2;
Rec.SubIndex = 0;
recQuery.PushRec(Rec);
Rec.Index = 2;
Rec.SubIndex = 1;
recQuery.PushRec(Rec);
// Q1 - How to print only the records with highest SubIndex(Highest) - Desired output - 1 0 & 2 1
std::cout << "Q1 ===========================================" << std::endl;
recQuery.VisitHighestSubIndexes([](testclass& rec)
{
std::cout << "Index: " << rec.Index << " Subindex: " << rec.SubIndex << std::endl;
});
std::cout << "Q2 ===========================================" << std::endl;
// Q2 - How to get only a specific Index/Subindex(highest) for a given input value - 2 1
testclass rec2 = recQuery.FindRecord(2);
std::cout << "Index: " << rec2.Index << " Subindex: " << rec2.SubIndex << std::endl;
// Q3
std::cout << "Q3 ===========================================" << std::endl;
recQuery.VisitRangeByIndex(1, 2, [](testclass& rec)
{
std::cout << "Index: " << rec.Index << " Subindex: " << rec.SubIndex << std::endl;
});
std::cout << "===========================================" << std::endl;
for (auto& i : recQuery)
{
std::cout << i.Index << " " << i.SubIndex << "\n";
}
return 0;
}
I used std::map, but it is better if using boost::flat_map. The boost::flat_map example is commented out.
We can use a transparent compare to find the range of testclass instances that share an Index, and the end of that range is the highest SubIndex
It would actually be easier if your data was ordered in the opposite direction, so we can take the first element of the range, but we can use reverse iterators to do that.
struct compare_index {
using is_transparent = void;
bool operator()(testclass lhs, testclass rhs)
{
return lhs.index > rhs.index;
}
bool operator()(int lhs, testclass rhs)
{
return lhs > rhs.index;
}
bool operator()(testclass lhs, int rhs)
{
return lhs.index > rhs;
}
};
// can swap std::vector for another range here
std::vector<testclass> highest_subindices()
{
if (v.empty()) { return {}; }
std::vector<testclass> result;
auto range = std::equal_range(v.rbegin(), v.rend(), v.back(), compare_index{});
while (range.second != range.first)
{
result.push_back(*range.first);
range = std::equal_range(range.second, v.rend(), *range.second, compare_index{});
}
return result;
}
testclass * highest_subindex(int Index)
{
auto range = std::equal_range(v.rbegin(), v.rend(), Index, compare_index{});
return range.first != range.second ? &*range.first : nullptr;
}
// can swap either std::vector for another range here
std::vector<testclass *> highest_subindices(std::vector<int> Indices)
{
std::vector<testclass> result(Indices.size());
std::transform(Indices.begin(), Indices.end(), result.begin(), highest_subindex);
return result;
}
I am trying to use ceres solver to optimize the point cloud transformation process.
Through following the samples from ceres solver tutorial, I got a simple working version of the optimization process. However, when I try to further modify the function in operator (in MyCostFunctor class), the results are totally wrong (the solver converges, but gives wrong results). I found the problem is caused by the two lines of codes where I was trying to convert the parameters from template type T to Eigen matrix type.
Here are the codes:
template<typename T> inline
void DataTransfer(const T* input, Eigen::Matrix<T, Eigen::Dynamic, 1>& output) {
for (int i = 0; i < 12; ++i) {
output[i] = input[i];
}
}
template<typename T, typename PtT> inline
T* GetCorrespondingPoint(const T* rot, const PtT pt) {
//**!!!!!!!!!!! Error !!!!!!!!!!!**
//Eigen::Matrix<T, Eigen::Dynamic, 1> param_vecs = Eigen::Matrix<T, Eigen::Dynamic, 1>::Zero(12);
//DataTransfer<T>(rot, param_vecs);
// **!!!!!!!!!! Error !!!!!!!!!!!**
T result[3];
result[0] = rot[0] * T(pt(0)) + rot[1] * T(pt(1)) + rot[2] * T(pt(2)) + rot[9];
result[1] = rot[3] * T(pt(0)) + rot[4] * T(pt(1)) + rot[5] * T(pt(2)) + rot[10];
result[2] = rot[6] * T(pt(0)) + rot[7] * T(pt(1)) + rot[8] * T(pt(2)) + rot[11];
return result;
}
// A cost functor that implements the residual r = x - y.
// where x = R*x' + T or add more operations such as x = C*inverse((R*x')*A + T*B), A, B, C are related vectors or matrices
template<typename PtT>
class MyCostFunctor {
public:
MyCostFunctor(PtT& x, PtT& y, int pt_id)
:x_(x), y_(y), idx_(pt_id) {
}
template<typename T>
bool operator()(const T* const params, T* residual) const {
// Data transformation
T* rslt;
rslt = GetCorrespondingPoint<T, PtT>(params, x_);
residual[0] = T(rslt[0] - y_(0));
residual[1] = T(rslt[1] - y_(1));
residual[2] = T(rslt[2] - y_(2));
return true;
}
private:
PtT x_; // source point
PtT y_; // target point
int idx_; // source point idx
};
The two lines of codes are commented out in function "GetCorrespondingPoint".
The code of the main function are as follows:
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <Eigen/Dense>
#include "ceres/ceres.h"
#include "glog/logging.h"
#include "ceres/dynamic_autodiff_cost_function.h"
using ceres::NumericDiffCostFunction;
using ceres::AutoDiffCostFunction;
using ceres::SizedCostFunction;
using ceres::CENTRAL;
using ceres::CostFunction;
using ceres::Problem;
using ceres::Solver;
using ceres::Solve;
int main(int argc, char** argv){
google::InitGoogleLogging(argv[0]);
// 1. Sample Data Set Up
std::vector<Eigen::Vector3d> model_pts;
model_pts.clear();
std::vector<Eigen::Vector3d> target_pts;
target_pts.clear();
model_pts.push_back(Eigen::Vector3d(10.0, 10.0, 10.0));
model_pts.push_back(Eigen::Vector3d(20.0, 10.0, 10.0));
model_pts.push_back(Eigen::Vector3d(10.0, 20.0, 10.0));
model_pts.push_back(Eigen::Vector3d(10.0, 10.0, 20.0));
target_pts.push_back(Eigen::Vector3d(40.0, 40.0, 40.0));
target_pts.push_back(Eigen::Vector3d(40.0, 30.0, 40.0));
target_pts.push_back(Eigen::Vector3d(30.0, 40.0, 40.0));
target_pts.push_back(Eigen::Vector3d(40.0, 40.0, 30.0));
/// Set up the index for pairing the model and target points
std::vector<int> pt_idx;
pt_idx.push_back(0);
pt_idx.push_back(1);
pt_idx.push_back(2);
pt_idx.push_back(3);
// print pts
std::cout << "Model pts\t\tTarget pts\n";
for (int i = 0; i < model_pts.size(); ++i) {
std::cout << model_pts[i](0) << " " << model_pts[i](1) << " " << model_pts[i](2) << "\t\t\t"
<< target_pts[i](0) << " " << target_pts[i](1) << " " << target_pts[i](2) << "\n";
}
// Parameter Set up
double params[12];
for (int i = 0; i < 12; ++i) {
params[i] = 1.0;
}
// Set up the problem
int num_pts = target_pts.size();
Problem problem;
for (int i = 0; i < num_pts; ++i) {
problem.AddResidualBlock(
new AutoDiffCostFunction<MyCostFunctor<Eigen::Vector3d>, 3, 12>(new MyCostFunctor<Eigen::Vector3d>(model_pts[i], target_pts[i], pt_idx[i])), NULL,¶ms[0]);
}
// Set the solver options
ceres::Solver::Options options;
options.minimizer_progress_to_stdout = true;
// Run the solver!
ceres::Solver::Summary summary;
Solve(options, &problem, &summary);
std::cout << summary.FullReport() << "\n\n";
// print results
std::cout << "test results: \n";
for (int i = 0; i < model_pts.size(); ++i) {
Eigen::Vector3d pt;
pt(0) = params[0]*model_pts[i](0) + params[1]*model_pts[i](1) + params[2]*model_pts[i](2) + params[9];
pt(1) = params[3]*model_pts[i](0) + params[4]*model_pts[i](1) + params[5]*model_pts[i](2) + params[10];
pt(2) = params[6]*model_pts[i](0) + params[7]*model_pts[i](1) + params[8]*model_pts[i](2) + params[11];
std::cout << pt(0) << " " << pt(1) << " " << pt(2) << "\n";
}
return 0;
}
If I comment out the two lines, I will get right results:
results before data transfer
However, when I am trying to transfer the parameters into Eigen formate with those two lines of codes (NOT BEEN USED in the function, only copy and transfer), I will get the wrong results:
results after data transfer
Can anyone help me to figure out what's the problem and what should I do if I want to do some operation on the parameters to get the right corresponding points? Thanks!
Your code for the residual uses the matrix rot in row-major form, while Eigen defaults to column-major form:
https://eigen.tuxfamily.org/dox/group__TopicStorageOrders.html
I've written code responsible for performing a Reduction on a large set of data, and while the code appears to be logically correct, it's proving to be slower than a simple std::accumulate or std::max_element call for the same data, and I'm looking for any insight into how I might have botched the performance of this code.
These are the results I'm getting. Note that even the raw time to execute the kernel is slower than a simple CPU reduction of my data.
Select which Device to use:
0: Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
1: Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
2: Intel(R) ... (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
3: Intel(R) ... (Experimental Open... - OpenCL 2.0 (Build...)
Device: Cedar
Platform: AMD Accelerated Parallel Processing
Num of compute units: 8
Work Group Size: 128
i = 9419918
Internal Duration: 95609555ns //Time to run just the kernel, no setup
Num of Work Groups to sum up: 78125
Reduced Value was detected to be: -5.06886
(Index): 1008460
Value at index is: -5.06886
Kernel Duration: 153748214ns //Includes copying of data, excludes building of kernel
Counting manually, Reduced Value is: -5.06886
(Index of): 1008460
Value at index is: -5.06886
Manual Duration: 48173322ns //CPU runtime using std::max_element`.
Press any key to continue . . .
The kernel code is constructed by concatenating all four of these files:
expand.cl
R"D(
#define EXPAND(type) \
typedef type Scalar;\
typedef type ## 2 Vector2;\
typedef type ## 4 Vector4;\
typedef type ## 8 Vector8;\
typedef type ## 16 Vector16;
)D"
float.cl
R"D(
EXPAND(float);
#define SCALAR_MAXIMUM INFINITY;
#define SCALAR_MINIMUM -INFINITY;
#define SCALAR_ZERO 0;
)D"
max.cl
R"D(
constant Scalar IDENTITY = SCALAR_MINIMUM;
#define REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index) \
if(a > b) {\
reduced_value = a;\
reduced_index = indexa;\
} else {\
reduced_value = b;\
reduced_index = indexb;\
}
)D"
Reduction Main.cl
R"D(
kernel void reduce(global Scalar * a, global Scalar * output, local Scalar * scratch, global long * index_output, local long * index_scratch, long size) {
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
size_t wid = get_group_id(0);
size_t gsize = get_global_size(0);
size_t lsize = get_local_size(0);
size_t wsize = get_num_groups(0);
if(gid < size) {
scratch[lid] = a[gid];
index_scratch[lid] = gid;
} else {
scratch[lid] = IDENTITY;
index_scratch[lid] = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(size_t offset = lsize / 2; offset > 0; offset >>= 1) {
if(lid < offset) {
size_t indexa = index_scratch[lid];
size_t indexb = index_scratch[lid + offset];
Scalar a = scratch[lid];
Scalar b = scratch[lid + offset];
Scalar reduced_value;
size_t reduced_index;
REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index);
scratch[lid] = reduced_value;
index_scratch[lid] = reduced_index;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[wid] = scratch[0];
index_output[wid] = index_scratch[0];
}
}
)D"
CL Reduction.h perform_reduction:
std::future<result> perform_reduction(std::vector<T> const& values) {
cl_long size = values.size();
uint64_t num_of_work_groups = size / work_group_size;
int64_t global_size = work_group_size * num_of_work_groups;
if (global_size < size) {
num_of_work_groups++;
global_size = work_group_size * num_of_work_groups;
}
cl::Buffer input_buffer(context, CL_MEM_READ_ONLY, global_size * sizeof(T), nullptr);
std::vector<cl::Event> write_events(1);
queue.enqueueWriteBuffer(input_buffer, false, 0, size * sizeof(T), values.data(), nullptr, &write_events.back());
if (global_size != size) {
write_events.emplace_back();
queue.enqueueFillBuffer(input_buffer, reduction::identity<T>(), size * sizeof(T), (global_size - size) * sizeof(T), nullptr, &write_events.back());
}
return std::async([size, num_of_work_groups, global_size, input_buffer, write_events, this] {
cl::Buffer output_buffer( context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(T) );
cl::Buffer output_index_buffer(context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(cl_long));
kernel.setArg(0, input_buffer);
kernel.setArg(1, output_buffer);
kernel.setArg(2, sizeof(T) * work_group_size, nullptr);
kernel.setArg(3, output_index_buffer);
kernel.setArg(4, sizeof(cl_long) * work_group_size, nullptr);
kernel.setArg(5, size);
std::vector<cl::Event> kernel_event;
kernel_event.emplace_back();
queue.enqueueNDRangeKernel(kernel, {}, { uint64_t(global_size) }, { work_group_size }, &write_events, &kernel_event.back());
std::vector<T> results;
std::vector<int64_t> indexes;
results.resize(num_of_work_groups);
indexes.resize(num_of_work_groups);
queue.enqueueReadBuffer(output_buffer, false, 0, num_of_work_groups * sizeof(T), results.data(), &kernel_event);
queue.enqueueReadBuffer(output_index_buffer, false, 0, num_of_work_groups * sizeof(cl_long), indexes.data(), &kernel_event);
queue.finish();
std::cout << "Internal Duration: " << std::setw(11) << (kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_START>()) << "ns" << std::endl;
std::cout << "Num of Work Groups to sum up: " << num_of_work_groups << std::endl;
result t{ reduction::identity<T>(), 0 };
for (size_t i = 0; i < results.size(); i++) {
T const& val = results[i];
size_t const& index = indexes[i];
t = reduction::reduce(t.reduced_value, val, t.reduced_index, index);
}
return t;
});
}
Reduction Main.cpp:
#define _HAS_AUTO_PTR_ETC 1
#include <vector>
#include <list>
#include <memory>
#include <utility>
#include<fstream>
#include<chrono>
#include<numeric>
#include<random>
#include<iomanip>
#include "CL Reduction.h"
std::string limit(std::string string, size_t limit) {
if (string.size() >= limit) return string.substr(0, limit - 3) + "...";
else return std::move(string);
}
cl::Device choose_device() {
std::vector<cl::Device> all_devices;
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
for (cl::Platform const& platform : platforms) {
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
all_devices.insert(all_devices.end(), devices.begin(), devices.end());
}
std::cout << "Select which Device to use: " << std::endl;
for (size_t i = 0; i < all_devices.size(); i++) {
cl::Device const& device = all_devices[i];
std::cout << i;
std::cout << ": ";
std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_NAME>(), 20);
std::cout << " (";
std::cout << std::setw(20) << limit(cl::Platform{ device.getInfo<CL_DEVICE_PLATFORM>() }.getInfo<CL_PLATFORM_NAME>(), 20);
std::cout << " - ";
std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_VERSION>(), 20);
std::cout << ")";
std::cout << std::endl;
}
size_t chosen;
std::cin >> chosen;
return all_devices[chosen];
}
int main() {
using type = float;
using reduction_type = cl_reduction_type::reduction_type<cl_reduction_type::type::maximum>;
using datatype = cl_datatype::datatype<type>;
using context_t = cl_reduction::reduction_context<datatype, reduction_type>;
std::ofstream err_log{ "err.txt" };
cl::Device device = choose_device();
try {
cl_reduction::reduction_context<datatype, reduction_type> context{ { device }, err_log };
std::vector<type> values;
auto last_ping = std::chrono::steady_clock::now();
std::default_random_engine engine{ std::random_device{}() };
std::uniform_real_distribution<type> distribution{ -100.f, 100.f };
//std::uniform_int_distribution<type> distribution(1, 500);
values.resize(10'000'000ull);
//values.resize(10'000);
type start = distribution(engine);
for (size_t i = 0; i < values.size(); i++) {
values[i] = start;
start = std::nextafter(start, std::numeric_limits<type>::infinity());
if (std::chrono::steady_clock::now() - last_ping > std::chrono::seconds(1)) {
std::cout << "i = " << i << '\r';
last_ping += std::chrono::seconds(1);
}
}
std::shuffle(values.begin(), values.end(), engine);
auto begin = std::chrono::steady_clock::now();
auto future = context.perform_reduction(values);
context_t::result t;
try {
t = future.get();
}
catch (cl::Error const& e) {
err_log << e.what() << std::endl;
err_log << e.err() << std::endl;
}
auto end = std::chrono::steady_clock::now();
std::cout << "Reduced Value was detected to be: " << t.reduced_value << std::endl;
std::cout << "(Index): " << t.reduced_index << std::endl;
std::cout << "Value at index is: " << values[t.reduced_index] << std::endl;
std::cout << "Kernel Duration: " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
begin = std::chrono::steady_clock::now();
//auto value = std::accumulate(values.begin(), values.end(), type(0));
auto it = std::max_element(values.begin(), values.end());
auto index = std::distance(values.begin(), it);
auto value = values[index];
end = std::chrono::steady_clock::now();
std::cout << "Counting manually, Reduced Value is: " << value << std::endl;
std::cout << "(Index of): " << index << std::endl;
std::cout << "Value at index is: " << values[index] << std::endl;
std::cout << "Manual Duration: " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
}
catch (cl::Error const& e) {
std::cerr << e.what() << ':' << e.err() << std::endl;
if (e.err() == CL_INVALID_BUFFER_SIZE)
std::cerr << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << std::endl;
}
system("pause");
return 0;
}
I've included the entire codebase here, which includes the three headers used and the main function. ("CL Datatype.h", "Cl Reduction Type.h", "CL Reduction.h", "Reduction Main.cpp"). I've only included for this post the code that I think is relevant, but if you think the problem is in something else, you can point to that in the Github Repo.
Read your input with Vector4 a = vload4(...) and use .xyzw. You might also try vectorizing by 8 with vload8.
Instead of a > b, use isgreater(a, b) along with any, all and select.
Do more than one reduction per loop to keep it in registers and reduce the bandwidth to the local memory. For a workgroup size of 128 and vector size of 4, the first thread would reduce 0-3 with 512-515, then with 1024-1027, etc. before writing to local memory with vstore4. Try different inner loop sizes.
As much as possible, you don't want threads sitting around doing nothing. The kernel should just be reducing from global memory into registers once, storing to local memory and then synchronizing the threads before one thread reduces from local to a single value for the kernel and store that in global memory. Finally, you can do the last, relatively small, level of reduction on the CPU. This level will only contain one value from each workgroup: total_size / (work_group_size = 128) / (vector_size = 4) / (inner_loop_size = 16)