I'm trying to imitate CUDA/OpenCL workflow using vectorized functions like this:
#include <omp.h>
#include <iostream>
#include <string>
#include <functional>
#include<cmath>
template<typename Type, int Simd>
struct KernelData
{
alignas(32)
Type data[Simd];
inline void readFrom(const Type * const __restrict__ ptr) noexcept
{
for(int i=0;i<Simd;i++)
{
data[i] = ptr[i];
}
}
inline void writeTo(Type * const __restrict__ ptr) const noexcept
{
for(int i=0;i<Simd;i++)
{
ptr[i] = data[i];
}
}
inline const KernelData<Type,Simd> sqrt() const noexcept
{
KernelData<Type,Simd> result;
for(int i=0;i<Simd;i++)
{
result.data[i] = std::sqrt(data[i]);
}
return result;
}
};
template<int mask>
struct KernelDataFactory
{
KernelDataFactory()
{
}
template<typename Type>
inline
KernelData<Type,mask> generate() const
{
return KernelData<Type,mask>();
}
};
template<int SimdWidth, typename... Args>
class Kernel
{
public:
Kernel(std::function<void(int,int, Args...)> kernelPrm)
{
kernel = kernelPrm;
}
void run(int n, Args... args)
{
const int nLoop = (n/SimdWidth);
for(int i=0;i<nLoop;i++)
{
kernel(i*SimdWidth,SimdWidth, args...);
}
if((n/SimdWidth)*SimdWidth != n)
{
const int m = n%SimdWidth;
for(int i=0;i<m;i++)
{
kernel(nLoop*SimdWidth+i,1, args...);
}
}
}
private:
std::function<void(int,int, Args...)> kernel;
};
// cpu cycles from stackoverflow
#include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
int main(int argC, char** argV)
{
constexpr int simd = 16;
constexpr int n = 1003;
Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
const int id = simdGroupId;
if(simdWidth == simd)
{
const KernelDataFactory<simd> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
else
{
const KernelDataFactory<1> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
});
alignas(32)
float i[n],o[n];
for(int j=0;j<n;j++)
i[j]=j;
auto t1 = readTSC();
for(int k=0;k<10000;k++)
kernel.run(n,i,o);
auto t2 = readTSC();
for(int i=n-10;i<n;i++)
{
std::cout<<"i="<<i<<" value="<<o[i]<<std::endl;
}
std::cout<<0.0001f*(t2-t1)/(float)(15*n)<<" cycles per sqrt"<<std::endl;
return 0;
}
but the part that is given by user has to be duplicated like this:
Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
const int id = simdGroupId;
if(simdWidth == simd)
{
const KernelDataFactory<simd> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
else
{
const KernelDataFactory<1> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
});
the only difference is the compile-time known two templates to produce:
KernelDataFactory<1> and KernelDataFactory<simd>
With define macros, it is easy to duplicate just the function body of the lambda. I'm trying to do this without using any define macro. Is there a simple way to do it such that user only gives this:
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
and it is automatically duplicated by the implementation?
What the current implementation does is:
takes n
breaks it into two parts
run vectorized code until n - (n%simd) point is reached
run scalar code from n - (n%simd) to n
KernelDataFactory template argument (has to be compile-time known) (1 and simd) is used to let compiler generate vectorized code. (On godbolt.org (avx512), it runs at "0.9 cycles per sqrt" speed and on my system (avx1) it is 3.8 cycles per sqrt.)
You could use a generic lambda (C++14) to achieve something like this. Note that this requires you to change the type of Kernel::kernel and change the creation of the kernel a bit to allow for automatic type deduction:
Kernel
template<int SimdWidth, typename F, typename... Args>
class Kernel
{
public:
Kernel(F&& kernelPrm)
: kernel(std::move(kernelPrm))
{
}
void run(int n, Args... args)
{
const int nLoop = (n / SimdWidth);
for (int i = 0; i < nLoop; i++)
{
CallKernel(i * SimdWidth, SimdWidth, args...);
}
if ((n / SimdWidth) * SimdWidth != n)
{
const int m = n % SimdWidth;
for (int i = 0; i < m; i++)
{
CallKernel(nLoop * SimdWidth + i, 1, args...);
}
}
}
private:
// helper function creating the factory and passing it to kernel
void CallKernel(int simdGroupId, int simdWidth, Args... args)
{
const int id = simdGroupId;
if (simdWidth == SimdWidth)
{
const KernelDataFactory<SimdWidth> factory;
kernel(factory, id, args...);
}
else
{
const KernelDataFactory<1> factory;
kernel(factory, id, args...);
}
}
F kernel;
};
Helpers
These helpers are necessary to deduce the second template argument of Kernel.
// helper for specifying the parameter pack
template<class...Args>
struct KernelArgs
{};
template<int SimdWidth, typename F, class...Args>
auto CreateKernel(F&& kernelPrm, KernelArgs<Args...> const&)
{
return Kernel<SimdWidth, F, Args...>(std::forward<F>(kernelPrm));
}
main
...
auto kernel = CreateKernel<simd>([](auto& factory, int const id, float* input, float* output)
{
auto a = factory.template generate<float>();
a.readFrom(input + id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output + id);
}, KernelArgs<float*, float*>{});
...
Related
I wrote an expression template to sum up to three vectors together. However, as you can see in my code, this doesn't scale very well because for every additional sum operand I have to add another nested template expression. Is there a way to refactor this code to handle a (theoretically) infinite amount of additions?
template<class A>
struct Expr {
operator const A&() const {
return *static_cast<const A*>(this);
}
};
template<class A, class B>
class Add : public Expr<Add<A,B>> {
private:
const A &a_;
const B &b_;
public:
Add(const A &a, const B &b) : a_(a), b_(b) { }
double operator[] (int i) const {
return a_[i] + b_[i];
}
};
class Vector : public Expr<Vector> {
private:
double *data_;
int n_;
public:
Vector(int n, double w = 0.0) : n_(n) {
data_ = new double[n];
for(int i = 0; i < n; ++i) {
data_[i] = w;
}
}
double operator[] (int i) const {
return data_[i];
}
friend Expr<Add<Vector, Vector>> operator+(Vector &a, Vector &b) {
return Add<Vector, Vector>(a, b);
}
friend Expr<Add<Add<Vector, Vector>, Vector>> operator+(const Add<Vector, Vector> &add, const Vector &b) {
return Add<Add<Vector, Vector>, Vector>(add, b);
}
template<class A>
void operator= (const Expr<A> &a) {
const A &a_(a);
for(int i = 0; i < n_; ++i) {
data_[i] = a_[i];
}
}
};
int main() {
constexpr int size = 5;
Vector a(size, 1.0), b(size, 2.0), c(size);
c = a + b + a;
return 0;
}
This was working for me:
class Vector : public Expr<Vector> {
private:
double *data_;
int n_;
public:
Vector(int n, double w = 0.0) : n_(n) {
data_ = new double[n];
for(int i = 0; i < n; ++i) {
data_[i] = w;
}
}
double operator[] (int i) const {
return data_[i];
}
template<class A, class B>
friend Add<A, B> operator+(const Expr<A> &a, const Expr<B> &b) {
return Add<A, B>(a, b);
}
template<class A>
void operator= (const Expr<A> &a) {
const A &a_(a);
for(int i = 0; i < n_; ++i) {
data_[i] = a_[i];
}
}
};
I'm no template wizard (and I'm not up-to-date with the latest possibilities), but you can at least make a function that added a variadic amount of vectors, using something like described in the code below.
You could then buildup you expressiontree like you did before and call this function in you evaluation (operator=) function.
edit: updated the code, based on this solution (credits there)
#include <vector>
#include <algorithm>
template<typename T>
using Vec = std::vector<T>;
template<typename T, typename...Args>
auto AddVector_impl(Vec<Args> const & ... vecs){
auto its = std::tuple(cbegin(vecs)...);
auto add_inc = [](auto&... iters){
return ((*iters++) + ... );
};
auto end_check = [&](auto&...iters){
return ((iters != cend(vecs)) && ...);
};
Vec<T> res;
for(auto it = back_inserter(res); apply(end_check,its);){
*it++ = apply(add_inc,its);
}
return res;
}
template<typename T, typename... Args>
Vec<T> AddVector(Vec<T> const& vt, Vec<Args> const&... vargs){
return AddVector_impl<T>(vt,vargs...);
}
#include <iostream>
int main() {
constexpr auto size = 5;
Vec<double> a(size, 1.0), b(size, 2.0);
auto c = AddVector(a, b, a);
for(auto const& el : c){
std::cout << el << " ";
}
}
outputs:
4 4 4 4 4
I have a class with an int template parameter. Under some circumstances I want it to output an error message. This message should be a concatenated string from some fixed text and the template parameters. For performance reasons I'd like to avoid building up this string at runtime each time the error occurs and theoretically both, the string literal and the template parameter are known at compiletime. So I'm looking for a possibility to declare it as a constexpr.
Code example:
template<int size>
class MyClass
{
void onError()
{
// obviously won't work but expressing the concatenation like
// it would be done with a std::string for clarification
constexpr char errMsg[] = "Error in MyClass of size " + std::to_string (size) + ": Detailed error description\n";
outputErrorMessage (errMsg);
}
}
Using static const would allow to compute it only once (but at runtime):
template<int size>
class MyClass
{
void onError()
{
static const std::string = "Error in MyClass of size "
+ std::to_string(size)
+ ": Detailed error description\n";
outputErrorMessage(errMsg);
}
};
If you really want to have that string at compile time, you might use std::array, something like:
template <std::size_t N>
constexpr std::size_t count_digit() {
if (N == 0) {
return 1;
}
std::size_t res = 0;
for (int i = N; i; i /= 10) {
++res;
}
return res;
}
template <std::size_t N>
constexpr auto to_char_array()
{
constexpr auto digit_count = count_digit<N>();
std::array<char, digit_count> res{};
auto n = N;
for (std::size_t i = 0; i != digit_count; ++i) {
res[digit_count - 1 - i] = static_cast<char>('0' + n % 10);
n /= 10;
}
return res;
}
template <std::size_t N>
constexpr std::array<char, N - 1> to_array(const char (&a)[N])
{
std::array<char, N - 1> res{};
for (std::size_t i = 0; i != N - 1; ++i) {
res[i] = a[i];
}
return res;
}
template <std::size_t ...Ns>
constexpr std::array<char, (Ns + ...)> concat(const std::array<char, Ns>&... as)
{
std::array<char, (Ns + ...)> res{};
std::size_t i = 0;
auto l = [&](const auto& a) { for (auto c : a) {res[i++] = c;} };
(l(as), ...);
return res;
}
And finally:
template<int size>
class MyClass
{
public:
void onError()
{
constexpr auto errMsg = concat(to_array("Error in MyClass of size "),
to_char_array<size>(),
to_array(": Detailed error description\n"),
std::array<char, 1>{{0}});
std::cout << errMsg.data();
}
};
Demo
Here's my solution. Tested on godbolt:
#include <string_view>
#include <array>
#include <algorithm>
void outputErrorMessage(std::string_view s);
template<int N> struct cint
{
constexpr int value() const { return N; }
};
struct concat_op {};
template<std::size_t N>
struct fixed_string
{
constexpr static std::size_t length() { return N; }
constexpr static std::size_t capacity() { return N + 1; }
template<std::size_t L, std::size_t R>
constexpr fixed_string(concat_op, fixed_string<L> l, fixed_string<R> r)
: fixed_string()
{
static_assert(L + R == N);
overwrite(0, l.data(), L);
overwrite(L, r.data(), R);
}
constexpr fixed_string()
: buffer_ { 0 }
{
}
constexpr fixed_string(const char (&source)[N + 1])
: fixed_string()
{
do_copy(source, buffer_.data());
}
static constexpr void do_copy(const char (&source)[N + 1], char* dest)
{
for(std::size_t i = 0 ; i < capacity() ; ++i)
dest[i] = source[i];
}
constexpr const char* data() const
{
return buffer_.data();
}
constexpr const char* data()
{
return buffer_.data();
}
constexpr void overwrite(std::size_t where, const char* source, std::size_t len)
{
auto dest = buffer_.data() + where;
while(len--)
*dest++ = *source++;
}
operator std::string_view() const
{
return { buffer_.data(), N };
}
std::array<char, capacity()> buffer_;
};
template<std::size_t N> fixed_string(const char (&)[N]) -> fixed_string<N - 1>;
template<std::size_t L, std::size_t R>
constexpr auto operator+(fixed_string<L> l, fixed_string<R> r) -> fixed_string<L + R>
{
auto result = fixed_string<L + R>(concat_op(), l , r);
return result;
};
template<int N>
constexpr auto to_string()
{
auto log10 = []
{
if constexpr (N < 10)
return 1;
else if constexpr(N < 100)
return 2;
else if constexpr(N < 1000)
return 3;
else
return 4;
// etc
};
constexpr auto len = log10();
auto result = fixed_string<len>();
auto pow10 = [](int n, int x)
{
if (x == 0)
return 1;
else while(x--)
n *= 10;
return n;
};
auto to_char = [](int n)
{
return '0' + char(n);
};
int n = N;
for (int i = 0 ; i < len ; ++i)
{
auto pow = pow10(10, i);
auto digit = to_char(n % 10);
if (n == 0 && i != 0) digit = ' ';
result.buffer_[len - i - 1] = digit;
n /= 10;
}
return result;
}
template<int size>
struct MyClass
{
void onError()
{
// obviously won't work but expressing the concatenation like
// it would be done with a std::string for clarification
static const auto errMsg = fixed_string("Error in MyClass of size ") + to_string<size>() + fixed_string(": Detailed error description\n");
outputErrorMessage (errMsg);
}
};
int main()
{
auto x = MyClass<10>();
x.onError();
}
Results in the following code:
main:
sub rsp, 8
mov edi, 56
mov esi, OFFSET FLAT:MyClass<10>::onError()::errMsg
call outputErrorMessage(std::basic_string_view<char, std::char_traits<char> >)
xor eax, eax
add rsp, 8
ret
https://godbolt.org/z/LTgn4F
Update:
The call to pow10 is not necessary. It's dead code which can be removed.
Unfortunately your options are limited. C++ doesn't permit string literals to be used for template arguments and, even if it did, literal concatenation happens in the preprocessor, before templates come into it. You would need some ghastly character-by-character array definition and some manual int-to-char conversion. Horrible enough that I can't bring myself to make an attempt, and horrible enough that to be honest I'd recommend not bothering. I'd generate it at runtime, albeit only once (you can make errMsg a function-static std::string at least).
I'm looking for a small function that is able to transform a std::array by adding increasing values. The function must be a compile time function.
I was able to write a small constexpr function which does so for an array of length 3, but I was unable to generalize it to std::arrays of arbitrary lengths. I also failed to generalize it to contain something different than chars.
Does anyone knows how to do it?
#include <array>
#include <iostream>
#include <valarray>
constexpr std::array<char,3> obfuscate(const std::array<char,3>& x) {
return std::array<char, 3>{x.at(0)+1, x.at(1) + 2, x.at(2) + 3 };
}
/* Won't compile
template<typename T,typename S, template<typename, typename> L=std::array<T, U>>
constexpr L<T,U> obfuscate(const L<T, U>& x) {
return {x.at(0) + 1, x.at(0) + 2, x.at(0) + 3 };
}
*/
std::ostream& operator<<(std::ostream& str, const std::array<char, 3>& x) {
for (auto i = 0; i < 3; i++) {
str << x.at(i);
}
return str;
}
int main(int argc, char** args) {
std::array<char, 3> x{ 'a','b','c' };
std::cout << x << std::endl;
std::cout << obfuscate(x) << std::endl;
// std::cout << obfuscate<3>(x) << std::endl;
}
You can use std::index_sequence:
template<class T, std::size_t N, std::size_t... Is>
constexpr std::array<T, N> helper (const std::array<T, N> &x, std::index_sequence<Is...>) {
return std::array<T, N>{static_cast<T>(x.at(Is)+Is+1)...};
}
template<class T, std::size_t N>
constexpr std::array<T, N> obfuscate(const std::array<T, N> &x) {
return helper(x, std::make_index_sequence<N>{});
}
There are a few methods that use tuple packs, these are great except that MSVC has a performance problem compiling large strings.
I've found this compromise works well in MSVC.
template<typename I>
struct encrypted_string;
template<size_t... I>
struct encrypted_string<std::index_sequence<I...>>
{
std::array<char, sizeof...(I)+1> buf;
constexpr static char encrypt(char c) { return c ^ 0x41; }
constexpr static char decrypt(char c) { return encrypt(c); }
constexpr explicit __forceinline encrypted_string(const char* str)
: buf{ encrypt(str[I])... } { }
inline const char* decrypt()
{
for (size_t i = 0; i < sizeof...(I); ++i)
{
buf[i] = decrypt(buf[i]);
}
buf[sizeof...(I)] = 0;
return buf.data();
}
};
#define enc(str) encrypted_string<std::make_index_sequence<sizeof(str)>>(str)
And somewhere later
auto stringo = enc(R"(
kernel void prg_PassThru_src(const global unsigned short * restrict A, int srcstepA, int srcoffsetA,
global float * restrict Beta, int srcstepBeta, int srcoffsetBeta,
int rows, int cols) {
int x = get_global_id(0);
int y0 = get_global_id(1);
if (x < cols) {
int srcA_index = mad24(y0, srcstepA / 2, x + srcoffsetA / 2);
int srcBeta_index = mad24(y0, srcstepBeta / 4, x + srcoffsetBeta / 4);
Beta[srcBeta_index] = A[srcA_index];
}
}
//somewhere later
cv::ocl::ProgramSource programSource(stringo.decrypt());
You can see this guy's talk for more sophisticated methods:
https://www.blackhat.com/docs/eu-14/materials/eu-14-Andrivet-C-plus-plus11-Metaprogramming-Applied-To-software-Obfuscation.pdf
I have an integer N which I know at compile time. I also have an std::array holding integers describing the shape of an N-dimensional array. I want to generate nested loops, as described bellow, at compile time, using metaprogramming techniques.
constexpr int N {4};
constexpr std::array<int, N> shape {{1,3,5,2}};
auto f = [/* accept object which uses coords */] (auto... coords) {
// do sth with coords
};
// This is what I want to generate.
for(int i = 0; i < shape[0]; i++) {
for(int j = 0; j < shape[1]; j++) {
for(int k = 0; k < shape[2]; k++) {
for(int l = 0; l < shape[3]; l++) {
f(i,j,k,l) // object is modified via the lambda function.
}
}
}
}
Note the parameter N is known at compile time but might change unpredictably between compilations, hence I can't hard code the loops as above. Ideally the loop generation mechanism will provide an interface which accepts the lambda function, generates the loops and calls the function producing the equivalent code as above. I am aware that one can write an equivalent loop at runtime with a single while loop and an array of indices, and there are answers to this question already. I am, however, not interested in this solution. I am also not interested in solutions involving preprocessor magic.
Something like this (NOTE: I take the "shape" as a variadic template argument set..)
#include <iostream>
template <int I, int ...N>
struct Looper{
template <typename F, typename ...X>
constexpr void operator()(F& f, X... x) {
for (int i = 0; i < I; ++i) {
Looper<N...>()(f, x..., i);
}
}
};
template <int I>
struct Looper<I>{
template <typename F, typename ...X>
constexpr void operator()(F& f, X... x) {
for (int i = 0; i < I; ++i) {
f(x..., i);
}
}
};
int main()
{
int v = 0;
auto f = [&](int i, int j, int k, int l) {
v += i + j + k + l;
};
Looper<1, 3, 5, 2>()(f);
auto g = [&](int i) {
v += i;
};
Looper<5>()(g);
std::cout << v << std::endl;
}
Assuming you don't want total loop unrolling, just generation of i, j, k etc. argument tuples for f:
#include <stdio.h>
#include <utility> // std::integer_sequence
template< int dim >
constexpr auto item_size_at()
-> int
{ return ::shape[dim + 1]*item_size_at<dim + 1>(); }
template<> constexpr auto item_size_at<::N-1>() -> int { return 1; }
template< size_t... dim >
void call_f( int i, std::index_sequence<dim...> )
{
f( (i/item_size_at<dim>() % ::shape[dim])... );
}
auto main()
-> int
{
int const n_items = ::shape[0]*item_size_at<0>();
for( int i = 0; i < n_items; ++i )
{
call_f( i, std::make_index_sequence<::N>() );
}
}
I suppose this is exactly what you asked for:
#include <array>
#include <iostream>
constexpr int N{4};
constexpr std::array<int, N> shape {{1,3,5,2}};
// Diagnositcs
template<typename V, typename ...Vals>
struct TPrintf {
constexpr static void call(V v, Vals ...vals) {
std::cout << v << " ";
TPrintf<Vals...>::call(vals...);
}
};
template<typename V>
struct TPrintf<V> {
constexpr static void call(V v) {
std::cout << v << std::endl;
}
};
template<typename ...Vals>
constexpr void t_printf(Vals ...vals) {
TPrintf<Vals...>::call(vals...);
}
// Unroll
template<int CtIdx, typename F>
struct NestedLoops {
template<typename ...RtIdx>
constexpr static void call(const F& f, RtIdx ...idx) {
for(int i = 0; i < shape[CtIdx]; ++i) {
NestedLoops<CtIdx + 1, F>::call(f, idx..., i);
}
}
};
template<typename F>
struct NestedLoops<N-1, F> {
template<typename ...RtIdx>
constexpr static void call(const F& f, RtIdx ...idx) {
for(int i = 0; i < shape[N-1]; ++i) {
f(idx..., i);
}
}
};
template<typename F>
void nested_loops(const F& f) {
NestedLoops<0, F>::call(f);
}
int main()
{
auto lf = [](int i, int j, int k, int l) {
t_printf(i,j,k,l);
};
nested_loops(lf);
return 0;
}
Another variant of the same thing:
template <size_t shape_index, size_t shape_size>
struct Looper
{
template <typename Functor>
void operator()(const std::array<int, shape_size>& shape, Functor functor)
{
for (int index = 0; index < shape[shape_index]; ++index)
{
Looper<shape_index + 1, shape_size>()
(
shape,
[index, &functor](auto... tail){ functor(index, tail...); }
);
}
}
};
template <size_t shape_size>
struct Looper<shape_size, shape_size>
{
template <typename Functor>
void operator()(const std::array<int, shape_size>&, Functor functor)
{
functor();
}
};
template <size_t shape_size, typename Functor>
void loop(const std::array<int, shape_size>& shape, Functor functor)
{
Looper<0, shape_size>()(shape, functor);
}
Example of use:
constexpr size_t N {4};
constexpr std::array<int, N> shape {{1,3,5,2}};
void f(int i, int j, int k, int l)
{
std::cout
<< std::setw(5) << i
<< std::setw(5) << j
<< std::setw(5) << k
<< std::setw(5) << l
<< std::endl;
}
// ...
loop(shape, f);
Live demo
I was trying my hand with allocators this time & felt that there were many chances of leaking the resources. So I thought what if I used std::unique_ptr to handle them. I tried my hand with a std::vector's allocator. My code goes like this :-
// allocator
#include <iostream>
#include <vector>
#include <memory>
using namespace std;
class X
{
int x,ID;
static int i;
public:
X()
{
cout<<"constructing ";
ID=++i;
cout<<"ID="<<ID<<'\n';
}
X(int a)
{
x=a;
cout<<"constructing ";
ID=++i;
cout<<"ID="<<ID<<'\n';
}
void get()
{
cout<<"enter x: ";
cin>>x;
}
void disp()
{
cout<<"x="<<x<<'\t';
}
~X()
{
cout<<"destroying ID="<<ID<<'\n';
}
};
int X:: i=0;
int main()
{
ios::sync_with_stdio(false);
vector<X> v;
auto alloc = v.get_allocator();
unsigned int i=0;
X *p(alloc.allocate(5));
for (i=0;i<5;++i)
alloc.construct (&p[i], i+1);
unique_ptr<X[]> ptr(p);
cout<<"\nthe elements are:-\n";
for (i=0;i<5;++i)
{
ptr[i].disp();
cout << '\t' << (long long)alloc.address(ptr[i]) << '\n';
}
cout<<"\n";
/*for (i=0;i<5;++i)
alloc.destroy(&p[i]);
deallocate(p,16)*/
return 0;
}
Unfortunately this code crashes showing UB. So what should I do ? How should I manipulate my code so as to make it suited for std::unique_ptr ?
template<typename T>
std::unique_ptr<T[], std::function<void(T *)>> make_T(X *ptr, std::allocator<T> alloc, std::size_t size) {
auto deleter = [](T *p, std::allocator<T> alloc, std::size_t size) {
for (int i = 0; i < size; ++i) {
alloc.destroy(&p[i]);
}
alloc.deallocate(p, sizeof(T) * size);
};
return {ptr, std::bind(deleter, std::placeholders::_1, alloc, size)};
}
int main(int argc, const char * argv[]) {
std::allocator<X> alloc = std::allocator<X>();
X *p = alloc.allocate(5);
for (int i = 0; i < 5; ++i) {
alloc.construct(&p[i], i + 1);
}
auto ptr = make_T(p, alloc, 5);
return 0;
}
Can also write one to construct the objects for you:
template<typename T, typename... Args>
std::unique_ptr<T[], std::function<void(T *)>> make_T_Construct(std::allocator<T> alloc, std::size_t size, Args... args) {
X *ptr = alloc.allocate(size);
for (std::size_t i = 0; i < size; ++i) {
alloc.construct(&ptr[i], std::forward<Args>(args)...);
}
auto deleter = [](T *p, std::allocator<T> alloc, std::size_t size) {
for (std::size_t i = 0; i < size; ++i) {
alloc.destroy(&p[i]);
}
alloc.deallocate(p, sizeof(T) * size);
};
return {ptr, std::bind(deleter, std::placeholders::_1, alloc, size)};
}
int main(int argc, const char * argv[]) {
std::allocator<X> alloc = std::allocator<X>();
auto ptr = make_T_Construct(alloc, 5, 100);
return 0;
}
Edit: To do what you want (tracking allocations), you have to track the memory allocations yourself using a custom allocator..
template<typename T>
struct Allocator
{
typedef T value_type;
Allocator() noexcept {};
template<typename U>
Allocator(const Allocator<U>& other) throw() {};
T* allocate(std::size_t n, const void* hint = 0)
{
T* memory = static_cast<T*>(::operator new(n * (sizeof(T) + sizeof(bool))));
for (std::size_t i = 0; i < n * (sizeof(T) + sizeof(bool)); ++i)
{
*reinterpret_cast<bool*>(reinterpret_cast<char*>(memory) + sizeof(bool)) = false;
}
return memory;
}
void deallocate(T* ptr, std::size_t n)
{
::operator delete(ptr);
}
void construct(T* p, const T& arg)
{
destroy(p);
new(p) T(arg);
*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool)) = true;
}
template<class U, class... Args>
void construct(U* p, Args&&... args)
{
destroy(p);
::new(p) U(std::forward<Args>(args)...);
*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool)) = true;
}
void destroy(T* p)
{
if (*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool))) {
p->~T();
*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool)) = false;
}
}
template<class U>
void destroy(U* p)
{
if (*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool))) {
p->~U();
*reinterpret_cast<bool*>(reinterpret_cast<char*>(p) + sizeof(bool)) = false;
}
}
};
template <typename T, typename U>
inline bool operator == (const Allocator<T>&, const Allocator<U>&)
{
return true;
}
template <typename T, typename U>
inline bool operator != (const Allocator<T>& a, const Allocator<U>& b)
{
return !(a == b);
}