Problem with cudaMalloc and cudaMemcpy in seperate template function [duplicate] - c++

This question already has an answer here:
Cuda allocation and return array from gpu to cpu
(1 answer)
Closed 2 years ago.
I am working on the basic CUDA program that only calculates square and cube. But I do not want to write all code in main thus I have separated into the functions some of them are template. No special purpose to create a template function. Only, I want to try it. The problem is related to if I call the function as naked such as cudaMalloc it is okay. If I call with my function, it fails. Let me show;
kernel.cuh
#ifndef KERNEL_CUH_
#define KERNEL_CUH_
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <exception>
#include <iostream>
struct GPUVars
{
private:
size_t block_sz;
size_t thread_sz;
public:
GPUVars(size_t block, size_t thread) : block_sz{ block }, thread_sz{ thread } {};
size_t GetBlockSize()const { return block_sz; };
size_t GetThreadSize()const { return thread_sz; }
};
inline bool check_device()
{
auto cuda_device_count{ 0 };
cudaGetDeviceCount(&cuda_device_count);
return cuda_device_count > 0;
}
template <typename T>
void AllocateMem(T* arr, size_t SIZE_BYTE)
{
if (cudaMalloc(&arr, SIZE_BYTE) != cudaSuccess)
{
throw std::bad_alloc();
}
}
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
{
throw std::bad_alloc();
}
}
#endif
main.cpp
#include <iostream>
#include <random>
#include <iomanip>
#include <cassert>
#include "timer.h"
#include "cpu_calc.h"
#include "kernel.cuh"
template <typename T>
void RandNumberGen(T lower, T upper, T* arr, size_t SIZE_ARR)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(lower, upper);
for (size_t i = 0; i < SIZE_ARR; ++i)
{
arr[i] = dis(gen);
}
}
int main()
{
assert(check_device() == true);
constexpr size_t SIZE_ARR{ 1024 };
double input_arr[SIZE_ARR]{ 0 };
RandNumberGen(1.0, 10000.0, input_arr, SIZE_ARR);
constexpr size_t SIZE_BYTE = SIZE_ARR * sizeof(double);
std::cout << std::setprecision(9) << std::fixed;
double cpu_output[SIZE_ARR]{ 0 };
// SQUARE
auto time = CPUTimer(&cpu_output[0], &input_arr[0], SIZE_ARR, &CPUSquare);
std::cout << "CPU square opeartion with " << SIZE_ARR << " size array takes " << std::setw(18) << time << " ns\n";
GPUVars gpu_vars{ 0, 1024 };
double* pgpu_input = nullptr;
double gpu_output[SIZE_ARR];
double* pgpu_output = nullptr;
AllocateMem(pgpu_input, SIZE_BYTE);
AllocateMem(pgpu_output, SIZE_BYTE);
CopyMemToDevice(input_arr, pgpu_input, SIZE_BYTE);
}
When I call CopyMemToDevice function, it throws an error due to cudaMemCpy function return that equal to cudaErrorInvalidValue.
Also, if I change CopyMemToDevice function to this still same;
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
AllocateMem(device_arr, SIZE_BYTE);
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess) // return 1 which is equal to cudaErrorInvalidValue
{
throw std::bad_alloc();
}
}
When I write this function as below, it works perfectly;
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
cudaMalloc(&device_arr, SIZE_BYTE);
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
{
throw std::bad_alloc();
}
}
Also, I know that AllocateMem function works, cudaMalloc return 0 which is cudaSuccess.
My question is what is the difference between calling a cudaMalloc and cudaMemcpy in the same function and different function? Why it gives cudaErrorInvalidValue : This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values. error when I call in the separated function? Thanks in advance.
I am using Visual Studio 2019 16.7.1 and CUDA 10.1

As Igor Tandetnik mentioned in the comment. The problem is only related to pass by value. I updated AllocateMem function as like that;
template <typename T>
void AllocateMem(T** arr, size_t SIZE_BYTE)
{
if (cudaMalloc(arr, SIZE_BYTE); != cudaSuccess)
{
throw std::bad_alloc();
}
}
And call like this,
AllocateMem(&pgpu_output, SIZE_BYTE);
It works.

Related

How to make template function work for char[32] and string type?

i want to design a template function, which build a shared memory with size = sizeof(T) * n
it returns template type pointer. and i pass a default value as default value.
function def looks like:
#ifndef SHMHELP_HPP_
#define SHMHELP_HPP_
#include <bits/stdc++.h>
#include <sys/shm.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
namespace cppbox {
namespace shm {
enum EmShmOpenMode:int {
MODE_CREATE,
MODE_RD,
};
template<typename T> //, T v>
T* func(const char* filename, size_t n, int rdflag, T v) { // mode: 0666
int offlag = rdflag == EmShmOpenMode::MODE_CREATE ? O_CREAT | O_EXCL | O_RDWR : offlag = O_RDWR;
int shm_fd = shm_open(filename, offlag, 0666);
if (-1 == shm_fd) {
if (rdflag != EmShmOpenMode::MODE_CREATE) {
std::cerr << "shm_open open failed: " << strerror(errno) << std::endl;
return nullptr;
}
offlag = O_RDWR| O_TRUNC;
if (-1 == (shm_fd = shm_open(filename, offlag, 0666))) {
std::cerr << "shm_open create failed: " << strerror(errno) << std::endl;
return nullptr;
}
}
if (rdflag == EmShmOpenMode::MODE_CREATE) {
if (ftruncate(shm_fd, n*sizeof(T))) {
std::cerr << "ftruncate failed: " << strerror(errno) << std::endl;
close(shm_fd);
return nullptr;
}
}
T* ret = (T*)mmap(0, n*sizeof(T), PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
close(shm_fd);
if (ret == MAP_FAILED) {
std::cerr << "mmap failed: " << strerror(errno) << std::endl;
return nullptr;
}
if (rdflag == EmShmOpenMode::MODE_CREATE) std::fill((T*)ret, ((T*)ret) + n, v);
return ret;
}
}
};
#endif // SHMHELP_HPP_
it's ok when i call func<int>("a", 100, 0, 0) or func<double>("a", 100, 0, 0.)
but it crashed when i call func<std::string>("a", 100, 0, "")
int main() {
std::string*p = cppbox::shm::MapShm<std::string>("b", 100, cppbox::shm::MODE_CREATE, "huang");
for (int i = 0; i < 100; ++i) {
cout << (*p)[i] << " ";
}
}
and complier will reject to complie when i call func<char[32]>("a", 100, 0, "") like this:
int main() {
char[32]*p = cppbox::shm::MapShm<char[32]>("b", 100, cppbox::shm::MODE_CREATE, "huang"); // compiler will reject in this line
for (int i = 0; i < 100; ++i) {
cout << (*p)[i] << " ";
}//*/
}
how can i make func<char[32]>("a",100, 0, "") and func<std::string>("a", 100, 0, "") work?
Change
if (rdflag == EmShmOpenMode::MODE_CREATE) std::fill((T*)ret, ((T*)ret) + n, v);
to
if (rdflag == EmShmOpenMode::MODE_CREATE) std::uninitialized_fill((T*)ret, ((T*)ret) + n, v);
That should help with the std::string case.
std::fill can only be used on memory that already contains objects. In your case you have uninitialised (raw) memory containing no constructed objects, so std::uninitialized_fill should be used instead.
First of all it is not a safe practice to allocate memory inside a function and return its pointer. Because one may forget to free the memory!
Second since there is no way to assign a default value for arrays and the desired default value is somewhat empty anyway, you could do it by having 2 function overloads, one for assigning a default value and another for just allocating memory like:
#include <stdio.h>
#include <type_traits>
//general purpose type that can be constructed from any type!
class all{
public:
template<typename T>
all(T){}
};
//First overload
template <typename T>
T* pre_func(size_t n, T defaultvalue) {
T *pointer = (T*) calloc(n, sizeof(T));
for(size_t counter = n; counter--;){
pointer[counter] = defaultvalue;
}
return pointer;
}
//second overload
template <typename T>
T* pre_func(size_t n) {
T *pointer = (T*) calloc(n, sizeof(T));
return pointer;
}
//overload selector
template <typename T>
T* func(size_t n, typename std::conditional<std::is_array<T>::value, all, T>::type defaultvalue){
if constexpr(std::is_array<T>::value){
return pre_func<T>(n);
}else{
return pre_func<T>(n, defaultvalue);
}
}
then in your program call the function overloads any way you want for normal types and array types like:
func<int>(100, 0);
func<char[32]>(100, "");

Memory error when capturing variable in CUDA extended lambda

I am creating an extended (i.e. __device__) lambda in CUDA (see e.g. here) and it is supposed to capture a variable (here, a simple double value = 3;).
It compiles, but running it, I get an invalid memory access error and I don't understand why.
Changing the variable to static const double value = 3 fixes the problem, as it is no longer captured (though I don't understand how it is still available inside the lambda).
Question1: how can I correctly capture host variables in a CUDA extended lambda?
Question2: why is this code not working?
I tried this on Ubuntu 16, both with CUDA 8 and 10.
MWE Code
Compiled with nvcc mwe_lambda.cu -o mwe_lambda --std=c++11 -lineinfo -arch=sm_60 --expt-relaxed-constexpr --expt-extended-lambda
Note in particular the lambda, which should capture by copy.
The managed_allocator etc. are just in order to use managed memory and print the CUDA error.
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <iostream>
#include <string>
static void CudaHandleError( cudaError_t err, const char *file, int line, const std::string & function)
{
if (err != cudaSuccess)
{
std::cerr << std::string(cudaGetErrorString( err )) << " " << file << " " << line << " " << function << std::endl;
}
}
#define CU_HANDLE_ERROR( err ) (CudaHandleError( err, __FILE__, __LINE__, __func__ ))
#define CU_CHECK_ERROR( ) (CudaHandleError( cudaGetLastError(), __FILE__, __LINE__, __func__ ))
#define CU_CHECK_AND_SYNC( ) CU_CHECK_ERROR(); CU_HANDLE_ERROR( cudaDeviceSynchronize() )
template<class T>
class managed_allocator : public std::allocator<T>
{
public:
using value_type = T;
template<typename _Tp1>
struct rebind
{
typedef managed_allocator<_Tp1> other;
};
value_type* allocate(size_t n)
{
value_type* result = nullptr;
CU_HANDLE_ERROR( cudaMallocManaged(&result, n*sizeof(value_type)) );
return result;
}
void deallocate(value_type* ptr, size_t)
{
CU_HANDLE_ERROR( cudaFree(ptr) );
}
managed_allocator() throw(): std::allocator<T>() { } //fprintf(stderr, "Hello managed allocator!\n"); }
managed_allocator(const managed_allocator &a) throw(): std::allocator<T>(a) { }
template <class U>
managed_allocator(const managed_allocator<U> &a) throw(): std::allocator<T>(a) { }
~managed_allocator() throw() { }
};
template<typename T>
using field = std::vector<T, managed_allocator<T>>;
// vf[i] = f()
template<typename A, typename F>
__global__ void cu_set_lambda(A * vf, const F & f, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
{
vf[idx] = f();
}
}
int main()
{
std::cerr << "started" << std::endl;
{
field<double> vf(10, 0);
double value = 3;
auto lambda = [=] __device__ ()
{
return value;
};
auto n = vf.size();
cu_set_lambda<<<(n+1023)/1024, 1024>>>(vf.data(), lambda, n);
CU_CHECK_AND_SYNC();
std::cerr << vf[0] << " " << vf[1] << std::endl;
}
std::cerr << "finished" << std::endl;
}
You need to pass the lambda by value, as the variables captured by value in the lambda will not be available in device when you pass the lambda by reference.
__global__ void cu_set_lambda(A * vf, const F f, int N)
^^^^^^^
If you pass the lambda by value, the object (and its internals) will be copied to the kernel.

std::num_put issue with nan-boxing due to auto-cast from float to double

I'm using this post to extend nan values with some extra info and this post to modify std::cout behaviour and display this extra info.
Here is the code defining the functions and NumPut class:
#include <iostream>
#include <assert.h>
#include <limits>
#include <bitset>
#include <cmath>
#include <locale>
#include <ostream>
#include <sstream>
template <typename T>
void showValue( T val, const std::string& what )
{
union uT {
T d;
unsigned long long u;
};
uT ud;
ud.d = val;
std::bitset<sizeof(T) * 8> b(ud.u);
std::cout << val << " (" << what << "): " << b.to_string() << std::endl;
}
template <typename T>
T customizeNaN( T value, char mask )
{
T res = value;
char* ptr = (char*) &res;
assert( ptr[0] == 0 );
ptr[0] |= mask;
return res;
}
template <typename T>
bool isCustomNaN( T value, char mask )
{
char* ptr = (char*) &value;
return ptr[0] == mask;
}
template <typename T>
char getCustomNaNMask( T value )
{
char* ptr = (char*) &value;
return ptr[0];
}
template <typename Iterator = std::ostreambuf_iterator<char> >
class NumPut : public std::num_put<char, Iterator>
{
private:
using base_type = std::num_put<char, Iterator>;
public:
using char_type = typename base_type::char_type;
using iter_type = typename base_type::iter_type;
NumPut(std::size_t refs = 0)
: base_type(refs)
{}
protected:
virtual iter_type do_put(iter_type out, std::ios_base& str, char_type fill, double v) const override {
if(std::isnan(v))
{
char mask = getCustomNaNMask(v);
if ( mask == 0x00 )
{
out = std::copy(std::begin(NotANumber), std::end(NotANumber), out);
}
else
{
std::stringstream maskStr;
maskStr << "(0x" << std::hex << (unsigned) mask << ")";
std::string temp = maskStr.str();
out = std::copy(std::begin(CustomNotANumber), std::end(CustomNotANumber), out);
out = std::copy(std::begin(temp), std::end(temp), out);
}
}
else
{
out = base_type::do_put(out, str, fill, v);
}
return out;
}
private:
static const std::string NotANumber;
static const std::string CustomNotANumber;
};
template<typename Iterator> const std::string NumPut<Iterator>::NotANumber = "Not a Number";
template<typename Iterator> const std::string NumPut<Iterator>::CustomNotANumber = "Custom Not a Number";
inline void fixNaNToStream( std::ostream& str )
{
str.imbue( std::locale(str.getloc(), new NumPut<std::ostreambuf_iterator<char>>() ) );
}
A simple test function:
template<typename T>
void doTest()
{
T regular_nan = std::numeric_limits<T>::quiet_NaN();
T myNaN1 = customizeNaN( regular_nan, 0x01 );
T myNaN2 = customizeNaN( regular_nan, 0x02 );
showValue( regular_nan, "regular" );
showValue( myNaN1, "custom 1" );
showValue( myNaN2, "custom 2" );
}
My main program:
int main(int argc, char *argv[])
{
fixNaNToStream( std::cout );
doTest<double>();
doTest<float>();
return 0;
}
doTest<double> outputs:
Not a Number (regular): 0111111111111000000000000000000000000000000000000000000000000000
Custom Not a Number(0x1) (custom 1): 0111111111111000000000000000000000000000000000000000000000000001
Custom Not a Number(0x2) (custom 2): 0111111111111000000000000000000000000000000000000000000000000010
doTest<float> outputs:
Not a Number (regular): 01111111110000000000000000000000
Not a Number (custom 1): 01111111110000000000000000000001
Not a Number (custom 2): 01111111110000000000000000000010
While I would expect for float:
Not a Number (regular): 01111111110000000000000000000000
Custom Not a Number(0x1) (custom 1): 01111111110000000000000000000001
Custom Not a Number(0x2) (custom 2): 01111111110000000000000000000010
The problem is that num_put only has a virtual do_put for double, not for float. So my float is silently casted to a double, losing my extended information.
I know there are some alternatives, like using FloatFormat from the second post, or simply writing a smart float2double function and calling it prior to sending my NaN value to the output stream, but they require the developer to take care of this situation...and he may forget to.
Is there no way to implement that within NumPut class or anything else that would simply make things work when a float is send to the imbued stream as nicely as it works for a double?
My requirement is to be able to simply call a function like fixNaNToStream for any output stream (std::cout, local std::stringstream, ...) and then send float and double to it and get them identified as my custom NaNs and displayed accordingly.
The problem is that num_put only has a virtual do_put for double, not for float. So my float is silently casted to a double, losing my extended information.
The information is lost because the positions of the bits carrying it are different when the number is converted from float to double:
// Assuming an IEE-754 floating-point representation of float and double
0 11111111 10000000000000000000010
0 11111111111 1000000000000000000001000000000000000000000000000000
Note that the mantissa bits are "shifted" by 3 positions, because the exponent requires 3 more bits.
Also, it's worth noting what it's stated in this page: https://en.cppreference.com/w/cpp/numeric/math/isnan
Copying a NaN is not required, by IEEE-754, to preserve its bit representation (sign and payload), though most implementation do.
I assume the same holds for casting such values, so that, even ignoring other causes of undefined behavior in OP's code, whether a method of NaN-boxing could work or not is actually implementation defined.
In my former attempts of answering this question, I used some explicit bit shifting by different offset to achive the result, but as jpo38 also found out, the easiest way is to always generate a float NaN and then cast correctly.
The Standard Library function std::nanf could be used to generate a "customized" float NaN, but in the following demo snippet I won't use it.
#include <cstdint>
#include <limits>
#include <cstring>
#include <cassert>
#include <type_traits>
#include <iostream>
#include <bitset>
#include <array>
#include <climits>
namespace my {
// Waiting for C++20 std::bit_cast
// source: https://en.cppreference.com/w/cpp/numeric/bit_cast
template <class To, class From>
typename std::enable_if<
(sizeof(To) == sizeof(From)) &&
std::is_trivially_copyable<From>::value &&
std::is_trivial<To>::value,
// this implementation requires that To is trivially default constructible
To>::type
// constexpr support needs compiler magic
bit_cast(const From &src) noexcept
{
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
template <typename T, std::size_t Size = sizeof(T)>
void print_bits(T x)
{
std::array<unsigned char, Size> buf;
std::memcpy(buf.data(), &x, Size);
for (auto it = buf.crbegin(); it != buf.crend(); ++it)
{
std::bitset<CHAR_BIT> b{*it};
std::cout << b.to_string();
}
std::cout << '\n';
}
// The following assumes that both floats and doubles store the mantissa
// in the lower bits and that while casting a NaN (float->double or double->float)
// the most significant of those aren't changed
template <typename T>
auto boxed_nan(uint8_t data = 0) -> typename std::enable_if<std::numeric_limits<T>::has_quiet_NaN, T>::type
{
return bit_cast<float>(
bit_cast<uint32_t>(std::numeric_limits<float>::quiet_NaN()) |
static_cast<uint32_t>(data)
);
}
template <typename T>
uint8_t unbox_nan(T num)
{
return bit_cast<uint32_t>(static_cast<float>(num));
}
}; // End of namespace 'my'
int main()
{
auto my_nan = my::boxed_nan<float>(42);
my::print_bits(my_nan);
my::print_bits(static_cast<double>(my_nan));
assert(my::unbox_nan(my_nan) == 42);
assert(my::unbox_nan(static_cast<double>(my_nan)) == 42);
auto my_d_nan = my::boxed_nan<double>(17);
my::print_bits(my_d_nan);
my::print_bits(static_cast<float>(my_d_nan));
assert(my::unbox_nan(my_d_nan) == 17);
assert(my::unbox_nan(static_cast<float>(my_d_nan)) == 17);
auto my_ld_nan = my::boxed_nan<long double>(9);
assert(my::unbox_nan(my_ld_nan) == 9);
assert(my::unbox_nan(static_cast<double>(my_ld_nan)) == 9);
}
As Bob pointed, the double extended bit should be at the same relative position to biased exponent than it is for float if you want cast to work in both ways (from float to double and from double to float).
Considering that, a very trivial approach to handle that is to use the far right bit for the float. For for double, instead of trying to determine manually what bit should be used, simply douse cast operations and let the system identify where is the right place...
Then code becomes:
#include <iostream>
#include <assert.h>
#include <limits>
#include <bitset>
#include <cmath>
#include <locale>
#include <ostream>
#include <sstream>
template <typename T>
void showValue( T val, const std::string& what )
{
union uT {
T d;
unsigned long long u;
};
uT ud;
ud.d = val;
std::bitset<sizeof(T) * 8> b(ud.u);
std::cout << val << " (" << what << "): " << b.to_string() << std::endl;
}
char& getCustomNaNMask( float& value )
{
char* ptr = (char*) &value;
return ptr[0];
}
/** temp parameter is mainly used because we can't have two functions with same prototype even if they return different values */
float getCustomizedNaN( char mask, float temp )
{
// let's reuse temp argument as we need a local float variable
temp = std::numeric_limits<float>::quiet_NaN();
getCustomNaNMask(temp) |= mask;
return temp;
}
/** temp parameter is mainly used because we can't have two functions with same prototype even if they return different values */
double getCustomizedNaN( char mask, double temp )
{
float asFloat = getCustomizedNaN( mask, float() );
// Let the system correctly cast from float to double, that's it!
return static_cast<double>( asFloat );
}
template <typename T>
bool isCustomNaN( T value, char mask )
{
return getCustomNaNMask(value) == mask;
}
template <typename Iterator = std::ostreambuf_iterator<char> >
class NumPut : public std::num_put<char, Iterator>
{
private:
using base_type = std::num_put<char, Iterator>;
public:
using char_type = typename base_type::char_type;
using iter_type = typename base_type::iter_type;
NumPut(std::size_t refs = 0)
: base_type(refs)
{}
protected:
virtual iter_type do_put(iter_type out, std::ios_base& str, char_type fill, double v) const override {
if(std::isnan(v))
{
float asFloat = static_cast<float>( v );
char& mask = getCustomNaNMask(asFloat);
if ( mask == 0x00 )
{
out = std::copy(std::begin(NotANumber), std::end(NotANumber), out);
}
else
{
std::stringstream maskStr;
maskStr << "(0x" << std::hex << (unsigned) mask << ")";
std::string temp = maskStr.str();
out = std::copy(std::begin(CustomNotANumber), std::end(CustomNotANumber), out);
out = std::copy(std::begin(temp), std::end(temp), out);
}
}
else
{
out = base_type::do_put(out, str, fill, v);
}
return out;
}
private:
static const std::string NotANumber;
static const std::string CustomNotANumber;
};
template<typename Iterator> const std::string NumPut<Iterator>::NotANumber = "Not a Number";
template<typename Iterator> const std::string NumPut<Iterator>::CustomNotANumber = "Custom Not a Number";
inline void fixNaNToStream( std::ostream& str )
{
str.imbue( std::locale(str.getloc(), new NumPut<std::ostreambuf_iterator<char>>() ) );
}
And test program:
template<typename T>
void doTest()
{
T regular_nan = std::numeric_limits<T>::quiet_NaN();
T myNaN1 = getCustomizedNaN( 0x01, T() );
T myNaN2 = getCustomizedNaN( 0x02, T() );
showValue( regular_nan, "regular" );
showValue( myNaN1, "custom 1" );
showValue( myNaN2, "custom 2" );
}
int main(int argc, char *argv[])
{
fixNaNToStream( std::cout );
doTest<double>();
doTest<float>();
return 0;
}
Outputs:
Not a Number (regular): 0111111111111000000000000000000000000000000000000000000000000000
Custom Not a Number(0x1) (custom 1): 0111111111111000000000000000000000100000000000000000000000000000
Custom Not a Number(0x2) (custom 2): 0111111111111000000000000000000001000000000000000000000000000000
Not a Number (regular): 01111111110000000000000000000000
Custom Not a Number(0x1) (custom 1): 01111111110000000000000000000001
Custom Not a Number(0x2) (custom 2): 01111111110000000000000000000010
Thanks Bob!

How do I store the intermediate results of a recursive function using C++ templates at compile time?

I asked How do I capture the results of a recursive function at compile-time?, but I think my approach was wrong.
I have a program like so:
#include <iostream>
#include <list>
std::list<unsigned int> recursive_case(std::list<unsigned int>& result, unsigned int& i) {
result.push_front(1 + (i % 10));
i /= 10;
return i != 0 ? recursive_case(result, i) : result;
}
std::list<unsigned int> initial_case(unsigned int i) {
std::list<unsigned int> result;
result.push_back(i % 10);
i /= 10;
return i != 0 ? recursive_case(result, i) : result;
}
int main() {
auto list = initial_case(123);
bool first = true;
for (auto i: list) {
if (first) {
first = false;
} else {
std::cout << ", ";
}
std::cout << i;
}
std::cout << std::endl;
}
The output is 2, 3, 3.
I want to perform the above computation and get the same output but in compile-time (the loop iteration and output-printing would be at runtime i.e. everything starting from the for loop). Templates seem like a possibility (that's why I tagged this ask as such), but I am open to anything that gets the job done in compile-time.
You can use constexpr to calculate the list at compile time. I converted the recursion to iteration and used the indices trick to call calculate as often as necessary.
#include <iostream>
#include <array>
#include <iterator>
#include <utility>
constexpr std::size_t count_digits(std::size_t N, std::size_t Count = 0)
{
return (N > 0) ? count_digits(N/10, Count+1) : Count;
}
constexpr std::size_t ipow(std::size_t N, std::size_t Base)
{
return (N > 0) ? Base*ipow(N-1,Base) : 1;
}
constexpr std::size_t calculate(std::size_t n, std::size_t i)
{
std::size_t p = ipow(i,10);
std::size_t t = (n/p) % 10;
return i > 0 ? (t+1) : t;
}
template<std::size_t Num, std::size_t C, std::size_t... Is>
constexpr std::array<std::size_t, C> build_list(std::index_sequence<Is...>)
{
return {{ calculate(Num, C-Is-1)... }};
}
template <std::size_t Num, std::size_t C = count_digits(Num)>
constexpr auto build_list()
{
return build_list<Num, C>(std::make_index_sequence<C>{});
}
int main()
{
constexpr auto list = build_list<123>();
for(auto e : list)
{
std::cout << e << " ";
}
return 0;
}
output:
2 3 3
live example
Here's one solution.
#include <iostream>
// Print one digit.
template <unsigned int N, bool Initial> struct AtomicPrinter
{
static void print()
{
std::cout << N%10;
}
};
template <unsigned int N> struct AtomicPrinter<N, false>
{
static void print()
{
std::cout << 1 + N%10 << ", ";
}
};
// Recursive printer for a number
template <unsigned int N, bool Initial> struct Printer
{
static void print()
{
Printer<N/10, false>::print();
AtomicPrinter<N, Initial>::print();
}
};
// Specialization to end recursion.
template <bool TF> struct Printer<0, TF>
{
static void print()
{
}
};
void printList()
{
Printer<123, true>::print();
std::cout << std::endl;
}
int main() {
printList();
}
If there is a need to separate printing of the digits from constructing the list of digits, you can use:
#include <iostream>
#include <list>
template <unsigned int N, bool Initial> struct Digit
{
static void get(std::list<int>& l)
{
l.push_back(N%10);
}
};
template <unsigned int N> struct Digit<N, false>
{
static void get(std::list<int>& l)
{
l.push_back(1 + N%10);
}
};
template <unsigned int N, bool Initial> struct Digits
{
static void get(std::list<int>& l)
{
Digits<N/10, false>::get(l);
Digit<N, Initial>::get(l);
}
};
template <bool TF> struct Digits<0, TF>
{
static void get(std::list<int>& l)
{
}
};
void printList()
{
std::list<int> l;
Digits<123, true>::get(l);
bool first = true;
for (auto i: l) {
if (first) {
first = false;
} else {
std::cout << ", ";
}
std::cout << i;
}
std::cout << std::endl;
}
int main() {
printList();
}
You may use something like the following to split number at compile time:
#include <utility>
#include <iostream>
template <char... Cs>
std::integer_sequence<char, Cs...> operator "" _seq() { return {}; }
template <char...Cs>
void print(std::integer_sequence<char, Cs...>)
{
const char* sep = "";
for (const auto& c : {Cs...}) {
std::cout << sep << c;
sep = ", ";
}
}
int main() {
auto seq = 123_seq;
print(seq);
}
Demo

(C++) Reversing a string using stacks?

I'm trying to reverse a string using stacks. It correctly reverses the string, but the for loop crashes when i reaches 0. I get a "string subscript out of range" error. Currently the for loop only decrements to 1. How can I get it to push and display s1[0]?
This is the main code:
#include <cstdlib> // Provides EXIT_SUCCESS
#include <iostream> // Provides cin, cout
#include <stack> // Provides stack
#include <string> // Provides string
using namespace std;
. . .
string reverse(string & s1)
{
stack<char> stk1;
string::size_type i;
// this for loop sets the rest of the characters
for (i = s1.size() - 1; i > 0; i--)
{
stk1.push(s1[i]);
cout << stk1.top();
}
return "The function was a success. Now that's what I call reverse psychology.";
}
This is the header file:
#ifndef MAIN_SAVITCH_STACK1_H
#define MAIN_SAVITCH_STACK1_H
#include <cstdlib> // Provides size_t
namespace main_savitch_7A
{
template <class Item>
class stack
{
public:
// TYPEDEFS AND MEMBER CONSTANT -- See Appendix E if this fails to compile.
typedef std::size_t size_type;
typedef Item value_type;
static const size_type CAPACITY = 30;
// CONSTRUCTOR
stack( ) { used = 0; }
// MODIFICATION MEMBER FUNCTIONS
void push(const Item& entry);
void pop( );
// CONSTANT MEMBER FUNCTIONS
bool empty( ) const { return (used == 0); }
size_type size( ) const { return used; }
Item top( ) const;
private:
Item data[CAPACITY]; // Partially filled array
size_type used; // How much of array is being used
};
}
#include "stack1.template" // Include the implementation.
#endif
And this is the stack implementation (a template file):
#include <cassert> // Provides assert
namespace main_savitch_7A
{
template <class Item>
const typename stack<Item>::size_type stack<Item>::CAPACITY;
template <class Item>
void stack<Item>::push(const Item& entry)
// Library facilities used: cassert
{
assert(size( ) < CAPACITY);
data[used] = entry;
++used;
}
template <class Item>
void stack<Item>::pop( )
// Library facilities used: cassert
{
assert(!empty( ));
--used;
}
template <class Item>
Item stack<Item>::top( ) const
// Library facilities used: cassert
{
assert(!empty( ));
return data[used-1];
}
}
I want to change the for loop to this, but it doesn't work:
// this for loop sets the rest of the characters
for (i = s1.size() - 1; i >= 0; i--) // i > -1 doesn't work either
{
stk1.push(s1[i]);
cout << stk1.top();
}
cout << s1[0] << "\n\n";
return "The function was a success. Now that's what I call reverse psychology.";
}
I can think of the following couple of options.
Using the string::size_type for the loop counter:
string::size_type i;
for (i = s1.size(); i > 0; i--)
{
stk1.push(s1[i-1]);
cout << stk1.top();
}
or
Using an int for the loop counter:
int i = 0;
for (i = s1.size()-1; i >= 0; i--)
{
stk1.push(s1[i]);
cout << stk1.top();
}
i is unsigned so it wraps around when it is decremented if it is equal to 0. You need to use a signed type for it or to check the boundary condition without involving negative numbers(that is, do not compare it with -1 and do not decrement it if it is 0).