Is there a generalization of std::bitset for two-bit values? - c++

Suppose I am a genome scientist trying to store extremely long strings of characters, each of which represents two bits of information (i.e. each element is either G, A, T, or C). Because the strings are incredibly long, I need to be able to store a string of length N in precisely 2N bits (or rather, N/4 bytes).
With that motivation in mind, I am looking for a generalization of std::bitset (or boost::dynamic_bitset<>) that works on two-bit values instead of single-bit values. I want to store N such two-bit values, each of which can be 0, 1, 2, or 3. I need the data packed as closely as possible in memory, so vector<char> will not work (as it wastes a factor of 4 of memory).
What is the best way to achieve my goal? One option is to wrap the existing bitset templates with customized operator[], iterators, etc., but I'd prefer to use an existing library if at all possible.

std::bitset<> is fixed length and you probably do not want that.
I think you should go ahead and wrap std::vector<bool>.
Note that std::vector<bool> is optimised for space, but has the benefit that it is dynamic in size.
Presumably you need to read the genome of arbitrary length on from somewhere.
Have a think about whether you need much of an API to access it; you might only need a couple of methods.
#Jefffrey's answer already covers the relevant code, if for bitset<>.
[ I am not familiar with boost::dynamic_bitset<> and what it might give over vector.]
One further thought is whether it might be convenient for you to work with quads of letters, a quad nicely filling a char in space.
class Genome
{
public:
enum class Letter {A,C,G,T};
Genome(const std::string& source)
{
code_.resize(source.size() * 2);
for (unsigned index = 0; index != source.size(); ++index)
{
char text = source[index];
Letter letter = textToLetter(text);
set(index, letter);
}
}
static Letter textToLetter(char text)
{
// Or search through the array `letterText`.
// Or come up with a neat but unintelligible one liner ...
Letter letter = Letter::A;
switch (text)
{
case 'A':
letter = Letter::A;
break;
case 'C':
letter = Letter::C;
break;
case 'G':
letter = Letter::G;
break;
case 'T':
letter = Letter::T;
break;
default:
// Invalid - handle error.
break;
}
return letter;
}
static char letterToText(Letter l)
{
return letterText[(unsigned)l];
}
// Add bounds checking
Letter get(unsigned index) const
{
unsigned distance = index * 2;
char numeric = code_[distance] + code_[distance + 1] * 2;
return Letter(numeric);
}
// Add bounds checking
void set(unsigned index, Letter value)
{
unsigned distance = index * 2;
bool low = (unsigned)value & 1;
bool high = (bool)((unsigned)value & 2);
code_[distance] = low;
code_[distance + 1] = high;
}
unsigned size()
{
return code_.size() / 2;
}
// Extend by numLetters, initially set to 'A'
void extend(unsigned numLetters)
{
code_.resize(code_.size() + numLetters * 2);
}
private:
static char letterText[4];
std::vector<bool> code_;
};
char Genome::letterText [4] = { 'A', 'C', 'G', 'T' };
int main()
{
Genome g("GATT");
g.extend(3);
g.set(5, Genome::Letter::C);
for (unsigned i = 0; i != g.size(); ++i)
std::cout << Genome::letterToText(g.get(i));
std::cout << std::endl;
return 0;
}

You have two choices.
Given:
enum class nucleobase { a, c, g, t };
You have two choices. You can:
use a single std::bitset and play with indexing
use std::bitset in combination with another container
For the first, you can just define a couple of functions that target the correct number of bits per set/get:
template<std::size_t N>
void set(std::bitset<N>& bits, std::size_t i, nucleobase x) {
switch (x) {
case nucleobase::a: bits.set(i * 2, 0); bits.set(i * 2 + 1, 0); break;
case nucleobase::c: bits.set(i * 2, 0); bits.set(i * 2 + 1, 1); break;
case nucleobase::g: bits.set(i * 2, 1); bits.set(i * 2 + 1, 0); break;
case nucleobase::t: bits.set(i * 2, 1); bits.set(i * 2 + 1, 1); break;
}
}
template<std::size_t N>
nucleobase get(const std::bitset<N>& bits, std::size_t i) {
if (!bits[i * 2])
if (!bits[i * 2 + 1]) return nucleobase::a;
else return nucleobase::c;
else
if (!bits[i * 2 + 1]) return nucleobase::g;
else return nucleobase::t;
}
Live demo
The above is just an example and a terrible one (it's almost 4AM here and I really need to sleep).
For the second you just need to map alleles and bits:
bit_pair bits_for(nucleobase x) {
switch (x) {
case nucleobase::a: return bit_pair("00"); break;
case nucleobase::c: return bit_pair("10"); break;
case nucleobase::g: return bit_pair("01"); break;
case nucleobase::t: return bit_pair("11"); break;
}
}
nucleobase nucleobase_for(bit_pair x) {
switch (x.to_ulong()) {
case 0: return nucleobase::a; break;
case 1: return nucleobase::c; break;
case 2: return nucleobase::g; break;
case 3: return nucleobase::t; break;
default: return nucleobase::a; break; // just for the warning
}
}
Live demo
Of course if you need runtime length you can just use boost::dynamic_bitset and std::vector.

Here's what I use for fixed-length k-mers.
#include <cstdint>
#include <cstdlib>
#include <ostream>
enum class nucleotide { A, C, G, T };
inline std::ostream&
operator<<(std::ostream& pOut, nucleotide pNt)
{
switch (pNt) {
case nucleotide::A: pOut << 'A'; break;
case nucleotide::C: pOut << 'C'; break;
case nucleotide::G: pOut << 'G'; break;
case nucleotide::T: pOut << 'T'; break;
}
return pOut;
}
class kmer_base;
class nucleotide_proxy {
public:
operator nucleotide() const {
return nucleotide((*mWord >> (mPosition * 2)) & 3);
};
nucleotide_proxy& operator=(nucleotide pNt) {
uint64_t word = *mWord;
word &= ~(uint64_t(3) << (mPosition*2));
word |= uint64_t(pNt) << (mPosition*2);
*mWord = word;
return *this;
};
private:
friend class kmer_base;
nucleotide_proxy(uint64_t* pWord, uint8_t pPosition)
: mWord(pWord), mPosition(pPosition)
{
}
uint64_t* mWord;
uint8_t mPosition;
};
class kmer_base {
protected:
nucleotide_proxy access(uint64_t* pWord, size_t pPosition)
{
return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
}
const nucleotide_proxy access(uint64_t* pWord, size_t pPosition) const
{
return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
}
};
template<int K>
class kmer : public kmer_base
{
enum { Words = (K + 31) / 32 };
public:
nucleotide_proxy operator[](size_t pOutdex) {
return access(mWords, pOutdex);
}
const nucleotide_proxy operator[](size_t pOutdex) const {
return access(mWords, pOutdex);
}
private:
uint64_t mWords[Words];
};
Extending this to dynamic-length k-mere is left as an exercise; it's pretty easy once you have nucleotide_proxy at your disposal. Implementing the reverse complement operator efficiently is also left as an exercise.

Related

Is there a way I can use a 2-bit size type instead of an int, by just plugging in the new type name instead of int?

I have an application where I need to save as much of memory as possible. I need to store a large amount of data that can take exactly three possible values. So, I have been trying to use a 2 bit sized type.
One possibility is using bit fields. I could do
struct myType {
uint8_t twoBits : 2;
}
This is a suggestion from this thread.
However, everywhere where I have used int variables prior to this, I would need to change their usage by appending a .twoBits. I checked if I can create a bit field outside of a struct, such as
uint8_t twoBits : 2;
but this thread says it is not possible. However,that thread is specific to C, so I am not sure if it applied to C++.
Is there a clean way I can define a 2-bit type, so that by simply replacing int with my type, I can run the program correctly? Or is using bit fields the only possible way?
CPU, and thus the memory, the bus, and the compiler too, uses only bytes or groups of bytes. There's no way to store a 2-bits type without storing also the other 6 remaining bits.
What you can so is define a struct that only uses some bits. But we aware that it will not save memory.
You can pack several x-bits types in a struct, as you already know. Or you can do bits operations to pack/unpack them into a integer type.
Is there a clean way I can define a 2-bit type, so that by simply
replacing int with my type, I can run the program correctly? Or is
using bit fields the only possible way?
You can try to make the struct as transparent as possible by providing implicit conversion operators and constructors:
#include <cstdint>
#include <iostream>
template <std::size_t N, typename T = unsigned>
struct bit_field {
T rep : N;
operator T() { return rep; }
bit_field(T i) : rep{ i } { }
bit_field() = default;
};
using myType = bit_field<2, std::uint8_t>;
int main() {
myType mt;
mt = 3;
std::cout << mt << "\n";
}
So objects of type my_type somewhat behave like real 3-bit unsigned integers, despite having more than 3 bits.
Of course, the residual bits are unused, but as single bits are not addressable on most systems, this is the best way to go.
I'm not convinced that you will save anything with your existing structure, as the surrounding structure still gets rounded up to a whole number of bytes.
You can write the following to squeeze 4 2-bit counters into 1 byte, but as you say, you have to name them myInst.f0:
struct MyStruct
{
ubyte_t f0:2,
f1:2,
f2:2,
f3:2;
} myInst;
In c and c++98, you can declare this anonymous, but this usage is deprecated. You can now access the 4 values directly by name:
struct
{ // deprecated!
ubyte_t f0:2,
f1:2,
f2:2,
f3:2;
};
You could declare some sort of template that wraps a single instance with an operator int and operator =(int), and then define a union to put the 4 instances at the same location, but again anonymous unions are deprecated. However you could then declare references to your 4 values, but then you are paying for the references, which are bigger than the bytes you were trying to save!
template <class Size,int offset,int bits>
struct Bitz
{
Size ignore : offset,
value : bits;
operator Size()const { return value; }
Size operator = (Size val) { return (value = val); }
};
template <class Size,int bits>
struct Bitz0
{ // I know this can be done better
Size value : bits;
operator Size()const { return value; }
Size operator = (Size val) { return (value = val); }
};
static union
{ // Still deprecated!
Bitz0<char, 2> F0;
Bitz<char, 2, 2> F1;
Bitz<char, 4, 2> F2;
Bitz<char, 6, 2> F3;
};
union
{
Bitz0<char, 2> F0;
Bitz<char, 2, 2> F1;
Bitz<char, 4, 2> F2;
Bitz<char, 6, 2> F3;
} bitz;
Bitz0<char, 2>& F0 = bitz.F0; /// etc...
Alternatively, you could simply declare macros to replace the the dotted name with a simple name (how 1970s):
#define myF0 myInst.f0
Note that you can't pass bitfields by reference or pointer, as they don't have a byte address, only by value and assignment.
A very minimal example of a bit array with a proxy class that looks (for the most part) like you were dealing with an array of very small integers.
#include <cstdint>
#include <iostream>
#include <vector>
class proxy
{
uint8_t & byte;
unsigned int shift;
public:
proxy(uint8_t & byte,
unsigned int shift):
byte(byte),
shift(shift)
{
}
proxy(const proxy & src):
byte(src.byte),
shift(src.shift)
{
}
proxy & operator=(const proxy &) = delete;
proxy & operator=(unsigned int val)
{
if (val <=3)
{
uint8_t wipe = 3 << shift;
byte &= ~wipe;
byte |= val << shift;
}
// might want to throw std::out_of_range here
return *this;
}
operator int() const
{
return (byte >> shift) &0x03;
}
};
Proxy holds a reference to a byte and knows how to extract two specific bits and look like an int to anyone who uses it.
If we wrap an array of bits packed into bytes with a class that returns this proxy object wrapped around the appropriate byte, we now have something that looks a lot like an array of very small ints.
class bitarray
{
size_t size;
std::vector<uint8_t> data;
public:
bitarray(size_t size):
size(size),
data((size + 3) / 4)
{
}
proxy operator[](size_t index)
{
return proxy(data[index/4], (index % 4) * 2);
}
};
If you want to extend this and go the distance, Writing your own STL Container should help you make a fully armed and operational bit-packed array.
There's room for abuse here. The caller can hold onto a proxy and get up to whatever manner of evil this allows.
Use of this primitive example:
int main()
{
bitarray arr(10);
arr[0] = 1;
arr[1] = 2;
arr[2] = 3;
arr[3] = 1;
arr[4] = 2;
arr[5] = 3;
arr[6] = 1;
arr[7] = 2;
arr[8] = 3;
arr[9] = 1;
std::cout << arr[0] << std::endl;
std::cout << arr[1] << std::endl;
std::cout << arr[2] << std::endl;
std::cout << arr[3] << std::endl;
std::cout << arr[4] << std::endl;
std::cout << arr[5] << std::endl;
std::cout << arr[6] << std::endl;
std::cout << arr[7] << std::endl;
std::cout << arr[8] << std::endl;
std::cout << arr[9] << std::endl;
}
Simply, build on top of bitset, something like:
#include<bitset>
#include<iostream>
using namespace std;
template<int N>
class mydoublebitset
{
public:
uint_least8_t operator[](size_t index)
{
return 2 * b[index * 2 + 1] + b[index * 2 ];
}
void set(size_t index, uint_least8_t store)
{
switch (store)
{
case 3:
b[index * 2] = 1;
b[index * 2 + 1] = 1;
break;
case 2:
b[index * 2] = 0;
b[index * 2 + 1] = 1;
break;
case 1:
b[index * 2] = 0;
b[index * 2 + 1] = 1;
break;
case 0:
b[index * 2] = 0;
b[index * 2 + 1] = 0;
break;
default:
throw exception();
}
}
private:
bitset<N * 2> b;
};
int main()
{
mydoublebitset<12> mydata;
mydata.set(0, 0);
mydata.set(1, 2);
mydata.set(2, 2);
cout << (unsigned int)mydata[0] << (unsigned int)mydata[1] << (unsigned int)mydata[2] << endl;
system("pause");
return 0;
}
Basically use a bitset with twice the size and index it accordingly. its simpler and memory efficient as is required by you.

Is it possible to get hash values as compile-time constants?

I thought I'd try selecting different options as strings by hashing them, but this doesn't work:
#include <type_traits>
#include <string>
inline void selectMenuOptionString(const std::string& str)
{
switch (std::hash<std::string>()(str))
{
case std::hash<std::string>()(std::string("Selection one")) : break;
// Expression must have a constant value
}
}
inline void selectMenuOptionString2(const std::string& str)
{
size_t selectionOneHash = std::hash<std::string>()(std::string("Selection one"));
switch (std::hash<std::string>()(str))
{
case selectionOneHash: // Expression must have a constant value
// The variable of selectionOneHash cannot be used as a constant
}
constexpr size_t hash = std::hash<int>()(6); // Expression must have a constant value
}
It seems I can't get hash values at compile time. From what I've read each different input should yield the same unique output every time, with a very low chance of collision. Given these properties couldn't the hash value be calculated at compile time? I don't know much at all about hashing, I usually use an unordered_map, but I wanted to try something new for learning's sake.
std::hash::operator() isn't constexpr, so you can't just use it. Instead, you'd have to write your own constexpr hash function. For example, the following is the FNV-1a hash algorithm (untested):
template <typename Str>
constexpr size_t hashString(const Str& toHash)
{
// For this example, I'm requiring size_t to be 64-bit, but you could
// easily change the offset and prime used to the appropriate ones
// based on sizeof(size_t).
static_assert(sizeof(size_t) == 8);
// FNV-1a 64 bit algorithm
size_t result = 0xcbf29ce484222325; // FNV offset basis
for (char c : toHash) {
result ^= c;
result *= 1099511628211; // FNV prime
}
return result;
}
And then you can use it:
int selectMenuOptionString(const std::string& str)
{
switch (hashString(str))
{
case hashString(std::string_view("Selection one")): return 42;
default: return 0;
}
}
Note that if you wrote hashString("Selection one"), it would actually hash the null terminator as well, so you might want to have an overload to catch string literals, such as:
template <size_t N>
constexpr size_t hashString(char const (&toHash)[N])
{
return hashString(std::string_view(toHash));
}
Demo
You'll need to implement your own hash function, because there's no suitable instantiation of std::hash that's constexpr. Here's a cheap-and-dirty...
EDIT: In order not to be humiliated too badly by Justin's answer, I added a 32 bit branch.
constexpr size_t hash(const char *str) {
static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4);
size_t h = 0;
if constexpr(sizeof(size_t) == 8) {
h = 1125899906842597L; // prime
} else {
h = 4294967291L;
}
int i = 0;
while (str[i] != 0) {
h = 31 * h + str[i++];
}
return h;
}
I just wanted to add this because I think it's cool. The constexpr strlen I got from a question here: constexpr strlen
#include <iostream>
#include <string>
int constexpr strlength(const char* str)
{
return *str ? 1 + strlength(str + 1) : 0;
}
size_t constexpr Hash(const char *first)
{ // FNV-1a hash function
const size_t FNVoffsetBasis = 14695981039346656037ULL;
const size_t FNVprime = 1099511628211ULL;
const size_t count = strlength(first);
size_t val = FNVoffsetBasis;
for (size_t next = 0; next < count; ++next)
{
val ^= (size_t)first[next];
val *= FNVprime;
}
return val;
}
inline void selectMenuOptionString(const std::string& str)
{
switch (Hash(str.c_str()))
{
case Hash("Selection one"): /*Do something*/ break;
case Hash("Selection two"): /*Do something*/ break;
}
}
int main()
{
static_assert(strlength("Hello") == 5, "String length not equal");
}
You can't get the hash of a runtime value at compile-time, no.
Even if you passed std::hash a constant expression, it is not defined to be able to do its hashing work at compile-time.
As far as I know (which isn't far), you'd have to come up with some monstrous template metahackery (or, worse, macros!) to do this. Personally, if your text input is known at build, I'd just pregenerate a hash outside of the code, perhaps in some Python-driven pre-build step.

C++ - Class for list with 3 elements

I'm attempting to make a data type that is basically an associative array/map, but it would have 3 elements instead of 2. It would be implemented like this:
myTable rouletteBoard;
rouletteBoard.push.back(0, "Green", "Neither");
rouletteBoard.push.back(00, "Green", "Neither");
rouletteBoard.push.back(1, "Red", "Odd");
So really just a map or list with 3 elements, the first one being the unique key.
Yes this is a Roulette game. And I understand how to basically have a class for each number and make a separate instance for each number with the appropriate properties, but I feel that would be rather inefficient, since I could just have a list of each number with it's associated properties.
I've gotten pretty much nowhere on creating the class for this. I keep wondering if there is a better way to do it and trying that, then getting frustrated and quitting.
First let's talk about the data. Note that 0 must be distinguished from 00 so we cannot store them both naively as integers. Second, note that the color and parity (odd/even) can be derived instantly from the number. There is no need to store them as separate data. So we can do this:
struct Pocket {
enum class Color { GREEN, RED, BLACK };
enum class Parity { NONE, ODD, EVEN };
Pocket(int8_t num) : number(num) {}
int8_t number; // -1 for "00" on American style wheel
Parity parity() const {
if (number < 1) return Parity::NONE;
if (number % 2) return Parity::ODD;
return Parity::EVEN;
}
Color color() const {
if (number < 1) return Color::GREEN;
if (number % 2) return Color::RED;
return Color::BLACK;
}
};
Then you can make a simple container:
std::vector<Pocket> wheel;
for (int8_t ii = is_american ? -1 : 0; ii <= 36; ++ii) {
wheel.emplace_back(ii);
}
Finally, you can add code for printing:
std::ostream& operator <<(std::ostream& out, Pocket pocket) {
if (pocket.number == -1) return out << "00";
return out << pocket.number;
}
const char* to_string(Pocket::Color color) {
switch (color) {
case Pocket::Color::GREEN: return "Green";
case Pocket::Color::RED: return "Red";
case Pocket::Color::BLACK: return "Black";
default: return "?";
}
}
If you want an associative array with multiple data, you create a map between a key and a data-structure.
For example here, if you only wanted to store strings, I'd suggest using a map between a key and a vector. Then you can add as many or as few strings as needed to each key, so it's a flexible system.
std::map<int,std::vector<std::string>> rouletteBoard;
Or, have that structure inside your "rouletteBoard" class.
As for the key, if you use literal ints, then you have a problem, as 0 and 00 would be the same int, you either need string keys, or to specify "00" interally with a special value such as -1. You can then create an enum relating to the different fields of the vector, a working prototype could look like:
#include<iostream>
#include<map>
#include<string>
#include<vector>
std::map<int, std::vector<std::string>> rouletteBoard;
enum
{
name,
color,
oddeven,
property_count
};
std::string colors[] = { "Green", "Black", "Red"};
std::string roulette_color(int i)
{
if (i < 1) return colors[0]; // Green
if (i < 11) return colors[1 + (i & 1)]; // Black=Even Red=Odd
if (i < 19) return colors[2 - (i & 1)]; // Black=Odd Red=Even
if (i < 29) return colors[1 + (i & 1)]; // Black=Even Red=Odd
return colors[2 - (i & 1)]; // Black=Even Red=Odd
}
int main()
{
rouletteBoard[-1] = {"00", roulette_color(-1), "Neither"};
rouletteBoard[ 0] = { "0", roulette_color(0), "Neither" };
for(int i = 1; i <=36; ++i)
{
rouletteBoard[i] = { std::to_string(i), roulette_color(i), (i & 1) ? "Odd" : "Even" };
}
for (int i = -1; i <= 36; ++i)
{
std::cout << rouletteBoard[i][name] << ": " << rouletteBoard[i][color] << ", " << rouletteBoard[i][oddeven] << "\n";
}
std::cin.get();
return 0;
}

C++ unordered_map especify types on runtime

Is there any way that I could define an unordered_map<*,*> var and depending a case or other redefine it with the appropriate types?
I'm reading some binaries files and the format is different for each so depending the formats it can by <int, string>, <short, string>, <int, int>, etc..
The only way I can think of is to define it <char *, char *> but I would have to define the hashing and other thing to work like that.
Is there any other option?
EDIT. ADD MORE CONTEXT FOR THE PROBLEM:
I will iterate another lists and get the values from the ordered_maps, I will know what type of data I'm using for the key and use that to generate a JSON string as result.
For more context the format of the files are like these:
INT number of fields to use. Example: 3
-- now there is a for from 1 to 3 as we have 3 fields
CHAR type of data (1 = int8, 2 = int16, 3 = int32, 4=string)
STRING name of the field
STRING alias of the field
-- end for
-- now I do a while not EOF
-- for each field
read value from file (int8, int16, int32, string) depending the type of field
first item of the for will be the KEY
if item != first add the value to an unoredered_map using the first as key
-- end for
-- end while
What are you going to store inside the map and how are you going to choose it?
There are two practical solution to your problem:
Parametric Polymorphism
This is how you should try to solve your problem in the first place. By keeping the arguments of your unordered_map generic.
This is mostly done by having a structure like
class Reader {
virtual void readFile(const std::string& name) = 0;
};
template<typename K, typename V>
class RealReader {
private:
std::unordered_map<K,V> data;
public:
void readFile(const std::string& name) override {
K key = // read key;
V value = // read value
data[key] = value;
}
};
Subtype Polymorphism
Define your own Key and/or Value classes so that you can define a std::unordered_map<Key*,Value*> and then subtype these custom types with your required types.
Without knowing how these are going to be used it's difficult to tell what's best.
I ended up using a self defined type and void * for the data.
So I set the type of the var in the struct and the data for it.
Here is the result:
struct fieldVariant {
char type;
void * data;
fieldVariant(char _type, void * _data) {
type = _type;
data = _data;
}
};
struct fieldHash {
inline size_t operator()(const fieldVariant * val) const
{
unsigned long h = 0;
unsigned long varSize = 0;
switch (val->type) {
case INT8:
varSize = 1;
break;
case INT16:
varSize = 2;
break;
case INT32:
varSize = 4;
break;
case INT64:
varSize = 8;
break;
case INT128:
varSize = 16;
break;
case CHAR2:
varSize = ((string *)val->data)->length();
break;
}
for (int i=0; i < varSize; i++)
h = 5 * h + *(char *)(val->data + i);
return size_t(h);
}
};
struct fieldEql {
inline bool operator()(const fieldVariant *s1,const fieldVariant *s2) const {
unsigned long varSize = 0;
switch (s1->type) {
case INT8:
varSize = 1;
break;
case INT16:
varSize = 2;
break;
case INT32:
varSize = 4;
break;
case INT64:
varSize = 8;
break;
case INT128:
varSize = 16;
break;
case CHAR2:
return *((string *)s1->data) == *((string *)s2->data);
}
return memcmp(s1->data, s2->data, varSize) == 0;
}
};
unordered_map<fieldVariant *, fieldVariant *, fieldHash, fieldEql> data;
void add(fieldVariant * key, fieldVariant * value) {data[key] = value;};

Infix Calculator Expression Parser

How do I parse and evaluate expressions in an infix calculator grammar? I thought of two ways.
The 1st involves using two stacks. One is for numbers and the other is for operators, and I would assess the operator precedence and association in order to figure out how to evaluate an expression.
The second method involves converting the infix expression to postfix which I have no idea how I'd go about doing. It was just an idea. Currently I set up my program with the intention to use the 1st method.
#include <iostream>
#include <string>
#include <sstream>
using namespace std;
bool die(const string &msg);
//stack class
class Stack{
public:
Stack();
void push(const double &val);
void push(const string &oper);
double popnum();
string popop();
double getopele();
double getnumele();
private:
static const unsigned MAX=30;
string opstack[MAX];
double numstack[MAX];
unsigned opele;
unsigned numele;
};
//operator type
struct OP{
string name;
void * func;
unsigned arity;
unsigned prec;
bool lass;
string descrip;
};
//operator table
OP op[]={{"+", add, 2, 4, true, "2+3 is 5"},
{"-", subtract, 2, 4, true, "2-3 is -1"},
{"*", multiply, 2, 6, true, "2*3 is 6"},
{"/", divide, 2, 6, true, "2/3 is 0.666666..., div by 0 illegal"}};
unsigned OPELE =sizeof(op)/sizeof(op[0]);
//operators
bool add(double &r, double &x, double &y);
bool subtract(double &r, double &x, double &y);
bool multiply(double &r, double &x, double &y);
bool divide(double &r, double &x, double &y);
//Manip
unsigned findindex(string token, OP op[], unsigned OPELE);
bool parse(double &t, const string &token);
bool evaluate(double &result, string line);
bool weird(double x);
int main(){
for(string line; getline(cin, line);){
if(line=="QUIT") break;
if(line.empty()) continue;
if(line=="DOC")
for(unsigned i=0; i<OPELE; i++)
cout<<op[i].name<<" | "<<op[i].descrip<<'\n';
double result;
if(evaluate(result, line)){
cout<<result<<'\n';
}else{
cout<<"Could not understand input\n\n";
}
}
}
Stack::Stack(){
opele=0;
numele=0;
}
void Stack::push(const double &val){
if(MAX) die("Stack Overflow");
numstack[numele++]=val;
}
void Stack::push(const string &oper){
if(MAX) die("Stack Overflow");
opstack[opele++]=oper;
}
double Stack::popnum(){
if(!numele) die("Stack Underflow");
return numstack[--numele];
}
string Stack::popop(){
if(!opele) die("Stack Underflow");
return opstack[--opele];
}
double Stack::getopele(){
return opele;
}
double Stack::getnumele(){
return numele;
}
bool add(double &r, double &x, double &y){
double t = x + y;
if( weird(t) ) return false;
r = t;
return true;
}
bool subtract(double &r, double &x, double &y){
double t = x - y;
if( weird(t) ) return false;
result = t;
return true;
}
bool multiply( double & r, double& x, double &y ){
double t = x * y;
if( weird(t) ) return false;
result = t;
return true;
}
bool divide( double & result, double &x, double &y ){
double t = x / y;
if( weird(t) ) return false;
result = t;
return true;
}
unsigned findindex(string token, OP op[], unsigned OPELE){
for(unsigned i=0l i<OPELE; i++)
if(op[i].name==token)
return i;
return UINT_MAX;
}
bool parse(double &t, const string &token){
istringstream sin( token );
double t;
if( !(sin >>t) ) return false;
char junk;
if( sin >>junk ) return false;
value = t;
return true;
}
bool evaluate(double &result, string line){
istringstream sin(line);
Stack s;
for(string token; sin>>token;){
double t;
if(parse(t, token)){
s.push(t);
}else if(
}
}
bool weird( double x ){
return x != x || x != 0 && x == 2*x;
}
This will be a long read, but anyway, I will share with you the algorithm I use to parse an infix expression and store it as a binary tree. Not Stack, but binary tree. Parsing that will give the postfix order easily. I don't say this is the best algorithm out there, but this works for my scripting language.
The algorithm:
We have a method which operates on a "current node" of a binary tree and a "current expression". The nodes contain a "data" field and a "type" field.
Stage 1: Simple things, such as "4" go directly into the node, and we specify the type to be as "DATA", ie. use this information as it is.
Stage 2: Now, Let's consider the following expression:
a) 2 + 3
this will be transformed into the following binary tree:
+
/ \
2 3
So, the operators go into the nodes and the operands go into the leafs. Transofrming the expression a) into the tree is pretty simple: find the operator, put in the "current" node of the tree, specify the type of the node to be operator "PLUS", and what is left of it goes into the tree to the left part of the node, what is right of it goes into the right tree. Nice and simple, using the information from Stage 1 the two leafs will be "DATA" leafs with value 2 and 3.
Stage 3: But for a more complex expression:
b) 2 * 3 + 4
The tree will be:
+
/ \ 4
*
/ \
2 3
So we need to modify the algorithm above to the following: Find the first operator which has the highest precedence (considering c++ guidelines... precedence of + (plus) and - (minus) is 6, while precedence of * (multiply), / (divide) and % (modulo) is 5) in the expression, divide the expression into two parts (before operand with highest precedence and after operand with highest precedence) and call recursively the method for the two parts, while placing the operator with the highest precedence into the current node. So, we do create a tree wit hdata like:
+
/ \
/ call method with "4"
call method with "2*3"
and at this stage we fall back to "Stage 2" for the call ("2*3") and "Stage 1" for the call "4".
Stage 4: What if there are paranthesis in the expression? Such as
c) 2 * (3 + 4)
This will give us the tree:
*
/ \
2 +
/ \
3 4
We modify the algorithm to be like:
while the current expression is enclosed in a paranthesis remove the paranthesis from it and restart the algorithm. Be careful. (2 + 3 * 4 + 5) is considered to be enclosed in a parnethesis while (2+3)*(4+5) is NOT. So, it's not just the starting and ending characters of the expression, but you effectively need to count the parantheses. (this is a recursive method, don't be afraid of the first step...)
now find the first operator with the highest precedence outside the parantheses of the expression. Again, take the left and right sides of the expression and call the method again and again till you end up at "Stage 1" ie. with a single data element.
Now this is an algorithm for an expression which consists of plain numbers and operators. For more complex information you might need to refine it to suit your needs. If you consider it worth, take a look at https://sourceforge.net/p/nap-script/mercurial/ci/default/tree/compiler/interpreter.cpp . This contains a full implementation (in C) of the algorithm above with regard to more complex notions (variables, method calls, postfix/prefix operators, etc...) The method is build_expr_tree, starts at line 1327.
The method of recursive descent is the simplest way to implement a correct expression parser by hand. Here the programming language stack does the same thing as the explicit stack you're trying to use. There are many RD examples to be found with google, and any good compiler book will have some.
The linked Wikipedia page shows a parser, but not how to add evaluation. So below is a complete rudimentary expression evaluator in C. It could be easily wrapped in a C++ class with the globals becoming instance variables. It's missing features you'd need in a production system. For example, when it finds an error, it just exits. C++ exceptions will easily allow you to unwind the recursion and continue. It also needs protections against numerical overflow, divide-by-zero, etc., which you obviously know how to do.
The idea of recursive descent is to transform the grammar of the desired language into a form called LL(1). When that's done, there are fixed rules - guarenteed to work every time - for transforming the grammar rules into procedures. I've done this below by hand. There are tools to do it automatically.
So this evaluator is very easy to extend. Just add the necessary grammar rule, then implement the needed enhancements to scanner, parser, and evaluation code. For example, a built-in function rule would be unsigned_factor -> FUNCTION_NAME ( expr ), where the scanner recognizes all function names as the same token and the unsigned_factor C function is augmented to parse and compute values.
I had to include a small scanner to get a working program. Note more than half the code is the scanner. Basic RD parsers are simple.
They get more complex if you add error recovery: the intelligent ability to skip just past an error and continue parsing, while emitting only one precisely worded error message. But then again, this adds lots of complexity to any parser.
// Bare bones scanner and parser for the following LL(1) grammar:
// expr -> term { [+-] term } ; An expression is terms separated by add ops.
// term -> factor { [*/] factor } ; A term is factors separated by mul ops.
// factor -> unsigned_factor ; A signed factor is a factor,
// | - unsigned_factor ; possibly with leading minus sign
// unsigned_factor -> ( expr ) ; An unsigned factor is a parenthesized expression
// | NUMBER ; or a number
//
// The parser returns the floating point value of the expression.
#include <stdio.h>
#include <stdlib.h>
// The token buffer. We never check for overflow! Do so in production code.
char buf[1024];
int n = 0;
// The current character.
int ch;
// The look-ahead token. This is the 1 in LL(1).
enum { ADD_OP, MUL_OP, LEFT_PAREN, RIGHT_PAREN, NUMBER, END_INPUT } look_ahead;
// Forward declarations.
void init(void);
void advance(void);
double expr(void);
void error(char *msg);
// Parse expressions, one per line.
int main(void)
{
init();
while (1) {
double val = expr();
printf("val: %f\n", val);
if (look_ahead != END_INPUT) error("junk after expression");
advance(); // past end of input mark
}
return 0;
}
// Just die on any error.
void error(char *msg)
{
fprintf(stderr, "Error: %s. I quit.\n", msg);
exit(1);
}
// Buffer the current character and read a new one.
void read()
{
buf[n++] = ch;
buf[n] = '\0'; // Terminate the string.
ch = getchar();
}
// Ignore the current character.
void ignore()
{
ch = getchar();
}
// Reset the token buffer.
void reset()
{
n = 0;
buf[0] = '\0';
}
// The scanner. A tiny deterministic finite automaton.
int scan()
{
reset();
START:
switch (ch) {
case ' ': case '\t': case '\r':
ignore();
goto START;
case '-': case '+':
read();
return ADD_OP;
case '*': case '/':
read();
return MUL_OP;
case '(':
read();
return LEFT_PAREN;
case ')':
read();
return RIGHT_PAREN;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
read();
goto IN_LEADING_DIGITS;
case '\n':
ch = ' '; // delayed ignore()
return END_INPUT;
default:
error("bad character");
}
IN_LEADING_DIGITS:
switch (ch) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
read();
goto IN_LEADING_DIGITS;
case '.':
read();
goto IN_TRAILING_DIGITS;
default:
return NUMBER;
}
IN_TRAILING_DIGITS:
switch (ch) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
read();
goto IN_TRAILING_DIGITS;
default:
return NUMBER;
}
}
// To advance is just to replace the look-ahead.
void advance()
{
look_ahead = scan();
}
// Clear the token buffer and read the first look-ahead.
void init()
{
reset();
ignore(); // junk current character
advance();
}
double unsigned_factor()
{
double rtn = 0;
switch (look_ahead) {
case NUMBER:
sscanf(buf, "%lf", &rtn);
advance();
break;
case LEFT_PAREN:
advance();
rtn = expr();
if (look_ahead != RIGHT_PAREN) error("missing ')'");
advance();
break;
default:
error("unexpected token");
}
return rtn;
}
double factor()
{
double rtn = 0;
// If there is a leading minus...
if (look_ahead == ADD_OP && buf[0] == '-') {
advance();
rtn = -unsigned_factor();
}
else
rtn = unsigned_factor();
return rtn;
}
double term()
{
double rtn = factor();
while (look_ahead == MUL_OP) {
switch(buf[0]) {
case '*':
advance();
rtn *= factor();
break;
case '/':
advance();
rtn /= factor();
break;
}
}
return rtn;
}
double expr()
{
double rtn = term();
while (look_ahead == ADD_OP) {
switch(buf[0]) {
case '+':
advance();
rtn += term();
break;
case '-':
advance();
rtn -= term();
break;
}
}
return rtn;
}
And running the program:
1 + 2 * 3
val: 7.000000
(1 + 2) * 3
val: 9.000000