I used Huffman encoding that we wrote to compress a file.
The function takes String and its output is String.
The problem is I want to save it as binary to get lower size than the original size, but when I take it back (0's and 1's ) as a string its size is larger than the main file. How can I convert that string of (0's and 1's) to a binary so that every character is saved in 1 bit? I am using Qt to achieve this:
string Huffman_encoding(string text)
{
buildHuffmanTree(text);
string encoded = "";
unordered_map<char, string> StringEncoded;
encoding(main_root, "", StringEncoded);
for (char ch : text) {
encoded += StringEncoded[ch];
}
return encoded;
}
The canonical solution uses a "bit packer" that accepts bitstrings and emits packed bytes. As a first start, replace encoded by an instance of the following:
class BitPacker {
QByteArray res;
quint8 bitsLeft = 8;
quint8 buf = 0;
public:
void operator+=(const std::string& s) {
for (auto c : s) {
buf = buf << 1 | c - '0';
if (--bitsLeft == 0) {
res.append(buf);
buf = 0;
bitsLeft = 8;
}
}
}
QByteArray finish() {
if (bitsLeft < 8) {
res.append(buf << bitsLeft);
buf = 0;
bitsLeft = 8;
}
return res;
}
}
operator+= will add additional bits to buf and flush complete bytes to res. At the end of the process you may be left with, say, 3 bits. finish uses a simple algorithm: it pads the buffer with zeroes to produce a final byte and hands you back the fully encoded buffer.
A more sophisticated solution might be to introduce an explicit "end of stream" token that is not present in the source character set.
Seems what you're searching for is a way to convert a string containing a sequence of 0s and 1s like "0000010010000000" to an actual binary representation (numbers 4 and 128 in this example).
This could be achieved with a function like this:
#include <iostream>
#include <string>
#include <cstdint>
#include <vector>
std::vector<uint8_t> toBinary(std::string const& binStr)
{
std::vector<uint8_t> result;
result.reserve(binStr.size() / 8);
size_t pos = 0;
size_t len = binStr.length();
while (pos < len)
{
size_t curLen = std::min(static_cast<size_t>(8), len-pos);
auto curStr = binStr.substr(pos, curLen) + std::string(8-curLen, '0');
std::cout << "curLen: " << curLen << ", curStr: " << curStr << "\n";
result.push_back(std::stoi(curStr, 0, 2));
pos += 8;
}
return result;
}
// test:
int main()
{
std::string binStr("000001001000000001");
auto bin = toBinary(binStr);
for (auto i: bin)
{
std::cout << static_cast<int>(i) << " ";
}
return 0;
}
Output:
4 128 64
You can then do whatever you want with these numbers, e.g. write them into a binary file.
Note that toBinary as above, pads the last byte, if incomplete, with zeros.
You can create a bitstream using bitwise logic like this :
#include <cassert>
#include <string>
#include <stdexcept>
#include <vector>
auto to_bit_stream(const std::string& bytes)
{
std::vector<std::uint8_t> stream;
std::uint8_t shift{ 0 };
std::uint8_t out{ 0 };
// allocate enough bytes to hold the bits
// speeds up the code a bit
stream.reserve((bytes.size() + 7) / 8);
// loop over all bytes
for (const auto c : bytes)
{
// check input
if (!((c == '0') || (c == '1'))) throw std::invalid_argument("invalid character in input");
// shift output by one to accept next bit
out <<= 1;
// keep track of number of shifts
// after 8 shifts a byte has been filled
shift++;
// or the output with a 1 if needed
out |= (c == '1');
// complete an output byte
if (shift == 8)
{
stream.push_back(out);
out = 0;
shift = 0;
}
}
return stream;
}
int main()
{
// stream is 8 bits per value, values 0,1,2,3
auto stream = to_bit_stream("00000000000000010000001000000011");
assert(stream.size() == 4ul);
assert(stream[0] == 0);
assert(stream[1] == 1);
assert(stream[2] == 2);
assert(stream[3] == 3);
return 0;
}
Use std::stoi()
int n = std::stoi("01000100", nullptr, 2);
Related
I'm working with an old program and need help swapping the order of a Hex String.
Yes, a string...as in:
string hexString = "F07D0079"
string hexString2= "F07F"
I need each string to look like:
79007DF0 &
7FF0 respectively.
For the love of god i don't know why they're stored in strings, but they are.
This is a little endian/big endian issue but since it's in a string i can't use standard functions to reverse the order can I?
Is there any easy way to do this?
std::string swapValues(string originalHex)
{
string swappedHex;
//what to do here.
return swappedHex;
}
First check that the length is even (if it hasn't already been sanitised):
assert(hex.length() % 2 == 0);
Then reverse the string:
std::reverse(hex.begin(), hex.end());
Now the bytes are in the correct order, but the digits within each are wrong, so we need to swap them back:
for (auto it = hex.begin(); it != hex.end(); it += 2) {
std::swap(it[0], it[1]);
}
I might use the append member function.
std::string reverse_pairs(std::string const & src)
{
assert(src.size() % 2 == 0);
std::string result;
result.reserve(src.size());
for (std::size_t i = src.size(); i != 0; i -= 2)
{
result.append(src, i - 2, 2);
}
return result;
}
(As an exercise in extensibility, you can make the "2" a parameter, too.)
If you want to do it in-place, you can use std::rotate in a loop.
I wouldn't bother with something overly clever for this:
std::string swapValues(const std::string& o)
{
std::string s(o.length());
if (s.length() == 4) {
s[0] = o[2];
s[1] = o[3];
s[2] = o[0];
s[3] = o[1];
return s;
}
if (s.length() == 8) {
// left as an exercise
}
throw std::logic_error("You got to be kidding me...");
}
There should be library functions available (a naive string manipulation might be no good):
#include <iostream>
#include <arpa/inet.h>
int main() {
std::string hex32 = "F07D0079";
std::string hex16 = "F07F";
std::uint32_t u32 = std::strtoul(hex32.c_str(), 0, 16);
std::uint16_t u16 = std::strtoul(hex16.c_str(), 0, 16);
// Here we would need to know the endian of the sources.
u32 = ntohl(u32);
u16 = ntohs(u16);
std::cout << std::hex << u32 << ", " << u16 << '\n';
}
Linux/Little Endian
Any function operating on the strings must know the target platform (hence there is no general solution)
Phase 1
example 1: I have string text = "01100001" then I want write to file "a"
example 2: I have string text = "0110000101100010" So I want write to file "ab"
NOTE:I solved phase 1 and result of writing is true.
Phase 2
for example 1:
I want read the file and put it to temp.
So temp = "a" and i convert it to "01100001"
for example 2:
I want read the file and put it to temp.
So temp = "ab" and i convert it to "0110000101100010"
Question
in my code i have below input
string text ="00000110101011100010001011111110011011110101100101110101101111010111111110101011"
"00111011000011100011100000100010111110111110111001100001110001110000101001111010"
"00000101";
I did "phase 1" and I opened the file in a hex editor the writing is true.
But after doing "phase 2" temp != text. Why?
My code
#include <iostream>
#include <sstream>
#include <vector>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;
class bitChar{
public:
unsigned char* c;
int shift_count;
string BITS;
bitChar()
{
shift_count = 0;
c = (unsigned char*)calloc(1, sizeof(char));
}
string readByBits(ifstream& inf)
{
string s ="";
while (inf)
{
string strInput;
getline(inf, strInput );
for (int i =0 ; i < strInput.size() ; i++)
{
s += getBits(strInput[i]);
}
}
return s;
}
void setBITS(string X)
{
BITS = X;
}
int insertBits(ofstream& outf)
{
int total = 0 ;
while(BITS.length())
{
if(BITS[0] == '1')
*c |= 1;
*c <<= 1;
++shift_count;
++total;
BITS.erase(0, 1);
if(shift_count == 7 )
{
if(BITS.size()>0)
{
if(BITS[0] == '1')
*c |= 1;
++total;
BITS.erase(0, 1);
}
writeBits(outf);
shift_count = 0;
free(c);
c = (unsigned char*)calloc(1, sizeof(char));
}
}
if(shift_count > 0)
{
*c <<= (7 - shift_count);
writeBits(outf);
free(c);
c = (unsigned char*)calloc(1, sizeof(char));
}
outf.close();
return total;
}
string getBits(unsigned char X)
{
stringstream itoa;
for(unsigned s = 7; s > 0 ; s--)
{
itoa << ((X >> s) & 1);
}
itoa << (X&1) ;
return itoa.str();
}
void writeBits(ofstream& outf)
{
outf << *c;
}
~bitChar()
{
if(c)
free(c);
}
};
int main()
{
ofstream outf("ssSample.dat",ios::binary);
string text ="00000110101011100010001011111110011011110101100101110101101111010111111110101011"
"00111011000011100011100000100010111110111110111001100001110001110000101001111010"
"00000101";
cout<< text<<endl;
//write to file
bitChar bchar;
bchar.setBITS(text);
bchar.insertBits(outf);
outf.close();
ifstream inf("ssSample.dat" ,ios::binary);
//READ FROM FILE
string temp=bchar.readByBits(inf);
cout << endl;
cout << temp << endl;
return 0;
}
You have a LF Line Feed character. This is the character that is getting omitted.
0000 1010
This may be unrelated, but Windows requires a CR and LF for a new line. This code may act differently in Windows vs. Unix.
Read one byte at a time.
string readByBits(ifstream& inf)
{
string s ="";
char buffer[1];
while (inf.read (buffer, 1))
{
// string strInput;
//getline(inf, strInput );
//for (int i =0 ; i < strInput.size() ; i++)
//{
s += getBits(*buffer);
//}
}
return s;
}
Program output:
000001101010111000100010111111100110111101011001011101011011110101111111101010110011101100001110001110000010001011111011111011100110000111000111000010100111101000000101
000001101010111000100010111111100110111101011001011101011011110101111111101010110011101100001110001110000010001011111011111011100110000111000111000010100111101000000101
One problem with your approach is that your text must be a multiple of 8 bits to work. Otherwise, even if everything is correct, that last character will be read from the file and converted to 8 binary digits in the string adding trailing zeros.
Two problems I quickly identified (but I assume there are more)
Your input is not a multiple of 8-bits
By using getLine you're reading until you meet a delimiting character and thus spoiling your result since you're not dealing with a text-based file
I'm reading a string from a file so it's in the form of a char array. I need to tokenize the string and save each char array token as a uint8_t hex value in an array.
char* starting = "001122AABBCC";
// ...
uint8_t[] ending = {0x00,0x11,0x22,0xAA,0xBB,0xCC}
How can I convert from starting to ending? Thanks.
Here is a complete working program. It is based on Rob I's solution, but fixes several problems has been tested to work.
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
const char* starting = "001122AABBCC";
int main()
{
std::string starting_str = starting;
std::vector<unsigned char> ending;
ending.reserve( starting_str.size());
for (int i = 0 ; i < starting_str.length() ; i+=2) {
std::string pair = starting_str.substr( i, 2 );
ending.push_back(::strtol( pair.c_str(), 0, 16 ));
}
for(int i=0; i<ending.size(); ++i) {
printf("0x%X\n", ending[i]);
}
}
strtoul will convert text in any base you choose into bytes. You have to do a little work to chop the input string into individual digits, or you can convert 32 or 64bits at a time.
ps uint8_t[] ending = {0x00,0x11,0x22,0xAA,0xBB,0xCC}
Doesn't mean anything, you aren't storing the data in a uint8 as 'hex', you are storing bytes, it's upto how you (or your debugger) interpretes the binary data
With C++11, you may use std::stoi for that :
std::vector<uint8_t> convert(const std::string& s)
{
if (s.size() % 2 != 0) {
throw std::runtime_error("Bad size argument");
}
std::vector<uint8_t> res;
res.reserve(s.size() / 2);
for (std::size_t i = 0, size = s.size(); i != size; i += 2) {
std::size_t pos = 0;
res.push_back(std::stoi(s.substr(i, 2), &pos, 16));
if (pos != 2) {
throw std::runtime_error("bad character in argument");
}
}
return res;
}
Live example.
I think any canonical answer (w.r.t. the bounty notes) would involve some distinct phases in the solution:
Error checking for valid input
Length check and
Data content check
Element conversion
Output creation
Given the usefulness of such conversions, the solution should probably include some flexibility w.r.t. the types being used and the locale required.
From the outset, given the date of the request for a "more canonical answer" (circa August 2014) liberal use of C++11 will be applied.
An annotated version of the code, with types corresponding to the OP:
std::vector<std::uint8_t> convert(std::string const& src)
{
// error check on the length
if ((src.length() % 2) != 0) {
throw std::invalid_argument("conversion error: input is not even length");
}
auto ishex = [] (decltype(*src.begin()) c) {
return std::isxdigit(c, std::locale()); };
// error check on the data contents
if (!std::all_of(std::begin(src), std::end(src), ishex)) {
throw std::invalid_argument("conversion error: input values are not not all xdigits");
}
// allocate the result, initialised to 0 and size it to the correct length
std::vector<std::uint8_t> result(src.length() / 2, 0);
// run the actual conversion
auto str = src.begin(); // track the location in the string
std::for_each(result.begin(), result.end(), [&str](decltype(*result.begin())& element) {
element = static_cast<std::uint8_t>(std::stoul(std::string(str, str + 2), nullptr, 16));
std::advance(str, 2); // next two elements
});
return result;
}
The template version of the code adds flexibility;
template <typename Int /*= std::uint8_t*/,
typename Char = char,
typename Traits = std::char_traits<Char>,
typename Allocate = std::allocator<Char>,
typename Locale = std::locale>
std::vector<Int> basic_convert(std::basic_string<Char, Traits, Allocate> const& src, Locale locale = Locale())
{
using string_type = std::basic_string<Char, Traits, Allocate>;
auto ishex = [&locale] (decltype(*src.begin()) c) {
return std::isxdigit(c, locale); };
if ((src.length() % 2) != 0) {
throw std::invalid_argument("conversion error: input is not even length");
}
if (!std::all_of(std::begin(src), std::end(src), ishex)) {
throw std::invalid_argument("conversion error: input values are not not all xdigits");
}
std::vector<Int> result(src.length() / 2, 0);
auto str = std::begin(src);
std::for_each(std::begin(result), std::end(result), [&str](decltype(*std::begin(result))& element) {
element = static_cast<Int>(std::stoul(string_type(str, str + 2), nullptr, 16));
std::advance(str, 2);
});
return result;
}
The convert() function can then be based on the basic_convert() as follows:
std::vector<std::uint8_t> convert(std::string const& src)
{
return basic_convert<std::uint8_t>(src, std::locale());
}
Live sample.
uint8_t is typically no more than a typedef of an unsigned char. If you're reading characters from a file, you should be able to read them into an unsigned char array just as easily as a signed char array, and an unsigned char array is a uint8_t array.
I'd try something like this:
std::string starting_str = starting;
uint8_t[] ending = new uint8_t[starting_str.length()/2];
for (int i = 0 ; i < starting_str.length() ; i+=2) {
std::string pair = starting_str.substr( i, i+2 );
ending[i/2] = ::strtol( pair.c_str(), 0, 16 );
}
Didn't test it but it looks good to me...
You may add your own conversion from set of char { '0','1',...'E','F' } to uint8_t:
uint8_t ctoa(char c)
{
if( c >= '0' && c <= '9' ) return c - '0';
else if( c >= 'a' && c <= 'f' ) return 0xA + c - 'a';
else if( c >= 'A' && c <= 'F' ) return 0xA + c - 'A';
else return 0;
}
Then it will be easy to convert a string in to array:
uint32_t endingSize = strlen(starting)/2;
uint8_t* ending = new uint8_t[endingSize];
for( uint32_t i=0; i<endingSize; i++ )
{
ending[i] = ( ctoa( starting[i*2] ) << 4 ) + ctoa( starting[i*2+1] );
}
This simple solution should work for your problem
char* starting = "001122AABBCC";
uint8_t ending[12];
// This algo will work for any size of starting
// However, you have to make sure that the ending have enough space.
int i=0;
while (i<strlen(starting))
{
// convert the character to string
char str[2] = "\0";
str[0] = starting[i];
// convert string to int base 16
ending[i]= (uint8_t)atoi(str,16);
i++;
}
uint8_t* ending = static_cast<uint8_t*>(starting);
I have written a function to take in the data from a Sirit IDentity MaX AVI reader and parse out the facility code and keycard number. How I am currently doing it works, but is there a better way? Seems little hackish... buff & buf are size 264
buf and buff are char
Data received from reader:
2009/12/30 14:56:18 epc0 LN:001
C80507A0008A19FA 0000232F Xlat'd
char TAccessReader::HexCharToInt(char n)
{
if (n >= '0' && n <= '9')
return (n-'0');
else
if (n >= 'A' && n <= 'F')
return (n-'A'+10);
else
return 0;
}
bool TAccessReader::CheckSirit(char *buf, long *key_num, unsigned char *fac) {
unsigned short i, j, k;
*key_num = 0; // Default is zero
memset(buff, 0, sizeof(buff));
i = sscanf(buf, "%s %s %s %s %s %s %s", &buff[0], &buff[20], &buff[40],
&buff[60], &buff[80], &buff[140], &buff[160]);
if (i == 7 && buff[147] && !buff[148]) {
// UUGGNNNN UU=spare, GG=Facility Code, NNNN=Keycard Number (all HEX)
// get facility code
*fac = HexCharToInt(buff[142]) * 16 + HexCharToInt(buff[143]);
*key_num = (unsigned short)HexCharToInt(buff[144]) * 4096 +
(unsigned short)HexCharToInt(buff[145]) * 256 +
(unsigned short)HexCharToInt(buff[146]) * 16 +
HexCharToInt(buff[147]);
}
// do some basic checks.. return true or false
}
Just use std::stringstream:
#include <sstream>
#include <iostream>
using namespace std;
int main() {
unsigned int x;
stringstream ss;
ss << hex << "ff";
ss >> x;
// output it as a signed type
cout << static_cast<int>(x) << endl;
}
You can also use strtol from straight-up C:
#include <cstdlib>
#include <iostream>
using namespace std;
int main() {
string s = "ff";
char *p;
long n = strtol(s.c_str(), &p, 16);
if (*p != 0) {
cout << "fail" << endl;
}
else {
cout << n << endl;
}
}
Here's an easy way to get at the data you want. I do work in the access control business so this was something that interested me...
template<typename TRet, typename Iterator>
TRet ConvertHex(Iterator begin) {
unsigned long result;
Iterator end = begin + (sizeof(TRet) * 2);
std::stringstream ss(std::string(begin, end));
ss >> std::hex >> result;
return result;
}
bool TAccessReader::CheckSirit(char *buf, long *key_num, unsigned char *fac) {
*key_num = 0; // Default is zero
std::istringstream sbuf(std::string(buf, buf+264));
// Stuff all of the string elements into a vector
std::vector<std::string> elements;
std::copy (std::istream_iterator<std::string>(sbuf), std::istream_iterator<std::string>(), std::back_inserter (elements));
// We're interested in the 6th element
std::string read = elements[5];
if (read.length() == 8) {
// UUGGNNNN UU=spare, GG=Facility Code, NNNN=Keycard Number (all HEX)
// get facility and card code
std::string::const_iterator iter = read.begin();
*fac = ConvertHex<unsigned char>(iter + 2);
*key_num = ConvertHex<unsigned short>(iter + 4);
}
// do some basic checks.. return true or false
}
Since you are already using sscanf, why not have it parse the hex numbers for you:
sscanf(buff, "%x %x", &val1, &val2);
I found the code to convert a hexadecimal string into a signed int using strtol, but I can't find something for a short int (2 bytes). Here' my piece of code :
while (!sCurrentFile.eof() )
{
getline (sCurrentFile,currentString);
sOutputFile<<strtol(currentString.c_str(),NULL,16)<<endl;
}
My idea is to read a file with 2 bytes wide values (like 0xFFEE), convert it to signed int and write the result in an output file. Execution speed is not an issue.
I could find some ways to avoid the problem, but I'd like to use a "one line" solution, so maybe you can help for this :)
Edit : The files look like this :
0x0400
0x03fe
0x03fe
...
Edit : I already tried with the hex operator, but I still have to convert the string to an integer before doing so.
// This won't work as currentString is not an integer
myInt << std::hex << currentString.c_str();
This should be simple:
std::ifstream file("DataFile");
int value;
while(file >> std::hex >> value) // Reads a hex string and converts it to an int.
{
std::cout << "Value: " << std::hex << value << "\n";
}
While we are talking about files:
You should NOT do this:
while (!sCurrentFile.eof() )
{
getline (sCurrentFile,currentString);
... STUFF ...
}
This is because when you read the last line it does NOT set the EOF. So when you loop around and then read the line after the last line, getline() will fail and you will be doing STUFF on what was in currentString from the last time it was set up. So in-effect you will processes the last line twice.
The correct way to loop over a file is:
while (getline(sCurrentFile,currentString))
{
// If the get fails then you have read past EOF and loop is not entered.
... STUFF ...
}
You can probably use stringtream class's >> operator with hex manipulator.
Have you considered sscanf with the "%hx" conversion qualifier?
// convert unsigned-integer to it's hexadecimal string represention
// 0x12345678 -> '12345678'
// N is BYTE/WORD/UINT/ULONGLONG
// T is char or wchar_t
template <class N, class T> inline T* UnsignedToHexStr(N n , // [i ]
T* pcStr , // [i/o] filled with string
UINT nDigits , // [i ] number of digits in output string / 0 (auto)
bool bNullTerminate ) // [i ] whether to add NULL termination
{
if ((N)-1 < (N)1) // if type of N is floating-point / signed-integer
if (::IsDebuggerPresent())
{
::OutputDebugString(_T("UnsignedToHexStr: Incorrect type passed\n"));
::DebugBreak();
}
if (!nDigits)
nDigits= GetUnsignedHexDigits(n);
if (1 == sizeof(T))
{
const char _czIntHexConv[]= "0123456789ABCDEF";
for (int i= nDigits-1; i>= 0; i--)
{
char* pLoc= (char*)&pcStr[i];
*pLoc= _czIntHexConv[n & 0x0F];
n >>= 4;
}
}
else
{
const wchar_t _czIntHexConv[]= L"0123456789ABCDEF";
for (int i= nDigits-1; i>= 0; i--)
{
wchar_t* pLoc= (wchar_t*)&pcStr[i];
*pLoc= _czIntHexConv[n & 0x0F];
n >>= 4;
}
}
if (bNullTerminate)
pcStr[nDigits]= 0;
return pcStr;
}
// --------------------------------------------------------------------------
// convert unsigned-integer in HEX string represention to it's numerical value
// '1234' -> 0x1234
// N is BYTE/WORD/UINT/ULONGLONG
// T is char or wchar_t
template <class N, class T> inline bool HexStrToUnsigned(const T* pczSrc ,
N& n ,
bool bSpecificTerminator= false, // whether string should terminate with specific terminating char
T cTerminator = 0 ) // specific terminating char
{
n= 0;
if (!pczSrc)
return false;
while ((32 == *pczSrc) || (9 == *pczSrc))
pczSrc++;
bool bLeadZeros= *pczSrc == _T('0');
while (*pczSrc == _T('0')) // skip leading zeros
pczSrc++;
BYTE nMaxDigits= 2*sizeof(N);
BYTE nDigits = 0 ;
while (true)
{
if ( (*pczSrc >= _T('0')) && (*pczSrc <= _T('9')))
{ if (nDigits==nMaxDigits) return false; n= (n<<4) + (*pczSrc-_T('0') ); pczSrc++; nDigits++; continue; }
if ( (*pczSrc >= _T('A')) && (*pczSrc <= _T('F')))
{ if (nDigits==nMaxDigits) return false; n= (n<<4) + (*pczSrc-_T('A')+10); pczSrc++; nDigits++; continue; }
if ( (*pczSrc >= _T('a')) && (*pczSrc <= _T('f')))
{ if (nDigits==nMaxDigits) return false; n= (n<<4) + (*pczSrc-_T('a')+10); pczSrc++; nDigits++; continue; }
if (bSpecificTerminator)
if (*pczSrc != cTerminator)
return false;
break;
}
return (nDigits>0) || bLeadZeros; // at least one digit
}
If you're sure the data can be trusted from currentString.c_str(), then you could also easily do
myInt << std::hex << atoi(currentString.c_str());
If you know the data is always going to be in that format, couldn't you just do something like:
myInt << std::hex << currentString.c_str() +2; // skip the leading "0x"