C++ Output UTF-8 strings as UTF-16 to std::cout - c++

I have a lot of code written based on UTF-8 using C++03, STL and Boost 1.54.
All the code outputs data to the console via std::cout or std::cerr.
I do not want to introduce a new library to my code base or switch to C++11,
but I want to port the code to Windows.
Rewriting all code to either use std::wcout or std::wcerr instead of
std::cout or std::cerr is not what I intend but I still want to display
all on console as UTF-16.
Is there a way to change std::cout and std::cerr to transform all char based data (which is UTF-8 encoded) to wchar_t based data (which would be UTF-16 encoded) before writing it to the console?
It would be great to see a solution for this using just C++03, STL and Boost 1.54.
I found that Boost Locale has conversion functions for single strings and there is a UTF-8 to UTF-32 iterator in Boost Spirit available but I could not find any facet codecvt to transform UTF-8 to UTF-16 without using an additional library or switching to C++11.
Thanks in advance.
PS: I know it is doable with something like this, but I hope to find a better solution here.

I did not came up with a better solution than already hinted.
So I will just share the solution based on streambuf here for anyone who is interested in it.
Hopefully, someone will come up with a better solution and share it here.
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#define TEST_ARG_TYPE wchar_t
#else /* not windows, unicode */
#define TEST_ARG_TYPE char
#endif /* windows, unicode */
#ifndef _O_U16TEXT
#define _O_U16TEXT 0x20000
#endif
static size_t countValidUtf8Bytes(const unsigned char * buf, const size_t size) {
size_t i, charSize;
const unsigned char * src = buf;
for (i = 0; i < size && (*src) != 0; i += charSize, src += charSize) {
charSize = 0;
if ((*src) >= 0xFC) {
charSize = 6;
} else if ((*src) >= 0xF8) {
charSize = 5;
} else if ((*src) >= 0xF0) {
charSize = 4;
} else if ((*src) >= 0xE0) {
charSize = 3;
} else if ((*src) >= 0xC0) {
charSize = 2;
} else if ((*src) >= 0x80) {
/* Skip continuous UTF-8 character (should never happen). */
for (; (i + charSize) < size && src[charSize] != 0 && src[charSize] >= 0x80; charSize++) {
charSize++;
}
} else {
/* ASCII character. */
charSize = 1;
}
if ((i + charSize) > size) break;
}
return i;
}
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#include <locale>
#include <streambuf>
#include <boost/locale.hpp>
extern "C" {
#include <fcntl.h>
#include <io.h>
#include <windows.h>
int _CRT_glob;
extern void __wgetmainargs(int *, wchar_t ***, wchar_t ***, int, int *);
}
class Utf8ToUtf16Buffer : public std::basic_streambuf< char, std::char_traits<char> > {
private:
char * outBuf;
FILE * outFd;
public:
static const size_t BUFFER_SIZE = 1024;
typedef std::char_traits<char> traits_type;
typedef traits_type::int_type int_type;
typedef traits_type::pos_type pos_type;
typedef traits_type::off_type off_type;
explicit Utf8ToUtf16Buffer(FILE * o) : outBuf(new char[BUFFER_SIZE]), outFd(o) {
/* Initialize the put pointer. Overflow won't get called until this
* buffer is filled up, so we need to use valid pointers.
*/
this->setp(outBuf, outBuf + BUFFER_SIZE - 1);
}
~Utf8ToUtf16Buffer() {
delete[] outBuf;
}
protected:
virtual int_type overflow(int_type c);
virtual int_type sync();
};
Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::overflow(Utf8ToUtf16Buffer::int_type c) {
char * iBegin = this->outBuf;
char * iEnd = this->pptr();
int_type result = traits_type::not_eof(c);
/* If this is the end, add an eof character to the buffer.
* This is why the pointers passed to setp are off by 1
* (to reserve room for this).
*/
if ( ! traits_type::eq_int_type(c, traits_type::eof()) ) {
*iEnd = traits_type::to_char_type(c);
iEnd++;
}
/* Calculate output data length. */
int_type iLen = static_cast<int_type>(iEnd - iBegin);
int_type iLenU8 = static_cast<int_type>(
countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(iBegin), static_cast<size_t>(iLen))
);
/* Convert string to UTF-16 and write to defined file descriptor. */
if (fwprintf(this->outFd, boost::locale::conv::utf_to_utf<wchar_t>(std::string(outBuf, outBuf + iLenU8)).c_str()) < 0) {
/* Failed to write data to output file descriptor. */
result = traits_type::eof();
}
/* Reset the put pointers to indicate that the buffer is free. */
if (iLenU8 == iLen) {
this->setp(outBuf, outBuf + BUFFER_SIZE + 1);
} else {
/* Move incomplete UTF-8 characters remaining in buffer. */
const size_t overhead = static_cast<size_t>(iLen - iLenU8);
memmove(outBuf, outBuf + iLenU8, overhead);
this->setp(outBuf + overhead, outBuf + BUFFER_SIZE + 1);
}
return result;
}
Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::sync() {
return traits_type::eq_int_type(this->overflow(traits_type::eof()), traits_type::eof()) ? -1 : 0;
}
#endif /* windows, unicode */
int test_main(int argc, TEST_ARG_TYPE ** argv);
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
int main(/*int argc, char ** argv*/) {
wchar_t ** wenpv, ** wargv;
int wargc, si = 0;
/* this also creates the global variable __wargv */
__wgetmainargs(&wargc, &wargv, &wenpv, _CRT_glob, &si);
/* enable UTF-16 output to standard output console */
_setmode(_fileno(stdout), _O_U16TEXT);
std::locale::global(boost::locale::generator().generate("UTF-8"));
Utf8ToUtf16Buffer u8cout(stdout);
std::streambuf * out = std::cout.rdbuf();
std::cout.rdbuf(&u8cout);
/* process user defined main function */
const int result = test_main(wargc, wargv);
/* revert stream buffers to let cout clean up remaining memory correctly */
std::cout.rdbuf(out);
return result;
#else /* not windows or unicode */
int main(int argc, char ** argv) {
return test_main(argc, argv);
#endif /* windows, unicode */
}
int test_main(int /*argc*/, TEST_ARG_TYPE ** /*argv*/) {
const std::string str("\x61\x62\x63\xC3\xA4\xC3\xB6\xC3\xBC\xE3\x81\x82\xE3\x81\x88\xE3\x81\x84\xE3\x82\xA2\xE3\x82\xA8\xE3\x82\xA4\xE4\xBA\x9C\xE6\xB1\x9F\xE6\x84\x8F");
for (size_t i = 1; i <= str.size(); i++) {
const std::string part(str.begin(), str.begin() + i);
const size_t validByteCount = countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(part.c_str()), part.size());
wprintf(L"i = %u, v = %u\n", i, validByteCount);
const std::string valid(str.begin(), str.begin() + validByteCount);
std::cout << valid << std::endl;
std::cout.flush();
for (size_t j = 0; j < part.size(); j++) {
wprintf(L"%02X", static_cast<int>(part[j]) & 0xFF);
}
wprintf(L"\n");
}
return EXIT_SUCCESS;
}

I feel like this is probably a bad idea.. but I think it should still be seen as it does work provided that the console has the right font..
#include <iostream>
#include <windows.h>
//#include <io.h>
//#include <fcntl.h>
std::wstring UTF8ToUTF16(const char* utf8)
{
std::wstring utf16;
int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
if (len > 1)
{
utf16.resize(len);
MultiByteToWideChar(CP_UTF8, 0, utf8, -1, &utf16[0], len);
}
return utf16;
}
std::ostream& operator << (std::ostream& os, const char* data)
{
//_setmode(_fileno(stdout), _O_U16TEXT);
SetConsoleCP(1200);
std::wstring str = UTF8ToUTF16(data);
DWORD slen = str.size();
WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);
MessageBoxW(NULL, str.c_str(), L"", 0);
return os;
}
std::ostream& operator << (std::ostream& os, const std::string& data)
{
//_setmode(_fileno(stdout), _O_U16TEXT);
SetConsoleCP(1200);
std::wstring str = UTF8ToUTF16(&data[0]);
DWORD slen = str.size();
WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);
return os;
}
std::wostream& operator <<(std::wostream& os, const wchar_t* data)
{
DWORD slen = wcslen(data);
WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data, slen, &slen, nullptr);
return os;
}
std::wostream& operator <<(std::wostream& os, const std::wstring& data)
{
WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data.c_str(), data.size(), nullptr, nullptr);
return os;
}
int main()
{
std::cout<<"Россия";
}
And now cout and std::wcout both use the WriteConsoleW function.. You'd have to overload it for const char*, char*, std::string, char, etc.. whatever you need.. Maybe template it.

Related

how can I convert wstring to u16string?

I want to convert wstring to u16string in C++.
I can convert wstring to string, or reverse. But I don't know how convert to u16string.
u16string CTextConverter::convertWstring2U16(wstring str)
{
int iSize;
u16string szDest[256] = {};
memset(szDest, 0, 256);
iSize = WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, NULL, 0,0,0);
WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0);
u16string s16 = szDest;
return s16;
}
Error in WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0);'s szDest. Cause of u16string can't use with LPSTR.
How can I fix this code?
For a platform-independent solution see this answer.
If you need a solution only for the Windows platform, the following code will be sufficient:
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
On the Windows platform, a std::wstring is interchangeable with std::u16string because sizeof(wstring::value_type) == sizeof(u16string::value_type) and both are UTF-16 (little endian) encoded.
wstring::value_type = wchar_t
u16string::value_type = char16_t
The only difference is that wchar_t is signed, whereas char16_t is unsigned. So you only have to do sign conversion, which can be performed by using the u16string constructor that takes an iterator pair as arguments. This constructor will implicitly convert wchar_t to char16_t.
Full example console application:
#include <windows.h>
#include <string>
int main()
{
static_assert( sizeof(std::wstring::value_type) == sizeof(std::u16string::value_type),
"std::wstring and std::u16string are expected to have the same character size" );
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
// The u16string constructor performs an implicit conversion like:
wchar_t wch = L'A';
char16_t ch16 = wch;
// Need to reinterpret_cast because char16_t const* is not implicitly convertible
// to LPCWSTR (aka wchar_t const*).
::MessageBoxW( 0, reinterpret_cast<LPCWSTR>( u16str.c_str() ), L"test", 0 );
return 0;
}
Update
I had thought the standard version did not work, but in fact this was simply due to bugs in the Visual C++ and libstdc++ 3.4.21 runtime libraries. It does work with clang++ -std=c++14 -stdlib=libc++. Here is a version that tests whether the standard method works on your compiler:
#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <clocale>
#include <vector>
using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;
using std::wcout;
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
int main(void)
{
constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
init_locale();
const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
const size_t max_len = sizeof(msg_utf16);
std::vector<char> out(max_len);
std::mbstate_t state;
const wchar_t* from_w = nullptr;
char* to_next = nullptr;
converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
} else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
}
out.clear();
out.resize(max_len);
const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
const char32_t* from_u32 = nullptr;
converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
}
wcout << msg_w << endl;
return EXIT_SUCCESS;
}
Previous
A bit late to the game, but here’s a version that additionally checks whether wchar_t is 32-bits (as it is on Linux), and if so, performs surrogate-pair conversion. I recommend saving this source as UTF-8 with a BOM. Here is a link to it on ideone.
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string. Any wide characters
* outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
* per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
* retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
* sentinel values. Noncharacters, however, are left intact. As a fallback,
* if wide characters are the same size as char16_t, this does a more trivial
* construction using that implicit conversion.
*/
{
/* We assume that, if this test passes, a wide-character string is already
* UTF-16, or at least converts to it implicitly without needing surrogate
* pairs.
*/
if ( sizeof(wchar_t) == sizeof(char16_t) ) {
return std::u16string( ws.begin(), ws.end() );
} else {
/* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
* A surrogate pair suffices to represent all wide characters, because all
* characters outside the range will be mapped to the sentinel value
* U+FFFD. Add one character for the terminating NUL.
*/
const size_t max_len = 2 * ws.length() + 1;
// Our temporary UTF-16 string.
std::u16string result;
result.reserve(max_len);
for ( const wchar_t& wc : ws ) {
const std::wint_t chr = wc;
if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
// Invalid code point. Replace with sentinel, per Unicode standard:
constexpr char16_t sentinel = u'\uFFFD';
result.push_back(sentinel);
} else if ( chr < 0x10000UL ) { // In the BMP.
result.push_back(static_cast<char16_t>(wc));
} else {
const char16_t leading = static_cast<char16_t>(
((chr-0x10000UL) / 0x400U) + 0xD800U );
const char16_t trailing = static_cast<char16_t>(
((chr-0x10000UL) % 0x400U) + 0xDC00U );
result.append({leading, trailing});
} // end if
} // end for
/* The returned string is shrunken to fit, which might not be the Right
* Thing if there is more to be added to the string.
*/
result.shrink_to_fit();
// We depend here on the compiler to optimize the move constructor.
return result;
} // end if
// Not reached.
}
int main(void)
{
static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
const std::u16string converted = make_u16string(wtest);
init_locale();
std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
for( size_t i = 0; i <= u16test.length(); ++i ) {
if ( u16test[i] != converted[i] ) {
std::wcout << std::hex << std::showbase
<< std::right << std::setfill(L'0')
<< std::setw(4) << (unsigned)converted[i] << L" ≠ "
<< std::setw(4) << (unsigned)u16test[i] << L" at "
<< i << L'.' << std::endl;
return EXIT_FAILURE;
} // end if
} // end for
std::wcout << wtest << std::endl;
return EXIT_SUCCESS;
}
Footnote
Since someone asked: The reason I suggest UTF-8 with BOM is that some compilers, including MSVC 2015, will assume a source file is encoded according to the current code page unless there is a BOM or you specify an encoding on the command line. No encoding works on all toolchains, unfortunately, but every tool I’ve used that’s modern enough to support C++14 also understands the BOM.
- To convert CString to std:wstring and string
string CString2string(CString str)
{
int bufLen = WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, NULL, 0, NULL,NULL);
char *buf = new char[bufLen];
WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, buf, bufLen, NULL, NULL);
string sRet(buf);
delete[] buf;
return sRet;
}
CString strFileName = "test.txt";
wstring wFileName(strFileName.GetBuffer());
strFileName.ReleaseBuffer();
string sFileName = CString2string(strFileName);
- To convert string to CString
CString string2CString(string s)
{
int bufLen = MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, NULL, 0);
WCHAR *buf = new WCHAR[bufLen];
MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, buf, bufLen);
CString strRet(buf);
delete[] buf;
return strRet;
}
string sFileName = "test.txt";
CString strFileName = string2CString(sFileName);

AES128 in libgcrypt not encrypting

I've been trying to the libgcrypt for a small cryptography project of mine, but I can't seem to be able to implement the en/decryption correctly. The following the class and the usage of it.
#include <iostream>
#include <string>
#include <cstdlib>
#include <gcrypt.h>
#include "aes.h"
#define GCRY_CIPHER GCRY_CIPHER_AES128
#define GCRY_MODE GCRY_CIPHER_MODE_ECB
using namespace std;
aes::aes(string a) {
key = a;
keyLength = gcry_cipher_get_algo_keylen(GCRY_CIPHER);
gcry_control (GCRYCTL_DISABLE_SECMEM, 0);
gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0);
gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_MODE, 0);
gcry_cipher_setkey(handle, key.c_str(), keyLength);
}
string aes::encrypt(string text) {
size_t textLength = text.size() + 1;
char * encBuffer = (char *)malloc(textLength);
gcry_cipher_encrypt(handle, encBuffer, textLength, text.c_str(), textLength);
string ret (encBuffer);
return ret;
}
string aes::decrypt(string text) {
size_t textLength = text.size() + 1;
char * decBuffer = (char * )malloc(textLength);
gcry_cipher_decrypt(handle, decBuffer, textLength, text.c_str(), textLength);
string ret (decBuffer);
return ret;
}
And I use it in a main function like so:
...
aes bb = aes("one test AES key");
string test = "Some Message";
string enc = bb.encrypt(test);
string dec = bb.decrypt(enc);
for (size_t index = 0; index<enc.size(); index++)
printf("%c", enc[index]);
printf("\n");
cout << dec << endl;
...
And the output is
BBBBBBBBBBBBB
�n�S[
The odd thing is, I made a test program with almost the exact same statements that works perfectly. It started falling apart when I tried to package it into a class. The following is the code for this program, if anyone wants to see it.
#include <cstdlib>
#include <iostream>
#include <string>
#include <gcrypt.h>
using namespace std;
#define GCRY_CIPHER GCRY_CIPHER_AES128 // Pick the cipher here
#define GCRY_MODE GCRY_CIPHER_MODE_ECB // Pick the cipher mode here
void aesTest(void)
{
gcry_cipher_hd_t handle;
size_t keyLength = gcry_cipher_get_algo_keylen(GCRY_CIPHER);
string txtBuffer ("123456789 abcdefghijklmnopqrstuvwzyz ABCDEFGHIJKLMNOPQRSTUVWZYZ");
size_t txtLength = txtBuffer.size() +1; // string plus termination
char * encBuffer = (char *)malloc(txtLength);
char * outBuffer = (char *)malloc(txtLength);
char * key = "one test AES key"; // 16 bytes
gcry_control (GCRYCTL_DISABLE_SECMEM, 0);
gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0);
gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_MODE, 0);
gcry_cipher_setkey(handle, key, keyLength);
gcry_cipher_encrypt(handle, encBuffer, txtLength, txtBuffer.c_str(), txtLength);
gcry_cipher_decrypt(handle, outBuffer, txtLength, encBuffer, txtLength);
size_t index;
printf("encBuffer = ");
for (index = 0; index<txtLength; index++)
printf("%c", encBuffer[index]);
printf("\n");
printf("outBuffer = %s\n", outBuffer);
gcry_cipher_close(handle);
free(encBuffer);
free(outBuffer);
}
int main() {
aesTest();
return 0;
}
When you use a std::string to capture the encrypted data, you are possibly losing some data due to the presence of '\0' in the encrypted string.
Try using std::vector<char> instead.
void aes::encrypt(string text, std::vector<char>& ret) {
size_t textLength = text.size() + 1;
ret.resize(textLength);
gcry_cipher_encrypt(handle, ret.data(), textLength, text.c_str(), textLength);
}
string aes::decrypt(std::vector<char> const& text) {
size_t textLength = text.size() + 1;
// Since you are in C++ land, use new and delete
// instead of malloc and free.
char * decBuffer = new char[textLength];
gcry_cipher_decrypt(handle, decBuffer, textLength, text.data(), textLength);
string ret (decBuffer);
delete [] decBuffer;
return ret;
}

How to make my `std::string url_encode_wstring(const std::wstring &input)` work on Linux?

So we have such function:
std::string url_encode_wstring(const std::wstring &input)
{
std::string output;
int cbNeeded = WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, NULL, 0, NULL, NULL);
if (cbNeeded > 0) {
char *utf8 = new char[cbNeeded];
if (WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, utf8, cbNeeded, NULL, NULL) != 0) {
for (char *p = utf8; *p; *p++) {
char onehex[5];
_snprintf(onehex, sizeof(onehex), "%%%02.2X", (unsigned char)*p);
output.append(onehex);
}
}
delete[] utf8;
}
return output;
}
Its grate for windows but I wonder how (and is it possible) to make it work under linux?
IMHO you should use a portable character codec library.
Here's an example of minimal portable code using iconv, which should be more than enough.
It's supposed to work on Windows (if it does, you can get rid of your windows-specific code altogether).
I follow the GNU guidelines not to use the wcstombs & co functions ( https://www.gnu.org/s/hello/manual/libc/iconv-Examples.html )
Depending on the use case, handle errors appropriately... and to enhance performance, you can create a class out of it.
#include <iostream>
#include <iconv.h>
#include <cerrno>
#include <cstring>
#include <stdexcept>
std::string wstring_to_utf8_string(const std::wstring &input)
{
size_t in_size = input.length() * sizeof(wchar_t);
char * in_buf = (char*)input.data();
size_t buf_size = input.length() * 6; // pessimistic: max UTF-8 char size
char * buf = new char[buf_size];
memset(buf, 0, buf_size);
char * out_buf(buf);
size_t out_size(buf_size);
iconv_t conv_desc = iconv_open("UTF-8", "wchar_t");
if (conv_desc == iconv_t(-1))
throw std::runtime_error(std::string("Could not open iconv: ") + strerror(errno));
size_t iconv_value = iconv(conv_desc, &in_buf, &in_size, &out_buf, &out_size);
if (iconv_value == -1)
throw std::runtime_error(std::string("When converting: ") + strerror(errno));
int ret = iconv_close(conv_desc);
if (ret != 0)
throw std::runtime_error(std::string("Could not close iconv: ") + strerror(errno));
std::string s(buf);
delete [] buf;
return s;
}
int main() {
std::wstring in(L"hello world");
std::wcout << L"input: [" << in << L"]" << std::endl;
std::string out(wstring_to_utf8_string(in));
std::cerr << "output: [" << out << "]" << std::endl;
return 0;
}

Espeak SAPI/dll usage on Windows?

Question: I am trying to use the espeak text-to-speech engine.
So for I got it working wounderfully on linux (code below).
Now I wanted to port this basic program to windows, too, but it's nearly impossible...
Part of the problem is that the windows dll only allows for AUDIO_OUTPUT_SYNCHRONOUS, which means it requires a callback, but I can't figure out how to play the audio from the callback... First it crashed, then I realized, I need a callback function, now I get the data in the callback function, but I don't know how to play it... as it is neither a wav file nor plays automatically as on Linux.
The sourceforge site is rather useless, because it basically says use the SAPI version, but then there is no example on how to use the sapi espeak dll...
Anyway, here's my code, can anybody help?
#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#include <assert.h>
#include <ctype.h>
//#include "speak_lib.h"
#include "espeak/speak_lib.h"
// libespeak-dev: /usr/include/espeak/speak_lib.h
// apt-get install libespeak-dev
// apt-get install libportaudio-dev
// g++ -o mine mine.cpp -lespeak
// g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak
// gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak
char voicename[40];
int samplerate;
int quiet = 0;
static char genders[4] = {' ','M','F',' '};
//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/
const char *data_path = NULL; // use default path for espeak-data
int strrcmp(const char *s, const char *sub)
{
int slen = strlen(s);
int sublen = strlen(sub);
return memcmp(s + slen - sublen, sub, sublen);
}
char * strrcpy(char *dest, const char *source)
{
// Pre assertions
assert(dest != NULL);
assert(source != NULL);
assert(dest != source);
// tk: parentheses
while((*dest++ = *source++))
;
return(--dest);
}
const char* GetLanguageVoiceName(const char* pszShortSign)
{
#define LANGUAGE_LENGTH 30
static char szReturnValue[LANGUAGE_LENGTH] ;
memset(szReturnValue, 0, LANGUAGE_LENGTH);
for (int i = 0; pszShortSign[i] != '\0'; ++i)
szReturnValue[i] = (char) tolower(pszShortSign[i]);
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
if( !strrcmp( v->languages, szReturnValue) )
{
strcpy(szReturnValue, v->name);
return szReturnValue;
}
} // End for
strcpy(szReturnValue, "default");
return szReturnValue;
} // End function getvoicename
void ListVoices()
{
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
printf("Shortsign: %s\n", v->languages);
printf("age: %d\n", v->age);
printf("gender: %c\n", genders[v->gender]);
printf("name: %s\n", v->name);
printf("\n\n");
} // End for
} // End function getvoicename
int main()
{
printf("Hello World!\n");
const char* szVersionInfo = espeak_Info(NULL);
printf("Espeak version: %s\n", szVersionInfo);
samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);
strcpy(voicename, "default");
// espeak --voices
strcpy(voicename, "german");
strcpy(voicename, GetLanguageVoiceName("DE"));
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
printf("Espeak setvoice error...\n");
}
static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100 0=silence
int pitch = 50; // base pitch, range 0-100. 50=normal
// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE: pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION: which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some),
espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;
espeak_SetVoiceByProperties(voice_spec);
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
espeak_Terminate();
printf("Espeak terminated\n");
return EXIT_SUCCESS;
}
/*
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
memset(&voice_select,0,sizeof(voice_select));
voice_select.languages = voicename;
if(espeak_SetVoiceByProperties(&voice_select) != EE_OK)
{
fprintf(stderr,"%svoice '%s'\n",err_load,voicename);
exit(2);
}
}
*/
The above code is for Linux.
The below code is about as far as I got on Vista x64 (32 bit emu):
#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#include <assert.h>
#include <ctype.h>
#include "speak_lib.h"
//#include "espeak/speak_lib.h"
// libespeak-dev: /usr/include/espeak/speak_lib.h
// apt-get install libespeak-dev
// apt-get install libportaudio-dev
// g++ -o mine mine.cpp -lespeak
// g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak
// gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak
char voicename[40];
int iSampleRate;
int quiet = 0;
static char genders[4] = {' ','M','F',' '};
//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/
//const char *data_path = NULL; // use default path for espeak-data
const char *data_path = "C:\\Users\\Username\\Desktop\\espeak-1.43-source\\espeak-1.43-source\\";
int strrcmp(const char *s, const char *sub)
{
int slen = strlen(s);
int sublen = strlen(sub);
return memcmp(s + slen - sublen, sub, sublen);
}
char * strrcpy(char *dest, const char *source)
{
// Pre assertions
assert(dest != NULL);
assert(source != NULL);
assert(dest != source);
// tk: parentheses
while((*dest++ = *source++))
;
return(--dest);
}
const char* GetLanguageVoiceName(const char* pszShortSign)
{
#define LANGUAGE_LENGTH 30
static char szReturnValue[LANGUAGE_LENGTH] ;
memset(szReturnValue, 0, LANGUAGE_LENGTH);
for (int i = 0; pszShortSign[i] != '\0'; ++i)
szReturnValue[i] = (char) tolower(pszShortSign[i]);
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
if( !strrcmp( v->languages, szReturnValue) )
{
strcpy(szReturnValue, v->name);
return szReturnValue;
}
} // End for
strcpy(szReturnValue, "default");
return szReturnValue;
} // End function getvoicename
void ListVoices()
{
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
printf("Shortsign: %s\n", v->languages);
printf("age: %d\n", v->age);
printf("gender: %c\n", genders[v->gender]);
printf("name: %s\n", v->name);
printf("\n\n");
} // End for
} // End function getvoicename
/* Callback from espeak. Directly speaks using AudioTrack. */
#define LOGI(x) printf("%s\n", x)
static int AndroidEspeakDirectSpeechCallback(short *wav, int numsamples, espeak_EVENT *events)
{
char buf[100];
sprintf(buf, "AndroidEspeakDirectSpeechCallback: %d samples", numsamples);
LOGI(buf);
if (wav == NULL)
{
LOGI("Null: speech has completed");
}
if (numsamples > 0)
{
//audout->write(wav, sizeof(short) * numsamples);
sprintf(buf, "AudioTrack wrote: %d bytes", sizeof(short) * numsamples);
LOGI(buf);
}
return 0; // continue synthesis (1 is to abort)
}
static int AndroidEspeakSynthToFileCallback(short *wav, int numsamples,espeak_EVENT *events)
{
char buf[100];
sprintf(buf, "AndroidEspeakSynthToFileCallback: %d samples", numsamples);
LOGI(buf);
if (wav == NULL)
{
LOGI("Null: speech has completed");
}
// The user data should contain the file pointer of the file to write to
//void* user_data = events->user_data;
FILE* user_data = fopen ( "myfile1.wav" , "ab" );
FILE* fp = static_cast<FILE *>(user_data);
// Write all of the samples
fwrite(wav, sizeof(short), numsamples, fp);
return 0; // continue synthesis (1 is to abort)
}
int main()
{
printf("Hello World!\n");
const char* szVersionInfo = espeak_Info(NULL);
printf("Espeak version: %s\n", szVersionInfo);
iSampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, data_path, 0);
if (iSampleRate <= 0)
{
printf("Unable to initialize espeak");
return EXIT_FAILURE;
}
//samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);
//ListVoices();
strcpy(voicename, "default");
// espeak --voices
//strcpy(voicename, "german");
//strcpy(voicename, GetLanguageVoiceName("DE"));
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
printf("Espeak setvoice error...\n");
}
static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100 0=silence
int pitch = 50; // base pitch, range 0-100. 50=normal
// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE: pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION: which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some),
//espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
//voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;
//espeak_SetVoiceByProperties(voice_spec);
//espeak_SetSynthCallback(AndroidEspeakDirectSpeechCallback);
espeak_SetSynthCallback(AndroidEspeakSynthToFileCallback);
unsigned int unique_identifier;
espeak_ERROR err = espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, &unique_identifier, NULL);
err = espeak_Synchronize();
/*
strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
*/
// espeak_Cancel();
espeak_Terminate();
printf("Espeak terminated\n");
system("pause");
return EXIT_SUCCESS;
}
Have you tried passing the buffer you obtain in your callback to sndplaysnd()??
Declare Function sndPlaySound Lib "winmm.dll" Alias "sndPlaySoundA" (ByVal lpszSoundName As String, ByVal uFlags As Long) As Long
Its standard winAPI is as follows:
sndPlaySound(buffer[0], SND_ASYNC | SND_MEMORY)
Alternately, if you have a wav-file that has the audio to play:
sndPlaySound(filename, SND_ASYNC)
playsound has a ASYNC mode that wouldn't block your program's execution while the audio is being played.
NOTE: I have used it in VB and the above snippets are for use in VB. If you are coding in VC++, you might have to modify them accordingly. But the basic intention remains the same; to pass the buffer to sndPlaySound with the ASYNC flag set.
Good LUCK!!
Several changes in source code are needed to make the windows library have the same functionality as the one on Linux. I listed the changes here. The ready to use binary is also available.
All the patches and the description were also sent to espeak maintainer (publicly, through the mailing list and patches tracker), so maybe in future it will be available directly.

Generate SHA hash in C++ using OpenSSL library

How can I generate SHA1 or SHA2 hashes using the OpenSSL libarary?
I searched google and could not find any function or example code.
From the command line, it's simply:
printf "compute sha1" | openssl sha1
You can invoke the library like this:
#include <stdio.h>
#include <string.h>
#include <openssl/sha.h>
int main()
{
unsigned char ibuf[] = "compute sha1";
unsigned char obuf[20];
SHA1(ibuf, strlen(ibuf), obuf);
int i;
for (i = 0; i < 20; i++) {
printf("%02x ", obuf[i]);
}
printf("\n");
return 0;
}
OpenSSL has a horrible documentation with no code examples, but here you are:
#include <openssl/sha.h>
bool simpleSHA256(void* input, unsigned long length, unsigned char* md)
{
SHA256_CTX context;
if(!SHA256_Init(&context))
return false;
if(!SHA256_Update(&context, (unsigned char*)input, length))
return false;
if(!SHA256_Final(md, &context))
return false;
return true;
}
Usage:
unsigned char md[SHA256_DIGEST_LENGTH]; // 32 bytes
if(!simpleSHA256(<data buffer>, <data length>, md))
{
// handle error
}
Afterwards, md will contain the binary SHA-256 message digest. Similar code can be used for the other SHA family members, just replace "256" in the code.
If you have larger data, you of course should feed data chunks as they arrive (multiple SHA256_Update calls).
Adaptation of #AndiDog version for big file:
static const int K_READ_BUF_SIZE{ 1024 * 16 };
std::optional<std::string> CalcSha256(std::string filename)
{
// Initialize openssl
SHA256_CTX context;
if(!SHA256_Init(&context))
{
return std::nullopt;
}
// Read file and update calculated SHA
char buf[K_READ_BUF_SIZE];
std::ifstream file(filename, std::ifstream::binary);
while (file.good())
{
file.read(buf, sizeof(buf));
if(!SHA256_Update(&context, buf, file.gcount()))
{
return std::nullopt;
}
}
// Get Final SHA
unsigned char result[SHA256_DIGEST_LENGTH];
if(!SHA256_Final(result, &context))
{
return std::nullopt;
}
// Transform byte-array to string
std::stringstream shastr;
shastr << std::hex << std::setfill('0');
for (const auto &byte: result)
{
shastr << std::setw(2) << (int)byte;
}
return shastr.str();
}
correct syntax at command line should be
echo -n "compute sha1" | openssl sha1
otherwise you'll hash the trailing newline character as well.
Here is OpenSSL example of calculating sha-1 digest using BIO:
#include <openssl/bio.h>
#include <openssl/evp.h>
std::string sha1(const std::string &input)
{
BIO * p_bio_md = nullptr;
BIO * p_bio_mem = nullptr;
try
{
// make chain: p_bio_md <-> p_bio_mem
p_bio_md = BIO_new(BIO_f_md());
if (!p_bio_md) throw std::bad_alloc();
BIO_set_md(p_bio_md, EVP_sha1());
p_bio_mem = BIO_new_mem_buf((void*)input.c_str(), input.length());
if (!p_bio_mem) throw std::bad_alloc();
BIO_push(p_bio_md, p_bio_mem);
// read through p_bio_md
// read sequence: buf <<-- p_bio_md <<-- p_bio_mem
std::vector<char> buf(input.size());
for (;;)
{
auto nread = BIO_read(p_bio_md, buf.data(), buf.size());
if (nread < 0) { throw std::runtime_error("BIO_read failed"); }
if (nread == 0) { break; } // eof
}
// get result
char md_buf[EVP_MAX_MD_SIZE];
auto md_len = BIO_gets(p_bio_md, md_buf, sizeof(md_buf));
if (md_len <= 0) { throw std::runtime_error("BIO_gets failed"); }
std::string result(md_buf, md_len);
// clean
BIO_free_all(p_bio_md);
return result;
}
catch (...)
{
if (p_bio_md) { BIO_free_all(p_bio_md); }
throw;
}
}
Though it's longer than just calling SHA1 function from OpenSSL, but it's more universal and can be reworked for using with file streams (thus processing data of any length).
C version of #Nayfe code, generating SHA1 hash from file:
#include <stdio.h>
#include <openssl/sha.h>
static const int K_READ_BUF_SIZE = { 1024 * 16 };
unsigned char* calculateSHA1(char *filename)
{
if (!filename) {
return NULL;
}
FILE *fp = fopen(filename, "rb");
if (fp == NULL) {
return NULL;
}
unsigned char* sha1_digest = malloc(sizeof(char)*SHA_DIGEST_LENGTH);
SHA_CTX context;
if(!SHA1_Init(&context))
return NULL;
unsigned char buf[K_READ_BUF_SIZE];
while (!feof(fp))
{
size_t total_read = fread(buf, 1, sizeof(buf), fp);
if(!SHA1_Update(&context, buf, total_read))
{
return NULL;
}
}
fclose(fp);
if(!SHA1_Final(sha1_digest, &context))
return NULL;
return sha1_digest;
}
It can be used as follows:
unsigned char *sha1digest = calculateSHA1("/tmp/file1");
The res variable contains the sha1 hash.
You can print it on the screen using the following for-loop:
char *sha1hash = (char *)malloc(sizeof(char) * 41);
sha1hash[40] = '\0';
int i;
for (i = 0; i < SHA_DIGEST_LENGTH; i++)
{
sprintf(&sha1hash[i*2], "%02x", sha1digest[i]);
}
printf("SHA1 HASH: %s\n", sha1hash);