How do I HTML-/ URL-Encode a std::wstring containing Unicode characters?

How do I HTML-/ URL-Encode a std::wstring containing Unicode characters? - c++

I have another question yet. If I had a std::wstring looking like this:
ドイツ語で検索していてこちらのサイトにたどり着きました。
How could I possibly get it to be URL-Encoded (%nn, n = 0-9, a-f) to:
%E3%83%89%E3%82%A4%E3%83%84%E8%AA%9E%E3%81%A7%E6%A4%9C%E7%B4%A2%E3%81%97%E3%81%A6%E3%81%84%E3%81%A6%E3%81%93%E3%81%A1%E3%82%89%E3%81%AE%E3%82%B5%E3%82%A4%E3%83%88%E3%81%AB%E3%81%9F%E3%81%A9%E3%82%8A%E7%9D%80%E3%81%8D%E3%81%BE%E3%81%97%E3%81%9F%E3%80%82
... and also HTML-Encoded (&#nnn(nn);, n = 0-9(?)) to:
ドイツ語で検索していてこちらのサイトにたどり着きました。
Please help me as I am totally lost right now and don't even know where to start. By the way, performance isn't much important to me right now.
Thanks in advance!

Here is an example which shows two methods, one based on the Qt library and one based on the ICU library. Both should be fairly platform-independent:
#include <iostream>
#include <sstream>
#include <iomanip>
#include <stdexcept>
#include <boost/scoped_array.hpp>
#include <QtCore/QString>
#include <QtCore/QUrl>
#include <QtCore/QVector>
#include <unicode/utypes.h>
#include <unicode/ustring.h>
#include <unicode/unistr.h>
#include <unicode/schriter.h>
void encodeQt() {
const QString str = QString::fromWCharArray(L"ドイツ語で検索していてこちらのサイトにたどり着きました。");
const QUrl url = str;
std::cout << "URL encoded: " << url.toEncoded().constData() << std::endl;
typedef QVector<uint> CodePointVector;
const CodePointVector codePoints = str.toUcs4();
std::stringstream htmlEncoded;
for (CodePointVector::const_iterator it = codePoints.constBegin(); it != codePoints.constEnd(); ++it) {
htmlEncoded << "&#" << *it << ';';
}
std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
}
void encodeICU() {
const std::wstring cppString = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
int bufSize = cppString.length() * 2;
boost::scoped_array<UChar> strBuffer(new UChar[bufSize]);
int size = 0;
UErrorCode error = U_ZERO_ERROR;
u_strFromWCS(strBuffer.get(), bufSize, &size, cppString.data(), cppString.length(), &error);
if (error) return;
const UnicodeString str(strBuffer.get(), size);
bufSize = str.length() * 4;
boost::scoped_array<char> buffer(new char[bufSize]);
u_strToUTF8(buffer.get(), bufSize, &size, str.getBuffer(), str.length(), &error);
if (error) return;
const std::string urlUtf8(buffer.get(), size);
std::stringstream urlEncoded;
urlEncoded << std::hex << std::setfill('0');
for (std::string::const_iterator it = urlUtf8.begin(); it != urlUtf8.end(); ++it) {
urlEncoded << '%' << std::setw(2) << static_cast<unsigned int>(static_cast<unsigned char>(*it));
}
std::cout << "URL encoded: " << urlEncoded.str() << std::endl;
std::stringstream htmlEncoded;
StringCharacterIterator it = str;
while (it.hasNext()) {
const UChar32 pt = it.next32PostInc();
htmlEncoded << "&#" << pt << ';';
}
std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
}
int main() {
encodeQt();
encodeICU();
}

You see, before you can convert a char to a URL escape sequence, you have to convert your wstring* into ISO-Latin charset which is what is used for URLs. ICU could be a good place to start, where you can pass your wstring to it and get a ISO-Lantin sequence. Then, simply iterate through the resulting chars and convert them to the escape senquence:
std::stringstream URL;
URL << std::hex;
for(auto it = myWString.begin(); it != myWString.end(); ++it)
URL << '%' << std::setfill('0') << std::setw(2) << (int)*it;
Take a look here for more info in how to format the string.
* I'm assuming that your wstring is a UTF-16, which usually is the case, although you didn't specify
This might help also.

Here's a version that converts from UTF-16 (wchar) to hex-encoded UTF-8 using the Win32-specific WideCharToMultiByte() function.
#include <string>
#include <iostream>
#include <stdio.h>
#include <windows.h>
std::string wstring_to_utf8_hex(const std::wstring &input)
{
std::string output;
int cbNeeded = WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, NULL, 0, NULL, NULL);
if (cbNeeded > 0) {
char *utf8 = new char[cbNeeded];
if (WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, utf8, cbNeeded, NULL, NULL) != 0) {
for (char *p = utf8; *p; *p++) {
char onehex[5];
_snprintf(onehex, sizeof(onehex), "%%%02.2X", (unsigned char)*p);
output.append(onehex);
}
}
delete[] utf8;
}
return output;
}
int main(int, char*[])
{
std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
std::cout << "result=" << wstring_to_utf8_hex(ja) << std::endl;
return 0;
}
To go the other way, you'll need to use some parsing to decode the hex values into a UTF-8 buffer, and then call the complimentary MultiByteToWideChar() to get it back into a wchar array.
#include <string>
#include <iostream>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
std::string unhexlify(const std::string &input)
{
std::string output;
for (const char *p = input.c_str(); *p; ) {
if (p[0] == '%' && isxdigit(p[1]) && isxdigit(p[2])) {
int ch = (isdigit(p[1]) ? p[1] - '0' : toupper(p[1]) - 'A' + 10) * 16 +
(isdigit(p[2]) ? p[2] - '0' : toupper(p[2]) - 'A' + 10);
output.push_back((char)ch);
p += 3;
} else if (p[0] == '%' && p[1] == '#' && isdigit(p[2])) {
int ch = atoi(p + 2);
output.push_back((char)ch);
p += 2;
while (*p && isdigit(*p)) p++;
if (*p == ';') p++;
} else {
output.push_back(*p++);
}
}
return output;
}
std::wstring utf8_hex_to_wstring(const std::string &input)
{
std::wstring output;
std::string utf8 = unhexlify(input);
int cchNeeded = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
if (cchNeeded > 0) {
wchar_t *widebuf = new wchar_t[cchNeeded];
if (MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, widebuf, cchNeeded) != 0) {
output = widebuf;
}
delete[] widebuf;
}
return output;
}
int main(int, char*[])
{
std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
std::string hex = "%E3%83%89%E3%82%A4%E3%83%84%E8%AA%9E%E3%81%A7%E6%A4%9C%E7%B4%A2%E3%81%97%E3%81%A6%E3%81%84%E3%81%A6%E3%81%93%E3%81%A1%E3%82%89%E3%81%AE%E3%82%B5%E3%82%A4%E3%83%88%E3%81%AB%E3%81%9F%E3%81%A9%E3%82%8A%E7%9D%80%E3%81%8D%E3%81%BE%E3%81%97%E3%81%9F%E3%80%82";
std::wstring newja = utf8_hex_to_wstring(hex);
std::cout << "match?=" << (newja == ja ? "yes" : "no") << std::endl;
return 0;
}

First, convert to UTF-8.
Then, normal URL/HTML encode would do the right thing.

I find in C# it's simple, so I use C++\CLI as wrapper, wrap C# code:
string encodedStr = System.Web.HttpUtility.UrlEncode(inputstr);`
in C++\CLI make a method as __declspec(dllexport) so in C++ can call it, the C++\CLI syntax is:
String^ encodedStr = System::Web::HttpUtility::UrlEncode(inputStr);`.
this is a tutorial about how to call C++\CLI from C++: How to call a C# library from Native C++ (using C++\CLI and IJW)

Related

How to convert hex representation from URL (%) to std::string (chinese text)?

Intro
I have some input that I need to convert to the correct Chinese characters but I think I'm stuck at the final number to string conversion. I have checked using this hex to text converter online tool that e6b9af corresponds to the text 湯.
MWE
Here is a minimal example that I made to illustrate the problem. The input is "%e6%b9%af" (obtained from an URL somewhere else).
#include <iostream>
#include <string>
std::string attempt(std::string path)
{
std::size_t i = path.find("%");
while (i != std::string::npos)
{
std::string sub = path.substr(i, 9);
sub.erase(i + 6, 1);
sub.erase(i + 3, 1);
sub.erase(i, 1);
std::size_t s = std::stoul(sub, nullptr, 16);
path.replace(i, 9, std::to_string(s));
i = path.find("%");
}
return path;
}
int main()
{
std::string input = "%E6%B9%AF";
std::string goal = "湯";
// convert input to goal
input = attempt(input);
std::cout << goal << " and " << input << (input == goal ? " are the same" : " are not the same") << std::endl;
return 0;
}
Output
湯 and 15120815 are not the same
Expected output
湯 and 湯 are the same
Additional question
Are all characters in foreign languages represented in 3 bytes or is that just for Chinese? Since my attempt assumes blocks of 3 bytes, is that a good assumption?

Based on your suggestions and changing an example from this other post. This is what I came up with.
#include <iostream>
#include <string>
#include <sstream>
std::string decode_url(const std::string& path)
{
std::stringstream decoded;
for (std::size_t i = 0; i < path.size(); i++)
{
if (path[i] != '%')
{
if (path[i] == '+')
decoded << ' ';
else
decoded << path[i];
}
else
{
unsigned int j;
sscanf(path.substr(i + 1, 2).c_str(), "%x", &j);
decoded << static_cast<char>(j);
i += 2;
}
}
return decoded.str();
}
int main()
{
std::string input = "%E6%B9%AF";
std::string goal = "湯";
// convert input to goal
input = decode_url(input);
std::cout << goal << " and " << input << (input == goal ? " are the same" : " are not the same") << std::endl;
return 0;
}
Output
湯 and 湯 are the same

C++ XOR failing when reaching null byte?

Here is the code I have right now:
#include <iostream>
#include <string>
#include <sstream>
std::string string_to_hex(const std::string& input)
{
static const char* const lut = "0123456789ABCDEF";
size_t len = input.length();
std::string output;
output.reserve(2 * len);
for (size_t i = 0; i < len; ++i)
{
const unsigned char c = input[i];
output.push_back(lut[c >> 4]);
output.push_back(lut[c & 15]);
}
return output;
}
std::string encrypt(std::string msg, std::string key)
{
// Make sure the key is at least as long as the message
std::string tmp(key);
while (key.size() < msg.size())
key += tmp;
// And now for the encryption part
for (std::string::size_type i = 0; i < msg.size(); ++i)
msg[i] ^= key[i];
return msg;
}
std::string decrypt(std::string msg, std::string key)
{
return encrypt(msg, key); // lol
}
int main()
{
std::cout << string_to_hex(encrypt("Hello World!", "monkey")) << std::endl;
std::cout << decrypt("\x25\x0A\x02\x07\x0A\x59\x3A\x00\x1C\x07\x01\x58", "monkey") << std::endl;
std::cout << string_to_hex(encrypt("Hello. This is a test of encrypting strings in C++.", "monkey")) << std::endl;
std::cout << decrypt("\x25\x0A\x02\x07\x0A\x57\x4D\x3B\x06\x02\x16\x59\x04\x1C\x4E\x0A\x45\x0D\x08\x1C\x1A\x4B\x0A\x1F\x4D\x0A\x00\x08\x17\x00\x1D\x1B\x07\x05\x02\x59\x1E\x1B\x1C\x02\x0B\x1E\x1E\x4F\x07\x05\x45\x3A\x46\x44\x40", "monkey") << std::endl;
}
The output is the following:
250A02070A593A001C070158
Hello W
250A02070A574D3B06021659041C4E0A450D081C1A4B0A1F4D0A000817001D1B070502591E1B1C020B1E1E4F0705453A464440
Hello. This is a test of e
The decryption seems to stop when reaching a \x00. Does anyone have any ideas on how to fix or get around that?
Thanks!

The std::string constructor that takes in a char* assumes that the input is a null-terminated string, so even though your string literal has lots of data in it past the null byte, when you pass it into your function the std::string constructor will stop reading as soon as it hits that null byte.
You have a couple of options to fix this. As one option, the std::string type has a two-argument constructor where you can give a pointer to the first element in the string and the past-the-end byte of the string. The std::string will then initialize itself to the text in that range, ignoring intermediary null terminators.
char s1[] = "\x25\x0A\x02\x07\x0A\x59\x3A\x00\x1C\x07\x01\x58";
char s2[] = "\x25\x0A\x02\x07\x0A\x57\x4D\x3B\x06\x02\x16\x59\x04\x1C\x4E\x0A\x45\x0D\x08\x1C\x1A\x4B\x0A\x1F\x4D\x0A\x00\x08\x17\x00\x1D\x1B\x07\x05\x02\x59\x1E\x1B\x1C\x02\x0B\x1E\x1E\x4F\x07\x05\x45\x3A\x46\x44\x40";
std::cout << string_to_hex(encrypt("Hello World!", "monkey")) << std::endl;
std::cout << decrypt(std::string(std::begin(s1), std::end(s1)-1), "monkey") << std::endl;
std::cout << string_to_hex(encrypt("Hello. This is a test of encrypting strings in C++.", "monkey")) << std::endl;
std::cout << decrypt(std::string(std::begin(s2), std::end(s2)-1), "monkey") << std::endl;
Demo.

C++ Linux Getting MAC Address returning different values on same computer

I´d wrapped up the following code to get the computer MAC address (Linux Ubuntu) and to print it using a custom hexadecimal string:
#include <iostream>
#include <string>
#include <cstring>
#include <sstream>
#include <iomanip>
#include <stdexcept>
#include <unistd.h>
#include <net/if.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
std::string convertToHex(std::string str)
{
std::stringstream outText;
for(unsigned int i = 0; i < str.size(); i++ )
{
outText << std::hex << std::setw(2) << std::setfill('0') << (0xFF & static_cast<char>(str[i]));
if (i != (str.size() - 1))
outText << ":";
}
return outText.str();
}
std::string getMacId()
{
struct ifreq ifr;
struct ifreq *IFR;
struct ifconf ifc;
char buf[1024];
int s, i;
std::string macAddr("");
s = socket(AF_INET, SOCK_DGRAM, 0);
if (s==-1)
{
return "";
}
ifc.ifc_len = sizeof(buf);
ifc.ifc_buf = buf;
ioctl(s, SIOCGIFCONF, &ifc);
IFR = ifc.ifc_req;
int ok = 0;
for (i = ifc.ifc_len / sizeof(struct ifreq); --i >= 0; IFR++)
{
strcpy(ifr.ifr_name, IFR->ifr_name);
if (ioctl(s, SIOCGIFFLAGS, &ifr) == 0)
{
if (! (ifr.ifr_flags & IFF_LOOPBACK))
{
if (ioctl(s, SIOCGIFHWADDR, &ifr) == 0)
{
ok = 1;
break;
}
}
}
}
close(s);
std::stringstream data;
for (auto &c : ifr.ifr_addr.sa_data)
data << c;
std::string ret = data.str();
std::cout << ret << std::endl;
return ret;
}
void showMacId()
{
std::string mac = getMacId();
std::string hexmac = convertToHex(mac);
std::cout << hexmac << std::endl;
}
int main()
{
std::cout << "Pass1: " << std::endl;
showMacId();
std::cout << "Pass2: " << std::endl;
showMacId();
std::cout << "Pass3: " << std::endl;
showMacId();
std::cout << "Pass4: " << std::endl;
showMacId();
}
The problem is that I´m getting different outputs on every read. This is the result running the program:
Pass1:
.)� W��
2e:02:00:00:04:29:80:20:57:82:42:08:80:20
Pass2:
p�ЕĿ�wq���Ŀ
70:b7:d0:95:c4:bf:aa:77:71:b7:80:95:c4:bf
Pass3:
p�ЕĿ�wq���Ŀ
70:b7:d0:95:c4:bf:aa:77:71:b7:80:95:c4:bf
Pass4:
p�ЕĿ�wq���Ŀ
70:b7:d0:95:c4:bf:aa:77:71:b7:80:95:c4:bf
And if I run the program again I get different results:
Pass1:
.)� W��
2e:02:00:00:04:29:80:20:57:82:42:08:80:20
Pass2:
q��
���wr�P
��
71:b7:a0:0a:93:bf:aa:77:72:b7:50:0a:93:bf
Pass3:
q��
���wr�P
��
71:b7:a0:0a:93:bf:aa:77:72:b7:50:0a:93:bf
Pass4:
q��
���wr�P
��
71:b7:a0:0a:93:bf:aa:77:72:b7:50:0a:93:bf
So, what am I doing wrong here?

sizeof(buf) looks dangerous. Use 1024 instead; I'm not absolutely certain you don't ask for sizeof(char*) this way.
You must check that
ioctl(s, SIOCGIFCONF, &ifc);
doesn't return -1!
All in all, this seems a very old-school method of getting the MAC address.
Why don't you just read /sys/class/net/<devicename>/address? I could pretty much rely on sysfs being there on any linux system I've encountered, and it's the failsafe, clear, portable thing to do. To find your devices, just list the /sys/calls/net/ directory.
EDIT I was asked to give an example; I really don't know how to do this, it seems so straightforward:
#include <iostream>
#include <fstream>
int main (int argc, char** argv)
{
std::ifstream inp ("/sys/class/net/enp0s25/address", std::ifstream::in);
for(int bytecounter = 0; bytecounter < 6; bytecounter++)
{
unsigned int byte;
inp >>std::hex >> byte;
inp.get(); //drop the :
std::cout <<byte;
if(bytecounter < 5)
std::cout << ":";
}
inp.close();
return 0;
}

I am not sure why the "MAC" addresses you are printing are that long (usually they are 48 bit long), but I guess your problem is that you do not initialize your structs and arrays with zeros.
This should help (given that you use C++11 anyway):
struct ifreq ifr = {0};
struct ifconf ifc = {0};
char buf[1024] = {0};

How to convert Byte Array to hex string in visual c++?

Declaration of a method are following:
//some.h
void TDES_Decryption(BYTE *Data, BYTE *Key, BYTE *InitalVector, int Length);
I am calling this method from the following code:
//some.c
extern "C" __declspec(dllexport) bool _cdecl OnDecryption(LPCTSTR stringKSN, LPCTSTR BDK){
TDES_Decryption(m_Track1Buffer, m_cryptoKey, init_vector, len);
return m_Track1Buffer;
}
Where as data type of m_Track1Buffer is BYTE m_Track1Buffer[1000];
Now i want to make some changes in above method i.e. want to return the String in hex instead of Byte. How should i convert this m_Track1buffer to Hex string

As you have mentioned c++, here is an answer. Iomanip is used to store ints in hex form into stringstream.
#include <iomanip>
#include <sstream>
#include <string>
std::string hexStr(const uint8_t *data, int len)
{
std::stringstream ss;
ss << std::hex;
for( int i(0) ; i < len; ++i )
ss << std::setw(2) << std::setfill('0') << (int)data[i];
return ss.str();
}

This code will convert byte array of fixed size 100 into hex string:
BYTE array[100];
char hexstr[201];
int i;
for (i=0; i<ARRAY_SIZE(array); i++) {
sprintf(hexstr+i*2, "%02x", array[i]);
}
hexstr[i*2] = 0;

Here is a somewhat more flexible version (Use uppercase characters? Insert spaces between bytes?) that can be used with plain arrays and various standard containers:
#include <string>
#include <sstream>
#include <iomanip>
template<typename TInputIter>
std::string make_hex_string(TInputIter first, TInputIter last, bool use_uppercase = true, bool insert_spaces = false)
{
std::ostringstream ss;
ss << std::hex << std::setfill('0');
if (use_uppercase)
ss << std::uppercase;
while (first != last)
{
ss << std::setw(2) << static_cast<int>(*first++);
if (insert_spaces && first != last)
ss << " ";
}
return ss.str();
}
Example usage (plain array):
uint8_t byte_array[] = { 0xDE, 0xAD, 0xC0, 0xDE, 0x00, 0xFF };
auto from_array = make_hex_string(std::begin(byte_array), std::end(byte_array), true, true);
assert(from_array == "DE AD C0 DE 00 FF");
Example usage (std::vector):
// fill with values from the array above
std::vector<uint8_t> byte_vector(std::begin(byte_array), std::end(byte_array));
auto from_vector = make_hex_string(byte_vector.begin(), byte_vector.end(), false);
assert(from_vector == "deadc0de00ff");

Using stringstream, sprintf and other functions in the loop is simply not C++. It's horrible for performance and these kind of functions usually get called a lot (unless you're just writing some things into the log).
Here's one way of doing it.
Writing directly into the std::string's buffer is discouraged because specific std::string implementation might behave differently and this will not work then but we're avoiding one copy of the whole buffer this way:
#include <iostream>
#include <string>
#include <vector>
std::string bytes_to_hex_string(const std::vector<uint8_t> &input)
{
static const char characters[] = "0123456789ABCDEF";
// Zeroes out the buffer unnecessarily, can't be avoided for std::string.
std::string ret(input.size() * 2, 0);
// Hack... Against the rules but avoids copying the whole buffer.
auto buf = const_cast<char *>(ret.data());
for (const auto &oneInputByte : input)
{
*buf++ = characters[oneInputByte >> 4];
*buf++ = characters[oneInputByte & 0x0F];
}
return ret;
}
int main()
{
std::vector<uint8_t> bytes = { 34, 123, 252, 0, 11, 52 };
std::cout << "Bytes to hex string: " << bytes_to_hex_string(bytes) << std::endl;
}

how about using the boost library like this (snippet taken from http://theboostcpplibraries.com/boost.algorithm ):
#include <boost/algorithm/hex.hpp>
#include <vector>
#include <string>
#include <iterator>
#include <iostream>
using namespace boost::algorithm;
int main()
{
std::vector<char> v{'C', '+', '+'};
hex(v, std::ostream_iterator<char>{std::cout, ""});
std::cout << '\n';
std::string s = "C++";
std::cout << hex(s) << '\n';
std::vector<char> w{'4', '3', '2', 'b', '2', 'b'};
unhex(w, std::ostream_iterator<char>{std::cout, ""});
std::cout << '\n';
std::string t = "432b2b";
std::cout << unhex(t) << '\n';
}

restore runtime unicode strings

I'm building an application that receives runtime strings with encoded unicode via tcp, an example string would be "\u7cfb\u8eca\u4e21\uff1a\u6771\u5317 ...". I have the following but unfortunately I can only benefit from it at compile time due to: incomplete universal character name \u since its expecting 4 hexadecimal characters at compile time.
QString restoreUnicode(QString strText)
{
QRegExp rx("\\\\u([0-9a-z]){4}");
return strText.replace(rx, QString::fromUtf8("\u\\1"));
}
I'm seeking a solution at runtime, I could I foreseen break up these strings and do some manipulation to convert those hexadecimals after the "\u" delimiters into base 10 and then pass them into the constructor of a QChar but I'm looking for a better way if one exists as I am very concerned about the time complexity incurred by such a method and am not an expert.
Does anyone have any solutions or tips.

You should decode the string by yourself. Just take the Unicode entry (rx.indexIn(strText)), parse it (int result; std::istringstream iss(s); if (!(iss>>std::hex>>result).fail()) ... and replace the original string \\uXXXX with (wchar_t)result.

For closure and anyone who comes across this thread in future, here is my initial solution before optimising the scope of these variables. Not a fan of it but it works given the unpredictable nature of unicode and/or ascii in the stream of which I have no control over (client only), whilst Unicode presence is low, it is good to handle it instead of ugly \u1234 etc.
QString restoreUnicode(QString strText)
{
QRegExp rxUnicode("\\\\u([0-9a-z]){4}");
bool bSuccessFlag;
int iSafetyOffset = 0;
int iNeedle = strText.indexOf(rxUnicode, iSafetyOffset);
while (iNeedle != -1)
{
QChar cCodePoint(strText.mid(iNeedle + 2, 4).toInt(&bSuccessFlag, 16));
if ( bSuccessFlag )
strText = strText.replace(strText.mid(iNeedle, 6), QString(cCodePoint));
else
iSafetyOffset = iNeedle + 1; // hop over non code point to avoid lock
iNeedle = strText.indexOf(rxUnicode, iSafetyOffset);
}
return strText;
}

#include <assert.h>
#include <iostream>
#include <string>
#include <sstream>
#include <locale>
#include <codecvt> // C++11
using namespace std;
int main()
{
char const data[] = "\\u7cfb\\u8eca\\u4e21\\uff1a\\u6771\\u5317";
istringstream stream( data );
wstring ws;
int code;
char slashCh, uCh;
while( stream >> slashCh >> uCh >> hex >> code )
{
assert( slashCh == '\\' && uCh == 'u' );
ws += wchar_t( code );
}
cout << "Unicode code points:" << endl;
for( auto it = ws.begin(); it != ws.end(); ++it )
{
cout << hex << 0 + *it << endl;
}
cout << endl;
// The following is C++11 specific.
cout << "UTF-8 encoding:" << endl;
wstring_convert< codecvt_utf8< wchar_t > > converter;
string const bytes = converter.to_bytes( ws );
for( auto it = bytes.begin(); it != bytes.end(); ++it )
{
cout << hex << 0 + (unsigned char)*it << ' ';
}
cout << endl;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How do I HTML-/ URL-Encode a std::wstring containing Unicode characters? - c++

First, convert to UTF-8. Then, normal URL/HTML encode would do the right thing.

Related

How to convert hex representation from URL (%) to std::string (chinese text)?

C++ XOR failing when reaching null byte?

C++ Linux Getting MAC Address returning different values on same computer

How to convert Byte Array to hex string in visual c++?

restore runtime unicode strings

Categories

Resources