How can I decode HTML entities in C++? [duplicate]

How can I decode HTML entities in C++? [duplicate] - c++

I'm interested in unescaping text for example: \ maps to \ in C. Does anyone know of a good library?
As reference the Wikipedia List of XML and HTML Character Entity References.

For another open source reference in C to decoding these HTML entities you can check out the command line utility uni2ascii/ascii2uni. The relevant files are enttbl.{c,h} for entity lookup and putu8.c which down converts from UTF32 to UTF8.
uni2ascii

I wrote my own unescape code; very simplified, but does the job: pn_util.c

Function Description: Convert special HTML entities back to characters.
Need to do some modifications to fit your requirement.
char* HtmlSpecialChars_Decode(char* encodedHtmlSpecialEntities)
{
int encodedLen = 0;
int escapeArrayLen = 0;
static char decodedHtmlSpecialChars[TITLE_SIZE];
char innerHtmlSpecialEntities[MAX_CONFIG_ITEM_SIZE];
/* This mapping table can be extended if necessary. */
static const struct {
const char* encodedEntity;
const char decodedChar;
} entityToChars[] = {
{"<", '<'},
{">", '>'},
{"&", '&'},
{""", '"'},
{"'", '\''},
};
if(strchr(encodedHtmlSpecialEntities, '&') == NULL)
return encodedHtmlSpecialEntities;
memset(decodedHtmlSpecialChars, '\0', TITLE_SIZE);
memset(innerHtmlSpecialEntities, '\0', MAX_CONFIG_ITEM_SIZE);
escapeArrayLen = sizeof(entityToChars) / sizeof(entityToChars[0]);
strcpy(innerHtmlSpecialEntities, encodedHtmlSpecialEntities);
encodedLen = strlen(innerHtmlSpecialEntities);
for(int i = 0; i < encodedLen; i++)
{
if(innerHtmlSpecialEntities[i] == '&')
{
/* Potential encode char. */
char * tempEntities = innerHtmlSpecialEntities + i;
for(int j = 0; j < escapeArrayLen; j++)
{
if(strncmp(tempEntities, entityToChars[j].encodedEntity, strlen(entityToChars[j].encodedEntity)) == 0)
{
int index = 0;
strncat(decodedHtmlSpecialChars, innerHtmlSpecialEntities, i);
index = strlen(decodedHtmlSpecialChars);
decodedHtmlSpecialChars[index] = entityToChars[j].decodedChar;
if(strlen(tempEntities) > strlen(entityToChars[j].encodedEntity))
{
/* Not to the end, continue */
char temp[MAX_CONFIG_ITEM_SIZE] = {'\0'};
strcpy(temp, tempEntities + strlen(entityToChars[j].encodedEntity));
memset(innerHtmlSpecialEntities, '\0', MAX_CONFIG_ITEM_SIZE);
strcpy(innerHtmlSpecialEntities, temp);
encodedLen = strlen(innerHtmlSpecialEntities);
i = -1;
}
else
encodedLen = 0;
break;
}
}
}
}
if(encodedLen != 0)
strcat(decodedHtmlSpecialChars, innerHtmlSpecialEntities);
return decodedHtmlSpecialChars;
}

QString UNESC(const QString &txt) {
QStringList bld;
static QChar AMP = '&', SCL = ';';
static QMap<QString, QString> dec = {
{"<", "<"}, {">", ">"}
, {"&", "&"}, {""", R"(")"}, {"'", "'"} };
if(!txt.contains(AMP)) { return txt; }
int bgn = 0, pos = 0;
while((pos = txt.indexOf(AMP, pos)) != -1) {
int end = txt.indexOf(SCL, pos)+1;
QString val = dec[txt.mid(pos, end - pos)];
bld << txt.mid(bgn, pos - bgn);
if(val.isEmpty()) {
end = txt.indexOf(AMP, pos+1);
bld << txt.mid(pos, end - pos);
} else {
bld << val;
}// else // if(val.isEmpty())
bgn = end; pos = end;
}// while((pos = txt.indexOf(AMP, pos)) != -1)
return bld.join(QString());
}// UNESC

Related

Get a substring using two char* positionned in a C string

Let long_text, keyword1 and keyword2 be three char* pointers. _keyword1_ and _keyword2_ being two substrings of long_text. Using strstr(long_text, keyword1) I can get a char* which points to the first occurrence of keyword1 in long_text, and using strstr(long_text, keyword2) I can get a char* which points to the first occurrence of keyword2 in long_text. keyword1 and keyword2 do not overlap.
Is there a way to extract a substring from long_text representing the string between keyword1 and keyword2 using the two char* obtained from strstr()?
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
int main(){
char* long_text = "this is keyword1 and this is keyword2 in long_text";
char* keyword1 = "keyword1";
char* keyword2 = "keyword2";
char* k1_start = strstr(long_text, keyword1);
char* k2_start = strstr(long_text, keyword2);
// TODO Be able to print " and this is "
return 0;
}

This is the part you are missing
// Move k1_start to end of keyword1
k1_start += strlen(keyword1);
// Copy text from k1_start to k2_start
char sub_string[32];
int len = k2_start - k1_start;
strncpy(sub_string, k1_start, len);
// Be sure to terminate the string
sub_string[len] = '\0';

Yeah..
This is C-like and uses char * and supporting char array.
#include <stdio.h>
#include <string.h>
int main(void) {
char* long_text = "key1(text)key2";
char* keyword1 = "key1";
char* keyword2 = "key2";
char* k1 = strstr(long_text, keyword1);
char* k2 = strstr(long_text, keyword2);
// from first char of match up to first char of second match
char text[strlen(k1) - strlen(k2)];
int len = (int)strlen(k1);
for (int i = 0;; i++) {
text[i] = *k1; k1++;
if (i == (len - strlen(k2))) {
text[len - strlen(k2)] = '\0';
break;
}
}
char* res;
//We have now only keyword1 + middle part, compare until diff.,
//then remember position and just iterate from to it later.
int j = 0;
for (int i = 0;; i++) {
if (*keyword1 == text[i]) {
j = i;
keyword1++;
} else {
res = &text[++j];
break;
}
}
printf("%s\n", res);
return 0;
}

void look_for_middle(const char *haystack,
const char *needle1, const char *needle2) {
const char *start; /* start of region between keywords */
const char *end; /* end of region between keywords */
const char *pos1; /* match needle1 within haystack */
const char *pos2; /* match needle2 within haystack */
int length; /* length of region between needles */
/* Look for needles in haystack. */
pos1 = strstr(haystack, needle1);
pos2 = strstr(haystack, needle2);
if (pos1 != NULL && pos2 != NULL) {
/* Both needles were found. */
if (pos1 < pos2) {
/* needle1 appears before needle2 */
start = pos1 + strlen(needle1);
end = pos2;
} else {
/* needle2 appears before needle1 */
start = pos2 + strlen(needle2);
end = pos1;
}
length = end - start;
} else {
/* One or more needles were not found. */
start = NULL;
end = NULL;
length = 0;
}
/* Report result. */
if (start != NULL) {
/* Both needles were found. */
if (length < 0) {
printf("Needles overlap\n");
} else {
/*
* In this printf, the "precision" of the string
* is set so that it only prints the portion between
* the needles.
*/
printf("Middle pos %d len %d: %.*s\n",
(int)(start - haystack), length, length, start);
}
} else {
if (pos1 == NULL) {
printf("Needle 1 not found\n");
}
if (pos2 == NULL) {
printf("Needle 2 not found\n");
}
}
}

This function can do your job . .
char* strInner(char *long_text , char *keyword1 , char* keyword2)
{
char * a = strstr(long_text,keyword1);
a+= strlen(keyword1);
if(a==NULL)
{
cout<<"Keyword1 didn't found ! ";
return a ;
}
char * b = strstr(a,keyword2);
if(b==NULL)
{
cout<<"Keyword2 didn't found Or, found before Keyword1 ! ";
return b ;
}
char *inner = (char*)malloc(strlen(a)-strlen(b));
memcpy(inner,a,strlen(a)-strlen(b) );
return inner ;
}

Using a loop with std::strcmp to load lots of settings

In my game I keep track of unlocked levels with a vector std::vector<bool> lvlUnlocked_;.
The simple function to save the progress is this:
void save() {
std::stringstream ss;
std::string stringToSave = "";
std::ofstream ofile("./progress.txt");
if (ofile.good()) {
ofile.clear();
for (std::size_t i = 0; i < levelUnlocked_.size(); ++i) {
ss << "lvl" << i << "=" << (lvlUnlocked_.at(i) ? "1" : "0") << std::endl;
}
stringToSave = ss.str();
ofile << stringToSave;
ofile.close();
}
}
This works and is nice since I can just use a loop to dump the info.
Now to the part where I am stuck, the lower part of my load function (see comment in code below):
void load() {
std::ifstream ifile("./progress.txt");
if (ifile.good()) {
int begin;
int end;
std::string line;
std::string stringKey = "";
std::string stringValue = "";
unsigned int result;
while (std::getline(ifile, line)) {
stringKey = "";
stringValue = "";
for (unsigned int i = 0; i < line.length(); i++) {
if (line.at(i) == '=') {
begin = i + 1;
end = line.length();
break;
}
}
for (int i = 0; i < begin - 1; i++) {
stringKey += line.at(i);
}
for (int i = begin; i < end; i++) {
stringValue += line.at(i);
}
result = static_cast<unsigned int>(std::stoi(stringValue));
// usually I now compare the value and act accordingly, like so:
if (std::strcmp(stringKey.c_str(), "lvl0") == 0) {
lvlUnlocked_.at(0) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl1") == 0) {
lvlUnlocked_.at(1) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl2") == 0) {
lvlUnlocked_.at(2) = true;
}
// etc....
}
}
}
This works fine, but...
the problem is that I have 100+ levels and I want it to be dynamic based on the size of my lvlUnlocked_ vector instead of having to type it all like in the code above.
Is there a way to somehow make use of a loop like in my save function to check all levels?

If you parse your key to extract a suitable integer value, you can just index into the bit-vector with that:
while (std::getline(ifile, line)) {
const size_t eq = line.find('=');
if (eq == std::string::npos)
// no equals sign
continue;
auto stringKey = line.substr(0, eq);
auto stringValue = line.substr(eq+1);
if (stringKey.substr(0,3) != "lvl")
// doesn't begin with lvl
continue;
// strip off "lvl"
stringKey = stringKey.substr(3);
size_t end;
std::vector<bool>::size_type index = std::stoi(stringKey, &end);
if (end == 0 || end != stringKey.length())
// not a valid level number
continue;
if (index >= lvlUnlocked_.size())
// out of range
continue;
// Set it :-)
lvlUnlocked_[index] = stringValue=="1";
}
(I've also updated your parsing for "key=value" strings to more idiomatic C++.)

How do I remove diacritics from a string in dynamics ax

I'm trying to convert some strings, I'd like to be able to remove diacritics from strinf. (Exemple : éùèà would become euea)
i have try this :
static str AALRemoveDiacritics( System.String input )
{
int i;
System.Text.NormalizationForm FormD;
str normalizedString = input.Normalize(FormD);
System.Text.StringBuilder stringBuilder = new System.Text.StringBuilder();
for (i = 0; i < strLen(normalizedString); i++)
{
System.Char c = normalizedString[i];
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}

It looks like you tried making this post work in X++ and were very close.
Here's a working job I just wrote you can use:
static void AlexRemoveDiacritics(Args _args)
{
str strInput = 'ÁÂÃÄÅÇÈÉàáâãäåèéêëìíîïòóôõ£ALEX';
System.String input = strInput;
str retVal;
int i;
System.Char c;
System.Text.NormalizationForm FormD = System.Text.NormalizationForm::FormD;
str normalizedString = input.Normalize(FormD);
System.Text.StringBuilder stringBuilder = new System.Text.StringBuilder();
for (i = 0; i <= strLen(normalizedString); i++)
{
c = System.Char::Parse(subStr(normalizedString, i, 1));
if (System.Globalization.CharUnicodeInfo::GetUnicodeCategory(c) != System.Globalization.UnicodeCategory::NonSpacingMark)
{
stringBuilder.Append(c);
}
}
input = stringBuilder.ToString();
input = input.Normalize();
retVal = input;
info(strFmt("Before: '%1'", strInput));
info(strFmt("After: '%1'", retVal));
}

How to get string from xml in COM

I have one array like this:
static WCHAR FilesToShow[][100] = { { L"start.cmd" },{ L"image.xml" }, { L"xyz" }};
as you see that there is "xyz" which I have to replace with some unique name. For this I have to read image.xml file.
Please can you tell me how can I do this.
I wrote a method like this:
PRIVATE WCHAR GetSystemName(WCHAR *pName)
{
WCHAR line;
wfstream in("image.xml");
WCHAR tmp;
bool begin_tag = false;
while (getline(in,line))
{
// strip whitespaces from the beginning
for (int i = 0; i < line.length(); i++)
{
if (line[i] == ' ' && tmp.size() == 0)
{
}
else
{
tmp += line[i];
}
}
if (wcswcs(tmp,"<SystemPath>") != NULL)
{
???????? how to get "vikash" from here <SystemPath>C:\Users\rs_user\Documents\RobotStudio\Systems\vikash</SystemPath>
}
else
{
continue;
}
}
return tmp;
}
I'm getting exception for wfstream, getline and line.length() method.
I have included fstream.h header file but I think It's not supported in COM.
Please help me how to solve this issue without parsing xml file.

If your xml-file is simple enough so that there is only a single tag with given name, you could do it like this:
#include <string>
#include <sstream>
#include <iostream>
std::wstring get_value(std::wistream & in, std::wstring const & tagname)
{
std::wstring text = std::wstring(std::istreambuf_iterator<std::wstring::value_type>(in),
std::istreambuf_iterator<std::wstring::value_type>());
std::wstring start_tag = L"<" + tagname + L">";
std::wstring end_tag = L"</" + tagname + L">";
std::wstring::size_type start = text.find(start_tag);
if (start == std::wstring::npos)
{
throw 123;
}
start += start_tag.length();
std::wstring::size_type end = text.find(end_tag);
if (end == std::wstring::npos)
{
throw 123;
}
return text.substr(start, end - start);
}
std::wstring get_substr_after(std::wstring const & str, wchar_t delim)
{
std::wstring::size_type pos = str.rfind(delim);
if (pos == std::wstring::npos)
{
throw 123;
}
return str.substr(pos + 1);
}
void stackoverflow()
{
std::wstring text(L"<foo>\n<bar>abc/def/ghi</bar>\n<baz>123/456/789</baz>\n</foo>\n");
std::wistringstream wiss(text);
std::wcout << text << std::endl;
std::wcout << get_substr_after(get_value(wiss, std::wstring(L"bar")), L'/') << std::endl;
}
The output of this program is:
<foo>
<bar>abc/def/ghi</bar>
<baz>123/456/789</baz>
</foo>
ghi
I hope that answered your question.

you have several issues here.
what you are getting are compiler errors and not exceptions
the header file to include is 'fstream' not 'fstream.h'.
make sure you have a line saying using namespace std;
You are declaring line as a variable of type WCHAR, so it is a single wide character, which surely is not a wstring object. Therefore line.length() is incorrect.
Why are you mixing C (wcswcs()) and C++ (STL) ? maybe you should re-design your function signature.
However, try the below function. I have modified the signature to return a pointer to WCHAR, and place the requested string in the buffer space provided by pName. I added a check to verify that the buffer is large enough to fit the name and the terminating NULL character.
WCHAR* GetSystemName(WCHAR *pName, size_t buflen)
{
wstring line;
wifstream in("image.xml");
WCHAR* tmp = NULL;
while (getline(in,line))
{
// strip whitespaces from the beginning
size_t beg_non_whitespace = line.find_first_not_of(L" \t");
if (beg_non_whitespace != wstring::npos)
{
line = line.substr( beg_non_whitespace );
}
size_t beg_system_path = line.find( L"<SystemPath>" );
if ( beg_system_path != wstring::npos )
{
// strip the tags (assuming closing tag is present)
size_t beg_data = beg_system_path + wstring( L"<SystemPath>" ).length();
size_t range = line.find( L"</SystemPath>" ) - beg_data;
line = line.substr( beg_data, range );
// get file name
size_t pos_last_backslash = line.find_last_of( L'\\' );
if ( pos_last_backslash != wstring::npos )
{
line = line.substr( pos_last_backslash + 1 );
if ( buflen <= line.length() )
{
// ERROR: pName buffer is not large enough to fit the string + terminating NULL character.
return NULL;
}
wcscpy( pName, line.c_str() );
tmp = pName;
break;
}
}
}
return tmp;
}
EDIT: Moreover, if you are using and/or parsing XML in other areas of your program, I strongly suggest using an XML parsing library such as Xerces-C or libXml2.

Thank you all for your answer. Here I got solution of my question.
PRIVATE WCHAR* GetNewSystemName()
{
WCHAR line[756];
WCHAR tempBuffer[100];
CComBSTR path = CurrentFolder.Path();
CComBSTR imagePath1 = L"rimageinfo.xml";
path.AppendBSTR(imagePath1);
std::wfstream in(path);
WCHAR tmp[756];
in.getline(line, 756);
WCHAR* buffer;
buffer = wcswcs(line, L"<SystemPath>");
WCHAR *dest = wcsstr(buffer, L"</SystemPath>");
int pos;
pos = dest - buffer;
unsigned int i = 0;
if (wcswcs(buffer,L"<SystemPath>") != NULL && wcswcs(buffer,L"</SystemPath>") != NULL)
{
for (; i < pos; i++)
{
if (buffer[i] == ' ' && sizeof(tmp) == 0)
{
}
else
{
tmp[i] = buffer[i];
}
}
tmp[i] = NULL;
//break;
}
int j = i;
for (; j > 0; j--)
{
if (tmp[j] == '\\')
{
break;
}
}
j++;
int k = 0;
for (; j < i ; j++)
{
System_Name[k] = tmp[j];
k++;
}
System_Name[k] = NULL;
return System_Name;

Strings with whitespace in a list?

I have this function sentanceParse with a string input which returns a list. The input might be something like "Hello my name is Anton. What's your name?" and then the return value would be a list containing "Hello my name is Anton" and "What's your name?". However, this is not what happens. It seems as if the whitespaces in the sentences are treated like a separator and therefore the return is rather "Hello", "my", "name" etc instead of what I expected.
How would you propose I solve this?
As I am not a 100% sure the problem does not lie within my code, I will add that to the post as well:
Main:
list<string> mylist = sentanceParse(textCipher);
list<string>::iterator it;
for(it = mylist.begin(); it != mylist.end(); it++){
textCipher = *it;
cout << textCipher << endl; //This prints out the words separately instead of the entire sentances.
sentanceParse:
list<string> sentanceParse(string strParse){
list<string> strList;
int len = strParse.length();
int pos = 0;
int count = 0;
for(int i = 0; i < len; i++){
if(strParse.at(i) == '.' || strParse.at(i) == '!' || strParse.at(i) == '?'){
if(i < strParse.length() - 1){
while(i < strParse.length() - 1 && (strParse.at(i+1) == '.' || strParse.at(i+1) == '!' || strParse.at(i+1) == '?')){
if(strParse.at(i+1) == '?'){
strParse.replace(i, 1, "?");
}
strParse.erase(i+1, 1);
len -= 1;
}
}
char strTemp[2000];
int lenTemp = strParse.copy(strTemp, i - pos + 1, pos);
strTemp[lenTemp] = '\0';
std::string strAdd(strTemp);
strList.push_back(strAdd);
pos = i + 1;
count ++;
}
}
if(count == 0){
strList.push_back(strParse);
}
return strList;
}

Your implementation of sentence parse is wrong, here is a simpler correct solution.
std::list<std::string> sentence_parse(const std::string &str){
std::string temp;
std::list<std::string> t;
for(int x=0; x<str.size();++x){
if(str[x]=='.'||str[x]=='!'||str[x]=='?'){
if(temp!="")t.push_back(temp);//Handle special case of input with
//multiple punctuation Ex. Hi!!!!
temp="";
}else temp+=str[x];
}
return t;
}
EDIT:
Here is a full example program using this function. Type some sentences in your console, press enter and it will spit the sentences out with a newline separating them instead of punctuation.
#include <iostream>
#include <string>
#include <list>
std::list<std::string> sentence_parse(const std::string &str){
std::string temp;
std::list<std::string> t;
for(int x=0; x<str.size();++x){
if(str[x]=='.'||str[x]=='!'||str[x]=='?'){
if(temp!="")t.push_back(temp);//Handle special case of input with
//multiple punctuation Ex. Hi!!!!
temp="";
}else temp+=str[x];
}
return t;
}
int main (int argc, const char * argv[])
{
std::string s;
while (std::getline(std::cin,s)) {
std::list<std::string> t= sentence_parse(s);
std::list<std::string>::iterator x=t.begin();
while (x!=t.end()) {
std::cout<<*x<<"\n";
++x;
}
}
return 0;
}

// This function should be easy to adapt to any basic libary
// this is in Windows MFC
// pass in a string, a char and a stringarray
// returns an array of strings using char as the separator
void tokenizeString(CString theString, TCHAR theToken, CStringArray *theParameters)
{
CString temp = "";
int i = 0;
for(i = 0; i < theString.GetLength(); i++ )
{
if (theString.GetAt(i) != theToken)
{
temp += theString.GetAt(i);
}
else
{
theParameters->Add(temp);
temp = "";
}
if(i == theString.GetLength()-1)
theParameters->Add(temp);
}
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How can I decode HTML entities in C++? [duplicate] - c++

I'm interested in unescaping text for example: \ maps to \ in C. Does anyone know of a good library? As reference the Wikipedia List of XML and HTML Character Entity References.

For another open source reference in C to decoding these HTML entities you can check out the command line utility uni2ascii/ascii2uni. The relevant files are enttbl.{c,h} for entity lookup and putu8.c which down converts from UTF32 to UTF8. uni2ascii

I wrote my own unescape code; very simplified, but does the job: pn_util.c

Related

Get a substring using two char* positionned in a C string

Using a loop with std::strcmp to load lots of settings

How do I remove diacritics from a string in dynamics ax

How to get string from xml in COM

Strings with whitespace in a list?

Categories

Resources