why its going in infinite loop? - c++

I have a class that takes a html file and formats it. Here is my code.
void FormatHtml::Format(const std::string &formattedFile, const std::string &inputFile) const
{
string str;
ifstream inputfileObj(inputFile.c_str());
//ofstream formattedFileObj(formattedFile.c_str());
if(inputfileObj.is_open() /*&& formattedFileObj.is_open()*/)
{
while(inputfileObj.good())
{
getline(inputfileObj,str);
//cout<<str<<endl;
//formattedFileObj<<str;
int pos = str.find(">");
int pos3;
while(pos != string::npos)
{
pos3 = str.find("<",pos);
if(str.length() >= pos3+1)
{
if(str.at(pos3+1) == '/')
{
pos = str.find(">",pos3);
}
}
cout<<str.substr(0,pos+1)<<endl;
//formattedFileObj<<str.substr(0,pos+1)<<endl;
str = str.substr(pos+1,string::npos);
pos = str.find(">");
}
}
inputfileObj.close();
//formattedFileObj.close();
}
else
cout<<"could not open file";
}
}
but if i use this function with small file it works fyn, but for larger html file like google's home page source it goes to infinite loop.
following is call stack.
ntdll.dll!76f99a94()
[Frames below may be incorrect and/or missing, no symbols loaded for ntdll.dll]
ntdll.dll!76f98d94()
ntdll.dll!76fa9522()
kernel32.dll!7588cb6c()
kernel32.dll!7588cbfc()
kernel32.dll!7588c964()
msvcr90d.dll!_write_nolock(int fh=14548992, const void * buf=0x77004cc0, unsigned int cnt=4074376) Line 335 + 0x3c bytes C
ffffffff()
And when i pause the execution it always stops in one file called write.c and at following code:
/* write the lf buf and update total */
if ( WriteFile( (HANDLE)_osfhnd(fh),
lfbuf,
(int)(q - lfbuf),
(LPDWORD)&written,
NULL) )
{
charcount += written;
if (written < q - lfbuf)
break;
}
Any one having clue what could be the reason, why it always happens with large unformatted file.

This line:
pos = str.find(">",pos3);
If pos == string::npos, then you carry on to do this:
str = str.substr(pos+1,string::npos);
pos = str.find(">");
string::npos == -1, so pos+1 == 0, so str.substr returns all of str. You are now in an infinite loop.

Related

How can I speed up parsing of large strings?

So I've made a program that reads in various config files. Some of these config files can be small, some can be semi-large (largest one is 3,844 KB).
The read in file is stored in a string (in the program below it's called sample).
I then have the program extract information from the string based on various formatting rules. This works well, the only issue is that when reading larger files it is very slow....
I was wondering if there was anything I could do to speed up the parsing or if there was an existing library that does what I need (extract string up until a delimiter & extract string string in between 2 delimiters on the same level). Any assistance would be great.
Here's my code & a sample of how it should work...
#include "stdafx.h"
#include <string>
#include <vector>
std::string ExtractStringUntilDelimiter(
std::string& original_string,
const std::string& delimiter,
const int delimiters_to_skip = 1)
{
std::string needle = "";
if (original_string.find(delimiter) != std::string::npos)
{
int total_found = 0;
auto occurance_index = static_cast<size_t>(-1);
while (total_found != delimiters_to_skip)
{
occurance_index = original_string.find(delimiter);
if (occurance_index != std::string::npos)
{
needle = original_string.substr(0, occurance_index);
total_found++;
}
else
{
break;
}
}
// Remove the found string from the original string...
original_string.erase(0, occurance_index + 1);
}
else
{
needle = original_string;
original_string.clear();
}
if (!needle.empty() && needle[0] == '\"')
{
needle = needle.substr(1);
}
if (!needle.empty() && needle[needle.length() - 1] == '\"')
{
needle.pop_back();
}
return needle;
}
void ExtractInitialDelimiter(
std::string& original_string,
const char delimiter)
{
// Remove extra new line characters
while (!original_string.empty() && original_string[0] == delimiter)
{
original_string.erase(0, 1);
}
}
void ExtractInitialAndFinalDelimiters(
std::string& original_string,
const char delimiter)
{
ExtractInitialDelimiter(original_string, delimiter);
while (!original_string.empty() && original_string[original_string.size() - 1] == delimiter)
{
original_string.erase(original_string.size() - 1, 1);
}
}
std::string ExtractStringBetweenDelimiters(
std::string& original_string,
const std::string& opening_delimiter,
const std::string& closing_delimiter)
{
const size_t first_delimiter = original_string.find(opening_delimiter);
if (first_delimiter != std::string::npos)
{
int total_open = 1;
const size_t opening_index = first_delimiter + opening_delimiter.size();
for (size_t i = opening_index; i < original_string.size(); i++)
{
// Check if we have room for opening_delimiter...
if (i + opening_delimiter.size() <= original_string.size())
{
for (size_t j = 0; j < opening_delimiter.size(); j++)
{
if (original_string[i + j] != opening_delimiter[j])
{
break;
}
else if (j == opening_delimiter.size() - 1)
{
total_open++;
}
}
}
// Check if we have room for closing_delimiter...
if (i + closing_delimiter.size() <= original_string.size())
{
for (size_t j = 0; j < closing_delimiter.size(); j++)
{
if (original_string[i + j] != closing_delimiter[j])
{
break;
}
else if (j == closing_delimiter.size() - 1)
{
total_open--;
}
}
}
if (total_open == 0)
{
// Extract result, and return it...
std::string needle = original_string.substr(opening_index, i - opening_index);
original_string.erase(first_delimiter, i + closing_delimiter.size());
// Remove new line symbols
ExtractInitialAndFinalDelimiters(needle, '\n');
ExtractInitialAndFinalDelimiters(original_string, '\n');
return needle;
}
}
}
return "";
}
int main()
{
std::string sample = "{\n"
"Line1\n"
"Line2\n"
"{\n"
"SubLine1\n"
"SubLine2\n"
"}\n"
"}";
std::string result = ExtractStringBetweenDelimiters(sample, "{", "}");
std::string LineOne = ExtractStringUntilDelimiter(result, "\n");
std::string LineTwo = ExtractStringUntilDelimiter(result, "\n");
std::string SerializedVector = ExtractStringBetweenDelimiters(result, "{", "}");
std::string SubLineOne = ExtractStringUntilDelimiter(SerializedVector, "\n");
std::string SubLineTwo = ExtractStringUntilDelimiter(SerializedVector, "\n");
// Just for testing...
printf("LineOne: %s\n", LineOne.c_str());
printf("LineTwo: %s\n", LineTwo.c_str());
printf("\tSubLineOne: %s\n", SubLineOne.c_str());
printf("\tSubLineTwo: %s\n", SubLineTwo.c_str());
system("pause");
}
Use string_view or a hand rolled one.
Don't modify the string loaded.
original_string.erase(0, occurance_index + 1);
is code smell and going to be expensive with a large original string.
If you are going to modify something, do it in one pass. Don't repeatedly delete from the front of it -- that is O(n^2). Instead, procceed along it and shove "finished" stuff into an output accumulator.
This will involve changing how your code works.
You're reading your data into a string. "Length of string" should not be a problem. So far, so good...
You're using "string.find().". That's not necessarily a bad choice.
You're using "string.erase()". That's probably the main source of your problem.
SUGGESTIONS:
Treat the original string as "read-only". Don't call erase(), don't modify it.
Personally, I'd consider reading your text into a C string (a text buffer), then parsing the text buffer, using strstr().
Here is a more efficient version of ExtractStringBetweenDelimiters. Note that this version does not mutate the original buffer. You would perform subsequent queries on the returned string.
std::string trim(std::string buffer, char what)
{
auto not_what = [&what](char ch)
{
return ch != what;
};
auto first = std::find_if(buffer.begin(), buffer.end(), not_what);
auto last = std::find_if(buffer.rbegin(), std::make_reverse_iterator(first), not_what).base();
return std::string(first, last);
}
std::string ExtractStringBetweenDelimiters(
std::string const& buffer,
const char opening_delimiter,
const char closing_delimiter)
{
std::string result;
auto first = std::find(buffer.begin(), buffer.end(), opening_delimiter);
if (first != buffer.end())
{
auto last = std::find(buffer.rbegin(), std::make_reverse_iterator(first),
closing_delimiter).base();
if(last > first)
{
result.assign(first + 1, last);
result = trim(std::move(result), '\n');
}
}
return result;
}
If you have access to string_view (c++17 for std::string_view or boost::string_view) you could return one of these from both functions for extra efficiency.
It's worth mentioning that this method of parsing a structured file is going to cause you problems down the line if any of the serialised strings contains a delimiter, such as a '{'.
In the end you'll want to write or use someone else's parser.
The boost::spirit library is a little complicated to learn, but creates very efficient parsers for this kind of thing.

C++ fseek doesn't set to same position

I need to return to a previous position in a file that I'm reading after a certain event. The file is opened in text read mode. So I get the read position of the file before reading the needed segment and after that I'm calling ftell to return me to that position. The problem is that it doesn't go to the position that I specified, it instead jumps ahead by two places.
To get the current position I'm using this simplified function.
I'm not sure if the fflush is necessary but it seems to make no difference in the result.
long GetCurPos(void)
{
return ftell(file);
}
To open the file I'm simply using
void Open(void)
{
file = fopen(path, "r");
}
To return to a previous position I'm using this simplified function.
It return a zero value.
void SetPos(long pos) const
{
if (fseek(file, pos, SEEK_SET)) LOG("ferror %d occured!", ferror(file));
}
And finally for function that causes the problems
bool TryPeekWord(char * value, size_t maxLen) const
{
bool result = false;
long start = GetCurPos();
int c;
size_t len;
for (len = 0; len < maxLen; len++)
{
c = fgetc(file);
if (c == EOF)
{
*(value + len) = '\0';
result = len > 0;
break;
}
if ((c == ' ' || c == '\n') && len > 0)
{
*(value + len) = '\0';
result = true;
break;
}
*(value + len) = static_cast<char>(c);
}
SetPos(start);
return result;
}
If I call fseek again after the SetPos call it will be the same value as start but if I then read from the file it will not return the expected characters. What am I doing wrong?
Over the course of improving my question to a proper standard I found that the file I was working on got corrupt. Maybe from moving to different operating system or something?
I copied over the contents of the old file into a new file and it worked fine again.

C++ Access violation writing location 0x000A000B

I'm building this webcrawler here. This error occurs to me when I start debugging and sends me to memcpy.asm or xstring or dbgdel.cpp files showing me different lines of these files every time.
I was wondering if the code is wrong somehow. I started thinking I am accessing memory blocks that I shouldn't. Here is some code. I hope you can help.
The idea is to iterate through httpContent and get all the URLs from the <a> tags. I am looking for href=" in the beginning and then for the next ". What is in between I am trying to put in temp, then pass the content of temp to an array of strings.
struct Url
{
string host;
string path;
};
int main(){
struct Url website;
string href[100];
website.host = "crawlertest.cs.tu-varna.bg";
website.path = "";
string httpContent = downloadHTTP(website);
for(unsigned int i = 0; i <= httpContent.length()-7; i++){
char c = httpContent[i];
if(c == 'h'){
c = httpContent[i+1];
if(c == 'r'){
c = httpContent[i+2];
if(c == 'e'){
c = httpContent[i+3];
if(c == 'f'){
c = httpContent[i+4];
if(c == '='){
c = httpContent[i+5];
if(c == '\"'){
i+=6;
c = httpContent[i];
string temp = "";
while(c!='\"'){
i++;
c = httpContent[i];
temp+= c;
}
href[i] = temp;
temp = "";
cout<<href[i]<<endl;
}}}}}}
}
system("pause");
return 0;
}
UPDATE
I edited the =, now ==
I am also stopping the iterations 7 positions earlier so the 'if's should not be problem.
I am getting the same errors though.
Use std::vector< std::string > href; to store your result.
With string::find you can find sequence in strings and with string::substr you can extract them from string.
#include <vetor>
#include <string>
struct Url
{
string host;
string path;
};
int main(){
struct Url website;
website.host = "crawlertest.cs.tu-varna.bg";
website.path = "";
std::string httpContent = downloadHTTP(website);
std::vector< std::string > href;
std::size_t pos = httpContent.find("href="); // serach for first "href="
while ( pos != string::npos )
{
pos = httpContent.find( '"', pos+5 ); // serch for '"' at start
if ( pos != string::npos )
{
std::size_t posSt = pos + 1;
pos = httpContent.find( '"', posSt ); // search for '"' at end
if ( pos != string::npos )
{
href.push_back( httpContent.substr( posSt, pos - posSt ) ); // extract ref and append to result
pos = httpContent.find( "href=", pos+1 ); // search for next "href="
}
}
}
system("pause");
return 0;
}

How to get string from xml in COM

I have one array like this:
static WCHAR FilesToShow[][100] = { { L"start.cmd" },{ L"image.xml" }, { L"xyz" }};
as you see that there is "xyz" which I have to replace with some unique name. For this I have to read image.xml file.
Please can you tell me how can I do this.
I wrote a method like this:
PRIVATE WCHAR GetSystemName(WCHAR *pName)
{
WCHAR line;
wfstream in("image.xml");
WCHAR tmp;
bool begin_tag = false;
while (getline(in,line))
{
// strip whitespaces from the beginning
for (int i = 0; i < line.length(); i++)
{
if (line[i] == ' ' && tmp.size() == 0)
{
}
else
{
tmp += line[i];
}
}
if (wcswcs(tmp,"<SystemPath>") != NULL)
{
???????? how to get "vikash" from here <SystemPath>C:\Users\rs_user\Documents\RobotStudio\Systems\vikash</SystemPath>
}
else
{
continue;
}
}
return tmp;
}
I'm getting exception for wfstream, getline and line.length() method.
I have included fstream.h header file but I think It's not supported in COM.
Please help me how to solve this issue without parsing xml file.
If your xml-file is simple enough so that there is only a single tag with given name, you could do it like this:
#include <string>
#include <sstream>
#include <iostream>
std::wstring get_value(std::wistream & in, std::wstring const & tagname)
{
std::wstring text = std::wstring(std::istreambuf_iterator<std::wstring::value_type>(in),
std::istreambuf_iterator<std::wstring::value_type>());
std::wstring start_tag = L"<" + tagname + L">";
std::wstring end_tag = L"</" + tagname + L">";
std::wstring::size_type start = text.find(start_tag);
if (start == std::wstring::npos)
{
throw 123;
}
start += start_tag.length();
std::wstring::size_type end = text.find(end_tag);
if (end == std::wstring::npos)
{
throw 123;
}
return text.substr(start, end - start);
}
std::wstring get_substr_after(std::wstring const & str, wchar_t delim)
{
std::wstring::size_type pos = str.rfind(delim);
if (pos == std::wstring::npos)
{
throw 123;
}
return str.substr(pos + 1);
}
void stackoverflow()
{
std::wstring text(L"<foo>\n<bar>abc/def/ghi</bar>\n<baz>123/456/789</baz>\n</foo>\n");
std::wistringstream wiss(text);
std::wcout << text << std::endl;
std::wcout << get_substr_after(get_value(wiss, std::wstring(L"bar")), L'/') << std::endl;
}
The output of this program is:
<foo>
<bar>abc/def/ghi</bar>
<baz>123/456/789</baz>
</foo>
ghi
I hope that answered your question.
you have several issues here.
what you are getting are compiler errors and not exceptions
the header file to include is 'fstream' not 'fstream.h'.
make sure you have a line saying using namespace std;
You are declaring line as a variable of type WCHAR, so it is a single wide character, which surely is not a wstring object. Therefore line.length() is incorrect.
Why are you mixing C (wcswcs()) and C++ (STL) ? maybe you should re-design your function signature.
However, try the below function. I have modified the signature to return a pointer to WCHAR, and place the requested string in the buffer space provided by pName. I added a check to verify that the buffer is large enough to fit the name and the terminating NULL character.
WCHAR* GetSystemName(WCHAR *pName, size_t buflen)
{
wstring line;
wifstream in("image.xml");
WCHAR* tmp = NULL;
while (getline(in,line))
{
// strip whitespaces from the beginning
size_t beg_non_whitespace = line.find_first_not_of(L" \t");
if (beg_non_whitespace != wstring::npos)
{
line = line.substr( beg_non_whitespace );
}
size_t beg_system_path = line.find( L"<SystemPath>" );
if ( beg_system_path != wstring::npos )
{
// strip the tags (assuming closing tag is present)
size_t beg_data = beg_system_path + wstring( L"<SystemPath>" ).length();
size_t range = line.find( L"</SystemPath>" ) - beg_data;
line = line.substr( beg_data, range );
// get file name
size_t pos_last_backslash = line.find_last_of( L'\\' );
if ( pos_last_backslash != wstring::npos )
{
line = line.substr( pos_last_backslash + 1 );
if ( buflen <= line.length() )
{
// ERROR: pName buffer is not large enough to fit the string + terminating NULL character.
return NULL;
}
wcscpy( pName, line.c_str() );
tmp = pName;
break;
}
}
}
return tmp;
}
EDIT: Moreover, if you are using and/or parsing XML in other areas of your program, I strongly suggest using an XML parsing library such as Xerces-C or libXml2.
Thank you all for your answer. Here I got solution of my question.
PRIVATE WCHAR* GetNewSystemName()
{
WCHAR line[756];
WCHAR tempBuffer[100];
CComBSTR path = CurrentFolder.Path();
CComBSTR imagePath1 = L"rimageinfo.xml";
path.AppendBSTR(imagePath1);
std::wfstream in(path);
WCHAR tmp[756];
in.getline(line, 756);
WCHAR* buffer;
buffer = wcswcs(line, L"<SystemPath>");
WCHAR *dest = wcsstr(buffer, L"</SystemPath>");
int pos;
pos = dest - buffer;
unsigned int i = 0;
if (wcswcs(buffer,L"<SystemPath>") != NULL && wcswcs(buffer,L"</SystemPath>") != NULL)
{
for (; i < pos; i++)
{
if (buffer[i] == ' ' && sizeof(tmp) == 0)
{
}
else
{
tmp[i] = buffer[i];
}
}
tmp[i] = NULL;
//break;
}
int j = i;
for (; j > 0; j--)
{
if (tmp[j] == '\\')
{
break;
}
}
j++;
int k = 0;
for (; j < i ; j++)
{
System_Name[k] = tmp[j];
k++;
}
System_Name[k] = NULL;
return System_Name;

read parameters from txt in Linux

I need to read parameters from a txt file for my program in Linux. But the result is that some of the parameters read from the txt file have the correct value, but some of them have a wrong value. Somebody has met this problem? I have translated the format of the txt in windows into Linux with the command dos2unix. I need your help, Thanks.
The read function is as follows:
template <class T>int ReadFileVar(ifstream *inClientFile, const char var_name[], T *var)
{
//inClientFile - pointer to the previously opened File stream
//var_name - contains the name of the variable
//var - pointer to a long, the function will return the value of the variable in this
int length_var_name = (int) strlen(var_name);
char line[512];
int i, j;
while (inClientFile->getline(line,512))
{
if (line[0] != '/' && line[1] != '/')
{
i = 0;
while (line[i] != '\0')
{
if (!strncmp(&line[i],var_name,length_var_name))
{
j = i + length_var_name;
while (line[j] != '\0')
{
if ( line[j] >= '0' && line[j] <= '9')
{
*var = (T) atof(&line[j]);
inClientFile->seekg( 0, ios_base::beg ); //back to the beginning of the file
return 0;
}
j++;
}
}
i++;
}
}
}
cerr << var_name << " - cannot be found" << endl;
throw "error reading input data from: ";
return 1; //the specified variable was not found in the file
}
For example:
the parameters in the txt are as follows:,the type of them are long,
nx=100;
ny=100;
nz=100;
ipro=1;
jpro=1;
kpro=1;
but after reading the txt in my program I get these,
nx=100;
ny=100;
nz=15;
ipro=1;
jpro=1;
kpro=100;
I have tested the program under Windows, there it works!
Your code works for me, you must have an error somewhere else or an undefined behavior I didn't spot.
May I suggest a more C++ way to do exactly the same thing :
template <class T>
T ReadFileVar(ifstream& inClientFile, string var_name)
{
string line;
while (getline(inClientFile, line))
{
if (line[0] != '/' && line[1] != '/')
{
size_t pos = line.find(var_name);
if( pos != string::npos) {
pos = line.find('=', pos + 1);
if(pos == string::npos) {
throw std::exception();
}
istringstream iss(line.substr(pos + 1));
T result;
iss >> result;
inClientFile.seekg( 0, ios_base::beg );
return result;
}
}
}
throw std::exception();
}
You could also parse the whole file and store the result in a map instead of searching the whole file for each variable :
map<string, string> ParseFile(ifstream& inClientFile) {
map<string, string> result;
string line;
while (getline(inClientFile, line))
{
if (line[0] != '/' && line[1] != '/')
{
size_t pos = line.find('=');
if(pos == string::npos) {
throw std::exception();
}
string var_name = line.substr(0, pos);
string var_value = line.substr(pos + 1);
result[var_name] = var_value;
}
}
return result;
}
template <class T>
T ReadVar(map<string, string> data, string var_name)
{
map<string, string>::iterator it = data.find(var_name);
if(it == data.end()) {
throw exception();
}
string value = it->second;
istringstream iss(value);
T result;
iss >> result;
return result;
}