Using C++ regex for multi match - c++

I want to parse relatively simple registry file format, let's assume it's plain ascii, saved in old REGEDIT4 format. I want to parse it using standard c++ regex class or function (preferably no boost). As an input data it could take for example sample file like this:
REGEDIT4
[HKEY_LOCAL_MACHINE\SOFTWARE\MyCompany\ConfigurationData\v1.0]
[HKEY_LOCAL_MACHINE\SOFTWARE\MyCompany\ConfigurationData\v1.0\General]
"SettingDword"=dword:00000009
"Setting1"="Some string 1"
"SettingString2"="my String"
[HKEY_LOCAL_MACHINE\SOFTWARE\MyCompany\ConfigurationData\v1.0\Networking]
"SettingDword2"=dword:00000002
"Setting2"="Some string 2"
"SettingString3"="my String2"
What I have briefly analyzed - scanning multiple [] can be done using for example cregex_token_iterator class, but main problem is that it is working in opposite way, which I want to use it. I want to start matching pattern like this: regex re("(\\[.*?\\])"), but token iterator returns all strings, which were not matched, which does sounds kind silly to me.
Basically I would like to match first whole section (\\[.*?\\])(.*?\n\n), and then pick up registry path first, and key-values next - then split using regex key-value pairs.
It's really incredible that in C# it's relatively easy to write regex matcher like this, but I would prefer go with C++, as it's native, does not have performance and assembly unload problems.

Finally cross analyzed - it's possible to use regex_search, but search needs to be retried by continuing from next char* after found pattern.
Below is almost complete example to load .reg file at run-time, I'm using MFC's CString, because it's slightly easier to use than std::string and portability is not needed currently.
#include "stdafx.h"
#include <afx.h> //CFile
#include "TestRegex.h"
#include <fstream>
#include <string>
#include <regex>
#include <map>
CWinApp theApp;
using namespace std;
typedef enum
{
eREG_DWORD = REG_DWORD,
eREG_QWORD = REG_QWORD,
eREG_BINARY = REG_BINARY,
eREG_SZ = REG_SZ
}eRegType;
class RegVariant
{
public:
eRegType type;
union
{
DWORD dw;
__int64 qw;
};
CStringA str;
};
class RegKeyNode
{
public:
// Paths to next nodes
map<CStringA, RegKeyNode> keyToNode;
// Values of current key
map<CStringA, RegVariant> keyValues;
};
map<HKEY, RegKeyNode> g_registry;
int char2int(char input)
{
if (input >= '0' && input <= '9')
return input - '0';
if (input >= 'A' && input <= 'F')
return input - 'A' + 10;
if (input >= 'a' && input <= 'f')
return input - 'a' + 10;
return 0;
}
void hexToBin( const char* hex, CStringA& bin, int maxSize = -1 )
{
int size = (strlen(hex) + 1)/ 3;
if(maxSize != -1 && size > maxSize)
size = maxSize;
unsigned char* buf = (unsigned char*)bin.GetBuffer(size);
for( int i = 0; i < size; i++ )
buf[i] = char2int( hex[ i*3 ] ) * 16 + char2int(hex[i * 3 + 1]);
bin.ReleaseBuffer();
}
int main()
{
HMODULE hModule = ::GetModuleHandle(nullptr);
AfxWinInit(hModule, nullptr, ::GetCommandLine(), 0);
//
// Load .reg file.
//
CString fileName = L"test1.reg";
CStringA file;
CFile cfile;
if (cfile.Open(fileName, CFile::modeRead | CFile::shareDenyNone))
{
int len = (int)cfile.GetLength();
cfile.Read(file.GetBuffer(len), len);
file.ReleaseBuffer();
}
cfile.Close();
file.Replace("\r\n", "\n");
const char* pbuf = file.GetBuffer();
regex reSection("\\[(.*?)\\]([^]*?)\n\n");
regex reLine("^\\s*\"(.*?)\"\\s*=\\s*(.*)$");
regex reTypedValue("^(hex|dword|hex\\(b\\)):(.*)$");
regex reStringValue("^\"(.*)\"$" );
cmatch cmSection, cmLine;
//
// For each section:
//
// [registry path]
// "value1"="value 1"
// "value2"="value 1"
//
while( regex_search(pbuf, pbuf + strlen(pbuf), cmSection, reSection) )
{
CStringA path = cmSection[1].str().c_str();
string key_values = cmSection[2].str();
const char* pkv = key_values.c_str();
int iPath = 0;
CStringA hkeyName = path.Tokenize("\\", iPath).MakeUpper();
RegKeyNode* rnode;
if( hkeyName.Compare("HKEY_LOCAL_MACHINE") == 0 )
rnode = &g_registry[HKEY_LOCAL_MACHINE];
else
rnode = &g_registry[HKEY_CURRENT_USER]; // Don't support other HKEY roots.
//
// Locate path where to place values.
//
for( ; hkeyName = path.Tokenize("\\", iPath); )
{
if( hkeyName.IsEmpty() )
break;
rnode = &rnode->keyToNode[hkeyName];
}
//
// Scan "key"="value" pairs.
//
while( regex_search(pkv, pkv+strlen(pkv), cmLine, reLine ))
{
CStringA key = cmLine[1].str().c_str();
string valueType = cmLine[2].str();
smatch cmTypeValue;
RegVariant* rvValue = &rnode->keyValues[key];
//
// Extract type and value.
//
if(regex_search(valueType, cmTypeValue, reTypedValue))
{
string type = cmTypeValue[1].str();
string value = cmTypeValue[2].str();
if( type == "dword")
{
rvValue->type = eREG_DWORD;
rvValue->dw = (DWORD)strtoul(value.c_str(), 0, 16);
}
else if (type == "hex(b)")
{
rvValue->type = eREG_QWORD;
rvValue->qw = 0;
if( value.size() == 8 * 2 + 7 )
{
CStringA v;
hexToBin(value.c_str(), v, sizeof(__int64));
rvValue->qw = *((__int64*)v.GetBuffer());
}
} else //if (type == "hex")
{
rvValue->type = eREG_BINARY;
hexToBin(value.c_str(), rvValue->str);
}
} else if( regex_search(valueType, cmTypeValue, reStringValue))
{
rvValue->type = eREG_SZ;
rvValue->str = cmTypeValue[1].str().c_str();
}
pkv = cmLine[2].second;
} //while
pbuf = cmSection[2].second;
} //while
return 0;
}

Related

I get a writing access violation of 0x00000... when trying to assign a value into my struct

for part of a school lab I need to read in unique words and their corresponding count with a struct. I am new to structs so please bear with me. I am getting an access violation when I try to write the adress of the current word to the character pointer inside of the current instance of my struct. I have read that this is due to dereferencing a nullptr. I have tried to understand this, but I just don't get it. I have resized arrays just like this on regular char** arrays for accepting new words. I am at a loss, any help would be greatly appreciated. The input file used here is just random words separated by non letter characters but not - or , Here is my code:
#define _CRT_SECURE_NO_WARNINGS
#define _CRTDBG_MAP_ALLOC
#include <iostream>
#include <iomanip>
#include <fstream>
#include <limits>
using std::cin;
using std::cout;
using std::endl;
using std::setw;
using std::right;
using std::left;
using std::ifstream;
using std::ofstream;
const int BUFFER = 100; //I figure this buffer is big enough for any given word
struct Word_Count_STRUCT
{
char* WORD = nullptr;
int COUNT = 0;
};
int main()
{
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
//Input for phrase
ifstream iphrase;
//Output to CSV (word count)
ofstream o_count;
//Word Exceptions
ifstream xinWord;
char wordbuffer[BUFFER] = { '\0' };
char ch = 0;
Word_Count_STRUCT** uniquewords = nullptr;
Word_Count_STRUCT** temp = nullptr;
int k = 0;
int wordcount = 0;
char* cword = nullptr; //Current Word
bool NextWord_flag = false;
bool interwordpunct = false;
bool NewWord_flag = true;
iphrase.open("C:\\Users\\me\\Desktop\\henroE.txt");
if (iphrase.is_open())
{
while (!iphrase.eof())
{
iphrase.get(ch);
if (isalpha(ch) || ch == '\'' || ch == '-')
{
wordbuffer[k] = ch;
++k;
NextWord_flag = true;
if (ch == '\'' || ch == '-')
interwordpunct = true;
}
if ( (NextWord_flag == true) && (!isalpha(ch)) && (interwordpunct == false) )
{
k = 0;
cword = new char[strlen(wordbuffer) + 1];
strcpy(cword, wordbuffer);
memset(wordbuffer, '\0', sizeof(wordbuffer));
for (int i = 0; (i < wordcount) && (NewWord_flag == true); ++i)
{
int cmp = _stricmp(uniquewords[i]->WORD, cword);
if (cmp == 0)
{
NewWord_flag = false;
uniquewords[i]->COUNT++;
delete[] cword;
}
}
if (NewWord_flag == true)
{
temp = new Word_Count_STRUCT * [wordcount + 1]();
for (int i = 0; i < wordcount; ++i)
{
temp[i] = uniquewords[i];
}
delete[] uniquewords;
temp[wordcount]->WORD = cword;
temp[wordcount]->COUNT++;
uniquewords = temp;
++wordcount;
NextWord_flag = false;
}
interwordpunct = false;
NewWord_flag = true;
}
}
}
I get an error on this line:
temp[wordcount]->WORD = cword;
I also get an error on the int value COUNT as well if I comment the line above it out. So I am guessing it is something with how I initialized the struct.
Worth noting that if I do not initialize this call:
temp = new Word_Count_STRUCT * [wordcount + 1]();
and instead just leave it as
temp = new Word_Count_STRUCT * [wordcount + 1];
I get another access violation but for reading instead of writing at 0xFFFFF...
At a loss, thank you for any help :)
You've got a number of things wrong. First, using fixed-length character buffers instead of C++ strings is about 20 years out of date and WILL cause buffer overflow errors unless you are exceedingly careful.
But this is an issue:
temp = new Word_Count_STRUCT * [wordcount + 1]();
for (int i = 0; i < wordcount; ++i)
{
temp[i] = uniquewords[i];
}
delete[] uniquewords;
But where did you allocate uniquewords? You declared it.
You also allocate cword outside a loop but the delete it inside a loop -- which seems really fishy, too.
But note that all you've allocated are pointers. I don't see you actually allocating the structure you're trying to put data in.

C++ Access violation writing location 0x000A000B

I'm building this webcrawler here. This error occurs to me when I start debugging and sends me to memcpy.asm or xstring or dbgdel.cpp files showing me different lines of these files every time.
I was wondering if the code is wrong somehow. I started thinking I am accessing memory blocks that I shouldn't. Here is some code. I hope you can help.
The idea is to iterate through httpContent and get all the URLs from the <a> tags. I am looking for href=" in the beginning and then for the next ". What is in between I am trying to put in temp, then pass the content of temp to an array of strings.
struct Url
{
string host;
string path;
};
int main(){
struct Url website;
string href[100];
website.host = "crawlertest.cs.tu-varna.bg";
website.path = "";
string httpContent = downloadHTTP(website);
for(unsigned int i = 0; i <= httpContent.length()-7; i++){
char c = httpContent[i];
if(c == 'h'){
c = httpContent[i+1];
if(c == 'r'){
c = httpContent[i+2];
if(c == 'e'){
c = httpContent[i+3];
if(c == 'f'){
c = httpContent[i+4];
if(c == '='){
c = httpContent[i+5];
if(c == '\"'){
i+=6;
c = httpContent[i];
string temp = "";
while(c!='\"'){
i++;
c = httpContent[i];
temp+= c;
}
href[i] = temp;
temp = "";
cout<<href[i]<<endl;
}}}}}}
}
system("pause");
return 0;
}
UPDATE
I edited the =, now ==
I am also stopping the iterations 7 positions earlier so the 'if's should not be problem.
I am getting the same errors though.
Use std::vector< std::string > href; to store your result.
With string::find you can find sequence in strings and with string::substr you can extract them from string.
#include <vetor>
#include <string>
struct Url
{
string host;
string path;
};
int main(){
struct Url website;
website.host = "crawlertest.cs.tu-varna.bg";
website.path = "";
std::string httpContent = downloadHTTP(website);
std::vector< std::string > href;
std::size_t pos = httpContent.find("href="); // serach for first "href="
while ( pos != string::npos )
{
pos = httpContent.find( '"', pos+5 ); // serch for '"' at start
if ( pos != string::npos )
{
std::size_t posSt = pos + 1;
pos = httpContent.find( '"', posSt ); // search for '"' at end
if ( pos != string::npos )
{
href.push_back( httpContent.substr( posSt, pos - posSt ) ); // extract ref and append to result
pos = httpContent.find( "href=", pos+1 ); // search for next "href="
}
}
}
system("pause");
return 0;
}

C++ efficient parse

I am programming some automated test equipment (ATE) and I'm trying to extract the following values out of an example response from the ATE:
DCRE? 1,
DCRE P, 10.3, (pin1)
DCRE F, 200.1, (pin2)
DCRE P, 20.4, (pin3)
From each line, I only care about the pin and the measured result value. So for the case above, I want to store the following pieces of information in a map<std::string, double> results;
results["pin1"] = 50.3;
results["pin2"] = 30.8;
results["pin3"] = 70.3;
I made the following code to parse the response:
void parseResultData(map<Pin*, double> &pinnametoresult, string &datatoparse) {
char *p = strtok((char*) datatoparse.c_str(), " \n");
string lastread;
string current;
while (p) {
current = p;
if(current.find('(') != string::npos) {
string substring = lastread.substr(1);
const char* last = substring.c_str();
double value = strtod(last, NULL);
unsigned short number = atoi(current.substr(4, current.size()-2).c_str());
pinnametoresult[&pinlookupmap[number]] = value;
}
lastread = p;
p = strtok(NULL, " \n");
}
}
It works, but it's not very efficient. Is there a way to make the function more efficient for this specific case? I don't care about the DCRE or P/F value on each line. I thought about using Boost regex library, but not sure if that would be more efficient.
In order to make this a bit more efficient, try to avoid copying. In particular, calls to substring, assignments etc can cause havoc on the performance. If you look at your code, you will see that the content of datatoparse are repeatedly assigned to lastread and current, each time with one line less at the beginning. So, on average you copy half of the original string times the number of lines, making just that part an O(n^2) algorithm. This isn't relevant if you have three or four line (not even on 100 lines!) but if you have a few more, performance degrades rapidly.
Try this approach instead:
string::size_type p0 = 0;
string::size_type p1 = input.find('\n', p0);
while (p1 != string::npos) {
// extract the line
string line = input.substr(p0, p1 - p0);
// move to the next line
p0 = p1 + 1;
p1 = input.find('\n', p0);
}
Notes:
Note that the algorithm still copies all input once, but each line only once, making it O(n).
Since you have a copy of the line, you can insert '\0' as artificial separator in order to give a substring to e.g. atoi() or strtod().
I'm not 100% sure of the order of parameters for string::find() and too lazy to look it up, but the idea is to start searching at a certain position. Look at the various overloads of find-like functions.
When handling a line, search the indices of the parts you need and then extract and parse them.
If you have line fragments (i.e. a partial line without a newline) at the end, you will have to modify the loop slightly. Create tests!
This is what I did:
#include <cstdlib>
#include <string>
#include <vector>
#include <unordered_map>
#include <sstream>
#include <iostream>
using namespace std;
struct Pin {
string something;
Pin() {}
};
vector<Pin*> pins = { new Pin(), new Pin(), new Pin() };
typedef unordered_map<Pin*, double> CONT_T;
inline bool OfInterest(const string& line) {
return line.find("(") != string::npos;
}
void parseResultData(CONT_T& pinnametoresult, const string& datatoparse)
{
istringstream is(datatoparse);
string line;
while (getline(is, line)) {
if (OfInterest(line)) {
double d = 0.0;
unsigned int pinid;
size_t firstComma = line.find(",")+2; // skip space
size_t secondComma = line.find(",", firstComma);
istringstream is2(line.substr(firstComma, secondComma-firstComma));
is2 >> d;
size_t paren = line.find("(")+4; // skip pin
istringstream is3(line.substr(paren, (line.length()-paren)-1));
is3 >> pinid;
--pinid;
Pin* pin = pins[pinid];
pinnametoresult[pin] = d;
}
}
}
/*
*
*/
int main(int argc, char** argv) {
string datatoparse = "DCRE? 1, \n"
"DCRE P, 10.3, (pin1)\n"
"DCRE F, 200.1, (pin2)\n"
"DCRE P, 20.4, (pin3)\n";
CONT_T results;
parseResultData(results, datatoparse);
return 0;
}
Here's my final result. Does not involve any copying, but it will destroy the string.
void parseResultData3(map<std::string, double> &pinnametoresult, std::string &datatoparse) {
char* str = (char*) datatoparse.c_str();
int length = datatoparse.size();
double lastdouble = 0.0;
char* startmarker = NULL; //beginning of next pin to parse
for(int pos = 0; pos < length; pos++, str++) {
if(str[0] == '(') {
startmarker = str + 1;
//get previous value
bool triggered = false;
for(char* lookback = str - 1; ; lookback--) {
if(!triggered && (isdigit(lookback[0]) || lookback[0] == '.')) {
triggered = true;
*(lookback + 1) = '\0';
}
else if(triggered && (!isdigit(lookback[0]) && lookback[0] != '.')) {
lastdouble = strtod(lookback, NULL);
break;
}
}
}
else if(startmarker != NULL) {
if(str[0] == ')') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = NULL;
}
if(str[0] == ',') {
str[0] = '\0';
pinnametoresult[startmarker] = lastdouble;
startmarker = str + 1;
}
}
}
}

How to get string from xml in COM

I have one array like this:
static WCHAR FilesToShow[][100] = { { L"start.cmd" },{ L"image.xml" }, { L"xyz" }};
as you see that there is "xyz" which I have to replace with some unique name. For this I have to read image.xml file.
Please can you tell me how can I do this.
I wrote a method like this:
PRIVATE WCHAR GetSystemName(WCHAR *pName)
{
WCHAR line;
wfstream in("image.xml");
WCHAR tmp;
bool begin_tag = false;
while (getline(in,line))
{
// strip whitespaces from the beginning
for (int i = 0; i < line.length(); i++)
{
if (line[i] == ' ' && tmp.size() == 0)
{
}
else
{
tmp += line[i];
}
}
if (wcswcs(tmp,"<SystemPath>") != NULL)
{
???????? how to get "vikash" from here <SystemPath>C:\Users\rs_user\Documents\RobotStudio\Systems\vikash</SystemPath>
}
else
{
continue;
}
}
return tmp;
}
I'm getting exception for wfstream, getline and line.length() method.
I have included fstream.h header file but I think It's not supported in COM.
Please help me how to solve this issue without parsing xml file.
If your xml-file is simple enough so that there is only a single tag with given name, you could do it like this:
#include <string>
#include <sstream>
#include <iostream>
std::wstring get_value(std::wistream & in, std::wstring const & tagname)
{
std::wstring text = std::wstring(std::istreambuf_iterator<std::wstring::value_type>(in),
std::istreambuf_iterator<std::wstring::value_type>());
std::wstring start_tag = L"<" + tagname + L">";
std::wstring end_tag = L"</" + tagname + L">";
std::wstring::size_type start = text.find(start_tag);
if (start == std::wstring::npos)
{
throw 123;
}
start += start_tag.length();
std::wstring::size_type end = text.find(end_tag);
if (end == std::wstring::npos)
{
throw 123;
}
return text.substr(start, end - start);
}
std::wstring get_substr_after(std::wstring const & str, wchar_t delim)
{
std::wstring::size_type pos = str.rfind(delim);
if (pos == std::wstring::npos)
{
throw 123;
}
return str.substr(pos + 1);
}
void stackoverflow()
{
std::wstring text(L"<foo>\n<bar>abc/def/ghi</bar>\n<baz>123/456/789</baz>\n</foo>\n");
std::wistringstream wiss(text);
std::wcout << text << std::endl;
std::wcout << get_substr_after(get_value(wiss, std::wstring(L"bar")), L'/') << std::endl;
}
The output of this program is:
<foo>
<bar>abc/def/ghi</bar>
<baz>123/456/789</baz>
</foo>
ghi
I hope that answered your question.
you have several issues here.
what you are getting are compiler errors and not exceptions
the header file to include is 'fstream' not 'fstream.h'.
make sure you have a line saying using namespace std;
You are declaring line as a variable of type WCHAR, so it is a single wide character, which surely is not a wstring object. Therefore line.length() is incorrect.
Why are you mixing C (wcswcs()) and C++ (STL) ? maybe you should re-design your function signature.
However, try the below function. I have modified the signature to return a pointer to WCHAR, and place the requested string in the buffer space provided by pName. I added a check to verify that the buffer is large enough to fit the name and the terminating NULL character.
WCHAR* GetSystemName(WCHAR *pName, size_t buflen)
{
wstring line;
wifstream in("image.xml");
WCHAR* tmp = NULL;
while (getline(in,line))
{
// strip whitespaces from the beginning
size_t beg_non_whitespace = line.find_first_not_of(L" \t");
if (beg_non_whitespace != wstring::npos)
{
line = line.substr( beg_non_whitespace );
}
size_t beg_system_path = line.find( L"<SystemPath>" );
if ( beg_system_path != wstring::npos )
{
// strip the tags (assuming closing tag is present)
size_t beg_data = beg_system_path + wstring( L"<SystemPath>" ).length();
size_t range = line.find( L"</SystemPath>" ) - beg_data;
line = line.substr( beg_data, range );
// get file name
size_t pos_last_backslash = line.find_last_of( L'\\' );
if ( pos_last_backslash != wstring::npos )
{
line = line.substr( pos_last_backslash + 1 );
if ( buflen <= line.length() )
{
// ERROR: pName buffer is not large enough to fit the string + terminating NULL character.
return NULL;
}
wcscpy( pName, line.c_str() );
tmp = pName;
break;
}
}
}
return tmp;
}
EDIT: Moreover, if you are using and/or parsing XML in other areas of your program, I strongly suggest using an XML parsing library such as Xerces-C or libXml2.
Thank you all for your answer. Here I got solution of my question.
PRIVATE WCHAR* GetNewSystemName()
{
WCHAR line[756];
WCHAR tempBuffer[100];
CComBSTR path = CurrentFolder.Path();
CComBSTR imagePath1 = L"rimageinfo.xml";
path.AppendBSTR(imagePath1);
std::wfstream in(path);
WCHAR tmp[756];
in.getline(line, 756);
WCHAR* buffer;
buffer = wcswcs(line, L"<SystemPath>");
WCHAR *dest = wcsstr(buffer, L"</SystemPath>");
int pos;
pos = dest - buffer;
unsigned int i = 0;
if (wcswcs(buffer,L"<SystemPath>") != NULL && wcswcs(buffer,L"</SystemPath>") != NULL)
{
for (; i < pos; i++)
{
if (buffer[i] == ' ' && sizeof(tmp) == 0)
{
}
else
{
tmp[i] = buffer[i];
}
}
tmp[i] = NULL;
//break;
}
int j = i;
for (; j > 0; j--)
{
if (tmp[j] == '\\')
{
break;
}
}
j++;
int k = 0;
for (; j < i ; j++)
{
System_Name[k] = tmp[j];
k++;
}
System_Name[k] = NULL;
return System_Name;

String Formatting using C / C++

Recently I was asked in an interview to convert the string "aabbbccccddddd" to "a2b3c4d5". The goal is to replace each repeated character with a single occurrence and a repeat count. Here 'a' is repeated twice in the input, so we have to write it as 'a2' in the output. Also I need to write a function to reverse the format back to the original one (e.g. from the string "a2b3c4d5" to "aabbbccccddddd"). I was free to use either C or C++. I wrote the below code, but the interviewer seemed to be not very happy with this. He asked me to try a smarter way than this.
In the below code, I used formatstring() to eliminate repeated chars by just adding the repeated count and used reverseformatstring() to convert back to the original string.
void formatstring(char* target, const char* source) {
int charRepeatCount = 1;
bool isFirstChar = true;
while (*source != '\0') {
if (isFirstChar) {
// Always add the first character to the target
isFirstChar = false;
*target = *source;
source++; target++;
} else {
// Compare the current char with previous one,
// increment repeat count
if (*source == *(source-1)) {
charRepeatCount++;
source++;
} else {
if (charRepeatCount > 1) {
// Convert repeat count to string, append to the target
char repeatStr[10];
_snprintf(repeatStr, 10, "%i", charRepeatCount);
int repeatCount = strlen(repeatStr);
for (int i = 0; i < repeatCount; i++) {
*target = repeatStr[i];
target++;
}
charRepeatCount = 1; // Reset repeat count
}
*target = *source;
source++; target++;
}
}
}
if (charRepeatCount > 1) {
// Convert repeat count to string, append it to the target
char repeatStr[10];
_snprintf(repeatStr, 10, "%i", charRepeatCount);
int repeatCount = strlen(repeatStr);
for (int i = 0; i < repeatCount; i++) {
*target = repeatStr[i];
target++;
}
}
*target = '\0';
}
void reverseformatstring(char* target, const char* source) {
int charRepeatCount = 0;
bool isFirstChar = true;
while (*source != '\0') {
if (isFirstChar) {
// Always add the first character to the target
isFirstChar = false;
*target = *source;
source++; target++;
} else {
// If current char is alpha, add it to the target
if (isalpha(*source)) {
*target = *source;
target++; source++;
} else {
// Get repeat count of previous character
while (isdigit(*source)) {
int currentDigit = (*source) - '0';
charRepeatCount = (charRepeatCount == 0) ?
currentDigit : (charRepeatCount * 10 + currentDigit);
source++;
}
// Decrement repeat count as we have already written
// the first unique char to the target
charRepeatCount--;
// Repeat the last char for this count
while (charRepeatCount > 0) {
*target = *(target - 1);
target++;
charRepeatCount--;
}
}
}
}
*target = '\0';
}
I didn't find any issues with above code. Is there any other better way of doing this?
The approach/algorithm is fine, perhaps you could refine and shrink the code a bit (by doing something simpler, there's no need to solve this in an overly complex way). And choose an indentation style that actually makes sense.
A C solution:
void print_transform(const char *input)
{
for (const char *s = input; *s;) {
char current = *s;
size_t count = 1;
while (*++s == current) {
count++;
}
if (count > 1) {
printf("%c%zu", current, count);
} else {
putc(current, stdout);
}
}
putc('\n', stdout);
}
(This can be easily modified so that it returns the transformed string instead, or writes it to a long enough buffer.)
A C++ solution:
std::string transform(const std::string &input)
{
std::stringstream ss;
std::string::const_iterator it = input.begin();
while (it != input.end()) {
char current = *it;
std::size_t count = 1;
while (++it != input.end() && *it == current) {
count++;
}
if (count > 1) {
ss << current << count;
} else {
ss << current;
}
}
return ss.str();
}
Since several others have suggested very reasonable alternatives, I'd like to offer some opinions on what I think is your underlying question: "He asked me to try a smarter way than this.... Is there any other better way of doing this?"
When I interview a developer, I'm looking for signals that tell me how she approaches a problem:
Most important, as H2CO3 noted, is correctness: will the code work? I'm usually happy to overlook small syntax errors (forgotten semicolons, mismatched parens or braces, and so on) if the algorithm is sensible.
Proper use of the language, especially if the candidate claims expertise or has had extensive experience. Does he understand and use idioms appropriately to write straightforward, uncomplicated code?
Can she explain her train of thought as she formulates her solution? Is it logical and coherent, or is it a shotgun approach? Is she able and willing to communicate well?
Does he account for edge cases? And if so, does the intrinsic algorithm handle them, or is everything a special case? Although I'm happiest if the initial algorithm "just works" for all cases, I think it's perfectly acceptable to start with a verbose approach that covers all cases (or simply to add a "TODO" comment, noting that more work needs to be done), and then simplifying later, when it may be easier to notice patterns or duplicated code.
Does she consider error-handling? Usually, if a candidate starts by asking whether she can assume the input is valid, or with a comment like, "If this were production code, I'd check for x, y, and z problems," I'll ask what she would do, then suggest she focus on a working algorithm for now and (maybe) come back to that later. But I'm disappointed if a candidate doesn't mention it.
Testing, testing, testing! How will the candidate verify his code works? Does he walk through the code and suggest test cases, or do I need to remind him? Are the test cases sensible? Will they cover the edge cases?
Optimization: as a final step, after everything works and has been validated, I'll sometimes ask the candidate if she can improve her code. Bonus points if she suggests it without my prodding; negative points if she spends a lot of effort worrying about it before the code even works.
Applying these ideas to the code you wrote, I'd make these observations:
Using const appropriately is a plus, as it shows familiarity with the language. During an interview I'd probably ask a question or two about why/when to use it.
The proper use of char pointers throughout the code is a good sign. I tend to be pedantic about making the data types explicit within comparisons, particularly during interviews, so I'm happy to see, e.g.
while (*source != '\0') rather than the (common, correct, but IMO less careful) while(*source).
isFirstChar is a bit of a red flag, based on my "edge cases" point. When you declare a boolean to keep track of the code's state, there's often a way of re-framing the problem to handle the condition intrinsically. In this case, you can use charRepeatCount to decide if this is the first character in a possible series, so you won't need to test explicitly for the first character in the string.
By the same token, repeated code can also be a sign that an algorithm can be simplified. One improvement would be to move the conversion of charRepeatCount to a separate function. See below for an even better solution.
It's funny, but I've found that candidates rarely add comments to their code during interviews. Kudos for helpful ones, negative points for those of the ilk "Increment the counter" that add verbosity without information. It's generally accepted that, unless you're doing something weird (in which case you should reconsider what you've written), you should assume the person who reads your code is familiar with the programming language. So comments should explain your thought process, not translate the code back to English.
Excessive levels of nested conditionals or loops can also be a warning. You can eliminate one level of nesting by comparing each character to the next one instead of the previous one. This works even for the last character in the string, because it will be compared to the terminating null character, which won't match and can be treated like any other character.
There are simpler ways to convert charRepeatCount from an int to a string. For example, _snprintf() returns the number of bytes it "prints" to the string, so you can use
target += _snprintf(target, 10, "%i", charRepeatCount);
In the reversing function, you've used the ternary operator perfectly ... but it's not necessary to special-case the zero value: the math is the same regardless of its value. Again, there are also standard utility functions like atoi() that will convert the leading digits of a string into an integer for you.
Experienced developers will often include the increment or decrement operation as part of the condition in a loop, rather than as a separate statement at the bottom: while(charRepeatCount-- > 0). I'd raise an eyebrow but give you a point or two for humor and personality if you wrote this using the slide operator: while (charRepeatCount --> 0). But only if you'd promise not to use it in production.
Good luck with your interviewing!
I think your code is too complex for the task. Here's my approach (using C):
#include <ctype.h>
#include <stdio.h>
void format_str(char *target, char *source) {
int count;
char last;
while (*source != '\0') {
*target = *source;
last = *target;
target++;
source++;
for (count = 1; *source == last; source++, count++)
; /* Intentionally left blank */
if (count > 1)
target += sprintf(target, "%d", count);
}
*target = '\0';
}
void convert_back(char *target, char *source) {
char last;
int val;
while (*source != '\0') {
if (!isdigit((unsigned char) *source)) {
last = *source;
*target = last;
target++;
source++;
}
else {
for (val = 0; isdigit((unsigned char) *source); val = val*10 + *source - '0', source++)
; /* Intentionally left blank */
while (--val) {
*target = last;
target++;
}
}
}
*target = '\0';
}
format_str compresses the string, and convert_back uncompresses it.
Your code "works", but it doesn't adhere to some common patterns used in C++. You should have:
used std::string instead of plain char* array(s)
pass that string as const reference to avoid modification, since you write the result somewhere else;
use C++11 features such as ranged based for loops and lambdas as well.
I think the interviewer's purpose was to test your ability to deal with the C++11 standard, since the algorithm itself was pretty trivial.
Perhaps the interviewer wanted to test your knowledge of existing standard library tools. Here's how my take could look in C++:
#include <string>
#include <sstream>
#include <algorithm>
#include <iostream>
typedef std::string::const_iterator Iter;
std::string foo(Iter first, Iter last)
{
Iter it = first;
std::ostringstream result;
while (it != last) {
it = std::find_if(it, last, [=](char c){ return c != *it; });
result << *first << (it - first);
first = it;
}
return result.str();
}
int main()
{
std::string s = "aaabbbbbbccddde";
std::cout << foo(s.begin(), s.end());
}
An extra check is needed for empty input.
try this
std::string str="aabbbccccddddd";
for(int i=0;i<255;i++)
{
int c=0;
for(int j=0;j<str.length();j++)
{
if(str[j] == i)
c++;
}
if(c>0)
printf("%c%d",i,c);
}
My naive approach:
void pack( char const * SrcStr, char * DstBuf ) {
char const * Src_Ptr = SrcStr;
char * Dst_Ptr = DstBuf;
char c = 0;
int RepeatCount = 1;
while( '\0' != *Src_Ptr ) {
c = *Dst_Ptr = *Src_Ptr;
++Src_Ptr; ++Dst_Ptr;
for( RepeatCount = 1; *Src_Ptr == c; ++RepeatCount ) {
++Src_Ptr;
}
if( RepeatCount > 1 ) {
Dst_Ptr += sprintf( Dst_Ptr, "%i", RepeatCount );
RepeatCount = 1;
}
}
*Dst_Ptr = '\0';
};
void unpack( char const * SrcStr, char * DstBuf ) {
char const * Src_Ptr = SrcStr;
char * Dst_Ptr = DstBuf;
char c = 0;
while( '\0' != *Src_Ptr ) {
if( !isdigit( *Src_Ptr ) ) {
c = *Dst_Ptr = *Src_Ptr;
++Src_Ptr; ++Dst_Ptr;
} else {
int repeat_count = strtol( Src_Ptr, (char**)&Src_Ptr, 10 );
memset( Dst_Ptr, c, repeat_count - 1 );
Dst_Ptr += repeat_count - 1;
}
}
*Dst_Ptr = '\0';
};
But if interviewer asks for error-handling than solution turns to be much more complex (and ugly). My portable approach:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
// for MSVC
#ifdef _WIN32
#define snprintf sprintf_s
#endif
int pack( char const * SrcStr, char * DstBuf, size_t DstBuf_Size ) {
int Err = 0;
char const * Src_Ptr = SrcStr;
char * Dst_Ptr = DstBuf;
size_t SrcBuf_Size = strlen( SrcStr ) + 1;
char const * SrcBuf_End = SrcStr + SrcBuf_Size;
char const * DstBuf_End = DstBuf + DstBuf_Size;
char c = 0;
int RepeatCount = 1;
// don't forget about buffers intercrossing
if( !SrcStr || !DstBuf || 0 == DstBuf_Size \
|| (DstBuf < SrcBuf_End && DstBuf_End > SrcStr) ) {
return 1;
}
// source string must contain no digits
// check for destination buffer overflow
while( '\0' != *Src_Ptr && Dst_Ptr < DstBuf_End - 1 \
&& !isdigit( *Src_Ptr ) && !Err ) {
c = *Dst_Ptr = *Src_Ptr;
++Src_Ptr; ++Dst_Ptr;
for( RepeatCount = 1; *Src_Ptr == c; ++RepeatCount ) {
++Src_Ptr;
}
if( RepeatCount > 1 ) {
int res = snprintf( Dst_Ptr, DstBuf_End - Dst_Ptr - 1, "%i" \
, RepeatCount );
if( res < 0 ) {
Err = 1;
} else {
Dst_Ptr += res;
RepeatCount = 1;
}
}
}
*Dst_Ptr = '\0';
return Err;
};
int unpack( char const * SrcStr, char * DstBuf, size_t DstBuf_Size ) {
int Err = 0;
char const * Src_Ptr = SrcStr;
char * Dst_Ptr = DstBuf;
size_t SrcBuf_Size = strlen( SrcStr ) + 1;
char const * SrcBuf_End = SrcStr + SrcBuf_Size;
char const * DstBuf_End = DstBuf + DstBuf_Size;
char c = 0;
// don't forget about buffers intercrossing
// first character of source string must be non-digit
if( !SrcStr || !DstBuf || 0 == DstBuf_Size \
|| (DstBuf < SrcBuf_End && DstBuf_End > SrcStr) || isdigit( SrcStr[0] ) ) {
return 1;
}
// check for destination buffer overflow
while( '\0' != *Src_Ptr && Dst_Ptr < DstBuf_End - 1 && !Err ) {
if( !isdigit( *Src_Ptr ) ) {
c = *Dst_Ptr = *Src_Ptr;
++Src_Ptr; ++Dst_Ptr;
} else {
int repeat_count = strtol( Src_Ptr, (char**)&Src_Ptr, 10 );
if( !repeat_count || repeat_count - 1 > DstBuf_End - Dst_Ptr - 1 ) {
Err = 1;
} else {
memset( Dst_Ptr, c, repeat_count - 1 );
Dst_Ptr += repeat_count - 1;
}
}
}
*Dst_Ptr = '\0';
return Err;
};
int main() {
char str[] = "aabbbccccddddd";
char buf1[128] = {0};
char buf2[128] = {0};
pack( str, buf1, 128 );
printf( "pack: %s -> %s\n", str, buf1 );
unpack( buf1, buf2, 128 );
printf( "unpack: %s -> %s\n", buf1, buf2 );
return 0;
}
Test: http://ideone.com/Y7FNE3. Also works in MSVC.
Try to make do with less boilerplate:
#include <iostream>
#include <iterator>
#include <sstream>
using namespace std;
template<typename in_iter,class ostream>
void torle(in_iter i, ostream &&o)
{
while (char c = *i++) {
size_t n = 1;
while ( *i == c )
++n, ++i;
o<<c<<n;
}
}
template<class istream, typename out_iter>
void fromrle(istream &&i, out_iter o)
{
char c; size_t n;
while (i>>c>>n)
while (n--) *o++=c;
}
int main()
{
typedef ostream_iterator<char> to;
string line; stringstream converted;
while (getline(cin,line)) {
torle(begin(line),converted);
cout<<converted.str()<<'\n';
fromrle(converted,ostream_iterator<char>(cout));
cout<<'\n';
}
}