Concurrent file reading / writing far slower than single-threaded - c++

I am writing a program which reads, parses, and writes large (~15Gb) text files in parallel. On a single thread, the program works very quickly, but there is a massive performance drop when I run it concurrently on several jobs (much more than to be expected from multithreading overhead).
The main file-reading/writing function takes an "SRA" object, which has two corresponding files associated with it, and the amount of RAM to dedicate, in bytes, as input. The function then reads the files chunk-buffer-wise and writes two new files that omit "error" information. The two files correspond to one another, with the information therein order-dependent. Their structure is not super important, but they look like this:
#SRR1000000.1 cor
ATATTTACCGGGTTATTCGGATTAGTTTTGGGCCCCCC
+
FFFFF:::F:::F:::,FFFF:::::::FFFFF,,,,F
#SRR1000000.2 error
ATTTTTTTTTCCCCCCGGCGGCGCGGCTTCTCGGTTAA
+
FF:FF:::::F::F::,,,,,:FFFFF,,F,F:,FF:F
#SRR1000000.3
ATTTTTTTTTCCCCCCGGCGGCGCGGCTTCTCGGTTAA
+
FF:FF:::::F::F::,,,,,:FFFFF,,F,F:,FF:F
.
.
.
The "header", beginning with "#", is associated with and identifies the next 3 lines of data, and the number after the "." denotes the order. Both files have these headers.
Below is the file reader, parser, writer function mentioned above:
// File read/parse/write function
// Takes SRA object, amount of RAM as input
// Reads objects associated files in chunks according to RAM
// Parses them to remove "errors" while writing to new files
void rem_unfix(SRA sra, long long int ram_b) {
// Get in-file / out-file paths
std::string inFile1Str(sra.get_sra_path_corr().first.c_str());
std::string inFile2Str(sra.get_sra_path_corr().second.c_str());
std::string outFile1Str(std::string(sra.get_sra_path_corr().first.replace_extension("fix.fq").c_str()));
std::string outFile2Str(std::string(sra.get_sra_path_corr().second.replace_extension("fix.fq").c_str()));
// Open streams for file processing
std::ifstream inFile1(inFile1Str);
std::ifstream inFile2(inFile2Str);
std::ofstream outFile1(outFile1Str);
std::ofstream outFile2(outFile2Str);
// Get amount of RAM to dedicate to each file (bytes)
long long int ram_b_per_file = ram_b / 2;
// Allocate space on heap for the file buffers
char * inFile1Data = new char[ram_b_per_file];
char * inFile2Data = new char[ram_b_per_file];
// Algorithmic pointer/size variables to aid in parsing
std::streamsize s1;
std::streamsize s2;
char * currPos1;
char * currPos2;
char * nlPos1;
char * nlPos2;
char * nlPos1Prev;
char * nlPos2Prev;
char * writeStart1;
char * writeStart2;
char * writeEnd1;
char * writeEnd2;
char * inFile1L;
char * inFile2L;
// Strings to ensure ends of buffers are lined up before parsing
std::string readMatch;
std::string read;
// Main loop processes files in RAM-determined chunks
while (!inFile1.eof() || !inFile2.eof()) {
// Fill buffers, count number of bytes read to account for
// end-of-file condition
inFile1.read(inFile1Data, ram_b_per_file);
inFile2.read(inFile2Data, ram_b_per_file);
s1 = inFile1.gcount();
s2 = inFile2.gcount();
currPos1 = inFile1Data;
currPos2 = inFile2Data;
nlPos1 = inFile1Data;
nlPos2 = inFile2Data;
writeStart1 = inFile1Data;
writeStart2 = inFile2Data;
inFile1L = inFile1Data + s1;
inFile2L = inFile2Data + s2;
// Line up the ends of the buffers / ifstreams so that all
// information is "paired", or that there is no piece of
// data in one buffer without its partner data in the other
if (!inFile1.eof() && !inFile2.eof()) {
// Unget/unstream character-wise until one of the buffers/ifstreams
// ends just before a header
while ((inFile1.peek() != '#' && inFile1.peek() != '>') &&
(inFile2.peek() != '#' && inFile2.peek() != '>')) {
if (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
if (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
}
// If inFile1's buffer was was first to get into position
// Get its header number, get inFile2's last header number.
// Then unstream accordingly until the two buffers end at the
// same data point in their files.
if (inFile1.peek() == '#' || inFile1.peek() == '>') {
inFile1.get();
inFile1 >> readMatch;
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
}
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
inFile2.get();
inFile2 >> read;
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
}
if (read > readMatch) {
inFile2.unget();
while (read.compare(readMatch) != 0) {
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
inFile2.get();
inFile2 >> read;
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
}
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
inFile2.get();
}
else if (read < readMatch) {
inFile1.unget();
while (read.compare(readMatch) != 0) {
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
inFile1.get();
inFile1 >> readMatch;
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
}
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
inFile1.get();
}
else {
// Buffer in position -- do nothing
}
}
// If inFile2's buffer was was first to get into position
// Get its header number, get inFile1's last header number.
// Then unstream accordingly until the two buffers end at the
// same data point in their files.
else {
inFile2.get();
inFile2 >> readMatch;
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
}
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
inFile1.get();
inFile1 >> read;
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
}
if (read > readMatch) {
inFile1.unget();
while (read.compare(readMatch) != 0) {
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
inFile1.get();
inFile1 >> read;
while (inFile1.peek() != '#' && inFile1.peek() != '>') {
inFile1.unget();
}
inFile1.unget();
inFile1Data[s1 - 1] = '\0';
s1--;
}
inFile1.get();
}
else if (read < readMatch) {
inFile2.unget();
while (read.compare(readMatch) != 0) {
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
inFile2.get();
inFile2 >> readMatch;
while (inFile2.peek() != '#' && inFile2.peek() != '>') {
inFile2.unget();
}
inFile2.unget();
inFile2Data[s2 - 1] = '\0';
s2--;
}
inFile2.get();
}
else {
// Buffer in position -- do nothing
}
}
}
// With buffers now aligned, parse them and write non-error
// information into the new files
while (nlPos1 != inFile1L && nlPos2 != inFile2L) {
nlPos1Prev = nlPos1;
nlPos2Prev = nlPos2;
nlPos1 = std::find(nlPos1 + 1, inFile1L, '\n');
nlPos2 = std::find(nlPos2 + 1, inFile2L, '\n');
if (strncmp(nlPos1 - 5, "error", 5) == 0 ||
strncmp(nlPos2 - 5, "error", 5) == 0) {
writeEnd1 = nlPos1Prev;
outFile1.write(writeStart1, writeEnd1 - writeStart1);
writeEnd2 = nlPos2Prev;
outFile2.write(writeStart2, writeEnd2 - writeStart2);
for (int i = 0; i < 3; i++) {
nlPos1 = std::find(nlPos1 + 1, inFile1L, '\n');
nlPos2 = std::find(nlPos2 + 1, inFile2L, '\n');
}
writeStart1 = nlPos1;
writeStart2 = nlPos2;
}
}
outFile1.write(writeStart1, inFile1Data + s1 - writeStart1);
outFile2.write(writeStart2, inFile2Data + s2 - writeStart2);
}
inFile1.close();
inFile2.close();
outFile1.close();
outFile2.close();
}
The function, albeit a bit spaghetti-ish, works fine. I call it into concurrency with the below function, which leverages a "threadPool" object, for which the header, implementation files are at the bottom of the question:
// Concurrency function which starts parallel file-pair-processing
// jobs for each **"SRA"** object in its input vector.
void rem_unfix_bulk(std::vector<SRA> sras, int numThreads, int ram_gb) {
long long int ram_b = stoi(ram_gb) * 1000000000;
long long int ram_b_per_thread = ram_b / numThreads;
// Instantiate thread pool with numThreads threads to handle jobs
threadPool fileCorrPool;
fileCorrPool.start(numThreads);
// Add queue jobs to thread pool
for (auto sra : sras) {
if (fs::exists(sra.get_sra_path_corr_fix().first.c_str())) {
continue;
}
fileCorrPool.queueJob([sra, ram_b_per_thread {rem_unfix_pe(sra, ram_b_per_thread);});
}
fileCorrPool.stop();
}
threadPool object header file (thread_pool.h):
#include <thread>
#include <mutex>
#include <functional>
#include <vector>
#include <queue>
#include <condition_variable>
class threadPool {
public:
void start(int threadNum);
void queueJob(const std::function<void()>& job);
void stop();
bool busy();
private:
void threadLoop();
bool endProc = false;
std::mutex queue_mutex;
std::condition_variable mutex_condition;
std::vector<std::thread> threads;
std::queue<std::function<void()>> jobs;
};
threadPool object implementation file (thread_pool.cpp):
#include "thread_pool.h"
void threadPool::start(int threadNum) {
threads.resize(threadNum);
for (int i = 0; i < threadNum; i++) {
threads.at(i) = std::thread(&threadPool::threadLoop, this);
}
}
void threadPool::threadLoop() {
while (true) {
std::function<void()> job;
{
std::unique_lock<std::mutex> lock(queue_mutex);
mutex_condition.wait(lock, [this] {
return (!jobs.empty() || endProc);
});
if (jobs.empty() && endProc) {
return;
}
job = jobs.front();
jobs.pop();
}
job();
}
}
void threadPool::queueJob(const std::function<void()>& job) {
{
std::unique_lock<std::mutex> lock(queue_mutex);
jobs.push(job);
}
mutex_condition.notify_one();
}
bool threadPool::busy() {
bool poolBusy;
{
std::unique_lock<std::mutex> lock(queue_mutex);
poolBusy = jobs.empty();
}
return poolBusy;
}
void threadPool::stop() {
{
std::unique_lock<std::mutex> lock(queue_mutex);
endProc = true;
}
mutex_condition.notify_all();
for (std::thread& t : threads) {
t.join();
}
threads.clear();
}

Related

C++ peek giving value 'ÿ' (ifstream)

My code first of all:
int GetHighScore(string name)
{
int highScore = 0;
ifstream fin;
char textInFile[50];1
fin.open(name + ".txt", ios::in);
if (fin.fail())
{
// Old piece of code
highScore = 0;
}
else
{
while (fin.good())
{
fin >> textInFile;
for each (char var in textInFile)
{
if (var == '#')
{
char c = fin.peek();
if (c == '1')
{
char score = fin.peek();
highScoreLvl1 = (int)score;
}
else if (c == '2')
{
char score = fin.peek();
highScoreLvl2 = (int)score;
}
else if (c == '3')
{
char score = fin.peek();
highScoreLvl3 = (int)score;
}
}
}
}
//fin >> highScore;
}
// Return the high score found in the file
return highScoreLvl1;
}
It detects the '#', but then c gets assigned the value 'ÿ' when it performs the peek operation. What it should give is the number '1', '2' or '3' (in char form); but it doesn't for some reason, and I can't see why... :/
Here's what the file looks like:
level#12level#22level#32
The first number represents the level, and the second number is the score achieved on that level.
If your file contains the only string 'level#12level#22level#32' then it's read into textInFile in fin >> textInFile operator. When you meet '#' character in the string you're trying to peek character from the file stream but there is nothing to peek, that's why -1 (end of file) is returned.
To fix this you need to take next character from textInFile string, not from the file. Here is example code:
int GetHighScore(string name)
{
int highScore = 0;
ifstream fin;
char textInFile[50];
fin.open(name + ".txt", ios::in);
int highScoreLvl1, highScoreLvl2, highScoreLvl3;
if (fin.fail())
{
// Old piece of code
highScore = 0;
}
else
{
while (fin.good())
{
fin >> textInFile;
bool bPrevIsHash = false;
size_t nLength = strlen(textInFile);
for (size_t i = 0; i + 2 < nLength; ++i)
{
if (textInFile[i] == '#')
{
if (textInFile[i + 1] == '1')
{
highScoreLvl1 = (int)textInFile[i + 2];
}
else if (textInFile[i + 1] == '2')
{
highScoreLvl2 = (int)textInFile[i + 2];
}
else if (textInFile[i + 1] == '3')
{
highScoreLvl3 = (int)textInFile[i + 2];
}
}
}
}
}
// Return the high score found in the file
return highScoreLvl1;
}
And there are several other issues with your code:
You return value of highScoreLvl1 that could be left uninitialized because there can be no '#' in the string. And probably you mean to return max value of highScoreLvl1, highScoreLvl2 or highScoreLvl3.
You're assigning value of char converted to int. In this case you will not get value of 1, 2, etc. You'll get ordinal of ASCII character, e.g. 0x31 (49) for '1', 0x32 (50) for 2, etc. If you need digit value you can do following trick: highScoreLvl1 = textInFile[i + 2] - '0';

Using a loop with std::strcmp to load lots of settings

In my game I keep track of unlocked levels with a vector std::vector<bool> lvlUnlocked_;.
The simple function to save the progress is this:
void save() {
std::stringstream ss;
std::string stringToSave = "";
std::ofstream ofile("./progress.txt");
if (ofile.good()) {
ofile.clear();
for (std::size_t i = 0; i < levelUnlocked_.size(); ++i) {
ss << "lvl" << i << "=" << (lvlUnlocked_.at(i) ? "1" : "0") << std::endl;
}
stringToSave = ss.str();
ofile << stringToSave;
ofile.close();
}
}
This works and is nice since I can just use a loop to dump the info.
Now to the part where I am stuck, the lower part of my load function (see comment in code below):
void load() {
std::ifstream ifile("./progress.txt");
if (ifile.good()) {
int begin;
int end;
std::string line;
std::string stringKey = "";
std::string stringValue = "";
unsigned int result;
while (std::getline(ifile, line)) {
stringKey = "";
stringValue = "";
for (unsigned int i = 0; i < line.length(); i++) {
if (line.at(i) == '=') {
begin = i + 1;
end = line.length();
break;
}
}
for (int i = 0; i < begin - 1; i++) {
stringKey += line.at(i);
}
for (int i = begin; i < end; i++) {
stringValue += line.at(i);
}
result = static_cast<unsigned int>(std::stoi(stringValue));
// usually I now compare the value and act accordingly, like so:
if (std::strcmp(stringKey.c_str(), "lvl0") == 0) {
lvlUnlocked_.at(0) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl1") == 0) {
lvlUnlocked_.at(1) = true;
} else if (std::strcmp(stringKey.c_str(), "lvl2") == 0) {
lvlUnlocked_.at(2) = true;
}
// etc....
}
}
}
This works fine, but...
the problem is that I have 100+ levels and I want it to be dynamic based on the size of my lvlUnlocked_ vector instead of having to type it all like in the code above.
Is there a way to somehow make use of a loop like in my save function to check all levels?
If you parse your key to extract a suitable integer value, you can just index into the bit-vector with that:
while (std::getline(ifile, line)) {
const size_t eq = line.find('=');
if (eq == std::string::npos)
// no equals sign
continue;
auto stringKey = line.substr(0, eq);
auto stringValue = line.substr(eq+1);
if (stringKey.substr(0,3) != "lvl")
// doesn't begin with lvl
continue;
// strip off "lvl"
stringKey = stringKey.substr(3);
size_t end;
std::vector<bool>::size_type index = std::stoi(stringKey, &end);
if (end == 0 || end != stringKey.length())
// not a valid level number
continue;
if (index >= lvlUnlocked_.size())
// out of range
continue;
// Set it :-)
lvlUnlocked_[index] = stringValue=="1";
}
(I've also updated your parsing for "key=value" strings to more idiomatic C++.)

Interview: Machine coding / regex (Better alternative to my solution)

The following is the interview question:
Machine coding round: (Time 1hr)
Expression is given and a string testCase, need to evaluate the testCase is valid or not for expression
Expression may contain:
letters [a-z]
'.' ('.' represents any char in [a-z])
'*' ('*' has same property as in normal RegExp)
'^' ('^' represents start of the String)
'$' ('$' represents end of String)
Sample cases:
Expression Test Case Valid
ab ab true
a*b aaaaaab true
a*b*c* abc true
a*b*c aaabccc false
^abc*b abccccb true
^abc*b abbccccb false
^abcd$ abcd true
^abc*abc$ abcabc true
^abc.abc$ abczabc true
^ab..*abc$ abyxxxxabc true
My approach:
Convert the given regular expression into concatenation(ab), alteration(a|b), (a*) kleenstar.
And add + for concatenation.
For example:
abc$ => .*+a+b+c
^ab..*abc$ => a+b+.+.*+a+b+c
Convert into postfix notation based on precedence.
(parantheses>kleen_star>concatenation>..)
(a|b)*+c => ab|*c+
Build NFA based on Thompson construction
Backtracking / traversing through NFA by maintaining a set of states.
When I started implementing it, it took me a lot more than 1 hour. I felt that the step 3 was very time consuming. I built the NFA by using postfix notation +stack and by adding new states and transitions as needed.
So, I was wondering if there is faster alternative solution this question? Or maybe a faster way to implement step 3. I found this CareerCup link where someone mentioned in the comment that it was from some programming contest. So If someone has solved this previously or has a better solution to this question, I'd be happy to know where I went wrong.
Some derivation of Levenshtein distance comes to mind - possibly not the fastest algorithm, but it should be quick to implement.
We can ignore ^ at the start and $ at the end - anywhere else is invalid.
Then we construct a 2D grid where each row represents a unit [1] in the expression and each column represents a character in the test string.
[1]: A "unit" here refers to a single character, with the exception that * shall be attached to the previous character
So for a*b*c and aaabccc, we get something like:
a a a b c c c
a*
b*
c
Each cell can have a boolean value indicating validity.
Now, for each cell, set it to valid if either of these hold:
The value in the left neighbour is valid and the row is x* or .* and the column is x (x being any character a-z)
This corresponds to a * matching one additional character.
The value in the upper-left neighbour is valid and the row is x or . and the column is x (x being any character a-z)
This corresponds to a single-character match.
The value in the top neighbour is valid and the row is x* or .*.
This corresponds to the * matching nothing.
Then check if the bottom-right-most cell is valid.
So, for the above example, we get: (V indicating valid)
a a a b c c c
a* V V V - - - -
b* - - - V - - -
c - - - - V - -
Since the bottom-right cell isn't valid, we return invalid.
Running time: O(stringLength*expressionLength).
You should notice that we're mostly exploring a fairly small part of the grid.
This solution can be improved by making it a recursive solution making use of memoization (and just calling the recursive solution for the bottom-right cell).
This will give us a best-case performance of O(1), but still a worst-case performance of O(stringLength*expressionLength).
My solution assumes the expression must match the entire string, as inferred from the result of the above example being invalid (as per the question).
If it can instead match a substring, we can modify this slightly so, if the cell is in the top row it's valid if:
The row is x* or .*.
The row is x or . and the column is x.
Given only 1 hour we can use simple way.
Split pattern into tokens: a*b.c => { a* b . c }.
If pattern doesn't start with ^ then add .* in the beginning, else remove ^.
If pattern doesn't end with $ then add .* in the end, else remove $.
Then we use recursion: going 3 way in case if we have recurring pattern (increase pattern index by 1, increase word index by 1, increase both indices by 1), going one way if it is not recurring pattern (increase both indices by 1).
Sample code in C#
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
namespace ReTest
{
class Program
{
static void Main(string[] args)
{
Debug.Assert(IsMatch("ab", "ab") == true);
Debug.Assert(IsMatch("aaaaaab", "a*b") == true);
Debug.Assert(IsMatch("abc", "a*b*c*") == true);
Debug.Assert(IsMatch("aaabccc", "a*b*c") == true); /* original false, but it should be true */
Debug.Assert(IsMatch("abccccb", "^abc*b") == true);
Debug.Assert(IsMatch("abbccccb", "^abc*b") == false);
Debug.Assert(IsMatch("abcd", "^abcd$") == true);
Debug.Assert(IsMatch("abcabc", "^abc*abc$") == true);
Debug.Assert(IsMatch("abczabc", "^abc.abc$") == true);
Debug.Assert(IsMatch("abyxxxxabc", "^ab..*abc$") == true);
}
static bool IsMatch(string input, string pattern)
{
List<PatternToken> patternTokens = new List<PatternToken>();
for (int i = 0; i < pattern.Length; i++)
{
char token = pattern[i];
if (token == '^')
{
if (i == 0)
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
else
throw new ArgumentException("input");
}
else if (char.IsLower(token) || token == '.')
{
if (i < pattern.Length - 1 && pattern[i + 1] == '*')
{
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Multiple });
i++;
}
else
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
}
else if (token == '$')
{
if (i == pattern.Length - 1)
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
else
throw new ArgumentException("input");
}
else
throw new ArgumentException("input");
}
PatternToken firstPatternToken = patternTokens.First();
if (firstPatternToken.Token == '^')
patternTokens.RemoveAt(0);
else
patternTokens.Insert(0, new PatternToken { Token = '.', Occurence = Occurence.Multiple });
PatternToken lastPatternToken = patternTokens.Last();
if (lastPatternToken.Token == '$')
patternTokens.RemoveAt(patternTokens.Count - 1);
else
patternTokens.Add(new PatternToken { Token = '.', Occurence = Occurence.Multiple });
return IsMatch(input, 0, patternTokens, 0);
}
static bool IsMatch(string input, int inputIndex, IList<PatternToken> pattern, int patternIndex)
{
if (inputIndex == input.Length)
{
if (patternIndex == pattern.Count || (patternIndex == pattern.Count - 1 && pattern[patternIndex].Occurence == Occurence.Multiple))
return true;
else
return false;
}
else if (inputIndex < input.Length && patternIndex < pattern.Count)
{
char c = input[inputIndex];
PatternToken patternToken = pattern[patternIndex];
if (patternToken.Token == '.' || patternToken.Token == c)
{
if (patternToken.Occurence == Occurence.Single)
return IsMatch(input, inputIndex + 1, pattern, patternIndex + 1);
else
return IsMatch(input, inputIndex, pattern, patternIndex + 1) ||
IsMatch(input, inputIndex + 1, pattern, patternIndex) ||
IsMatch(input, inputIndex + 1, pattern, patternIndex + 1);
}
else
return false;
}
else
return false;
}
class PatternToken
{
public char Token { get; set; }
public Occurence Occurence { get; set; }
public override string ToString()
{
if (Occurence == Occurence.Single)
return Token.ToString();
else
return Token.ToString() + "*";
}
}
enum Occurence
{
Single,
Multiple
}
}
}
Here is a solution in Java. Space and Time is O(n). Inline comments are provided for more clarity:
/**
* #author Santhosh Kumar
*
*/
public class ExpressionProblemSolution {
public static void main(String[] args) {
System.out.println("---------- ExpressionProblemSolution - start ---------- \n");
ExpressionProblemSolution evs = new ExpressionProblemSolution();
evs.runMatchTests();
System.out.println("\n---------- ExpressionProblemSolution - end ---------- ");
}
// simple node structure to keep expression terms
class Node {
Character ch; // char [a-z]
Character sch; // special char (^, *, $, .)
Node next;
Node(Character ch1, Character sch1) {
ch = ch1;
sch = sch1;
}
Node add(Character ch1, Character sch1) {
this.next = new Node(ch1, sch1);
return this.next;
}
Node next() {
return this.next;
}
public String toString() {
return "[ch=" + ch + ", sch=" + sch + "]";
}
}
private boolean letters(char ch) {
return (ch >= 'a' && ch <= 'z');
}
private boolean specialChars(char ch) {
return (ch == '.' || ch == '^' || ch == '*' || ch == '$');
}
private void validate(String expression) {
// if expression has invalid chars throw runtime exception
if (expression == null) {
throw new RuntimeException(
"Expression can't be null, but it can be empty");
}
char[] expr = expression.toCharArray();
for (int i = 0; i < expr.length; i++) {
if (!letters(expr[i]) && !specialChars(expr[i])) {
throw new RuntimeException(
"Expression contains invalid char at position=" + i
+ ", invalid_char=" + expr[i]
+ " (allowed chars are 'a-z', *, . ^, * and $)");
}
}
}
// Parse the expression and split them into terms and add to list
// the list is FSM (Finite State Machine). The list is used during
// the process step to iterate through the machine states based
// on the input string
//
// expression = a*b*c has 3 terms -> [a*] [b*] [c]
// expression = ^ab.*c$ has 4 terms -> [^a] [b] [.*] [c$]
//
// Timing : O(n) n -> expression length
// Space : O(n) n -> expression length decides the no.of terms stored in the list
private Node preprocess(String expression) {
debug("preprocess - start [" + expression + "]");
validate(expression);
Node root = new Node(' ', ' '); // root node with empty values
Node current = root;
char[] expr = expression.toCharArray();
int i = 0, n = expr.length;
while (i < n) {
debug("i=" + i);
if (expr[i] == '^') { // it is prefix operator, so it always linked
// to the char after that
if (i + 1 < n) {
if (i == 0) { // ^ indicates start of the expression, so it
// must be first in the expr string
current = current.add(expr[i + 1], expr[i]);
i += 2;
continue;
} else {
throw new RuntimeException(
"Special char ^ should be present only at the first position of the expression (position="
+ i + ", char=" + expr[i] + ")");
}
} else {
throw new RuntimeException(
"Expression missing after ^ (position=" + i
+ ", char=" + expr[i] + ")");
}
} else if (letters(expr[i]) || expr[i] == '.') { // [a-z] or .
if (i + 1 < n) {
char nextCh = expr[i + 1];
if (nextCh == '$' && i + 1 != n - 1) { // if $, then it must
// be at the last
// position of the
// expression
throw new RuntimeException(
"Special char $ should be present only at the last position of the expression (position="
+ (i + 1)
+ ", char="
+ expr[i + 1]
+ ")");
}
if (nextCh == '$' || nextCh == '*') { // a* or b$
current = current.add(expr[i], nextCh);
i += 2;
continue;
} else {
current = current.add(expr[i], expr[i] == '.' ? expr[i]
: null);
i++;
continue;
}
} else { // a or b
current = current.add(expr[i], null);
i++;
continue;
}
} else {
throw new RuntimeException("Invalid char - (position=" + (i)
+ ", char=" + expr[i] + ")");
}
}
debug("preprocess - end");
return root;
}
// Traverse over the terms in the list and iterate and match the input string
// The terms list is the FSM (Finite State Machine); the end of list indicates
// end state. That is, input is valid and matching the expression
//
// Timing : O(n) for pre-processing + O(n) for processing = 2O(n) = ~O(n) where n -> expression length
// Timing : O(2n) ~ O(n)
// Space : O(n) where n -> expression length decides the no.of terms stored in the list
public boolean process(String expression, String testString) {
Node root = preprocess(expression);
print(root);
Node current = root.next();
if (root == null || current == null)
return false;
int i = 0;
int n = testString.length();
debug("input-string-length=" + n);
char[] test = testString.toCharArray();
// while (i < n && current != null) {
while (current != null) {
debug("process: i=" + i);
debug("process: ch=" + current.ch + ", sch=" + current.sch);
if (current.sch == null) { // no special char just [a-z] case
if (test[i] != current.ch) { // test char and current state char
// should match
return false;
} else {
i++;
current = current.next();
continue;
}
} else if (current.sch == '^') { // process start char
if (i == 0 && test[i] == current.ch) {
i++;
current = current.next();
continue;
} else {
return false;
}
} else if (current.sch == '$') { // process end char
if (i == n - 1 && test[i] == current.ch) {
i++;
current = current.next();
continue;
} else {
return false;
}
} else if (current.sch == '*') { // process repeat char
if (letters(current.ch)) { // like a* or b*
while (i < n && test[i] == current.ch)
i++; // move i till end of repeat char
current = current.next();
continue;
} else if (current.ch == '.') { // like .*
Node nextNode = current.next();
print(nextNode);
if (nextNode != null) {
Character nextChar = nextNode.ch;
Character nextSChar = nextNode.sch;
// a.*z = az or (you need to check the next state in the
// list)
if (test[i] == nextChar) { // test [i] == 'z'
i++;
current = current.next();
continue;
} else {
// a.*z = abz or
// a.*z = abbz
char tch = test[i]; // get 'b'
while (i + 1 < n && test[++i] == tch)
; // move i till end of repeat char
current = current.next();
continue;
}
}
} else { // like $* or ^*
debug("process: return false-1");
return false;
}
} else if (current.sch == '.') { // process any char
if (!letters(test[i])) {
return false;
}
i++;
current = current.next();
continue;
}
}
if (i == n && current == null) {
// string position is out of bound
// list is at end ie. exhausted both expression and input
// FSM reached the end state, hence the input is valid and matches the given expression
return true;
} else {
return false;
}
}
public void debug(Object str) {
boolean debug = false;
if (debug) {
System.out.println("[debug] " + str);
}
}
private void print(Node node) {
StringBuilder sb = new StringBuilder();
while (node != null) {
sb.append(node + " ");
node = node.next();
}
sb.append("\n");
debug(sb.toString());
}
public boolean match(String expr, String input) {
boolean result = process(expr, input);
System.out.printf("\n%-20s %-20s %-20s\n", expr, input, result);
return result;
}
public void runMatchTests() {
match("ab", "ab");
match("a*b", "aaaaaab");
match("a*b*c*", "abc");
match("a*b*c", "aaabccc");
match("^abc*b", "abccccb");
match("^abc*b", "abccccbb");
match("^abcd$", "abcd");
match("^abc*abc$", "abcabc");
match("^abc.abc$", "abczabc");
match("^ab..*abc$", "abyxxxxabc");
match("a*b*", ""); // handles empty input string
match("xyza*b*", "xyz");
}}
int regex_validate(char *reg, char *test) {
char *ptr = reg;
while (*test) {
switch(*ptr) {
case '.':
{
test++; ptr++; continue;
break;
}
case '*':
{
if (*(ptr-1) == *test) {
test++; continue;
}
else if (*(ptr-1) == '.' && (*test == *(test-1))) {
test++; continue;
}
else {
ptr++; continue;
}
break;
}
case '^':
{
ptr++;
while ( ptr && test && *ptr == *test) {
ptr++; test++;
}
if (!ptr && !test)
return 1;
if (ptr && test && (*ptr == '$' || *ptr == '*' || *ptr == '.')) {
continue;
}
else {
return 0;
}
break;
}
case '$':
{
if (*test)
return 0;
break;
}
default:
{
printf("default case.\n");
if (*ptr != *test) {
return 0;
}
test++; ptr++; continue;
}
break;
}
}
return 1;
}
int main () {
printf("regex=%d\n", regex_validate("ab", "ab"));
printf("regex=%d\n", regex_validate("a*b", "aaaaaab"));
printf("regex=%d\n", regex_validate("^abc.abc$", "abcdabc"));
printf("regex=%d\n", regex_validate("^abc*abc$", "abcabc"));
printf("regex=%d\n", regex_validate("^abc*b", "abccccb"));
printf("regex=%d\n", regex_validate("^abc*b", "abbccccb"));
return 0;
}

read parameters from txt in Linux

I need to read parameters from a txt file for my program in Linux. But the result is that some of the parameters read from the txt file have the correct value, but some of them have a wrong value. Somebody has met this problem? I have translated the format of the txt in windows into Linux with the command dos2unix. I need your help, Thanks.
The read function is as follows:
template <class T>int ReadFileVar(ifstream *inClientFile, const char var_name[], T *var)
{
//inClientFile - pointer to the previously opened File stream
//var_name - contains the name of the variable
//var - pointer to a long, the function will return the value of the variable in this
int length_var_name = (int) strlen(var_name);
char line[512];
int i, j;
while (inClientFile->getline(line,512))
{
if (line[0] != '/' && line[1] != '/')
{
i = 0;
while (line[i] != '\0')
{
if (!strncmp(&line[i],var_name,length_var_name))
{
j = i + length_var_name;
while (line[j] != '\0')
{
if ( line[j] >= '0' && line[j] <= '9')
{
*var = (T) atof(&line[j]);
inClientFile->seekg( 0, ios_base::beg ); //back to the beginning of the file
return 0;
}
j++;
}
}
i++;
}
}
}
cerr << var_name << " - cannot be found" << endl;
throw "error reading input data from: ";
return 1; //the specified variable was not found in the file
}
For example:
the parameters in the txt are as follows:,the type of them are long,
nx=100;
ny=100;
nz=100;
ipro=1;
jpro=1;
kpro=1;
but after reading the txt in my program I get these,
nx=100;
ny=100;
nz=15;
ipro=1;
jpro=1;
kpro=100;
I have tested the program under Windows, there it works!
Your code works for me, you must have an error somewhere else or an undefined behavior I didn't spot.
May I suggest a more C++ way to do exactly the same thing :
template <class T>
T ReadFileVar(ifstream& inClientFile, string var_name)
{
string line;
while (getline(inClientFile, line))
{
if (line[0] != '/' && line[1] != '/')
{
size_t pos = line.find(var_name);
if( pos != string::npos) {
pos = line.find('=', pos + 1);
if(pos == string::npos) {
throw std::exception();
}
istringstream iss(line.substr(pos + 1));
T result;
iss >> result;
inClientFile.seekg( 0, ios_base::beg );
return result;
}
}
}
throw std::exception();
}
You could also parse the whole file and store the result in a map instead of searching the whole file for each variable :
map<string, string> ParseFile(ifstream& inClientFile) {
map<string, string> result;
string line;
while (getline(inClientFile, line))
{
if (line[0] != '/' && line[1] != '/')
{
size_t pos = line.find('=');
if(pos == string::npos) {
throw std::exception();
}
string var_name = line.substr(0, pos);
string var_value = line.substr(pos + 1);
result[var_name] = var_value;
}
}
return result;
}
template <class T>
T ReadVar(map<string, string> data, string var_name)
{
map<string, string>::iterator it = data.find(var_name);
if(it == data.end()) {
throw exception();
}
string value = it->second;
istringstream iss(value);
T result;
iss >> result;
return result;
}

How do I store a stream of characters in an array?

I have a stream of characters coming over the serial port like this;
FILE1,FILE2,FILE3,
I'm trying to read them in like this;
char* myFiles[20];
boolean done = false;
int fileNum = 0;
int charPos = 0;
char character;
while (!done)
{
if (Serial.available())
{
character = Serial.read();
if ((character == '\n') || (character == '\r'))
{
done = true;
}
else if (character == ',')
{
myFiles[fileNum][charPos] = '\0';
fileNum++;
charPos = 0;
}
else
{
myFiles[fileNum][charPos] = character;
charPos++;
}
}
}
when I try to print the first value like this;
Serial.println(myFiles[0]);
i get a continuous stream of characters.
What am i doing wrong?
What you are doing wrong is not allocating any memory for your strings.
Here's one way to do this
#include <vector>
#include <string>
std::vector<std::string> myFiles;
std::string file;
bool done = false;
char character;
while (!done)
{
if (Serial.available())
{
character = Serial.read();
if ((character == '\n') || (character == '\r'))
{
done = true;
}
else if (character == ',')
{
myfiles.push_back(file);
file = "";
}
else
{
file += character;
}
}
}
Serial.println(myFiles[0].c_str());
Since you are programming in C++ you should learn how to use std::vector and std::string, they will save you a lot of grief.
If std::vector and std::string are not available to you (apparently so on Arduino) then the quick hack would be to preallocate a fixed amount of memory for your strings by replacing
char* myFiles[20];
with
char myFiles[20][100];