How do I test this WML parser? - regex

So I had an assignment in which I was supposed to make a WML parser that obeys the following grammar rules:
Here are the gramma rules
The prof posted the solution and I'm trying to test inside intellij but it keeps giving me an error. here's the code:
import WML.args
import scala.util.parsing.combinator._
import scala.io.Source
// -------------------------------------------------------------------------
// These classes form our AST
abstract class ASTNode(val kind : String)
// Outermost program structure
case class ASTProgram(val outer : List[ASTNode]) extends ASTNode("PROGRAM") {
override def toString : String = kind+" ( " + outer.mkString(" ") + " )"
}
// The other non-terminals
case class ASTInvoke(val name : ASTItext, val targs : ASTTargs) extends ASTNode("INVOKE") {
override def toString : String = kind + " ( " + name + " " + targs + " )"
}
case class ASTTargs(val targs : List[ASTItext]) extends ASTNode("TARGS") {
override def toString : String = kind + " ( " + targs.mkString(" ") + " )"
}
case class ASTItext(val itext : List[ASTNode]) extends ASTNode("ITEXT") {
override def toString : String = kind + " ( " + itext.mkString(" ") + " )"
}
case class ASTTvar(val name : String, val opt: ASTItext) extends ASTNode("TVAR") {
override def toString : String = kind + " ( " + "VNAME" + { if (opt==null) "" else " " +opt.toString } + " )"
}
case class ASTTdef(val name : ASTDtext, val dparams : ASTDparams, val body : ASTDtext ) extends ASTNode("DEFINE") {
override def toString : String = kind + " ( " + name + " " + dparams + " " + body + " )"
}
case class ASTDparams(val dparams : List[ASTDtext]) extends ASTNode("DPARAMS") {
override def toString : String = kind + " ( " + dparams.mkString(" ") + " )"
}
case class ASTDtext(val k:String, val dtext : List[ASTNode]) extends ASTNode(k) {
override def toString : String = kind + " ( " + dtext.mkString(" ") + " )"
}
// Our various baseline forms of plain text, Outertext, Inneritext, Innerdtext, Bodytext
// These are really just tokens, but we use nodes to represent them in order to keep track of the contents.
case class ASTText(val k : String,val s : String) extends ASTNode(k) {
override def toString : String = kind
}
// -------------------------------------------------------------------------
// Now the actual parser
class WMLParser extends RegexParsers {
// Tokens. We start with the fixed character sequences, just as strings.
val TSTART = "{{"
val TEND = "}}"
val DSTART = "{'"
val DEND = "'}"
val VSTART = "{{{"
val VEND = "}}}"
val PIPE = "|"
val PIPES = "||"
// We also have tokens for more complex sequences, forming the outermost text,
// as well as the inner text of invocations, definitions, and arguments.
// anything but TSTART or DSTART
val OUTERTEXT = "^([^{]|\\{(?!([{'])))+".r
// anything but TSTART, DSTART, VSTART, PIPE(s), TEND
val INNERITEXT = "^([^{|}]|\\{(?!([{|']))|\\}(?!\\}))+".r
// anything but TSTART, DSTART, VSTART, PIPE(s), DEND
val INNERDTEXT = "^([^{|']|\\{(?!([{|']))|'(?!\\}))+".r
// anything but TSTART, DSTART, VSTART, DEND
val BODYTEXT = "^([^{']|\\{(?!([{']))|'(?!\\}))+".r
// anything but PIPE or VEND
val VNAME = "^([^\\}|]|\\}(?!\\})|\\}\\}(?!\\}))+".r
// Some helper functions
// This deals with our optional component in Targs.
// Here we assume a missing optional part is the empty string---that's actually
// an over-assumption, but it is ok for now.
def convertStringItextToTargs(x:List[String ~ Option[ASTItext]]) : ASTTargs = {
new ASTTargs(x.map( (z) => z match {case s ~ Some(i) => i
case s ~ None => new ASTItext(new ASTText("INNERITEXT","")::Nil) } ))
}
// A helper to convert dtext string, node pairs into just nodes.
def convertStringItextToDparams(x:List[String ~ ASTDtext]) : ASTDparams = {
new ASTDparams(x.map( {case s ~ i => i}))
}
// Now the grammar rules.
// First a few trivial things. We don't really need these as rules per se, as each of
// these just converts a regular expression match into an ASTNode, but making separate
// rules for these makes the type conversion easier.
def textOuter: Parser[ASTNode] = OUTERTEXT ^^ { (x:String) => new ASTText("OUTERTEXT",x) }
def textInner: Parser[ASTNode] = INNERITEXT ^^ { (x:String) => new ASTText("INNERITEXT",x) }
def textInnerd: Parser[ASTNode] = INNERDTEXT ^^ { (x:String) => new ASTText("INNERDTEXT",x) }
def textBody: Parser[ASTNode] = BODYTEXT ^^ { (x:String) => new ASTText("BODYTEXT",x) }
// Our main entry point.
// <program> ::= (OUTERTEXT |<invoke>|<define>)*
def program: Parser[ASTNode] = rep( textOuter | invoke | define ) ^^ {
(x:List[ASTNode]) => { new ASTProgram(x) }
}
// <invoke> ::= TSTART <itext> <targs> TEND
def invoke: Parser[ASTInvoke] = TSTART ~ itext ~ targs ~ TEND ^^ {
case _ ~ i ~ a ~ _ => { new ASTInvoke(i,a) }
}
// <targs> ::= (PIPE <itext>?)*
def targs: Parser[ASTTargs] = rep( PIPE ~ opt(itext) ) ^^ { convertStringItextToTargs _ }
// Here, note that we list tvar before invoke, to give it preference in parsing
// <itext> ::= (INNERTEXT|<tvar>|<invoke>|<define>)*
def itext: Parser[ASTItext] = rep( textInner | tvar | invoke | define ) ^^ { (x:List[ASTNode]) => { new ASTItext(x) } }
// <tvar> ::= VSTART VNAME (PIPE itext)? VEND
def tvar: Parser[ASTTvar] = VSTART ~ VNAME ~ opt(PIPE ~ itext) ~ VEND ^^ {
case _ ~ n ~ None ~ _ => { new ASTTvar (n,null) }
case _ ~ n ~ Some(_ ~ i) ~ _ => { new ASTTvar (n,i) }
}
// <define> ::= DSTART <dtextn> <dparams> PIPES <dtext> DEND
def define: Parser[ASTTdef] = DSTART ~ dtextn ~ dparams ~ PIPES ~ dtextb ~ DEND ^^ {
case _ ~ d ~ p ~ _ ~ b ~ _ => { new ASTTdef(d,p,b) }
}
// <dparams> ::= (SPIPE <dtextp>)*
def dparams: Parser[ASTDparams] = rep( PIPE ~ dtextp ) ^^ { convertStringItextToDparams _ }
// We have 3 forms of dtext. Each of them allows inner invokes, definitions, and args.
// The template name can be empty, so we use * for repetition
// <dtextn> ::= (INNERDTEXT|<tvar>|<invoke>|<define>)*
def dtextn: Parser[ASTDtext] = rep( textInnerd | tvar | invoke | define ) ^^ { (x:List[ASTNode]) => { new ASTDtext("DTEXTN",x) } }
// The parameters cannot be empty, so we use + (ie rep1) for repetition
// <dtextp> ::= (INNERDTEXT|<tvar>|<invoke>|<define>)+
def dtextp: Parser[ASTDtext] = rep1( textInnerd | tvar | invoke | define ) ^^ { (x:List[ASTNode]) => { new ASTDtext("DTEXTP",x) } }
// Finally, the body itself can be empty, so we use * for repetition. Different from a name, however,
// the text itself can include pipe symbols.
// <dtextb> ::= (BODYTEXT|<tvar>|<invoke>|<define>)*
def dtextb: Parser[ASTDtext] = rep( textBody | tvar | invoke | define ) ^^ { (x:List[ASTNode]) => { new ASTDtext("DTEXTB",x) } }
// We do not want whitespace discarded, we will do that ourselves
override val whiteSpace = "".r
}
// And finally a program to invoke the parser on a file or input string
object WML extends App {
def main(args: Array[String]): Unit = {
def help(): Unit = {
println("Specify (-s string|filename)")
System.exit(1)
}
if (args.length==0) {
help()
}
// taking a string input was not required, but it is convenient for testing, so allow it with a -s specifier.
val source = args match {
case Array(_,"-s",s) => s
case Array("-s",s) => s
case Array(_,fn) => Source.fromFile(fn, "UTF-8").mkString
case Array(fn) => Source.fromFile(fn, "UTF-8").mkString
}
val p = new WMLParser
val result = p.parseAll(p.program,source);
if (result.successful) {
println(result.get)
} else {
println("Parse failure: " + result);
}
}
}
It asks me to override the main method and when I do, I get this error:
objc[12857]: Class JavaLaunchHelper is implemented in both /Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home/bin/java (0x104e314c0) and /Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home/jre/lib/libinstrument.dylib (0x105e9c4e0). One of the two will be used. Which one is undefined.
Exception in thread "main" scala.MatchError: [Ljava.lang.String;#3551a94 (of class [Ljava.lang.String;)
at WML$.main(solution2.scala:176)
at WML.main(solution2.scala)
Here is what I have on line 176:
case Array(fn) => Source.fromFile(fn, "UTF-8").mkString
Thanks.

Can you please try upgrading your JDK version to update 161?
http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html
Looks like this is a problem with IntelliJ!

Related

Regex to match block of text inside square brackets that can be nested

I'm writing a parser in scala that reads a string composed by repetitions of '+', '-', '<', '>' and '.' characters. The string may also have '[' and ']' characters and inside them there is a repetition of the first group of characters.
I need a Regex that matches everything inside square brackets, the problem is that the brackets can be nested.
I've already tried with this regex: \[.*\] and many others that I've found on SO but none seems to be working.
The regex I'm looking for should work like this:
"[+++.]" matches "+++."
"[++[-]]" should match "++[-]"
edit (added a use case):
"[+++.] [++[-]]" should NOT match "+++.] [++[-]" but 2 matches of "+++." and "++[-]"
That would be pretty tough with a single regex, but with some post-processing you might get a bit closer.
def parse(s :String) :Array[String] =
"\\[(.*)\\]".r.unanchored
.findAllMatchIn(s)
.toArray
.flatMap(_.group(1).split(raw"][^\[\]]+\["))
usage:
parse("+++.]") //res0: Array[String] = Array()
parse("[+++.]") //res1: Array[String] = Array("+++.")
parse("[++[-]]") //res2: Array[String] = Array("++[-]")
parse("[+++.] [++[-]]") //res3: Array[String] = Array("+++.", "++[-]")
parse("[++[-]--] [+]") //res4: Array[String] = Array(++[-]--, +)
After some research I think I may have found the solution, however it is not usable in Scala. What is needed is a recursive regex that matches balanced constructs, in my case:
\[(?:[+-\[\]]|(?R))*\]
and as far as I know these kind are not supported in scala, so I'll just leave this here if someone needs it for other languages.
However I solved my problem by implementing the parser in another way, I just thought that having a regex like that would have been a simpler and smoother solution.
What I was implementing was a brainfuck language interpreter and here is my parser class:
class brainfuck(var pointer: Int, var array: Array[Int]) extends JavaTokenParsers {
def Program = rep(Statement) ^^ { _ => () }
def Statement: Parser[Unit] =
"+" ^^ { _ => array(pointer) = array(pointer) + 1 } |
"-" ^^ { _ => array(pointer) = array(pointer) - 1 } |
"." ^^ { _ => println("elem: " + array(pointer).toChar) } |
"," ^^ { _ => array(pointer) = readChar().toInt } |
">" ^^ { _ => pointer = pointer + 1 } |
"<" ^^ { _ => pointer = pointer - 1 } |
"[" ~> rep(block|squares) <~ "]" ^^ { items => while(array(pointer)!=0) { parseAll(Program,items.mkString) } }
def block =
"""[-+.,<>]""".r ^^ { b => b.toString() }
def squares: Parser[String] = "[" ~> rep(block|squares) <~ "]" ^^ { b => var res = "[" + b.mkString + "]"; res }
}

Scala string pattern match regex a star multiple findallin

I want to parse this string: "er1r2r3" with: """(e|w|n|s)(r[1-3])*""".r
val SideR = """(e|w|n|s)""".r
val PieceR = """(r)([1-3])""".r
def parseSidedPieces(str: String): (Char, List[Char]) = {
val side = str(0) match {
case SideR(s) => s
}
val pieces = parsePieces(str.tail)
(side, pieces)
}
def parsePieces(str: String): List[Char] = {
PieceR.findAllIn(str).toList map {
case PieceR(c, n) => n
}
}
But this throws on empty string "" because str(0).
Fix this, regex only.
I don't think this can be fixed 'regexes only' (whatever that is supposed to mean), because the code fails before the first regex is used.
It fails because you call apply(index: Int) on an empty String. So, either you do an isEmpty check before calling str(0) or even parseSidedPieces, or you change the code and match the whole String:
val PieceR = """(r)([1-3])""".r
val CombinedR = "(e|w|n|s)((?:r[1-3])*)".r
def parseSidedPieces(str: String): (Char, List[Char]) = {
str match {
case CombinedR(side, pieces) =>
(side(0), parsePieces(pieces))
case "" =>
// hmm, what kind of tuple would be a good return value here? maybe:
throw new IllegalArgumentException(s"Unexpected input: $str")
case _ =>
// handle unmatched strings however you like, I'd do:
throw new IllegalArgumentException(s"Unexpected input: $str")
}
}
def parsePieces(str: String): List[Char] = {
PieceR.findAllIn(str).toList map {
case PieceR(c, n) => n(0)
}
}
parseSidedPieces("er1r2r3") |-> res0: (Char, List[Char]) = (e,List(1, 2, 3))

Scala CSV parser with comments

First of all: credits. This code is based on the solution from here: Use Scala parser combinator to parse CSV files
The CSV files I want to parse can have comments, lines starting with #. And to avoid confusion: The CSV files are tabulator-separated. There are more constraints which would make the parser a lot easier, but since I am completly new to Scala I thought it would be best to stay as close to the (working) original as possible.
The problem I have is that I get a type mismatch. Obviously the regex for a comment does not yield a list. I was hoping that Scala would interpret a comment as a 1-element-list, but this is not the case.
So how would I need to modify my code that I can handle this comment lines? And closly related: Is there an elegant way to query the parser result so I can write in myfunc something like
if (isComment(a)) continue
So here is the actual code:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import scala.util.parsing.combinator._
object MyParser extends RegexParsers {
override val skipWhitespace = false // meaningful spaces in CSV
def COMMA = ","
def TAB = "\t"
def DQUOTE = "\""
def HASHTAG = "#"
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" } // combine 2 dquotes into 1
def CRLF = "\r\n" | "\n"
def TXT = "[^\",\r\n]".r
def SPACES = "[ ]+".r
def file: Parser[List[List[String]]] = repsep((comment|record), CRLF) <~ (CRLF?)
def comment: Parser[List[String]] = HASHTAG<~TXT
def record: Parser[List[String]] = "[^#]".r<~repsep(field, TAB)
def field: Parser[String] = escaped|nonescaped
def escaped: Parser[String] = {
((SPACES?)~>DQUOTE~>((TXT|COMMA|CRLF|DQUOTE2)*)<~DQUOTE<~(SPACES?)) ^^ {
case ls => ls.mkString("")
}
}
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def applyParser(s: String) = parseAll(file, s) match {
case Success(res, _) => res
case e => throw new Exception(e.toString)
}
def myfunc( a: (String, String)) = {
val parserResult = applyParser(a._2)
println("APPLY PARSER FOR " + a._1)
for( a <- parserResult ){
a.foreach { println }
}
}
def main(args: Array[String]) {
val filesPath = "/home/user/test/*.txt"
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val logData = sc.wholeTextFiles(filesPath).cache()
logData.foreach( x => myfunc(x))
}
}
Since the parser for comment and the parser for record are "or-ed" together they must be of the same type.
You need to make the following changes:
def comment: Parser[List[String]] = HASHTAG<~TXT ^^^ {List()}
By using ^^^ we are converting the result of the parser (which is the result returned by HASHTAG parser) to an empty List.
Also change:
def record: Parser[List[String]] = repsep(field, TAB)
Note that because comment and record parser are or-ed and because comment comes first, if the row begins with a "#" it will be parsed by the comment parser.
Edit:
In order to keep the comments text as an output of the parser (say if you want to print them later), and because you are using | you can do the following:
Define the following classes:
trait Line
case class Comment(text: String) extends Line
case class Record(elements: List[String]) extends Line
Now define comment, record & file parsers as follows:
val comment: Parser[Comment] = "#" ~> TXT ^^ Comment
val record :Parser[Line]= repsep(field, TAB) ^^ Record
val file: Parser[List[Line]] = repsep(comment | record, CRLF) <~ (CRLF?)
Now you can define the printing function myFunc:
def myfunc( a: (String, String)) = {
parseAll(file, a._2).map { lines =>
lines.foreach{
case Comment(t) => println(s"This is a comment: $t")
case Record(elems) => println(s"This is a record: ${elems.mkString(",")}")
}
}
}

Javacc regular expression that matches all string except certain ones

What is the regular expression that'd match anything , except these strings:
=>, | ?
If you're familiar with javacc, I'm trying to define my ANYTHING token as shown below:
TOKEN :
{
<ARROW: "=>" >
|
<ANYTHING: (["\u0001" - "\uffdc"])+>
|
<PIPE: "|">
|
<UPPER_CHAR: (["A"-"Z"])>
}
Thanks
Try something like this:
Test.jj
options {
STATIC = false ;
}
PARSER_BEGIN(Test)
public class Test {
public static void main(String[] args) throws Exception {
Test parser = new Test(new java.io.StringReader("foo=>bar=baz|done"));
parser.Parse();
}
}
PARSER_END(Test)
TOKEN :
{
< ARROW : "=>" >
| < PIPE : "|" >
| < ANYTHING : (~["=", "|"] | "=" ~[">"])+ >
}
void Parse() :
{}
{
(Any())* <EOF>
}
void Any() :
{Token t;}
{
( t=<ARROW> {System.out.println("ARROW = '" + t.image + "'");}
| t=<PIPE> {System.out.println("PIPE = '" + t.image + "'");}
| t=<ANYTHING> {System.out.println("ANYTHING = '" + t.image + "'");}
)
}
Generate the parser classes:
javacc Test.jj
and run the main method:
java Test
will print the following for the input "foo=>bar=baz|done":
ANYTHING = 'foo'
ARROW = '=>'
ANYTHING = 'bar=baz'
PIPE = '|'
ANYTHING = 'done'
You are probably best off to use lexical states. Here is an RE solution that might work
("=")+ | ( ~["=","|"] | ("=")* ~["=","|",">"] )+ ("=")*
By the way I changed the question a little so that the empty string is not an "anything", so this re should match any string that is not empty and does not contain any "|"s nor any "=>"s.

non-greedy matching in Scala RegexParsers

Suppose I'm writing a rudimentary SQL parser in Scala. I have the following:
class Arith extends RegexParsers {
def selectstatement: Parser[Any] = selectclause ~ fromclause
def selectclause: Parser[Any] = "(?i)SELECT".r ~ tokens
def fromclause: Parser[Any] = "(?i)FROM".r ~ tokens
def tokens: Parser[Any] = rep(token) //how to make this non-greedy?
def token: Parser[Any] = "(\\s*)\\w+(\\s*)".r
}
When trying to match selectstatement against SELECT foo FROM bar, how do I prevent the selectclause from gobbling up the entire phrase due to the rep(token) in ~ tokens?
In other words, how do I specify non-greedy matching in Scala?
To clarify, I'm fully aware that I can use standard non-greedy syntax (*?) or (+?) within the String pattern itself, but I wondered if there's a way to specify it at a higher level inside def tokens. For example, if I had defined token like this:
def token: Parser[Any] = stringliteral | numericliteral | columnname
Then how can I specify non-greedy matching for the rep(token) inside def tokens?
Not easily, because a successful match is not retried. Consider, for example:
object X extends RegexParsers {
def p = ("a" | "aa" | "aaa" | "aaaa") ~ "ab"
}
scala> X.parseAll(X.p, "aaaab")
res1: X.ParseResult[X.~[String,String]] =
[1.2] failure: `ab' expected but `a' found
aaaab
^
The first match was successful, in parser inside parenthesis, so it proceeded to the next one. That one failed, so p failed. If p was part of alternative matches, the alternative would be tried, so the trick is to produce something that can handle that sort of thing.
Let's say we have this:
def nonGreedy[T](rep: => Parser[T], terminal: => Parser[T]) = Parser { in =>
def recurse(in: Input, elems: List[T]): ParseResult[List[T] ~ T] =
terminal(in) match {
case Success(x, rest) => Success(new ~(elems.reverse, x), rest)
case _ =>
rep(in) match {
case Success(x, rest) => recurse(rest, x :: elems)
case ns: NoSuccess => ns
}
}
recurse(in, Nil)
}
You can then use it like this:
def p = nonGreedy("a", "ab")
By the way,I always found that looking at how other things are defined is helpful in trying to come up with stuff like nonGreedy above. In particular, look at how rep1 is defined, and how it was changed to avoid re-evaluating its repetition parameter -- the same thing would probably be useful on nonGreedy.
Here's a full solution, with a little change to avoid consuming the "terminal".
trait NonGreedy extends Parsers {
def nonGreedy[T, U](rep: => Parser[T], terminal: => Parser[U]) = Parser { in =>
def recurse(in: Input, elems: List[T]): ParseResult[List[T]] =
terminal(in) match {
case _: Success[_] => Success(elems.reverse, in)
case _ =>
rep(in) match {
case Success(x, rest) => recurse(rest, x :: elems)
case ns: NoSuccess => ns
}
}
recurse(in, Nil)
}
}
class Arith extends RegexParsers with NonGreedy {
// Just to avoid recompiling the pattern each time
val select: Parser[String] = "(?i)SELECT".r
val from: Parser[String] = "(?i)FROM".r
val token: Parser[String] = "(\\s*)\\w+(\\s*)".r
val eof: Parser[String] = """\z""".r
def selectstatement: Parser[Any] = selectclause(from) ~ fromclause(eof)
def selectclause(terminal: Parser[Any]): Parser[Any] =
select ~ tokens(terminal)
def fromclause(terminal: Parser[Any]): Parser[Any] =
from ~ tokens(terminal)
def tokens(terminal: Parser[Any]): Parser[Any] =
nonGreedy(token, terminal)
}