Haskell Alex - regex matches wrong string? - regex

I'm trying to write lexer for an indentation-based grammar and I'm having trouble matching the indentation.
Here's my code:
{
module Lexer ( main ) where
import System.IO.Unsafe
}
%wrapper "monadUserState"
$whitespace = [\ \t\b]
$digit = 0-9 -- digits
$alpha = [A-Za-z]
$letter = [a-zA-Z] -- alphabetic characters
$ident = [$letter $digit _] -- identifier character
$indent = [\ \t]
#number = [$digit]+
#identifier = $alpha($alpha|_|$digit)*
error:-
#identifier { mkL LVarId }
\n $whitespace* \n { skip }
\n $whitespace* { setIndent }
$whitespace+ { skip }
{
data Lexeme = Lexeme AlexPosn LexemeClass (Maybe String)
instance Show Lexeme where
show (Lexeme _ LEOF _) = " Lexeme EOF"
show (Lexeme p cl mbs) = " Lexeme class=" ++ show cl ++ showap p ++ showst mbs
where
showap pp = " posn=" ++ showPosn pp
showst Nothing = ""
showst (Just s) = " string=" ++ show s
instance Eq Lexeme where
(Lexeme _ cls1 _) == (Lexeme _ cls2 _) = cls1 == cls2
showPosn :: AlexPosn -> String
showPosn (AlexPn _ line col) = show line ++ ':': show col
tokPosn :: Lexeme -> AlexPosn
tokPosn (Lexeme p _ _) = p
data LexemeClass
= LVarId
| LTIndent Int
| LTDedent Int
| LIndent
| LDedent
| LEOF
deriving (Show, Eq)
mkL :: LexemeClass -> AlexInput -> Int -> Alex Lexeme
mkL c (p, _, _, str) len = return (Lexeme p c (Just (take len str)))
data AlexUserState = AlexUserState { indent :: Int }
alexInitUserState :: AlexUserState
alexInitUserState = AlexUserState 0
type Action = AlexInput -> Int -> Alex Lexeme
getLexerIndentLevel :: Alex Int
getLexerIndentLevel = Alex $ \s#AlexState{alex_ust=ust} -> Right (s, indent ust)
setLexerIndentLevel :: Int -> Alex ()
setLexerIndentLevel i = Alex $ \s#AlexState{alex_ust=ust} -> Right (s{alex_ust=(AlexUserState i)}, ())
setIndent :: Action
setIndent input#(p, _, _, str) i = do
--let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"
lastIndent <- getLexerIndentLevel
currIndent <- countIndent (drop 1 str) 0 -- first char is always \n
if (lastIndent < currIndent) then
do setLexerIndentLevel currIndent
mkL (LTIndent (currIndent - lastIndent)) input i
else if (lastIndent > currIndent) then
do setLexerIndentLevel currIndent
mkL (LTDedent (lastIndent - currIndent)) input i
else alexMonadScan
where
countIndent str total
| take 1 str == "\t" = do skip input 1
countIndent (drop 1 str) (total+1)
| take 4 str == " " = do skip input 4
countIndent (drop 4 str) (total+1)
| otherwise = return total
alexEOF :: Alex Lexeme
alexEOF = return (Lexeme undefined LEOF Nothing)
scanner :: String -> Either String [Lexeme]
scanner str =
let loop = do
tok#(Lexeme _ cl _) <- alexMonadScan
if (cl == LEOF)
then return [tok]
else do toks <- loop
return (tok:toks)
in runAlex str loop
addIndentations :: [Lexeme] -> [Lexeme]
addIndentations (lex#(Lexeme pos (LTIndent c) _):ls) =
concat [iter lex c, addIndentations ls]
where iter lex c = if c == 0 then []
else (Lexeme pos LIndent Nothing):(iter lex (c-1))
addIndentations (lex#(Lexeme pos (LTDedent c) _):ls) =
concat [iter lex c, addIndentations ls]
where iter lex c = if c == 0 then []
else (Lexeme pos LDedent Nothing):(iter lex (c-1))
addIndentations (l:ls) = l:(addIndentations ls)
addIndentations [] = []
main = do
s <- getContents
return ()
print $ fmap addIndentations (scanner s)
}
Problem is that in line \n $whitespace* { setIndent }, regex matches wrong string and calls setIndent with this wrong string. For debugging purposes, I added unsafePerformIO in setIndent function, here's an example run of the program:
begin
first indent
|matched string:
first indent
second indent
second indent
dedent
dedent
|
|matched string:
second indent
dedent
|
|matched string:
dedent
|
|matched string:
|
Right [ Lexeme class=LVarId posn=1:1 string="begin", Lexeme class=LIndent posn=1:6, Lexeme class=LVarId posn=2:15 string="indent", Lexeme class=LIndent posn=2:21, Lexeme class=LDedent posn=3:30, Lexeme class=LDedent posn=3:30, Lexeme class=LVarId posn=4:1 string="dedent", Lexeme EOF]
So setIndent is called with more than just whitespaces. And after it returns the lexeme for indentation, other part of the string is omitted.
Is this a bug in Alex? Or what am I doing wrong?

So I haven't analysed your code in detail, but I did notice this:
setIndent :: Action
setIndent input#(p, _, _, str) i = do
--let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"
Note that str is the rest of the input, not just the current token. To get the current token, you want take i str. Perhaps this is giving you the impression that the token is matching more of the input than it really is.
We handle indentation in GHC's own lexer of course, so you might want to look there for ideas (although as you might expect it's rather large and complicated).

Related

Parser for recursive expressions hangs in ghci

I am trying to make a parser for the following recursive datatype:
data Expr = Val Int
| Var Char
| App Op Expr Expr
deriving Show
data Op = Add | Sub | Mul | Div
deriving Show
It should, for example, parse "(1 + (a / -2))" as App Add (Val 1) (App Div (Var 'a') (Val (-2))). I've managed to write parsers for the Val and Var constructors as well as for Op's constructors like so:
import Text.Regex.Applicative
import Data.Char
rNonnegativeIntegral :: (Read a, Integral a) => RE Char a
rNonnegativeIntegral = read <$> some (psym isDigit)
rNegativeIntegral :: (Read a, Integral a) => RE Char a
rNegativeIntegral = negate <$> (sym '-' *> rNonnegativeIntegral)
rIntegral :: (Read a, Integral a) => RE Char a
rIntegral = rNonnegativeIntegral <|> rNegativeIntegral
rVal :: RE Char Expr
rVal = Val <$> rIntegral
rVar :: RE Char Expr
rVar = Var <$> psym isAlpha
rOp = aux <$> (foldr1 (<|>) $ map sym "+-*/")
where
aux '+' = Add
aux '-' = Sub
aux '*' = Mul
aux '/' = Div
When this is loaded into ghci it can produce the following output:
ghci> findLongestPrefix rVal "-271"
Just (Val (-271), "")
ghci> findLongestPrefix rVar "a"
Just (Var 'a', "")
ghci> findLongestPrefix rOp "-"
Just (Sub, "")
The trouble comes when I introduce this recursive definition for the App constructor:
whiteSpace :: RE Char String
whiteSpace = many $ psym isSpace
strictWhiteSpace :: RE Char String
strictWhiteSpace = some $ psym isSpace
rApp :: RE Char Expr
-- flip App :: Expr -> Op -> Expr
-- strictWhiteSpace after rOp to avoid conflict with rNegativeInteger
rApp = flip App <$> (sym '(' *> whiteSpace *> rExpr)
<*> (whiteSpace *> rOp <* strictWhiteSpace)
<*> (rExpr <* whiteSpace <* sym ')')
rExpr :: RE Char Expr
rExpr = rVal <|> rVar <|> rApp
This loads into ghci just fine, and all previous constructors still work. But findLongestPrefix rApp "(1 + a)" and many similar expressions cause ghci to hang and produce no output.
Through experimentation I've found that the issue happens in general when rExpr is passed in as the first argument to <*. For example, findLongestPrefix (rExpr <* whiteSpace) "a)" also causes ghci to hang.
Also, when the definition for rExpr is replaced by
rExpr = rVal <|> rVar
all of these hanging issues go away. Simple expressions like "(1 + a)" are able to be parsed, but support for recursive expressions is not available.
How can I implement a recursive parser here without hanging issues?
The language of expressions that you describe isn't regular. So you'll have to use a different library.
Luckily, essentially the same parser structure should work fine with most other parser combinator libraries. It should be as simple as substituting your new library's name for a few basic parsers in place of their regex-applicative analogs.

Process a string using foldr where '#' means deleting the previous character

I need to process a string using foldr where '#' means deleting the previous character. For example:
>backspace "abc#d##c"
"ac"
>backspace "#####"
""
It needs to be done using foldr through one pass of the list, without using reverse and/or (++).
Here what I have got so far:
backspace :: String -> String
backspace xs = foldr func [] xs where
func c cs | c /= '#' = c:cs
| otherwise = cs
But it just filter the '#' from the string. I thought about deleting the last element of current answer every time c == '#' and got something like that
backspace :: String -> String
backspace xs = foldr func [] xs where
func c cs | c /= '#' = c:cs
| cs /= [] = init cs
| otherwise = cs
but it is not working properly,
ghci> backspace "abc#d##c"
"abc"
You can use (Int, String) as state for your foldr where the first Int is the number of backspaces, and the String is the current string constructed.
This thus means that you can work with:
backspace :: String -> String
backspace = snd . foldr func (0, [])
where func '#' (n, cs) = (n+1, cs)
func c (n, cs)
| n > 0 = … -- (1)
| otherwise = … -- (2)
In case we have a character that is not a #, but n > 0 it means we need to remove that character, and thus ignore c and decrement n. In case n == 0 we can add c to the String.
I leave filling in the … parts as an exercise.

OCaml Reading from file and perform some validation

can you help me out, i made this program to get an output from some .txt file like this :
john:3:uk
paul:18:us
#load "str.cma"
let f_test = "/home/test.txt" ;;
(*
Recursive Reading function
*)
let read_lines f_test : string list =
if Sys.file_exists (f_test) then
begin
let ic = open_in f_test in
try
let try_read () =
try Some (input_line ic) with End_of_file -> None in
let rec loop acc = match try_read () with
| Some s -> loop (s :: acc)
| None -> close_in_noerr ic; List.rev acc in
loop []
with e ->
close_in_noerr ic;
[]
end
else
[]
;;
(*Using Records*)
type user =
{
name : string;
age : int;
country : string;
};;
(*
Function to separated info in list
*)
let rec splitinfo ?(sep=":") l = match l with
| [] -> []
| x::xs -> (Str.split (Str.regexp ":") x)::splitinfo xs;;
(*
Function to get users position
*)
let get_user l:user =
let age = int_of_string (List.nth l 1) in
let user_name = List.nth l 0 in
{
name = user_name;
age = age ;
country = List.nth l 2;
};;
(*
Function to check some parameter is valid
*)
let par1 u: int =
if (u.age = 3) then
1
else
0;;
(*
Reporting function
*)
let report_statistics list_users =
let child = ref 0 in
let teenager = ref 0 in
let adult = ref 0 in print_string (" ----- -- Stats -- ----- \n" ) ;
List.iter (
fun user_l -> (
match user_l with
| [] -> print_string("> no user <\n")
| _ ->
let user = get_user user_l in
if (par1 user = 1) then (
print_string (" "^ user.name ^" --> Child \n" ) ;
child := !child + 1;
)
else
print_string (" "^ user.name ^" --> Other \n" );
)
) list_users;
print_string ("------- List ---- ");
print_newline();
print_string ("Child " );
print_int(!child);
print_newline();
print_string ("Teenager ") ;
print_int(!teenager);
print_newline();
print_string ("Adult ");
print_int(!adult);
print_newline();
;;
The program compile but doesn't output any result ...
What am i missing ?
I kept the function to check parameters simple so i can understand it better but can't figure it out why it isn't outputing any result
Can you help me out here ?
Thanks in advance :)
The code as given defines some functions such as read_lines and report_statistics. But there are no calls to these functions.
If there is no other OCaml source involved, this is probably your problem. You need to call the functions.
It is fairly customary to have a "main" function that does the work of an OCaml program, and then (this is key) you have to actually call the main function:
let main () =
(* Call the functions that do the work of the program *)
let () = main ()
I have many times forgotten this last line and then nothing happens when I run the program.

How do you split a string into lists unless it is inside quotation marks ("") in Ocaml?

I'm reading an input file of several lines. Each line has the following format:
Greeting "hello"
Greeting " Good morning"
Sit
Smile
Question "How are you?"
My current can read each line into a string list. Then I process it using this function which is supposed to break it into a string list list:
let rec process (l : string list) (acc : string list list) : string list list =
match l with
| [] -> acc
| hd :: tl -> String.split_on_char ' ' hd :: (process tl acc)
Which, unfortunately, does not work, since it also splits spaces inside quotation marks. Anyone think of a the right way to do this, possibly using map or fold_left, etc? This would be my expected output:
[["Greeting"; "/"hello/""];[Greeting; "/" Good morning"];["Sit"]]
and so on. Thank you!
You want a real (but very simple) lexical analysis. IMHO this is beyond what you can do with simple string splitting.
A scanner takes a stream of characters and returns the next token it sees. You can make a string into a stream by having an index that traverses the string.
Here is a scanner that is roughly what you would want:
let rec scan s offset =
let slen = String.length s in
if offset >= slen then
None
else if s.[offset] = ' ' then
scan s (offset + 1)
else if s.[offset] = '"' then
let rec qlook loff =
if loff >= slen then
(* Unterminated quotation *)
let tok = String.sub s offset (slen - offset) in
Some (tok, slen)
else if s.[loff] = '"' then
let tok = String.sub s offset (loff - offset + 1) in
Some (tok, loff + 1)
else qlook (loff + 1)
in
qlook (offset + 1)
else
let rec wlook loff =
if loff >= slen then
let tok = String.sub s offset (slen - offset) in
Some (tok, slen)
else if s.[loff] = ' ' || s.[loff] = '"' then
let tok = String.sub s offset (loff - offset) in
Some (tok, loff)
else
wlook (loff + 1)
in
wlook (offset + 1)
It handles a few cases that you didn't specify: what to do if there is an unclosed quotation. What to do with something like abc"def ghi".
The scanner returns None at the end of the string, or Some (token, offset), i.e., the next token and the offset to continue scanning.
A recursive function to break up a string would look something like this:
let split s =
let rec isplit accum offset =
match scan s offset with
| None -> List.rev accum
| Some (tok, offset') -> isplit (tok :: accum) offset'
in
isplit [] 0
This can be visualized with a state machine. You have 2 main states: looking for ' ' and looking for '"'. Processing strings is ugly and you can't pattern match it. So first thing I did is turn the string into a char list. Implementing the two states then becomes simple:
let split s =
let rec split_space acc word = function
| [] -> List.rev (List.rev word::acc)
| ' '::xs -> split_space (List.rev word::acc) [] xs
| '"'::xs -> find_quote acc ('"'::word) xs
| x::xs -> split_space acc (x::word) xs
and find_quote acc word = function
| [] -> List.rev (List.rev word::acc)
| '"'::xs -> split_space acc ('"'::word) xs
| x::xs -> find_quote acc (x::word) xs
in
split_space [] [] s
;;
# split ['a';'b';' ';'"';'c';' ';'d';'"';' ';'e'];;
- : char list list = [['a'; 'b']; ['"'; 'c'; ' '; 'd'; '"']; ['e']]
Now if you want to do it with strings that's left to you. The Idea would be the same. Or you can just turn the char list list into a string list at the end.

FParsec styles; demonstrate differences between combinator and monadic style?

I am new to F#, about two months, and I recently finished the FParsec tutorial and started looking for more examples. The more I read the more confused I became, and then I started to see references to styles. I looked for more styles and came up with this list.
Combinator style
Monadic style
Arrow style
Direct style
Can someone list all of the styles and explain and demonstrate how each one works with a common problem, e.g. parse
“(abc
(b CDEF
(de 1 E)
(f 234)
)
(h 3)
(jkl H)
)”
into
[Lower "abc";
Group[Lower "b"; Upper "CDEF";
Group [Lower "de"; Number "1"; Upper "E"];
Group [Lower "f"; Number "234"]];
Group [Lower "h"; Number "3"];
Group [Lower "jkl"; Upper "H"]
]
Using
Type out =
| Lower of string
| Upper of string
| Number of string
| Group of out list
EDIT
I picked up combinator and monadic style from a comment in FParsec and a delimiter based syntax
Direct style is always appearing as Direct Style Monadic Parser
Arrow style appears in Parsec: Direct Style Monadic Parser Combinators For The Real World I haven’t read all of this.
EDIT
Per suggestion
Combinator style
type out =
| Lower of string
| Upper of string
| Number of string
| Group of out list
type Parser = Parser<out, unit>
let isUpper = fun c -> isAsciiUpper c
let upper : Parser =
many1Satisfy isUpper .>> ws
|>> fun x -> Upper(x)
let isLower = fun c -> isAsciiLower c
let lower : Parser=
many1Satisfy isLower .>> ws
|>> fun x -> Lower(x)
let isNumber = fun c -> isDigit c
let number : Parser =
many1Satisfy isNumber .>> ws
|>> fun x -> Number(x)
let groupRef, groupImpl = createParserForwardedToRef()
let item : Parser =
lower <|> upper <|> number <|> groupRef
let items =
many item .>> ws
|>> fun x -> Group(x)
do groupImpl := between (pchar '(') (pchar ')') items .>> ws
let test () =
match run groupRef "(abc (b CDEF (de 1 E) (f 234)) (h 3) (jkl H) )" with
| Success(result, _, _) -> printf "Success: %A" result
| Failure(errorMsg, _, _) -> printf "Failure: %s" errorMsg
Monadic style
type out =
| Lower of string
| Upper of string
| Number of string
| Group of out list
type Parser = Parser<out, unit>
let isUpper = fun c -> isAsciiUpper c
let upper : Parser = parse {
let! x = many1Satisfy isUpper
do! ws
return Upper(x)
}
let isLower = fun c -> isAsciiLower c
let lower = parse {
let! x = many1Satisfy isLower
do! ws
return Lower(x)
}
let isNumber = fun c -> isDigit c
let number = parse {
let! x = many1Satisfy isNumber
do! ws
return Number(x)
}
let groupRef, groupImpl = createParserForwardedToRef()
let group = parse {
let! x = groupRef
do! ws
return x
}
let item =
lower <|> upper <|> number <|> group
let items = parse {
let! x = many item
do! ws
return Group(x)
}
do groupImpl := between (pchar '(') (pchar ')') items
let test () =
match run group "(abc (b CDEF (de 1 E) (f 234)) (h 3) (jkl H) )" with
| Success(result, _, _) -> printf "Success: %A" result
| Failure(errorMsg, _, _) -> printf "Failure: %s" errorMsg