ocamllex regex syntax error - regex

I have some basic ocamllex code, which was written by my professor, and seems to be fine:
{ type token = EOF | Word of string }
rule token = parse
| eof { EOF }
| [’a’-’z’ ’A’-’Z’]+ as word { Word(word) }
| _ { token lexbuf }
{
(*module StringMap = BatMap.Make(String) in *)
let lexbuf = Lexing.from_channel stdin in
let wordlist =
let rec next l = match token lexbuf with
EOF -> l
| Word(s) -> next (s :: l)
in next []
in
List.iter print_endline wordlist
}
However, running ocamllex wordcount.mll produces
File "wordcount.mll", line 4, character 3: syntax error.
This indicates that there is an error at the first [ in the regex in the fourth line here. What is going on?

You seem to have curly quotes (also called "smart quotes" -- ugh) in your text. You need regular old single quotes.
curly quote: ’
old fashioned single quote: '

Related

What is this OCaml function returning?

As I understand it, OCaml doesn't require explicit return statements to yield a value from a function. The last line of the function is what returns something.
In that case, could someone please let me know what the following function foo is returning? It seems that it's returning a stream of data. Is it returning the lexer?
and foo ?(input = false) =
lexer
| 'x' _
-> let y = get_func lexbuf
get_text y
| ',' -> get_func lexbuf
| _ -> get_text lexbuf
I'm trying to edit the following function, bar, to return a data stream, as well, so that I can replace foo with bar in another function. However, it seems that bar has multiple lexers which is preventing this return. How can I rewrite bar to return a data stream in a similar way that foo appears to?
let bar cmd lexbuf =
let buff = Buffer.create 0 in
let quot plus =
lexer
| "<" -> if plus then Buffer.add_string b "<" quot plus lexbuf
and unquot plus =
lexer
| ">" -> if plus then Buffer.add_string b ">" unquot plus lexbuf
in
match unquot true lexbuf with
| e -> force_text cmd e
First, your code is probably using one of the old camlp4 syntax extension, you should precise that.
Second, foo is returning the same type of value as either get_text or get_funct. Without the code for those functions, it is not really possible to say more than that.
Third,
Buffer.add_string b ">" unquot plus lexbuf
is ill-typed. Are you missing parentheses:
Buffer.add_string b ">" (unquot plus lexbuf)
?

how to match only part of the expression to string in ocamllex

I have a simple ocamllex program where the rules section looks somewhat like this-
let digits= ['0'-'9']
let variables= 'X'|'Z'
rule addinlist = parse
|['\n'] {addinlist lexbuf;}
| "Inc" '(' variables+ '(' digits+ ')' ')' as ine { !inputstringarray.(!inputstringarrayi) <-ine;
inputstringarrayi := !inputstringarrayi +1;
addinlist lexbuf}
|_ as c
{ printf "Unrecognized character: %c\n" c;
addinlist lexbuf
}
| eof { () }
My question is suppose I want to match Inc(X(7)) such that I can convert it to my abstract syntax which is "Inc of var of int". I want my lexer to give me the separate strings while reading Inc(X(7)) such that I get "Inc" as a diff string (say inb) followed by "X" as a diff string (say inc) n followed by "7" as a diff string (say ind), so that i can play around with these strings inb, inc, & ind, instead of being stuck with a whole string ine, as is given by my program. How to go about this? I hope my question is clear

Extracting field from type Ocaml

This seems like a dumb question to ask, as when I prototype it inside of a terminal I am able to make this work. But when I use the following specific module:
http://caml.inria.fr/pub/docs/manual-ocaml/libref/Lexing.html
and this code:
(*Identifiers*)
let ws = [' ' '\t']*
let id = ['A'-'Z' 'a'-'z'] +
let map = id ws ':' ws id
let feed = '{' ws map+ ws '}'
let feeds = '[' ws feed + ws ']'
(*Entry Points *)
rule token = parse
[' ' '\t'] { token lexbuf } (* skip blanks *)
| ['\n' ] { EOL }
| feeds as expr { Feeds( expr ) }
| id as expr { Id(expr) }
| feed as expr {
let pos = Lexing.lexeme_start_p lexbuf in
let pos_bol = pos.pos_bol in
print_string (string_of_int pos_bol);
print_string "\n";
Feed(expr) }
I am getting the following error:
Error: Unbound record field label pos_bol
and I am kind of perplexed to why this happening. In the documentation I linked above it says that pos_bol is a field of the type Lexing.position
Sorry, I feel like this is going to have a rather obvious answer when it is answered, but thanks any way!
In OCaml, sum constructors and record field names are scoped inside modules, like identifiers. The position record is defined inside Lexing, which isn't opened in the current scope, so instead of pos.pos_bol you should use pos.Lexing.pos_bol.

syntax error in ocaml because of String.concat

Lets say I have a list of type integer [blah;blah;blah;...] and i don't know the size of the lis and I want to pattern match and not print the first element of the list. Is there any way to do this without using a if else case or having a syntax error?
because all i'm trying to do is parse a file tha looks like a/path/to/blah/blah/../file.c
and only print the path/to/blah/blah
for example, can it be done like this?
let out x = Printf.printf " %s \n" x
let _ = try
while true do
let line = input_line stdin in
...
let rec f (xpath: string list) : ( string list ) =
begin match Str.split (Str.regexp "/") xpath with
| _::rest -> out (String.concat "/" _::xpath);
| _ -> ()
end
but if i do this i have a syntax error at the line of String.concat!!
String.concat "/" _::xpath doesn't mean anything because _ is pattern but not a value. _ can be used in the left part of a pattern matching but not in the right part.
What you want to do is String.concat "/" rest.
Even if _::xpath were correct, String.concat "/" _::xpath would be interpreted as (String.concat "/" _)::xpath whereas you want it to be interpreted as String.concat "/" (_::xpath).

token meaning dependent on context

I have a weird string syntax where the meaning of a
delimiter depends on context. In the following sample
input:
( (foo) (bar) )
the result is a list of two strings ["foo"; "bar"].
The outer pair of parenthesis enters list mode.
Then, the next pair of parentheses delimits the string.
Inside strings, balanced pairs of parentheses are to be
treated as part of the string.
Right now the lexer decides what to return depending
on a global variable inside.
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
The delimiters are parentheses. If the lexer hits an
opening parenthesis, then
if inside is false, it emits an
Enter token and inside is set to true.
If inside is true, it switches to a string lexer
which treats any properly nested pair of parentheses
as part of the string. If the nesting level returns to
zero, the string buffer is passed to the parser.
If a closing parenthesis is encountered outside a string,
a Leave token is emitted and inside is unset.
My question is: How do I rewrite the lexer without
the global variable inside?
Fwiw I use menhir but afaict the same would be true for
ocamlyacc.
(Sorry if this sounds confused, I’m really a newbie to
the yacc/lex approach.
I can express all the above without thinking as a PEG but I
haven’t got used to mentally keeping lexer and parser
separated.
Feel free to point out other issues with the code!)
Simple example: *sample_lexer.mll*
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
let lpar = "("
let rpar = ")"
let ws = [' ' '\t' '\n' '\r']
rule tokenize = parse
| ws { tokenize lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
and string_scanner init depth buf = parse
| rpar { if depth = 0 then begin
Buffer.contents buf;
end else begin
Buffer.add_char buf ')';
string_scanner init (depth - 1) buf lexbuf end }
| lpar { Buffer.add_char buf '(';
string_scanner init (depth + 1) buf lexbuf }
| eof { raise (Error (Printf.sprintf
"Unexpected end of file inside string, pos %d--%d]!\n"
init
(Lexing.lexeme_start lexbuf))) }
| _ as chr { Buffer.add_char buf chr;
string_scanner init depth buf lexbuf }
*sample_scanner.mly*:
%token <string> String
%token Enter
%token Leave
%start <string list> process
%%
process:
| Enter lst = string_list Leave { lst }
string_list:
| elm = element lst = string_list { elm :: lst }
| elm = element { [elm] }
element:
| str = String { str }
main.ml:
open Batteries
let sample_input = "( (foo (bar) baz) (xyzzy) )"
(* EibssssssssssssseibssssseiL
* where E := enter inner
* L := leave inner
* i := ignore (whitespace)
* b := begin string
* e := end string
* s := part of string
*
* desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
*)
let main () =
let buf = Lexing.from_string sample_input in
try
List.print
String.print stdout
(Sample_parser.process Sample_lexer.tokenize buf);
print_string "\n";
with
| Sample_lexer.Error msg -> Printf.eprintf "%s%!" msg
| Sample_parser.Error -> Printf.eprintf
"Invalid syntax at pos %d.\n%!"
(Lexing.lexeme_start buf)
let _ = main ()
You can pass the state as an argument to tokenize. It still has to be mutable, but not global.
rule tokenize inside = parse
| ws { tokenize inside lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
And you call the parser as follows:
Sample_parser.process (Sample_lexer.tokenize (ref false)) buf