This seems like a dumb question to ask, as when I prototype it inside of a terminal I am able to make this work. But when I use the following specific module:
http://caml.inria.fr/pub/docs/manual-ocaml/libref/Lexing.html
and this code:
(*Identifiers*)
let ws = [' ' '\t']*
let id = ['A'-'Z' 'a'-'z'] +
let map = id ws ':' ws id
let feed = '{' ws map+ ws '}'
let feeds = '[' ws feed + ws ']'
(*Entry Points *)
rule token = parse
[' ' '\t'] { token lexbuf } (* skip blanks *)
| ['\n' ] { EOL }
| feeds as expr { Feeds( expr ) }
| id as expr { Id(expr) }
| feed as expr {
let pos = Lexing.lexeme_start_p lexbuf in
let pos_bol = pos.pos_bol in
print_string (string_of_int pos_bol);
print_string "\n";
Feed(expr) }
I am getting the following error:
Error: Unbound record field label pos_bol
and I am kind of perplexed to why this happening. In the documentation I linked above it says that pos_bol is a field of the type Lexing.position
Sorry, I feel like this is going to have a rather obvious answer when it is answered, but thanks any way!
In OCaml, sum constructors and record field names are scoped inside modules, like identifiers. The position record is defined inside Lexing, which isn't opened in the current scope, so instead of pos.pos_bol you should use pos.Lexing.pos_bol.
Related
As I understand it, OCaml doesn't require explicit return statements to yield a value from a function. The last line of the function is what returns something.
In that case, could someone please let me know what the following function foo is returning? It seems that it's returning a stream of data. Is it returning the lexer?
and foo ?(input = false) =
lexer
| 'x' _
-> let y = get_func lexbuf
get_text y
| ',' -> get_func lexbuf
| _ -> get_text lexbuf
I'm trying to edit the following function, bar, to return a data stream, as well, so that I can replace foo with bar in another function. However, it seems that bar has multiple lexers which is preventing this return. How can I rewrite bar to return a data stream in a similar way that foo appears to?
let bar cmd lexbuf =
let buff = Buffer.create 0 in
let quot plus =
lexer
| "<" -> if plus then Buffer.add_string b "<" quot plus lexbuf
and unquot plus =
lexer
| ">" -> if plus then Buffer.add_string b ">" unquot plus lexbuf
in
match unquot true lexbuf with
| e -> force_text cmd e
First, your code is probably using one of the old camlp4 syntax extension, you should precise that.
Second, foo is returning the same type of value as either get_text or get_funct. Without the code for those functions, it is not really possible to say more than that.
Third,
Buffer.add_string b ">" unquot plus lexbuf
is ill-typed. Are you missing parentheses:
Buffer.add_string b ">" (unquot plus lexbuf)
?
I have some basic ocamllex code, which was written by my professor, and seems to be fine:
{ type token = EOF | Word of string }
rule token = parse
| eof { EOF }
| [’a’-’z’ ’A’-’Z’]+ as word { Word(word) }
| _ { token lexbuf }
{
(*module StringMap = BatMap.Make(String) in *)
let lexbuf = Lexing.from_channel stdin in
let wordlist =
let rec next l = match token lexbuf with
EOF -> l
| Word(s) -> next (s :: l)
in next []
in
List.iter print_endline wordlist
}
However, running ocamllex wordcount.mll produces
File "wordcount.mll", line 4, character 3: syntax error.
This indicates that there is an error at the first [ in the regex in the fourth line here. What is going on?
You seem to have curly quotes (also called "smart quotes" -- ugh) in your text. You need regular old single quotes.
curly quote: ’
old fashioned single quote: '
I have a simple ocamllex program where the rules section looks somewhat like this-
let digits= ['0'-'9']
let variables= 'X'|'Z'
rule addinlist = parse
|['\n'] {addinlist lexbuf;}
| "Inc" '(' variables+ '(' digits+ ')' ')' as ine { !inputstringarray.(!inputstringarrayi) <-ine;
inputstringarrayi := !inputstringarrayi +1;
addinlist lexbuf}
|_ as c
{ printf "Unrecognized character: %c\n" c;
addinlist lexbuf
}
| eof { () }
My question is suppose I want to match Inc(X(7)) such that I can convert it to my abstract syntax which is "Inc of var of int". I want my lexer to give me the separate strings while reading Inc(X(7)) such that I get "Inc" as a diff string (say inb) followed by "X" as a diff string (say inc) n followed by "7" as a diff string (say ind), so that i can play around with these strings inb, inc, & ind, instead of being stuck with a whole string ine, as is given by my program. How to go about this? I hope my question is clear
I have the following line from apache (changing the apache log format is not an option):
... referrer=- user_agent=JSON-RPC Client status=200 size=44 ...
and I am trying to parse it with Ragel. I am getting all the fields extracted except the user_agent. The user guide says that strong difference -- ensures that the first machine does NOT contain the second. In my case, I want to match anything that does not contain " status=", which would signify the next field. However, my current definition (below) appears to be skipping user_agent entirely; I still get status and following fields. Am I utilizing strong difference properly?
...
referrer = ^space+ >mark %{ emit("referrer"); };
user_agent = any* -- ' status=' >mark %{ emit("user_agent"); };
status = digit+ >mark %{ emit("status"); };
...
line = (
...
space "referrer="
referrer
space "user_agent="
user_agent
space "status="
status
space "size="
...
);
I know this question is old, but I'm just getting into Ragel and this may help others.
The problem here is operator precedence. The action operators > and % are evaluated before the strong difference operator --. It's easy to fix with parentheses:
user_agent = (any* -- ' status=') >mark %{ emit("user_agent"); };
I would also use space instead of hard-coding a space, at least for consistency:
user_agent = (any* -- (space 'status=')) >mark %{ emit("user_agent"); };
You can use state charts, try this state machine:
STATUS := digit+ ' ' #{ fnext SIZE; };
SIZE := digit+;
USER_AGENT =
start: (
's' -> s1 |
[^s] #{ if (!ua_start) ua_start = p; } -> start
),
s1: (
't' -> s2 |
[^t] -> start
),
s2: (
'a' -> s3 |
[^a] -> start
),
s3: (
't' -> s4 |
[^t] -> start
),
s4: (
'u' -> s5 |
[^u] -> start
),
s5: (
's' -> s6 |
[^s] -> start
),
s6: (
'=' #{ fnext STATUS; ua_stop = p - 7; } -> final |
[^=] -> start
)
;
main := "user_agent=" USER_AGENT ;
U can get the user agent in "ua_start" -> "ua_stop", for other variables, i think u already knew how to mark & get the characters.
I have a weird string syntax where the meaning of a
delimiter depends on context. In the following sample
input:
( (foo) (bar) )
the result is a list of two strings ["foo"; "bar"].
The outer pair of parenthesis enters list mode.
Then, the next pair of parentheses delimits the string.
Inside strings, balanced pairs of parentheses are to be
treated as part of the string.
Right now the lexer decides what to return depending
on a global variable inside.
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
The delimiters are parentheses. If the lexer hits an
opening parenthesis, then
if inside is false, it emits an
Enter token and inside is set to true.
If inside is true, it switches to a string lexer
which treats any properly nested pair of parentheses
as part of the string. If the nesting level returns to
zero, the string buffer is passed to the parser.
If a closing parenthesis is encountered outside a string,
a Leave token is emitted and inside is unset.
My question is: How do I rewrite the lexer without
the global variable inside?
Fwiw I use menhir but afaict the same would be true for
ocamlyacc.
(Sorry if this sounds confused, I’m really a newbie to
the yacc/lex approach.
I can express all the above without thinking as a PEG but I
haven’t got used to mentally keeping lexer and parser
separated.
Feel free to point out other issues with the code!)
Simple example: *sample_lexer.mll*
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
let lpar = "("
let rpar = ")"
let ws = [' ' '\t' '\n' '\r']
rule tokenize = parse
| ws { tokenize lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
and string_scanner init depth buf = parse
| rpar { if depth = 0 then begin
Buffer.contents buf;
end else begin
Buffer.add_char buf ')';
string_scanner init (depth - 1) buf lexbuf end }
| lpar { Buffer.add_char buf '(';
string_scanner init (depth + 1) buf lexbuf }
| eof { raise (Error (Printf.sprintf
"Unexpected end of file inside string, pos %d--%d]!\n"
init
(Lexing.lexeme_start lexbuf))) }
| _ as chr { Buffer.add_char buf chr;
string_scanner init depth buf lexbuf }
*sample_scanner.mly*:
%token <string> String
%token Enter
%token Leave
%start <string list> process
%%
process:
| Enter lst = string_list Leave { lst }
string_list:
| elm = element lst = string_list { elm :: lst }
| elm = element { [elm] }
element:
| str = String { str }
main.ml:
open Batteries
let sample_input = "( (foo (bar) baz) (xyzzy) )"
(* EibssssssssssssseibssssseiL
* where E := enter inner
* L := leave inner
* i := ignore (whitespace)
* b := begin string
* e := end string
* s := part of string
*
* desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
*)
let main () =
let buf = Lexing.from_string sample_input in
try
List.print
String.print stdout
(Sample_parser.process Sample_lexer.tokenize buf);
print_string "\n";
with
| Sample_lexer.Error msg -> Printf.eprintf "%s%!" msg
| Sample_parser.Error -> Printf.eprintf
"Invalid syntax at pos %d.\n%!"
(Lexing.lexeme_start buf)
let _ = main ()
You can pass the state as an argument to tokenize. It still has to be mutable, but not global.
rule tokenize inside = parse
| ws { tokenize inside lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
And you call the parser as follows:
Sample_parser.process (Sample_lexer.tokenize (ref false)) buf