Ocaml - Convert string to lexeme sequences - regex

I have this code with the following structure:
type regexp =
| V (* void *)
| E (* epsilon *)
| C of char (* char *)
| U of regexp * regexp (* a + b *)
| P of regexp * regexp (* a.b *)
| S of regexp (* a* *)
;;
How do I convert a string like this (a + b) * into an expression of type regexp S (U (C 'a', C 'b'))?

Related

Ocaml Custom Data types

type expr =
| Plus of expr * expr (* a + b *)
| Minus of expr * expr (* a - b *)
| Times of expr * expr (* a * b *)
| Divide of expr * expr (* a / b *)
| Var of string (* "x", "y", etc. *)
Having this type "expr" I wanted to know how can I work with a customizable number of variables depending on my needs, in this case, we know that Plus is (Expr * Expr) but what if I want to do to do : ( a + b + c) or (a * b * c), is it possible? I got this example at https://ocaml.org/learn/tutorials/data_types_and_matching.html
Since + is a binary operation, the usual representation of a + b + c is this:
Plus (Plus (Var "a", Var "b"), Var "c")

Recursive match function with cyclic function dependences

I'm not sure that the title explains my problem, but I can improve it after for the moment I want to explain my problem because are several days that I'm breaking my mind over this problem.
I'm developing a static analysis for my class with Ocaml to check if my c (subset of a C language) program meaning somethings real, and I'm new with this stuff (with the language and paradigm and with the compiler stuff).
The static analysis is traversing the Abstract syntax tree (Ast) and make some checks of it (The check is a problem with TODO comment), for the moment I'm developing the data structure, in particular a Symbol Table, and implementing the code to traverse the Ast.
My complete Ast.
type binop = Add | Sub | Mult | Div | Mod | Equal | Neq | Less | Leq |
Greater | Geq | And | Or | Comma
[##deriving show]
type uop = Neg | Not [##deriving show]
type identifier = string [##deriving show]
type position = Lexing.position * Lexing.position
let dummy_pos = (Lexing.dummy_pos, Lexing.dummy_pos)
type 'a annotated_node = {loc : position[#opaque]; node : 'a}[##deriving show]
type typ =
| TypInt (* Type int *)
| TypBool (* Type bool *)
| TypChar (* Type char *)
| TypArray of typ * int option (* Array type *)
| TypPoint of typ (* Pointer type *)
| TypVoid (* Type void *)
[##deriving show]
and expr = expr_node annotated_node
and expr_node =
| Access of access (* x or *p or a[e] *)
| Assign of access * expr (* x=e or *p=e or a[e]=e *)
| Addr of access (* &x or &*p or &a[e] *)
| ILiteral of int (* Integer literal *)
| CLiteral of char (* Char literal *)
| BLiteral of bool (* Bool literal *)
| UnaryOp of uop * expr (* Unary primitive operator *)
| BinaryOp of binop * expr * expr (* Binary primitive operator *)
| Call of identifier * expr list (* Function call f(...) *)
[##deriving show]
and access = access_node annotated_node
and access_node =
| AccVar of identifier (* Variable access x *)
| AccDeref of expr (* Pointer dereferencing *p *)
| AccIndex of access * expr (* Array indexing a[e] *)
[##deriving show]
and stmt = stmt_node annotated_node
and stmt_node =
| If of expr * stmt * stmt (* Conditional *)
| While of expr * stmt (* While loop *)
| For of expr option * expr option * expr option * stmt (* For loop *)
| Expr of expr (* Expression statement e; *)
| Return of expr option (* Return statement *)
| Block of stmtordec list (* Block: grouping and scope *)
[##deriving show]
and stmtordec = stmtordec_node annotated_node
and stmtordec_node =
| Dec of typ * identifier (* Local variable declaration *)
| Stmt of stmt (* A statement *)
[##deriving show]
type fun_decl = {
typ : typ;
fname : string;
formals : (typ*identifier) list;
body : stmt;
}[##deriving show]
type topdecl = topdecl_node annotated_node
and topdecl_node =
| Fundecl of fun_decl
| Vardec of typ * identifier
[##deriving show]
type program = Prog of topdecl list [##deriving show]
My problem is how to traverse the stmt because inside contains the Block of stmtordec list and the stmtordec and have the stmt and in this case, I'm in a cyclic dependence that I'm not able to resolve with the match function.
My idea to traverse it is to have an OCaml function check_stm -> check_blk -> check_stm, but how I can resolve this idea with the code?
At the moment my code is that but don't compiler because I'm not able to put the function in the OCaml scope at the same time.
My code is that
open Ast
open Symbol_table
open Easy_logging
let logger = Logging.make_logger "Semant" Debug [Cli Debug]
(* Global Scope: This scope contains all the Global declaration
Global declaration types:
- Int, Bool, Char, Array.
- Struct
- function declaration
*)
let global_scope = empty_table
let check_blk blkstm =
match blkstm.node with
| Ast.Dec(tipe, id) ->
begin
logger#debug "Variable declaration check";
(* TODO: I'm missing the variable duplication *)
Symbol_table.add_entry id tipe global_scope
end
| Ast.Stmt(stm) ->
begin
logger#debug "Stm check (recursive call)";
check_stm stm
end
let check_stm node =
match node with
| Ast.If(ex, ifs, els) -> logger#debug "TODO: If stm check"
| Ast.While(ex, stm) -> logger#debug "TODO: While stm check"
| Ast.For(ex1, ex2, ex3, stm) -> logger#debug "TODO: For stm check"
| Ast.Expr(ex) -> logger#debug "TODO: Expression check"
| Ast.Return(optex) -> logger#debug "TODO: Return stm check"
| Ast.Block(blkstm) -> List.iter check_blk blkstm
let check_fundec node =
match node with
| fun_decl as f ->
begin
logger#debug "Checking function declaration";
(* TODO: how I can managed the parameter of the function?*)
global_scope = Symbol_table.begin_block global_scope;
check_stm f.body.node
end
let rec match_type ast_elem =
match ast_elem.node with
| Vardec(tipe, id) ->
begin
logger#debug "Global variable found";
add_entry id tipe global_scope;
()
end
| Fundecl(fundec) ->
begin
logger#debug "Function analysis found";
Symbol_table.add_entry fundec.fname fundec.typ global_scope;
check_fundec fundec;
end
let check (Ast.Prog(topdecls)) = List.iter match_type topdecls
Maybe this question is foolish, and maybe I'm making something wrong inside my idea, but I want to talk about the problem to fix it and learn how to use the OCaml language
p.s: For the moment the Symbol_table implementation is an empty implementation
If I understand your problem correctly you just have to specify the mutually recursive functions explicitly. You do so using the and keyword, just like with type definitions, but also have to use the rec keyword because function definitions are not recursive by default, unlike type definitions:
let rec check_blk blkstm = ...
and check_stm node = ...

Implement an interpreter in Ocaml

i'm trying to write an interpreter in Ocaml, i have defined the syntax already and the semantic for the most of the operations.
I'm trying to implement two more operations:
Ntimes: this has two argouments, an Integer and a function. Ntimes has to apply the function n times.
Pipe: this is the same has the pipe in the linux bash.
example of Ntimes:
Ntimes(Int(4),f);;
example of Pipe:
Pipe(f1,(f2,(f3,(f4,(f5,Nil)))));;
The interpeter that i have is this:
SYNTAX
type ide = string;;
type operator = Plus | Minus | Mul | Div | And | Or | Eq;;
type exp = Int of int
| Bool of bool
| Den of string
| Op of exp * operator * exp
| Let of ide * exp * exp
| Fun of ide * exp
| Apply of exp * exp
| Ifz of exp * exp * exp
| Etup of tuple
| Pipe of tuple
| ManyTimes of int * exp
and tuple = Nil | Seq of exp * tuple;;
type dexp = Dint of int | Dbool of bool | Dstring of string | Unbound | Dtuple of dexp list | Funval of efun and efun = ide * exp * dexp env ;;
RUN-TIME SUPPORT
let rec eval ((e: exp), (r:dexp env)) = match e with
Int i -> Dint i
| Bool i -> Dbool i
.
.
.
.
| Etup e1 -> let v = (evalList e1 r) in Dtuple v
| Apply(e1, e2) -> (match eval(e1, r) with
| Funval(i, a, r1) -> eval(a, bind(r1, i, eval(e2, r)))
| _ -> failwith("no funct in apply"))
| NTimes(i,e) -> *I have no idea*
| Pipe(e) -> * I have no idea*
;;
Thanks for the any help that you will give me!

Generating C code in Ocaml

I'm trying to create a code generating DSL in OCaml, however I can't find many examples on what the code generation looks like. I would just like to see how to create code values in OCaml.
For example if I had a type like this:
let equation =
Add of int * int
| Sub of int * int
| Mul of int * int
| Div of int * int;;
and I want a function like this:
let write_code = function
| Add (x, y) -> // INSERT CODE "x + y" here
etc... how would this look?
I have looked at this example http://okmij.org/ftp/meta-programming/tutorial/power.ml but the characters .< >. are causing syntax errors when I try to compile.
The code generated will not need to be compiled or executed, but saved to a .c file for later use.
I would just like to see the basic structure for this simple example so I can apply it to a more complicated problem.
You can do like that :
type equation =
| Const of int
| Var of string
| Add of equation * equation
| Mul of equation * equation ;;
let rec c_string_of_equation = function
| Const i -> string_of_int i
| Var x -> x
| Add (e1, e2) ->
"(" ^ c_string_of_equation e1 ^ ") + (" ^ c_string_of_equation e2 ^ ")"
| Mul (e1, e2) ->
"(" ^ c_string_of_equation e1 ^ ") * (" ^ c_string_of_equation e2 ^ ")"
;;
Here you produce a string and after that you can write that string where you want.
I changed your expression type a bit to be more general.
The result string will contain too much parentheses, but it does not matter because the generated code is not targeted to humans but to a compiler.
You could use a buffer :
As it's written in the module :
This module implements buffers that automatically expand as necessary. It provides accumulative concatenation of strings in quasi-linear time (instead of quadratic time when strings are concatenated pairwise).
For example, you can write :
let equation =
| Add of int * int
| Sub of int * int
| Mul of int * int
| Div of int * int;;
let co = open_out filename
let buff = Buffer.create 11235
let write_code = function
| Add (x, y) -> Buffer.add_string buff (Printf.sprintf "%d + %d" x y)
| ... -> ...
let write c =
write_code c;
Buffer.output_buffer co buff
With
# Buffer.create;;
- : int -> Buffer.t = <fun>
# Buffer.add_string;;
- : Buffer.t -> string -> unit = <fun>
# Buffer.output_buffer;;
- : out_channel -> Buffer.t -> unit = <fun>
Notice that Buffer.add_string write the string at the end of the buffer ;-)

Represent regular expression as context free grammar

I am hand-writing a parser for a simple regular expression engine.
The engine supports a .. z | * and concatenation and parentheses
Here is the CFG I made:
exp = concat factor1
factor1 = "|" exp | e
concat = term factor2
factor2 = concat | e
term = element factor3
factor3 = * | e
element = (exp) | a .. z
which is equal to
S = T X
X = "|" S | E
T = F Y
Y = T | E
F = U Z
Z = *| E
U = (S) | a .. z
For alternation and closure, I can easily handle them by looking ahead and choose a production based on the token. However, there is no way to handle concatenation by looking ahead cause it is implicit.
I am wondering how can I handle concatenation or is there anything wrong with my grammar?
And this is my OCaml code for parsing:
type regex =
| Closure of regex
| Char of char
| Concatenation of regex * regex
| Alternation of regex * regex
(*| Epsilon*)
exception IllegalExpression of string
type token =
| End
| Alphabet of char
| Star
| LParen
| RParen
| Pipe
let rec parse_S (l : token list) : (regex * token list) =
let (a1, l1) = parse_T l in
let (t, rest) = lookahead l1 in
match t with
| Pipe ->
let (a2, l2) = parse_S rest in
(Alternation (a1, a2), l2)
| _ -> (a1, l1)
and parse_T (l : token list) : (regex * token list) =
let (a1, l1) = parse_F l in
let (t, rest) = lookahead l1 in
match t with
| Alphabet c -> (Concatenation (a1, Char c), rest)
| LParen ->
(let (a, l1) = parse_S rest in
let (t1, l2) = lookahead l1 in
match t1 with
| RParen -> (Concatenation (a1, a), l2)
| _ -> raise (IllegalExpression "Unbalanced parentheses"))
| _ ->
let (a2, rest) = parse_T l1 in
(Concatenation (a1, a2), rest)
and parse_F (l : token list) : (regex * token list) =
let (a1, l1) = parse_U l in
let (t, rest) = lookahead l1 in
match t with
| Star -> (Closure a1, rest)
| _ -> (a1, l1)
and parse_U (l : token list) : (regex * token list) =
let (t, rest) = lookahead l in
match t with
| Alphabet c -> (Char c, rest)
| LParen ->
(let (a, l1) = parse_S rest in
let (t1, l2) = lookahead l1 in
match t1 with
| RParen -> (a, l2)
| _ -> raise (IllegalExpression "Unbalanced parentheses"))
| _ -> raise (IllegalExpression "Unknown token")
For a LL grammar the FIRST sets are the tokens that are allowed as first token for a rule. To can construct them iteratively till you reach a fixed point.
a rule starting with a token has that token in its FIRST set
a rule starting with a term has the FIRST set of that term in its FIRST set
a rule T = A | B has the union of FIRST(A) and FIRST(B) as FIRST set
Start with step 1 and then repeat steps 2 and 3 until the FIRST sets reach a fixed point (don't change). Now you have the true FIRST sets for your grammar and can decide every rule using the lookahead.
Note: In your code the parse_T function doesn't match the FIRST(T) set. If you look at for example 'a|b' then is enters parse_T and the 'a' is matched by the parse_F call. The lookahead then is '|' which matches epsilon in your grammar but not in your code.