Skip to content

Instantly share code, notes, and snippets.

@lindig
Created September 9, 2015 16:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save lindig/e86111675025819cee43 to your computer and use it in GitHub Desktop.
Save lindig/e86111675025819cee43 to your computer and use it in GitHub Desktop.
Some Scanning Recipes for OCamlLex
{
(* short names for important modules *)
module L = Lexing
module B = Buffer
type token =
| STR of string
| INT of int
| ID of string
| PLUSEQ
| MINUSEQ
| STAREQ
| SLASHEQ
| PLUS
| MINUS
| STAR
| SLASH
| ASSIGN
| EOF (* end of input *)
let get = L.lexeme
let sprintf = Printf.sprintf
let position lexbuf =
let p = lexbuf.L.lex_curr_p in
sprintf "%s:%d:%d"
p.L.pos_fname p.L.pos_lnum (p.L.pos_cnum - p.L.pos_bol)
let set_filename (fname:string) (lexbuf:L.lexbuf) =
( lexbuf.L.lex_curr_p <-
{ lexbuf.L.lex_curr_p with L.pos_fname = fname }
; lexbuf
)
exception Error of string
let error lexbuf fmt =
Printf.kprintf (fun msg ->
raise (Error ((position lexbuf)^" "^msg))) fmt
}
let ws = [' ' '\t']
let nl = ['\n']
let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let id = alpha (alpha|digit)*
rule token = parse
| ws+ { token lexbuf }
| nl { L.new_line lexbuf; token lexbuf }
| digit+ { INT(int_of_string @@ get lexbuf) }
| id { ID(get lexbuf)}
| '+' { PLUS }
| '-' { MINUS }
| '*' { STAR }
| '/' { SLASH }
| "+=" { PLUSEQ }
| "-=" { MINUSEQ }
| "*=" { STAREQ }
| "/=" { SLASHEQ }
| ":=" { ASSIGN }
| '"' { STR (string (B.create 100) lexbuf) } (* see below *)
| eof { EOF }
| _ { error lexbuf
"found '%s' - don't know how to handle" @@ get lexbuf }
and escape b = parse
| '&' { Buffer.add_string b "&amp;"; escape b lexbuf }
| '"' { Buffer.add_string b "&quot;"; escape b lexbuf }
| '\'' { Buffer.add_string b "&apos;"; escape b lexbuf }
| '>' { Buffer.add_string b "&gt;"; escape b lexbuf }
| '<' { Buffer.add_string b "&lt;"; escape b lexbuf }
| [^'&' '"' '\'' '>' '<']+
{ Buffer.add_string b @@ get lexbuf
; escape b lexbuf
}
| eof { let x = Buffer.contents b in Buffer.clear b; x }
| _ { error lexbuf
"don't know how to quote: %s" (get lexbuf) }
and string buf = parse (* use buf to build up result *)
| [^'"' '\n' '\\']+
{ B.add_string buf @@ get lexbuf
; string buf lexbuf
}
| '\n' { B.add_string buf @@ get lexbuf
; L.new_line lexbuf
; string buf lexbuf
}
| '\\' '"' { B.add_char buf '"'
; string buf lexbuf
}
| '\\' { B.add_char buf '\\'
; string buf lexbuf
}
| '"' { B.contents buf } (* return *)
| eof { error lexbuf "end of input inside of a string" }
| _ { error lexbuf
"found '%s' - don't know how to handle" @@ get lexbuf }
{
let escape str = escape (B.create 100) (L.from_string str)
let to_string = function
| STR(str) -> sprintf "STR(%s)" (escape str)
| INT(d) -> sprintf "INT(%d)" d
| PLUS -> sprintf "PLUS"
| MINUS -> sprintf "MINUS"
| STAR -> sprintf "STAR"
| SLASH -> sprintf "SLASH"
| PLUSEQ -> sprintf "PLUSEQ"
| MINUSEQ -> sprintf "MINUSEQ"
| STAREQ -> sprintf "STAREQ"
| SLASHEQ -> sprintf "SLASHEQ"
| ID(str) -> sprintf "ID(%s)" str
| ASSIGN -> sprintf "ASSIGN"
| EOF -> sprintf "EOF"
let main () =
let lexbuf = set_filename "stdin" @@ L.from_channel stdin in
let rec loop acc = function
| EOF -> to_string EOF :: acc |> List.rev
| x -> loop (to_string x :: acc) (token lexbuf)
in
loop [] (token lexbuf)
|> String.concat " "
|> print_endline
let () = main () (* call main function on startup *)
}
@liweijian
Copy link

Awesome thanks for your sharing, it's good to read this example.

Just wondering how to use menhir with this gist? From my understanding, we should define the tokens in parser.mly, and use open Parser in the tokenizer.mll file.

In this gist, it seems that we define the tokens in tokenizer.mll, I did try to define a simple parser.mly like

%{
open Tokenizer
%}
%start <int option> prog

%%

prog:
  | v = value { Some v }
  | EOF       { None   } ;

value:
  | i = INT                                   { i      }

And there's an error like:

$ dune build
File "bin/parser.mly", line 12, characters 4-7:
Error: EOF is undefined.

@lindig
Copy link
Author

lindig commented Dec 9, 2022

Sorry, I have not used Menhir and can't provide an answer without porting it to Menhir myself. From the documentation the same trick used here could be available: I am defining tokens in the early part of file which is just copied to the generated. Such a header section is available in Menhir as well. The same trick is used at the bottom to define the main function.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment