Created
October 18, 2014 17:00
-
-
Save kandu/6fb5777c92bc0369076c to your computer and use it in GitHub Desktop.
str2regexp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let epsilon= "ϵ" | |
type buffer= { | |
str: string; | |
len: int; | |
pos: int; | |
} | |
let initBuffer str= | |
{ str; | |
len= String.length str; | |
pos= 0; | |
} | |
type token= | |
| OpC | OpO | |
| TkE | TkC of char | TkS of string | |
type regexp= | |
| Ce | |
| C of char (* char *) | |
| A of regexp * regexp (* concatenation *) | |
| O of regexp * regexp (* alternation *) | |
| Cl of regexp (* closure aka Kleene Star *) | |
let getToken buf= | |
let rec paired str count pos= | |
if count > 0 then | |
match str.[pos] with | |
| '(' -> paired str (count+1) (pos+1) | |
| ')' -> paired str (count-1) (pos+1) | |
| _ -> paired str count (pos+1) | |
else pos-1 | |
in | |
match buf.str.[buf.pos] with | |
| '('-> | |
let tail= paired buf.str 1 (buf.pos+1) in | |
let paren= String.sub buf.str (buf.pos+1) (tail-buf.pos-1) | |
in (TkS paren, {buf with pos= tail+1}) | |
| '*'-> (OpC, {buf with pos= buf.pos+1}) | |
| '|'-> (OpO, {buf with pos= buf.pos+1}) | |
| '#'-> (TkE, {buf with pos= buf.pos+1}) | |
| c -> (TkC c, {buf with pos= buf.pos+1}) (* TODO: escape character *) | |
let rec str2regexp buf= | |
let getItem= function | |
| TkE-> Ce | |
| TkC c-> C c | |
| TkS s -> str2regexp @@ initBuffer s | |
| other -> raise @@ Failure | |
(Printf.sprintf "match %s" | |
(match other with OpC -> "OpC" | OpO -> "OpO" | _ -> "Tk")) | |
in | |
let rec parse curr nextbuf= | |
if nextbuf.pos >= nextbuf.len then | |
curr | |
else | |
let nextTk, lastbuf= getToken nextbuf in | |
match nextTk with | |
| OpO -> O (curr, str2regexp lastbuf) | |
| OpC-> let curr= match curr with Cl r-> curr | _ -> Cl curr in | |
parse curr lastbuf | |
| _ -> | |
let next= getItem nextTk in | |
if lastbuf.pos >= lastbuf.len then | |
A (curr, next) | |
else | |
let lastTk, tmpbuf= getToken lastbuf in | |
match lastTk with | |
| OpC-> parse (A (curr, Cl next)) tmpbuf | |
| OpO-> | |
O (A (curr, next), str2regexp tmpbuf) | |
| _ -> parse (A (curr, next)) lastbuf | |
in | |
let curr, buf= getToken buf in | |
parse (getItem curr) buf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment