Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Implementation of DFA-based regexp matching using Antimirov derviatives
type re = C of char | Nil | Seq of re * re | Bot | Alt of re * re | Star of re
let rec null = function
| C _ | Bot -> false
| Nil | Star _ -> true
| Alt(r1, r2) -> null r1 || null r2
| Seq(r1, r2) -> null r1 && null r2
module R = Set.Make(struct type t = re let compare = compare end)
let rmap f rs = R.fold (fun r -> R.add (f r)) rs R.empty
module M = Map.Make(R)
module I = Set.Make(struct type t = int let compare = compare end)
let rec aderiv c = function
| C c' when c = c' -> R.singleton Nil
| C _ | Nil | Bot -> R.empty
| Alt(r, r') -> R.union (aderiv c r) (aderiv c r')
| Seq(r1, r2) -> R.union (rmap (fun r1' -> Seq(r1', r2)) (aderiv c r1))
(if null r1 then aderiv c r2 else R.empty)
| Star r -> rmap (fun r' -> Seq(r', Star r)) (aderiv c r)
let deriv c rs = R.fold (fun r acc -> R.union (aderiv c r) acc) rs R.empty
type dfa = {size : int; fail : int; trans : (int * char * int) list; final : int list}
let rec enum f v i max = if i < max then enum f (f i v) (i+1) max else v
let charfold f init = enum (fun i -> f (Char.chr i)) init 0 256
let dfa r =
let find rs (n,m) = try M.find rs m, (n,m) with _ -> n, (n+1, M.add rs n m) in
let rec loop s v t f rs =
let (x, s) = find rs s in
if I.mem x v then (s, v, t, f)
else charfold (fun c (s, v, t, f) ->
let rs' = deriv c rs in
let (y, s) = find rs' s in
loop s v ((x,c,y) :: t) f rs')
(s, I.add x v, t, if R.exists null rs then x :: f else f) in
let (s, v, t, f) = loop (0, M.empty) I.empty [] [] (R.singleton r) in
let (fail, (n, m)) = find R.empty s in
{ size = n; fail = fail; trans= t; final = f }
type table = { m : int array array; accept : bool array; error : int }
let table d =
{ error = d.fail;
accept = Array.init d.size (fun i -> List.mem i d.final);
m = (let a = Array.init d.size (fun _ -> Array.make 256 0) in
List.iter (fun (x, c, y) -> a.(x).(Char.code c) <- y) d.trans; a) }
let rec matches' t s i x =
if i < String.length s && x != t.error
then matches' t s (i+1) t.m.(x).(Char.code s.[i])
else t.accept.(x)
let re_match t s = matches' t s 0 0
let charset s = enum (fun i r -> Alt(C s.[i], r)) Bot 0 (String.length s)
let string s = enum (fun i r -> Seq(r, C s.[i])) Nil 0 (String.length s)
let seq rs = List.fold_right (fun r rs -> Seq(r, rs)) rs Nil
let alt rs = List.fold_right (fun r rs -> Alt(r, rs)) rs Bot
let opt r = Alt(r, Nil)
let star r = Star r
let plus r = Seq(r, star r)
let print_table out t =
Array.iteri (fun x row ->
Array.iteri (fun c y ->
if x != t.error && y != t.error then
(Format.fprintf out "%d '%c' --> %d " x (Char.chr c) y;
(if t.accept.(y) then Format.fprintf out "*");
Format.fprintf out "\n"))
row)
t.m
module Test = struct
let digit = charset "0123456789"
let sign = charset "+-"
let dot = C '.'
let dotted = alt [ seq [star digit; dot; plus digit];
seq [plus digit; dot; star digit] ]
let exponent = seq [charset "eE"; opt sign; plus digit]
let float = alt [seq [opt sign; dotted; opt exponent];
seq [opt sign; plus digit; exponent] ]
let t_float = table (dfa float)
end
@neel-krishnaswami

This comment has been minimized.

Copy link
Owner Author

neel-krishnaswami commented Nov 7, 2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.