lascar-pacagi/dfa2.ml

## dfa2.ml
module DFA2 : Matching = struct
  type t1 = (int * state) ref
  and state =
    | Split of t1 * t1
    | CharSet of CSet.t * t1
    | Match

  module S =
    Set.Make(
        struct
          type t = t1
          let compare { contents = (i1, _) } { contents = (i2, _) } =
            compare i1 i2
        end)

  (* [set] represents a state of the DFA (a set of NFA states).
     [transitions] are the memoized transitions from this state. *)
  type t2 = { transitions : (char, t2) Hashtbl.t; set : S.t }

  module Memo = Map.Make(S)

  (* The first element of the pair [t] remembers the states of the DFA that we have already met so that
     we can get the associated t2 value when we get this state from another transition in the automaton.
     The second element of the pair [t] is the starting state of the DFA with its memoized transitions. *)
  type t = t2 Memo.t ref * t2

  let make_t1 =
    let counter = ref 1 in
    (fun s ->
      if s = Match then
        ref (0, s)
      else
        begin
          let i = !counter in
          incr counter;
          ref (i, s)
        end)

  let rec follow ({ contents = (_, s) } as t1) next =
    if S.mem t1 next then
      next
    else
      let next = S.add t1 next in
      match s with
      | Split (t1a, t1b) ->
         follow t1a next
         |> follow t1b

      | _ ->
         next

  let new_hashtbl () = Hashtbl.create 11

  let init re =
    let rec init re next =
      match re with
      | RE.CharSet set ->
         make_t1 (CharSet (set, next))

      | RE.Concatenation (re1, re2) ->
         init re2 next
         |> init re1

      | RE.Union (re1, re2) ->
         make_t1 (Split (init re1 next, init re2 next))

      | RE.ZeroOrOne re ->
         make_t1 (Split (init re next, next))

      | RE.ZeroOrMore re ->
         let t1 = make_t1 Match in
         let res = make_t1 (Split (init re t1, next)) in
         t1 := !res;
         res

      | RE.OneOrMore re ->
         let t1 = make_t1 Match in
         let res = init re t1 in
         t1 := !(make_t1 (Split (res, next)));
         res
    in
    (* [set] is the starting state of the DFA. *)
    let set =
      let nfa_start = init re (make_t1 Match) in
      follow nfa_start S.empty
    in
    (* [t2] puts together the starting state with the memoized transitions from this state (initially empty). *)
    let t2 = { transitions = new_hashtbl (); set } in
    (* We memoize the association [set] --> [t2] to be able to get this [t2] if we reach [set] from a transition
       during the construction. *)
    ref (Memo.singleton set t2), t2

  let full_match (memo, t2) s =
    let hit = ref 0 in
    let miss = ref 0 in
    let step { transitions; set } c =
      try
        (* We first test if this transition has already been seen. *)
        let res = Hashtbl.find transitions c in
        incr hit;
        res
      with Not_found ->
        incr miss;
        (* If this is not the case, we have to build the new state. *)
        let set' =
          S.fold
            (fun { contents = (_, s) } next ->
              match s with
              | CharSet (chars, t1) when CSet.mem c chars ->
                 follow t1 next

              | _ ->
                 next
            ) set S.empty
        in
        let res =
          try
            (* We try to find [set'] in [memo] to get the associated [t2] if we already found this state from another
               transition. *)
            Memo.find set' !memo
          with Not_found ->
            (* If not, we need to build a new [t2]. *)
            let t2 = { transitions = new_hashtbl (); set = set' } in
            (* We then remember the association [set'] --> [t2] in [memo] in case we get to [set'] afterward from another
               transition. *)
            memo := Memo.add set' t2 !memo;
            t2
        in
        (* We remember the new transition [set] --[c]--> [set'] in the DFA. *)
        Hashtbl.add transitions c res;
        res
    in
    let rec full_match l t2 =
      match l with
      | [] ->
         S.mem (make_t1 Match) t2.set

      | c :: r ->
         step t2 c
         |> full_match r
    in
    let res = full_match (explode s) t2 in
    Printf.printf "states: %d, hits: %d, misses: %d\n" (Memo.cardinal !memo) !hit !miss;
    res

end
	module DFA2 : Matching = struct
	type t1 = (int * state) ref
	and state =
	\| Split of t1 * t1
	\| CharSet of CSet.t * t1
	\| Match

	module S =
	Set.Make(
	struct
	type t = t1
	let compare { contents = (i1, _) } { contents = (i2, _) } =
	compare i1 i2
	end)

	(* [set] represents a state of the DFA (a set of NFA states).
	[transitions] are the memoized transitions from this state. *)
	type t2 = { transitions : (char, t2) Hashtbl.t; set : S.t }

	module Memo = Map.Make(S)

	(* The first element of the pair [t] remembers the states of the DFA that we have already met so that
	we can get the associated t2 value when we get this state from another transition in the automaton.
	The second element of the pair [t] is the starting state of the DFA with its memoized transitions. *)
	type t = t2 Memo.t ref * t2

	let make_t1 =
	let counter = ref 1 in
	(fun s ->
	if s = Match then
	ref (0, s)
	else
	begin
	let i = !counter in
	incr counter;
	ref (i, s)
	end)

	let rec follow ({ contents = (_, s) } as t1) next =
	if S.mem t1 next then
	next
	else
	let next = S.add t1 next in
	match s with
	\| Split (t1a, t1b) ->
	follow t1a next
	\|> follow t1b

	\| _ ->
	next

	let new_hashtbl () = Hashtbl.create 11

	let init re =
	let rec init re next =
	match re with
	\| RE.CharSet set ->
	make_t1 (CharSet (set, next))

	\| RE.Concatenation (re1, re2) ->
	init re2 next
	\|> init re1

	\| RE.Union (re1, re2) ->
	make_t1 (Split (init re1 next, init re2 next))

	\| RE.ZeroOrOne re ->
	make_t1 (Split (init re next, next))

	\| RE.ZeroOrMore re ->
	let t1 = make_t1 Match in
	let res = make_t1 (Split (init re t1, next)) in
	t1 := !res;
	res

	\| RE.OneOrMore re ->
	let t1 = make_t1 Match in
	let res = init re t1 in
	t1 := !(make_t1 (Split (res, next)));
	res
	in
	(* [set] is the starting state of the DFA. *)
	let set =
	let nfa_start = init re (make_t1 Match) in
	follow nfa_start S.empty
	in
	(* [t2] puts together the starting state with the memoized transitions from this state (initially empty). *)
	let t2 = { transitions = new_hashtbl (); set } in
	(* We memoize the association [set] --> [t2] to be able to get this [t2] if we reach [set] from a transition
	during the construction. *)
	ref (Memo.singleton set t2), t2

	let full_match (memo, t2) s =
	let hit = ref 0 in
	let miss = ref 0 in
	let step { transitions; set } c =
	try
	(* We first test if this transition has already been seen. *)
	let res = Hashtbl.find transitions c in
	incr hit;
	res
	with Not_found ->
	incr miss;
	(* If this is not the case, we have to build the new state. *)
	let set' =
	S.fold
	(fun { contents = (_, s) } next ->
	match s with
	\| CharSet (chars, t1) when CSet.mem c chars ->
	follow t1 next

	\| _ ->
	next
	) set S.empty
	in
	let res =
	try
	(* We try to find [set'] in [memo] to get the associated [t2] if we already found this state from another
	transition. *)
	Memo.find set' !memo
	with Not_found ->
	(* If not, we need to build a new [t2]. *)
	let t2 = { transitions = new_hashtbl (); set = set' } in
	(* We then remember the association [set'] --> [t2] in [memo] in case we get to [set'] afterward from another
	transition. *)
	memo := Memo.add set' t2 !memo;
	t2
	in
	(* We remember the new transition [set] --[c]--> [set'] in the DFA. *)
	Hashtbl.add transitions c res;
	res
	in
	let rec full_match l t2 =
	match l with
	\| [] ->
	S.mem (make_t1 Match) t2.set

	\| c :: r ->
	step t2 c
	\|> full_match r
	in
	let res = full_match (explode s) t2 in
	Printf.printf "states: %d, hits: %d, misses: %d\n" (Memo.cardinal !memo) !hit !miss;
	res

	end