Skip to content

Instantly share code, notes, and snippets.

@lascar-pacagi
Created January 17, 2020 09:01
Show Gist options
  • Save lascar-pacagi/02fe4e05b97b5fd5d8efa89c9c2ebf33 to your computer and use it in GitHub Desktop.
Save lascar-pacagi/02fe4e05b97b5fd5d8efa89c9c2ebf33 to your computer and use it in GitHub Desktop.
(** [explode s] returns the list of the characters of the string [s].
For example, [explode "hello"] is ['h'; 'e'; 'l'; 'l'; 'o']. *)
let explode s =
List.init (String.length s) (fun i -> String.get s i)
let implode l =
let res = Bytes.create (List.length l) in
let rec imp i = function
| [] -> ()
| c :: l -> Bytes.set res i c; imp (i + 1) l in
imp 0 l;
Bytes.unsafe_to_string res
(** [i -- j] creates a list of the integers [i, i+1, ..., j-1, j].
For example, [1 -- 5] is [1;2;3;4;5]. *)
let (--) i j =
let rec aux n acc =
if n < i then acc else aux (n-1) (n :: acc)
in aux j []
(** [CSet] is a character set module. *)
module CSet = Set.Make(Char)
(** [ascii_set] is a character set containing all the ascii characters. *)
let ascii_set =
List.init 128 Char.chr
|> CSet.of_list
module type RegularExpression = sig
type regex =
| Empty (** Empty represents the regex ø *)
| Epsilon (** Epsilon represents the regex ε *)
| CharSet of CSet.t (** [CharSet {c1, c2, ..., cn}] represents the regex (c1 | c2 | ... | cn),
where each ci is an ascii character. *)
| Concatenation of regex * regex (** [Concatenation (r1, r2)] represents the regex (r1r2). *)
| Union of regex * regex (** [Union (r1, r2)] represents the regex (r1 | r2). *)
| ZeroOrOne of regex (** [ZeroOrOne r] represents the regex (r | epsilon). *)
| ZeroOrMore of regex (** [ZeroOrMore r] represents the regex ( r* ). *)
| OneOrMore of regex (** [OneOrMore r] represents the regex ( rr* ). *)
(** [empty] is the empty regular expression. *)
val empty : regex
(** [epsilon] is the epsilon regular expression. *)
val epsilon : regex
(** [singleton c] is the regular expression for the character [c]. *)
val singleton : char -> regex
(** [charset set] is a regular expression for the set of characters [set]. *)
val charset : CSet.t -> regex
(** [concat r1 r2] is a regular expression for the concatenation of [r1] and [r2]. *)
val concat : regex -> regex -> regex
(** [union r1 r2] is a regular expression for the union of [r1] and [r2]. *)
val union : regex -> regex -> regex
(** [zero_or_one r] is a regular expression for zero or one [r]. *)
val zero_or_one : regex -> regex
(** [zero_or_more r] is a regular expression for zero or more [r]. *)
val zero_or_more : regex -> regex
(** [one_or_more r] is a regular expression for at least one [r]. *)
val one_or_more : regex -> regex
(** [regex_from_string s] returns a [regex] that represents the regular expression [s].
In the following a character c will be denoted by 'c'. We suppose that we do not need to escape
a character between simple quotes to avoid confusion. For example, we will write '\' and not '\\'.
The different operators in the regular expression are
- '?' represents zero or one.
- '*' represents zero or more.
- '+' represents one or more.
- '.' represents any character.
- '(' and ')' are used to group regular expressions.
- '[' and ']' are explained below.
- the special characters ('?', '*', '+', '.', '(', ')', '[', ']') can be escaped with the character '\'.
For example, '\' followed by '*' represents the character star and not the operator zero or more. '\' can be escaped too
with '\' followed by '\'. Note that in a normal string, in OCaml, to obtain '\' followed by '*', you need to type "\\*".
You can also use the quoted string {|\*|} in OCaml.
To obtain '\' followed by '\' you need to type "\\\\" or use the quoted string {|\\|}.
- "[c1-c2]" represents the range of ascii characters between c1 and c2. For example, "[a-zA-Z]"
is the set of lower case and upper case letters.
- "[c1c2...cn]" represents the ascii characters c1, c2, ..., cn. For example, "[cofe]" represents
the four characters 'c', 'o', 'f' and 'e'.
- "[^c1-c2]" represents all the ascii characters except for those between c1 and c2. For example,
"[^a-z]" represents all the ascii characters except for the lower case letters.
- "[^c1c2...cn]" represents all the ascii characters except for c1, c2, ..., cn. For example, "[^/*]" represents
all the ascii characters except for the two characters '/' and '*'.
- Inside "[]" we don't escape the special characters except for ']', for example, we have to write "[\\]]" to get ']'.
- ø represents the empty regular expression
- ε represents epsilon
*)
val regex_from_string : string -> regex
(** [print fmt re] prints the regular expression [re] with the formatter [fmt]. *)
val print : Format.formatter -> regex -> unit
end
module RE : RegularExpression = struct
type regex =
| Empty
| Epsilon
| CharSet of CSet.t
| Concatenation of regex * regex
| Union of regex * regex
| ZeroOrOne of regex
| ZeroOrMore of regex
| OneOrMore of regex
let empty = Empty
let epsilon = Epsilon
let singleton c = CharSet (CSet.singleton c)
let charset set = CharSet set
let concat re1 re2 =
match re1, re2 with
| Empty, _ | _, Empty ->
Empty
| Epsilon, re | re, Epsilon ->
re
| _ ->
Concatenation (re1, re2)
let union re1 re2 =
match re1, re2 with
| Empty, re | re, Empty ->
re
(* | _ when re1 = re2 ->
* re1 *)
| _ ->
Union (re1, re2)
let zero_or_one re =
match re with
| Empty
| Epsilon ->
Epsilon
| ZeroOrOne _
| ZeroOrMore _ ->
re
| OneOrMore r ->
ZeroOrMore r
| _ ->
ZeroOrOne re
let zero_or_more re =
match re with
| Empty
| Epsilon ->
Epsilon
| ZeroOrOne r
| ZeroOrMore r
| OneOrMore r ->
ZeroOrMore r
| _ ->
ZeroOrMore re
let one_or_more re =
match re with
| Empty ->
Empty
| Epsilon ->
Epsilon
| ZeroOrOne r
| ZeroOrMore r ->
ZeroOrMore r
| _ ->
OneOrMore re
(* let concat re1 re2 = Concatenation (re1, re2)
*
* let union re1 re2 = Union (re1, re2)
*
* let zero_or_one re = ZeroOrOne re
*
* let zero_or_more re = ZeroOrMore re
*
* let one_or_more re = OneOrMore re *)
let regex_from_string s =
let rec re l =
let e, l = re1 l in
match l with
| '|' :: r ->
let e', l = re r in
Union (e, e'), l
| _ ->
e, l
and re1 l =
let e, l = re2 l in
let e, l =
let rec re1' e l =
match l with
| '?' :: r ->
re1' (ZeroOrOne e) r
| '*' :: r ->
re1' (ZeroOrMore e) r
| '+' :: r ->
re1' (OneOrMore e) r
| _ ->
e, l
in
re1' e l
in
match l with
| c :: _ when c <> ')' && c <> '|' ->
let e', l = re1 l in
Concatenation (e, e'), l
| _ ->
e, l
and re2 l =
match l with
| '(' :: r ->
begin
let e, l = re r in
match l with
| ')' :: r ->
e, r
| _ ->
failwith "re2: ')' expected in re2"
end
| '.' :: r ->
CharSet ascii_set, r
| '\xc3' :: '\x98' :: r ->
Empty, r
| '\xce' :: '\xb5' :: r ->
Epsilon, r
| '[' :: '^' :: r ->
let set, r = char_set r CSet.empty in
CharSet (CSet.diff ascii_set set), r
| '[' :: r ->
let set, r = char_set r CSet.empty in
CharSet set, r
| '\\' :: ('.' | '*' | '+' | '?' | '\\' | '(' | ')' | '[' | ']' as c) :: r ->
CharSet (CSet.singleton c), r
| c :: r ->
CharSet (CSet.singleton c), r
| [] ->
failwith "char_set: character expected"
and char_set l acc =
match l with
| ']' :: r ->
if CSet.is_empty acc then
failwith "char_set: empty";
acc, r
| c1 :: '-' :: c2 :: r ->
chars_between c1 c2
|> CSet.union acc
|> char_set r
| '\\' :: (']' as c) :: r
| '\\' :: ('\\' as c) :: r
| c :: r ->
CSet.add c acc
|> char_set r
| [] ->
failwith "char_set: character expected"
and chars_between c1 c2 =
(Char.code c1) -- (Char.code c2)
|> List.map Char.chr
|> CSet.of_list
in
let e, l = re (explode s) in
if l <> [] then
failwith "regexp_from_string : not a valid regexp"
else
e
let print ppf re =
let range_from_cset cset =
let rec aux (a, b) acc = function
| [] ->
(a, b) :: acc
| c :: r ->
if c = b + 1 then
aux (a, c) acc r
else
aux (c, c) ((a, b) :: acc) r
in
let elts =
CSet.elements cset
|> List.map Char.code
in
let first_code = List.hd elts in
aux (first_code, first_code) [] (List.tl elts)
|> List.map (fun (a, b) -> (Char.chr a, Char.chr b))
in
let char_to_string_escaped c =
String.make 1 c
|> String.escaped
in
let open Format in
let rec print_range_list ppf = function
| [] ->
()
| (a, b) :: r ->
(if a = b then
fprintf ppf "%s%a" (char_to_string_escaped a)
else
fprintf ppf "[%s-%s]%a"
(char_to_string_escaped a)
(char_to_string_escaped b))
print_range_list r
in
let rec print_alternative ppf = function
| Union(re1, re2) ->
fprintf ppf "%a\xe2\x94\x82%a"
print_alternative re1
print_alternative re2
| re ->
print_concatenation ppf re
and print_concatenation ppf = function
| Concatenation (re1, re2) ->
fprintf ppf "%a%a"
print_concatenation re1
print_concatenation re2
| re ->
print_unary_operators ppf re
and print_unary_operators ppf = function
| ZeroOrOne re ->
fprintf ppf "%a\xef\xbc\x9f"
print_unary_operators re
| ZeroOrMore re ->
fprintf ppf "%a\xe2\xad\x91"
print_unary_operators re
| OneOrMore re ->
fprintf ppf "%a\xe2\x81\xba"
print_unary_operators re
| re ->
print_basics ppf re
and print_basics ppf = function
| Empty ->
fprintf ppf "\xc3\x98"
| Epsilon ->
fprintf ppf "\xce\xb5"
| CharSet set ->
fprintf ppf "%a"
print_range_list (range_from_cset set)
| re ->
fprintf ppf "\xe2\x9f\xae%a\xe2\x9f\xaf"
print_regex re
and print_regex ppf re =
print_alternative ppf re
in
fprintf ppf "%a@."
print_regex re
end
type graph = RE.regex array array
let all_except_aa_bb () =
let a = RE.singleton 'a' in
let b = RE.singleton 'b' in
[| (* 0 1 2 3 4 *)
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; |];
(*1*) [| RE.empty; RE.empty; a; b; RE.epsilon; |];
(*2*) [| RE.empty; RE.empty; RE.empty; b; RE.epsilon; |];
(*3*) [| RE.empty; RE.empty; a; RE.empty; RE.epsilon; |];
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |];
|]
let odd_a_even_b () =
let a = RE.singleton 'a' in
let b = RE.singleton 'b' in
[| (* 0 1 2 3 4 5 *)
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |];
(*1*) [| RE.empty; RE.empty; a; RE.empty; b; RE.empty; |];
(*2*) [| RE.empty; a; RE.empty; b; RE.empty; RE.epsilon; |];
(*3*) [| RE.empty; RE.empty; b; RE.empty; a; RE.empty; |];
(*4*) [| RE.empty; b; RE.empty; a; RE.empty; RE.empty; |];
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |];
|]
let any_aba_any () =
let a = RE.singleton 'a' in
let b = RE.singleton 'b' in
[| (* 0 1 2 3 4 5 *)
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |];
(*1*) [| RE.empty; RE.union a b; a; RE.empty; RE.empty; RE.empty; |];
(*2*) [| RE.empty; RE.empty; RE.empty; b; RE.empty; RE.empty; |];
(*3*) [| RE.empty; RE.empty; RE.empty; RE.empty; a; RE.empty; |];
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.union a b; RE.epsilon; |];
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |];
|]
let running_example () =
let a = RE.singleton 'a' in
let b = RE.singleton 'b' in
let c = RE.singleton 'c' in
[| (* 0 1 2 3 4 5 *)
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |];
(*1*) [| RE.empty; a; b; c; RE.empty; RE.empty; |];
(*2*) [| RE.empty; a; b; RE.empty; c; RE.empty; |];
(*3*) [| RE.empty; RE.empty; RE.empty; RE.union a b; c; RE.epsilon; |];
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; b; RE.epsilon; |];
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |];
|]
let comments () =
let a = RE.singleton 'a' in
let b = RE.singleton 'b' in
let star = RE.singleton '*' in
let slash = RE.singleton '/' in
let a_b = RE.union a b in
let a_b_slash = RE.union a_b slash in
[| (* 0 1 2 3 4 *)
(*0*) [| RE.empty; slash; RE.empty; RE.empty; RE.empty; |];
(*1*) [| RE.empty; RE.empty; star; RE.empty; RE.empty; |];
(*2*) [| RE.empty; RE.empty; a_b_slash; star; RE.empty; |];
(*3*) [| RE.empty; RE.empty; a_b; star; slash; |];
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |];
|]
let concat_list l =
List.fold_left (fun acc re -> RE.concat acc re) RE.epsilon l
let floyd_warshall (g : graph) : RE.regex =
let n = Array.length g in
for k = 1 to n - 2 do
for i = 0 to n - 1 do
if i = 0 || i > k then
for j = 0 to n - 1 do
if j = 0 || j > k then
let re1 = g.(i).(k) in
let re2 = g.(k).(j) in
let re =
concat_list [re1; (RE.zero_or_more g.(k).(k)); re2]
|> RE.union g.(i).(j)
in
g.(i).(j) <- re
done
done
done;
g.(0).(n - 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment