Created
January 17, 2020 09:01
-
-
Save lascar-pacagi/02fe4e05b97b5fd5d8efa89c9c2ebf33 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(** [explode s] returns the list of the characters of the string [s]. | |
For example, [explode "hello"] is ['h'; 'e'; 'l'; 'l'; 'o']. *) | |
let explode s = | |
List.init (String.length s) (fun i -> String.get s i) | |
let implode l = | |
let res = Bytes.create (List.length l) in | |
let rec imp i = function | |
| [] -> () | |
| c :: l -> Bytes.set res i c; imp (i + 1) l in | |
imp 0 l; | |
Bytes.unsafe_to_string res | |
(** [i -- j] creates a list of the integers [i, i+1, ..., j-1, j]. | |
For example, [1 -- 5] is [1;2;3;4;5]. *) | |
let (--) i j = | |
let rec aux n acc = | |
if n < i then acc else aux (n-1) (n :: acc) | |
in aux j [] | |
(** [CSet] is a character set module. *) | |
module CSet = Set.Make(Char) | |
(** [ascii_set] is a character set containing all the ascii characters. *) | |
let ascii_set = | |
List.init 128 Char.chr | |
|> CSet.of_list | |
module type RegularExpression = sig | |
type regex = | |
| Empty (** Empty represents the regex ø *) | |
| Epsilon (** Epsilon represents the regex ε *) | |
| CharSet of CSet.t (** [CharSet {c1, c2, ..., cn}] represents the regex (c1 | c2 | ... | cn), | |
where each ci is an ascii character. *) | |
| Concatenation of regex * regex (** [Concatenation (r1, r2)] represents the regex (r1r2). *) | |
| Union of regex * regex (** [Union (r1, r2)] represents the regex (r1 | r2). *) | |
| ZeroOrOne of regex (** [ZeroOrOne r] represents the regex (r | epsilon). *) | |
| ZeroOrMore of regex (** [ZeroOrMore r] represents the regex ( r* ). *) | |
| OneOrMore of regex (** [OneOrMore r] represents the regex ( rr* ). *) | |
(** [empty] is the empty regular expression. *) | |
val empty : regex | |
(** [epsilon] is the epsilon regular expression. *) | |
val epsilon : regex | |
(** [singleton c] is the regular expression for the character [c]. *) | |
val singleton : char -> regex | |
(** [charset set] is a regular expression for the set of characters [set]. *) | |
val charset : CSet.t -> regex | |
(** [concat r1 r2] is a regular expression for the concatenation of [r1] and [r2]. *) | |
val concat : regex -> regex -> regex | |
(** [union r1 r2] is a regular expression for the union of [r1] and [r2]. *) | |
val union : regex -> regex -> regex | |
(** [zero_or_one r] is a regular expression for zero or one [r]. *) | |
val zero_or_one : regex -> regex | |
(** [zero_or_more r] is a regular expression for zero or more [r]. *) | |
val zero_or_more : regex -> regex | |
(** [one_or_more r] is a regular expression for at least one [r]. *) | |
val one_or_more : regex -> regex | |
(** [regex_from_string s] returns a [regex] that represents the regular expression [s]. | |
In the following a character c will be denoted by 'c'. We suppose that we do not need to escape | |
a character between simple quotes to avoid confusion. For example, we will write '\' and not '\\'. | |
The different operators in the regular expression are | |
- '?' represents zero or one. | |
- '*' represents zero or more. | |
- '+' represents one or more. | |
- '.' represents any character. | |
- '(' and ')' are used to group regular expressions. | |
- '[' and ']' are explained below. | |
- the special characters ('?', '*', '+', '.', '(', ')', '[', ']') can be escaped with the character '\'. | |
For example, '\' followed by '*' represents the character star and not the operator zero or more. '\' can be escaped too | |
with '\' followed by '\'. Note that in a normal string, in OCaml, to obtain '\' followed by '*', you need to type "\\*". | |
You can also use the quoted string {|\*|} in OCaml. | |
To obtain '\' followed by '\' you need to type "\\\\" or use the quoted string {|\\|}. | |
- "[c1-c2]" represents the range of ascii characters between c1 and c2. For example, "[a-zA-Z]" | |
is the set of lower case and upper case letters. | |
- "[c1c2...cn]" represents the ascii characters c1, c2, ..., cn. For example, "[cofe]" represents | |
the four characters 'c', 'o', 'f' and 'e'. | |
- "[^c1-c2]" represents all the ascii characters except for those between c1 and c2. For example, | |
"[^a-z]" represents all the ascii characters except for the lower case letters. | |
- "[^c1c2...cn]" represents all the ascii characters except for c1, c2, ..., cn. For example, "[^/*]" represents | |
all the ascii characters except for the two characters '/' and '*'. | |
- Inside "[]" we don't escape the special characters except for ']', for example, we have to write "[\\]]" to get ']'. | |
- ø represents the empty regular expression | |
- ε represents epsilon | |
*) | |
val regex_from_string : string -> regex | |
(** [print fmt re] prints the regular expression [re] with the formatter [fmt]. *) | |
val print : Format.formatter -> regex -> unit | |
end | |
module RE : RegularExpression = struct | |
type regex = | |
| Empty | |
| Epsilon | |
| CharSet of CSet.t | |
| Concatenation of regex * regex | |
| Union of regex * regex | |
| ZeroOrOne of regex | |
| ZeroOrMore of regex | |
| OneOrMore of regex | |
let empty = Empty | |
let epsilon = Epsilon | |
let singleton c = CharSet (CSet.singleton c) | |
let charset set = CharSet set | |
let concat re1 re2 = | |
match re1, re2 with | |
| Empty, _ | _, Empty -> | |
Empty | |
| Epsilon, re | re, Epsilon -> | |
re | |
| _ -> | |
Concatenation (re1, re2) | |
let union re1 re2 = | |
match re1, re2 with | |
| Empty, re | re, Empty -> | |
re | |
(* | _ when re1 = re2 -> | |
* re1 *) | |
| _ -> | |
Union (re1, re2) | |
let zero_or_one re = | |
match re with | |
| Empty | |
| Epsilon -> | |
Epsilon | |
| ZeroOrOne _ | |
| ZeroOrMore _ -> | |
re | |
| OneOrMore r -> | |
ZeroOrMore r | |
| _ -> | |
ZeroOrOne re | |
let zero_or_more re = | |
match re with | |
| Empty | |
| Epsilon -> | |
Epsilon | |
| ZeroOrOne r | |
| ZeroOrMore r | |
| OneOrMore r -> | |
ZeroOrMore r | |
| _ -> | |
ZeroOrMore re | |
let one_or_more re = | |
match re with | |
| Empty -> | |
Empty | |
| Epsilon -> | |
Epsilon | |
| ZeroOrOne r | |
| ZeroOrMore r -> | |
ZeroOrMore r | |
| _ -> | |
OneOrMore re | |
(* let concat re1 re2 = Concatenation (re1, re2) | |
* | |
* let union re1 re2 = Union (re1, re2) | |
* | |
* let zero_or_one re = ZeroOrOne re | |
* | |
* let zero_or_more re = ZeroOrMore re | |
* | |
* let one_or_more re = OneOrMore re *) | |
let regex_from_string s = | |
let rec re l = | |
let e, l = re1 l in | |
match l with | |
| '|' :: r -> | |
let e', l = re r in | |
Union (e, e'), l | |
| _ -> | |
e, l | |
and re1 l = | |
let e, l = re2 l in | |
let e, l = | |
let rec re1' e l = | |
match l with | |
| '?' :: r -> | |
re1' (ZeroOrOne e) r | |
| '*' :: r -> | |
re1' (ZeroOrMore e) r | |
| '+' :: r -> | |
re1' (OneOrMore e) r | |
| _ -> | |
e, l | |
in | |
re1' e l | |
in | |
match l with | |
| c :: _ when c <> ')' && c <> '|' -> | |
let e', l = re1 l in | |
Concatenation (e, e'), l | |
| _ -> | |
e, l | |
and re2 l = | |
match l with | |
| '(' :: r -> | |
begin | |
let e, l = re r in | |
match l with | |
| ')' :: r -> | |
e, r | |
| _ -> | |
failwith "re2: ')' expected in re2" | |
end | |
| '.' :: r -> | |
CharSet ascii_set, r | |
| '\xc3' :: '\x98' :: r -> | |
Empty, r | |
| '\xce' :: '\xb5' :: r -> | |
Epsilon, r | |
| '[' :: '^' :: r -> | |
let set, r = char_set r CSet.empty in | |
CharSet (CSet.diff ascii_set set), r | |
| '[' :: r -> | |
let set, r = char_set r CSet.empty in | |
CharSet set, r | |
| '\\' :: ('.' | '*' | '+' | '?' | '\\' | '(' | ')' | '[' | ']' as c) :: r -> | |
CharSet (CSet.singleton c), r | |
| c :: r -> | |
CharSet (CSet.singleton c), r | |
| [] -> | |
failwith "char_set: character expected" | |
and char_set l acc = | |
match l with | |
| ']' :: r -> | |
if CSet.is_empty acc then | |
failwith "char_set: empty"; | |
acc, r | |
| c1 :: '-' :: c2 :: r -> | |
chars_between c1 c2 | |
|> CSet.union acc | |
|> char_set r | |
| '\\' :: (']' as c) :: r | |
| '\\' :: ('\\' as c) :: r | |
| c :: r -> | |
CSet.add c acc | |
|> char_set r | |
| [] -> | |
failwith "char_set: character expected" | |
and chars_between c1 c2 = | |
(Char.code c1) -- (Char.code c2) | |
|> List.map Char.chr | |
|> CSet.of_list | |
in | |
let e, l = re (explode s) in | |
if l <> [] then | |
failwith "regexp_from_string : not a valid regexp" | |
else | |
e | |
let print ppf re = | |
let range_from_cset cset = | |
let rec aux (a, b) acc = function | |
| [] -> | |
(a, b) :: acc | |
| c :: r -> | |
if c = b + 1 then | |
aux (a, c) acc r | |
else | |
aux (c, c) ((a, b) :: acc) r | |
in | |
let elts = | |
CSet.elements cset | |
|> List.map Char.code | |
in | |
let first_code = List.hd elts in | |
aux (first_code, first_code) [] (List.tl elts) | |
|> List.map (fun (a, b) -> (Char.chr a, Char.chr b)) | |
in | |
let char_to_string_escaped c = | |
String.make 1 c | |
|> String.escaped | |
in | |
let open Format in | |
let rec print_range_list ppf = function | |
| [] -> | |
() | |
| (a, b) :: r -> | |
(if a = b then | |
fprintf ppf "%s%a" (char_to_string_escaped a) | |
else | |
fprintf ppf "[%s-%s]%a" | |
(char_to_string_escaped a) | |
(char_to_string_escaped b)) | |
print_range_list r | |
in | |
let rec print_alternative ppf = function | |
| Union(re1, re2) -> | |
fprintf ppf "%a\xe2\x94\x82%a" | |
print_alternative re1 | |
print_alternative re2 | |
| re -> | |
print_concatenation ppf re | |
and print_concatenation ppf = function | |
| Concatenation (re1, re2) -> | |
fprintf ppf "%a%a" | |
print_concatenation re1 | |
print_concatenation re2 | |
| re -> | |
print_unary_operators ppf re | |
and print_unary_operators ppf = function | |
| ZeroOrOne re -> | |
fprintf ppf "%a\xef\xbc\x9f" | |
print_unary_operators re | |
| ZeroOrMore re -> | |
fprintf ppf "%a\xe2\xad\x91" | |
print_unary_operators re | |
| OneOrMore re -> | |
fprintf ppf "%a\xe2\x81\xba" | |
print_unary_operators re | |
| re -> | |
print_basics ppf re | |
and print_basics ppf = function | |
| Empty -> | |
fprintf ppf "\xc3\x98" | |
| Epsilon -> | |
fprintf ppf "\xce\xb5" | |
| CharSet set -> | |
fprintf ppf "%a" | |
print_range_list (range_from_cset set) | |
| re -> | |
fprintf ppf "\xe2\x9f\xae%a\xe2\x9f\xaf" | |
print_regex re | |
and print_regex ppf re = | |
print_alternative ppf re | |
in | |
fprintf ppf "%a@." | |
print_regex re | |
end | |
type graph = RE.regex array array | |
let all_except_aa_bb () = | |
let a = RE.singleton 'a' in | |
let b = RE.singleton 'b' in | |
[| (* 0 1 2 3 4 *) | |
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; |]; | |
(*1*) [| RE.empty; RE.empty; a; b; RE.epsilon; |]; | |
(*2*) [| RE.empty; RE.empty; RE.empty; b; RE.epsilon; |]; | |
(*3*) [| RE.empty; RE.empty; a; RE.empty; RE.epsilon; |]; | |
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
|] | |
let odd_a_even_b () = | |
let a = RE.singleton 'a' in | |
let b = RE.singleton 'b' in | |
[| (* 0 1 2 3 4 5 *) | |
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
(*1*) [| RE.empty; RE.empty; a; RE.empty; b; RE.empty; |]; | |
(*2*) [| RE.empty; a; RE.empty; b; RE.empty; RE.epsilon; |]; | |
(*3*) [| RE.empty; RE.empty; b; RE.empty; a; RE.empty; |]; | |
(*4*) [| RE.empty; b; RE.empty; a; RE.empty; RE.empty; |]; | |
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
|] | |
let any_aba_any () = | |
let a = RE.singleton 'a' in | |
let b = RE.singleton 'b' in | |
[| (* 0 1 2 3 4 5 *) | |
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
(*1*) [| RE.empty; RE.union a b; a; RE.empty; RE.empty; RE.empty; |]; | |
(*2*) [| RE.empty; RE.empty; RE.empty; b; RE.empty; RE.empty; |]; | |
(*3*) [| RE.empty; RE.empty; RE.empty; RE.empty; a; RE.empty; |]; | |
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.union a b; RE.epsilon; |]; | |
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
|] | |
let running_example () = | |
let a = RE.singleton 'a' in | |
let b = RE.singleton 'b' in | |
let c = RE.singleton 'c' in | |
[| (* 0 1 2 3 4 5 *) | |
(*0*) [| RE.empty; RE.epsilon; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
(*1*) [| RE.empty; a; b; c; RE.empty; RE.empty; |]; | |
(*2*) [| RE.empty; a; b; RE.empty; c; RE.empty; |]; | |
(*3*) [| RE.empty; RE.empty; RE.empty; RE.union a b; c; RE.epsilon; |]; | |
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; b; RE.epsilon; |]; | |
(*5*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
|] | |
let comments () = | |
let a = RE.singleton 'a' in | |
let b = RE.singleton 'b' in | |
let star = RE.singleton '*' in | |
let slash = RE.singleton '/' in | |
let a_b = RE.union a b in | |
let a_b_slash = RE.union a_b slash in | |
[| (* 0 1 2 3 4 *) | |
(*0*) [| RE.empty; slash; RE.empty; RE.empty; RE.empty; |]; | |
(*1*) [| RE.empty; RE.empty; star; RE.empty; RE.empty; |]; | |
(*2*) [| RE.empty; RE.empty; a_b_slash; star; RE.empty; |]; | |
(*3*) [| RE.empty; RE.empty; a_b; star; slash; |]; | |
(*4*) [| RE.empty; RE.empty; RE.empty; RE.empty; RE.empty; |]; | |
|] | |
let concat_list l = | |
List.fold_left (fun acc re -> RE.concat acc re) RE.epsilon l | |
let floyd_warshall (g : graph) : RE.regex = | |
let n = Array.length g in | |
for k = 1 to n - 2 do | |
for i = 0 to n - 1 do | |
if i = 0 || i > k then | |
for j = 0 to n - 1 do | |
if j = 0 || j > k then | |
let re1 = g.(i).(k) in | |
let re2 = g.(k).(j) in | |
let re = | |
concat_list [re1; (RE.zero_or_more g.(k).(k)); re2] | |
|> RE.union g.(i).(j) | |
in | |
g.(i).(j) <- re | |
done | |
done | |
done; | |
g.(0).(n - 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment