Last active
April 23, 2021 19:55
-
-
Save cloudRoutine/7725ab333338bed71e6a to your computer and use it in GitHub Desktop.
String Search Algorithms ( Boyer-Moore & Knuth-Morris-Pratt )
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module StringMatching = | |
open System | |
/// Knuth-Morris-Pratt String Searching Algorithm /// | |
let kmp_search (text:string) (word:string) : int = | |
let kmp_table (word:string) = | |
let table = [| 1..word.Length |] | |
table.[0] <- -1 | |
table.[1] <- 0 | |
let rec build pos cnd = | |
match word.[pos-1] with | |
| x when pos = word.Length -> table | |
| x when x = word.[cnd] -> table.[pos] <- cnd + 1 | |
build (pos+1) (cnd+1) | |
| x when cnd > 0 -> build pos (table.[cnd]) | |
| x -> table.[pos] <- 0 | |
build (pos+1) cnd | |
build 2 0 | |
let table = kmp_table word | |
let rec traverse pos ind = | |
match word.[ind] with | |
| x when (ind + pos) > text.Length -> -1 | |
| x when x = text.[pos+ind] | |
&& ind = word.Length - 1 -> pos | |
| x when x = text.[pos+ind] -> traverse pos (ind+1) | |
| x -> let pos' = pos + ind + table.[ind] | |
let ind' = if table.[ind] > -1 | |
then table.[ind] | |
else 0 | |
traverse pos' ind' | |
traverse 0 0 | |
/// #region Boyer Moore String Search Algorithm /// | |
/// Returns the index of the given character in the English alphabet counting from 0 | |
let alphabet_index (ch:char) = | |
let chlow = Char.ToLower( ch) | |
let charnum = Convert.ToInt32 chlow | |
charnum - 97 // 'a' is ASCII character 97 | |
/// Returns the length of the match of the substrings of str | |
/// beginning at index1 and index2 | |
let match_length (str:string) (index1:int) (index2:int) = | |
if index1 = index2 | |
then str.Length - index1 | |
else let rec accumulate_matches acc ind1 ind2 = | |
match ind1, ind2 with | |
| ind1, ind2 when ind1 < str.Length | |
&& ind2 < str.Length | |
&& str.[ind1] = str.[ind2] | |
-> accumulate_matches (acc+1) (ind1+1)(ind2+1) | |
| ind1, ind2 -> acc | |
accumulate_matches 0 index1 index2 | |
/// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring | |
/// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time, | |
/// where n is the length of S. | |
let fundamental_preprocess (str:string) : int [] = | |
match str.Length with | |
| 0 -> [||] // Handles the case of empty string | |
| 1 -> [|1|] // Handles the case of single character string | |
| _ -> let z = [| for x in str -> 0 |] | |
z.[0] <- str.Length | |
z.[1] <- match_length str 0 1 | |
for i in (2)..(z.[1]) do | |
z.[i] <- z.[1] - i + 1 | |
let rec preprocess l r range = | |
match range with | |
| i::tl when i <= r -> let k = i - l | |
let b = z.[k] | |
let a = r - i + 1 | |
if b < a | |
then z.[i] <- b | |
preprocess l r tl | |
else z.[i] <- b + ( match_length str a (r+1)) | |
preprocess i (i+z.[i]-1) tl | |
| i::tl -> z.[i] <- match_length str 0 i | |
if z.[i] > 0 | |
then preprocess i (i+z.[i]-1) tl | |
else preprocess l r tl | |
| [] -> z | |
preprocess 0 0 [ (2+z.[1])..(str.Length-1) ] | |
/// Generates R for S, which is an array indexed by the position of some | |
/// character c in the English alphabet. At that index in R is an array | |
/// of length |S|+1, specifying for each index i in S (plus the index after S) | |
/// the next location of character c encountered when traversing S from | |
/// right to left starting at i. | |
let bad_character_table (str:string) = | |
if str.Length = 0 | |
then [| for a in 0..25 -> [||] |] | |
else let R = [| for a in 0..25 -> [|-1|] |] | |
let alpha = [| for a in 0..25 -> -1 |] | |
str.ToCharArray() |> Array.iteri | |
( fun i c | |
-> alpha.[ alphabet_index c ] <- i | |
alpha |> Array.iteri | |
( fun j a -> R.[j] <- Array.append R.[j] [|a|]) ) | |
R | |
/// Generates L for S, an array used in the implementation of the strong | |
/// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i) | |
/// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring | |
/// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P | |
/// relative to T such that no instances of P in T are skipped and a suffix | |
/// of P[:L[ i]] matches the substring of T matched by a suffix of P in the | |
/// previous match attempt. Specifically, if the mismatch took place at position | |
/// i-1 in P, the shift magnitude is given by the equation P.length - L.[i]. | |
/// In the case that L.[i] = -1, the full shift table is used. | |
/// Since only proper suffixes matter, L.[0] = -1. | |
let good_suffix_table (str:string) = | |
let L = [| for c in str -> -1 |] | |
let N = fundamental_preprocess ( new string(str.ToCharArray() |> Array.rev )) | |
|> Array.rev | |
[| 0..(str.Length-2) |] | |
|> Array.iter( fun j -> let i = str.Length - N.[j] | |
if i <> str.Length | |
then L.[i] <- j ) | |
L | |
/// Generates F for S, an array used in a special case of the good suffix rule in | |
/// the Boyer-Moore string search algorithm. F.[i] is the length of the longest | |
/// suffix of S[i:] that is also a prefix of S. In the cases it is used, the | |
/// shift magnitude of the pattern P relative to the text T is P.length - F[i] | |
/// for a mismatch occurring at i-1 | |
let full_shift_table (str:string) = | |
let F = [| for x in str -> 0 |] | |
let Z = fundamental_preprocess str | |
Z |> Array.rev | |
|> Array.fold | |
( fun ( i:int, longest:int ) zv -> | |
let lg = if zv = i + 1 | |
then max zv longest | |
else longest | |
F.[str.Length - i - 1 ] <- lg | |
(i+1,lg) ) | |
(0,0) |> ignore | |
F | |
/// Implementation of the Boyer-Moore string search algorithm. This finds all | |
/// occurrences of P in T, and incorporates numerous ways of pre-processing | |
/// the pattern to determine the optimal amount to shift the string and skip | |
/// comparisons. In practice it runs in O(m) (and even sublinear) time, where | |
/// m is the length of T. This implementation performs a case-insensitive | |
/// search on ASCII alphabetic characters, spaces not included. | |
let boyer_moore_search (P:string) (T:string) = | |
if P.Length = 0 || T.Length = 0 || T.Length < P.Length | |
then [] else | |
let R = bad_character_table P | |
let L = good_suffix_table P | |
let F = full_shift_table P | |
let rec find_matches k prev_k matches = | |
if k < T.Length | |
then // Matches starting from end of P | |
let rec compare_chars i' h' = | |
match i', h' with | |
| i', h' when i' >= 0 && h' > prev_k && P.[i'] = T.[h'] | |
-> compare_chars (i'-1)(h'-1) | |
| i', h' -> i', h' | |
let i,h = compare_chars (P.Length - 1) k | |
// Match has been found | |
if i = -1 || h = prev_k | |
then let acc = ( k - P.Length + 1 ):: matches | |
let k' = if P.Length > 1 | |
then k + P.Length - F.[1] | |
else 1 | |
find_matches k' prev_k acc | |
// No match, shift by max of bad character and good suffix rules | |
else let char_shift = i - R.[(alphabet_index T.[h])].[i] | |
let suffix_shift = | |
// Mismatch happened on first attempt | |
if i + 1 = P.Length | |
then 1 | |
// Matched suffix does not appear anywhere in P | |
elif L.[i+1] = -1 | |
then P.Length - F.[i+1] | |
// Matched Suffix appears in P | |
else P.Length - L.[i+1] | |
let shift = max char_shift suffix_shift | |
// Galil's rule | |
let prev_k' = if shift >= i + 1 | |
then k | |
else prev_k | |
let k' = k + shift | |
find_matches k' prev_k' matches | |
else matches | |
find_matches (P.Length - 1) (-1) [] | |
// #endregion |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment