Skip to content

Instantly share code, notes, and snippets.

@cloudRoutine
Last active April 23, 2021 19:55
Show Gist options
  • Save cloudRoutine/7725ab333338bed71e6a to your computer and use it in GitHub Desktop.
Save cloudRoutine/7725ab333338bed71e6a to your computer and use it in GitHub Desktop.
String Search Algorithms ( Boyer-Moore & Knuth-Morris-Pratt )
module StringMatching =
open System
/// Knuth-Morris-Pratt String Searching Algorithm ///
let kmp_search (text:string) (word:string) : int =
let kmp_table (word:string) =
let table = [| 1..word.Length |]
table.[0] <- -1
table.[1] <- 0
let rec build pos cnd =
match word.[pos-1] with
| x when pos = word.Length -> table
| x when x = word.[cnd] -> table.[pos] <- cnd + 1
build (pos+1) (cnd+1)
| x when cnd > 0 -> build pos (table.[cnd])
| x -> table.[pos] <- 0
build (pos+1) cnd
build 2 0
let table = kmp_table word
let rec traverse pos ind =
match word.[ind] with
| x when (ind + pos) > text.Length -> -1
| x when x = text.[pos+ind]
&& ind = word.Length - 1 -> pos
| x when x = text.[pos+ind] -> traverse pos (ind+1)
| x -> let pos' = pos + ind + table.[ind]
let ind' = if table.[ind] > -1
then table.[ind]
else 0
traverse pos' ind'
traverse 0 0
/// #region Boyer Moore String Search Algorithm ///
/// Returns the index of the given character in the English alphabet counting from 0
let alphabet_index (ch:char) =
let chlow = Char.ToLower( ch)
let charnum = Convert.ToInt32 chlow
charnum - 97 // 'a' is ASCII character 97
/// Returns the length of the match of the substrings of str
/// beginning at index1 and index2
let match_length (str:string) (index1:int) (index2:int) =
if index1 = index2
then str.Length - index1
else let rec accumulate_matches acc ind1 ind2 =
match ind1, ind2 with
| ind1, ind2 when ind1 < str.Length
&& ind2 < str.Length
&& str.[ind1] = str.[ind2]
-> accumulate_matches (acc+1) (ind1+1)(ind2+1)
| ind1, ind2 -> acc
accumulate_matches 0 index1 index2
/// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring
/// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
/// where n is the length of S.
let fundamental_preprocess (str:string) : int [] =
match str.Length with
| 0 -> [||] // Handles the case of empty string
| 1 -> [|1|] // Handles the case of single character string
| _ -> let z = [| for x in str -> 0 |]
z.[0] <- str.Length
z.[1] <- match_length str 0 1
for i in (2)..(z.[1]) do
z.[i] <- z.[1] - i + 1
let rec preprocess l r range =
match range with
| i::tl when i <= r -> let k = i - l
let b = z.[k]
let a = r - i + 1
if b < a
then z.[i] <- b
preprocess l r tl
else z.[i] <- b + ( match_length str a (r+1))
preprocess i (i+z.[i]-1) tl
| i::tl -> z.[i] <- match_length str 0 i
if z.[i] > 0
then preprocess i (i+z.[i]-1) tl
else preprocess l r tl
| [] -> z
preprocess 0 0 [ (2+z.[1])..(str.Length-1) ]
/// Generates R for S, which is an array indexed by the position of some
/// character c in the English alphabet. At that index in R is an array
/// of length |S|+1, specifying for each index i in S (plus the index after S)
/// the next location of character c encountered when traversing S from
/// right to left starting at i.
let bad_character_table (str:string) =
if str.Length = 0
then [| for a in 0..25 -> [||] |]
else let R = [| for a in 0..25 -> [|-1|] |]
let alpha = [| for a in 0..25 -> -1 |]
str.ToCharArray() |> Array.iteri
( fun i c
-> alpha.[ alphabet_index c ] <- i
alpha |> Array.iteri
( fun j a -> R.[j] <- Array.append R.[j] [|a|]) )
R
/// Generates L for S, an array used in the implementation of the strong
/// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
/// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring
/// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P
/// relative to T such that no instances of P in T are skipped and a suffix
/// of P[:L[ i]] matches the substring of T matched by a suffix of P in the
/// previous match attempt. Specifically, if the mismatch took place at position
/// i-1 in P, the shift magnitude is given by the equation P.length - L.[i].
/// In the case that L.[i] = -1, the full shift table is used.
/// Since only proper suffixes matter, L.[0] = -1.
let good_suffix_table (str:string) =
let L = [| for c in str -> -1 |]
let N = fundamental_preprocess ( new string(str.ToCharArray() |> Array.rev ))
|> Array.rev
[| 0..(str.Length-2) |]
|> Array.iter( fun j -> let i = str.Length - N.[j]
if i <> str.Length
then L.[i] <- j )
L
/// Generates F for S, an array used in a special case of the good suffix rule in
/// the Boyer-Moore string search algorithm. F.[i] is the length of the longest
/// suffix of S[i:] that is also a prefix of S. In the cases it is used, the
/// shift magnitude of the pattern P relative to the text T is P.length - F[i]
/// for a mismatch occurring at i-1
let full_shift_table (str:string) =
let F = [| for x in str -> 0 |]
let Z = fundamental_preprocess str
Z |> Array.rev
|> Array.fold
( fun ( i:int, longest:int ) zv ->
let lg = if zv = i + 1
then max zv longest
else longest
F.[str.Length - i - 1 ] <- lg
(i+1,lg) )
(0,0) |> ignore
F
/// Implementation of the Boyer-Moore string search algorithm. This finds all
/// occurrences of P in T, and incorporates numerous ways of pre-processing
/// the pattern to determine the optimal amount to shift the string and skip
/// comparisons. In practice it runs in O(m) (and even sublinear) time, where
/// m is the length of T. This implementation performs a case-insensitive
/// search on ASCII alphabetic characters, spaces not included.
let boyer_moore_search (P:string) (T:string) =
if P.Length = 0 || T.Length = 0 || T.Length < P.Length
then [] else
let R = bad_character_table P
let L = good_suffix_table P
let F = full_shift_table P
let rec find_matches k prev_k matches =
if k < T.Length
then // Matches starting from end of P
let rec compare_chars i' h' =
match i', h' with
| i', h' when i' >= 0 && h' > prev_k && P.[i'] = T.[h']
-> compare_chars (i'-1)(h'-1)
| i', h' -> i', h'
let i,h = compare_chars (P.Length - 1) k
// Match has been found
if i = -1 || h = prev_k
then let acc = ( k - P.Length + 1 ):: matches
let k' = if P.Length > 1
then k + P.Length - F.[1]
else 1
find_matches k' prev_k acc
// No match, shift by max of bad character and good suffix rules
else let char_shift = i - R.[(alphabet_index T.[h])].[i]
let suffix_shift =
// Mismatch happened on first attempt
if i + 1 = P.Length
then 1
// Matched suffix does not appear anywhere in P
elif L.[i+1] = -1
then P.Length - F.[i+1]
// Matched Suffix appears in P
else P.Length - L.[i+1]
let shift = max char_shift suffix_shift
// Galil's rule
let prev_k' = if shift >= i + 1
then k
else prev_k
let k' = k + shift
find_matches k' prev_k' matches
else matches
find_matches (P.Length - 1) (-1) []
// #endregion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment