cloudRoutine/booyer-moore_knuth-morris-pratt.fs

## booyer-moore_knuth-morris-pratt.fs
module StringMatching =
    open System


    /// Knuth-Morris-Pratt String Searching Algorithm  ///

    let kmp_search (text:string) (word:string) : int =
        let kmp_table (word:string) =
            let table   = [| 1..word.Length |]
            table.[0]   <-  -1
            table.[1]   <-   0

            let rec build pos cnd =
                match word.[pos-1] with
                | x when pos = word.Length  ->  table
                | x when x   = word.[cnd]   ->  table.[pos] <- cnd + 1
                                                build (pos+1) (cnd+1)
                | x when cnd > 0            ->  build  pos    (table.[cnd])
                | x                         ->  table.[pos] <- 0
                                                build (pos+1)  cnd
            build 2 0
        let table   = kmp_table word

        let rec traverse pos ind =
            match word.[ind] with
            | x when (ind + pos) > text.Length  ->  -1
            | x when x      =  text.[pos+ind]
                &&   ind    =  word.Length - 1  ->  pos
            | x when x      =  text.[pos+ind]   ->  traverse pos (ind+1)
            | x                                 ->  let pos' =  pos + ind + table.[ind]
                                                    let ind' =  if   table.[ind] > -1
                                                                then table.[ind]
                                                                else 0
                                                    traverse pos' ind'
        traverse 0 0


/// #region  Boyer Moore String Search Algorithm    ///


    /// Returns the index of the given character in the English alphabet counting from 0
    let alphabet_index (ch:char) =
        let chlow   =   Char.ToLower( ch)
        let charnum =   Convert.ToInt32 chlow
        charnum - 97        // 'a' is ASCII character 97


    /// Returns the length of the match of the substrings of str
    /// beginning at index1 and index2
    let match_length (str:string) (index1:int) (index2:int) =
        if      index1 = index2
        then    str.Length - index1
        else    let rec accumulate_matches acc ind1 ind2  =
                    match ind1, ind2 with
                    | ind1, ind2  when  ind1 < str.Length
                                    &&  ind2 < str.Length
                                    &&  str.[ind1] = str.[ind2]
                                    -> accumulate_matches (acc+1) (ind1+1)(ind2+1)
                    | ind1, ind2    -> acc
                accumulate_matches 0 index1 index2


    /// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring
    /// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
    /// where n is the length of S.
    let fundamental_preprocess (str:string) : int [] =
        match str.Length with
        | 0 ->  [||]        //  Handles the case of empty string
        | 1 ->  [|1|]       //  Handles the case of single character string
        | _ ->  let z = [| for x in str -> 0 |]
                z.[0] <- str.Length
                z.[1] <- match_length str 0 1
                for i in (2)..(z.[1]) do
                    z.[i] <- z.[1] - i + 1
                let rec preprocess l r range =
                    match range with
                    | i::tl when i <= r ->  let k = i - l
                                            let b = z.[k]
                                            let a = r - i + 1
                                            if      b < a
                                            then    z.[i] <- b
                                                    preprocess l r tl
                                            else    z.[i] <- b + ( match_length str a (r+1))
                                                    preprocess i (i+z.[i]-1) tl
                    | i::tl         ->  z.[i] <- match_length str 0 i
                                        if      z.[i] > 0
                                        then    preprocess i (i+z.[i]-1) tl
                                        else    preprocess l r tl
                    | []            ->  z
                preprocess 0 0 [ (2+z.[1])..(str.Length-1) ]


    /// Generates R for S, which is an array indexed by the position of some
    /// character c in the English alphabet. At that index in R is an array
    /// of length |S|+1, specifying for each index i in S (plus the index after S)
    /// the next location of character c encountered when traversing S from
    /// right to left starting at i.
    let bad_character_table (str:string) =
        if      str.Length  = 0
        then    [| for a in 0..25 -> [||] |]
        else    let R       = [| for a in 0..25 ->  [|-1|] |]
                let alpha   = [| for a in 0..25 ->   -1    |]
                str.ToCharArray() |>  Array.iteri
                    ( fun i c
                        ->  alpha.[ alphabet_index c ] <- i
                            alpha |>  Array.iteri
                                ( fun j a -> R.[j] <- Array.append R.[j] [|a|]) )
                R


    /// Generates L for S, an array used in the implementation of the strong
    /// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
    /// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring
    /// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P
    /// relative to T such that no instances of P in T are skipped and a suffix
    /// of P[:L[ i]] matches the substring of T matched by a suffix of P in the
    /// previous match attempt. Specifically, if the mismatch took place at position
    /// i-1 in P, the shift magnitude is given by the equation P.length - L.[i].
    /// In the case that L.[i] = -1, the full shift table is used.
    /// Since only proper suffixes matter, L.[0] = -1.
    let good_suffix_table (str:string) =
        let L   = [| for c in str -> -1 |]
        let N   = fundamental_preprocess ( new string(str.ToCharArray() |> Array.rev ))
                |> Array.rev
        [|  0..(str.Length-2) |]
            |>  Array.iter( fun j -> let i = str.Length - N.[j]
                                     if      i <> str.Length
                                     then    L.[i] <- j          )
        L


    /// Generates F for S, an array used in a special case of the good suffix rule in
    /// the Boyer-Moore string search algorithm. F.[i] is the length of the longest
    /// suffix of S[i:] that is also a prefix of S. In the cases it is used, the
    /// shift magnitude of the pattern P relative to the text T is P.length - F[i]
    /// for a mismatch occurring at i-1
    let full_shift_table (str:string) =
        let F = [| for x in str -> 0 |]
        let Z = fundamental_preprocess str
        Z   |>  Array.rev
            |>  Array.fold
            (   fun ( i:int, longest:int ) zv ->
                    let lg  =   if      zv = i + 1
                                then    max zv longest
                                else    longest
                    F.[str.Length - i - 1 ] <- lg
                    (i+1,lg)                            )
            (0,0)   |> ignore
        F


    /// Implementation of the Boyer-Moore string search algorithm. This finds all
    /// occurrences of P in T, and incorporates numerous ways of pre-processing
    /// the pattern to determine the optimal amount to shift the string and skip
    /// comparisons. In practice it runs in O(m) (and even sublinear) time, where
    /// m is the length of T. This implementation performs a case-insensitive
    /// search on ASCII alphabetic characters, spaces not included.
    let boyer_moore_search (P:string) (T:string) =
        if      P.Length = 0 || T.Length = 0 || T.Length < P.Length
        then    []  else
            let R   = bad_character_table   P
            let L   = good_suffix_table     P
            let F   = full_shift_table      P

            let rec find_matches k prev_k matches =
                if  k < T.Length
                then // Matches starting from end of P
                    let rec compare_chars i' h' =
                        match i', h' with
                        | i', h'  when i' >= 0 && h' > prev_k && P.[i'] = T.[h']
                                    ->  compare_chars (i'-1)(h'-1)
                        | i', h'    ->  i', h'

                    let i,h = compare_chars (P.Length - 1) k
                //  Match has been found
                    if      i  =   -1 || h = prev_k
                    then    let acc =   ( k - P.Length + 1 ):: matches
                            let k'  =   if      P.Length > 1
                                        then    k +  P.Length - F.[1]
                                        else    1
                            find_matches k' prev_k acc
                //  No match, shift by max of bad character and good suffix rules
                    else    let char_shift      = i - R.[(alphabet_index T.[h])].[i]
                            let suffix_shift    =
                            //  Mismatch happened on first attempt
                                if      i + 1 = P.Length
                                then    1
                            //  Matched suffix does not appear anywhere in P
                                elif    L.[i+1] = -1
                                then    P.Length - F.[i+1]
                            //  Matched Suffix appears in P
                                else    P.Length - L.[i+1]
                            let shift = max char_shift  suffix_shift
                            //  Galil's rule
                            let prev_k' =   if      shift >= i + 1
                                            then    k
                                            else    prev_k
                            let k'  =   k + shift
                            find_matches k' prev_k' matches
                else    matches
            find_matches (P.Length - 1) (-1) []

//  #endregion
	module StringMatching =
	open System


	/// Knuth-Morris-Pratt String Searching Algorithm ///

	let kmp_search (text:string) (word:string) : int =
	let kmp_table (word:string) =
	let table = [\| 1..word.Length \|]
	table.[0] <- -1
	table.[1] <- 0

	let rec build pos cnd =
	match word.[pos-1] with
	\| x when pos = word.Length -> table
	\| x when x = word.[cnd] -> table.[pos] <- cnd + 1
	build (pos+1) (cnd+1)
	\| x when cnd > 0 -> build pos (table.[cnd])
	\| x -> table.[pos] <- 0
	build (pos+1) cnd
	build 2 0
	let table = kmp_table word

	let rec traverse pos ind =
	match word.[ind] with
	\| x when (ind + pos) > text.Length -> -1
	\| x when x = text.[pos+ind]
	&& ind = word.Length - 1 -> pos
	\| x when x = text.[pos+ind] -> traverse pos (ind+1)
	\| x -> let pos' = pos + ind + table.[ind]
	let ind' = if table.[ind] > -1
	then table.[ind]
	else 0
	traverse pos' ind'
	traverse 0 0




	/// #region Boyer Moore String Search Algorithm ///


	/// Returns the index of the given character in the English alphabet counting from 0
	let alphabet_index (ch:char) =
	let chlow = Char.ToLower( ch)
	let charnum = Convert.ToInt32 chlow
	charnum - 97 // 'a' is ASCII character 97


	/// Returns the length of the match of the substrings of str
	/// beginning at index1 and index2
	let match_length (str:string) (index1:int) (index2:int) =
	if index1 = index2
	then str.Length - index1
	else let rec accumulate_matches acc ind1 ind2 =
	match ind1, ind2 with
	\| ind1, ind2 when ind1 < str.Length
	&& ind2 < str.Length
	&& str.[ind1] = str.[ind2]
	-> accumulate_matches (acc+1) (ind1+1)(ind2+1)
	\| ind1, ind2 -> acc
	accumulate_matches 0 index1 index2


	/// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring
	/// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
	/// where n is the length of S.
	let fundamental_preprocess (str:string) : int [] =
	match str.Length with
	\| 0 -> [\|\|] // Handles the case of empty string
	\| 1 -> [\|1\|] // Handles the case of single character string
	\| _ -> let z = [\| for x in str -> 0 \|]
	z.[0] <- str.Length
	z.[1] <- match_length str 0 1
	for i in (2)..(z.[1]) do
	z.[i] <- z.[1] - i + 1
	let rec preprocess l r range =
	match range with
	\| i::tl when i <= r -> let k = i - l
	let b = z.[k]
	let a = r - i + 1
	if b < a
	then z.[i] <- b
	preprocess l r tl
	else z.[i] <- b + ( match_length str a (r+1))
	preprocess i (i+z.[i]-1) tl
	\| i::tl -> z.[i] <- match_length str 0 i
	if z.[i] > 0
	then preprocess i (i+z.[i]-1) tl
	else preprocess l r tl
	\| [] -> z
	preprocess 0 0 [ (2+z.[1])..(str.Length-1) ]


	/// Generates R for S, which is an array indexed by the position of some
	/// character c in the English alphabet. At that index in R is an array
	/// of length \|S\|+1, specifying for each index i in S (plus the index after S)
	/// the next location of character c encountered when traversing S from
	/// right to left starting at i.
	let bad_character_table (str:string) =
	if str.Length = 0
	then [\| for a in 0..25 -> [\|\|] \|]
	else let R = [\| for a in 0..25 -> [\|-1\|] \|]
	let alpha = [\| for a in 0..25 -> -1 \|]
	str.ToCharArray() \|> Array.iteri
	( fun i c
	-> alpha.[ alphabet_index c ] <- i
	alpha \|> Array.iteri
	( fun j a -> R.[j] <- Array.append R.[j] [\|a\|]) )
	R


	/// Generates L for S, an array used in the implementation of the strong
	/// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
	/// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring
	/// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P
	/// relative to T such that no instances of P in T are skipped and a suffix
	/// of P[:L[ i]] matches the substring of T matched by a suffix of P in the
	/// previous match attempt. Specifically, if the mismatch took place at position
	/// i-1 in P, the shift magnitude is given by the equation P.length - L.[i].
	/// In the case that L.[i] = -1, the full shift table is used.
	/// Since only proper suffixes matter, L.[0] = -1.
	let good_suffix_table (str:string) =
	let L = [\| for c in str -> -1 \|]
	let N = fundamental_preprocess ( new string(str.ToCharArray() \|> Array.rev ))
	\|> Array.rev
	[\| 0..(str.Length-2) \|]
	\|> Array.iter( fun j -> let i = str.Length - N.[j]
	if i <> str.Length
	then L.[i] <- j )
	L



	/// Generates F for S, an array used in a special case of the good suffix rule in
	/// the Boyer-Moore string search algorithm. F.[i] is the length of the longest
	/// suffix of S[i:] that is also a prefix of S. In the cases it is used, the
	/// shift magnitude of the pattern P relative to the text T is P.length - F[i]
	/// for a mismatch occurring at i-1
	let full_shift_table (str:string) =
	let F = [\| for x in str -> 0 \|]
	let Z = fundamental_preprocess str
	Z \|> Array.rev
	\|> Array.fold
	( fun ( i:int, longest:int ) zv ->
	let lg = if zv = i + 1
	then max zv longest
	else longest
	F.[str.Length - i - 1 ] <- lg
	(i+1,lg) )
	(0,0) \|> ignore
	F



	/// Implementation of the Boyer-Moore string search algorithm. This finds all
	/// occurrences of P in T, and incorporates numerous ways of pre-processing
	/// the pattern to determine the optimal amount to shift the string and skip
	/// comparisons. In practice it runs in O(m) (and even sublinear) time, where
	/// m is the length of T. This implementation performs a case-insensitive
	/// search on ASCII alphabetic characters, spaces not included.
	let boyer_moore_search (P:string) (T:string) =
	if P.Length = 0 \|\| T.Length = 0 \|\| T.Length < P.Length
	then [] else
	let R = bad_character_table P
	let L = good_suffix_table P
	let F = full_shift_table P

	let rec find_matches k prev_k matches =
	if k < T.Length
	then // Matches starting from end of P
	let rec compare_chars i' h' =
	match i', h' with
	\| i', h' when i' >= 0 && h' > prev_k && P.[i'] = T.[h']
	-> compare_chars (i'-1)(h'-1)
	\| i', h' -> i', h'

	let i,h = compare_chars (P.Length - 1) k
	// Match has been found
	if i = -1 \|\| h = prev_k
	then let acc = ( k - P.Length + 1 ):: matches
	let k' = if P.Length > 1
	then k + P.Length - F.[1]
	else 1
	find_matches k' prev_k acc
	// No match, shift by max of bad character and good suffix rules
	else let char_shift = i - R.[(alphabet_index T.[h])].[i]
	let suffix_shift =
	// Mismatch happened on first attempt
	if i + 1 = P.Length
	then 1
	// Matched suffix does not appear anywhere in P
	elif L.[i+1] = -1
	then P.Length - F.[i+1]
	// Matched Suffix appears in P
	else P.Length - L.[i+1]
	let shift = max char_shift suffix_shift
	// Galil's rule
	let prev_k' = if shift >= i + 1
	then k
	else prev_k
	let k' = k + shift
	find_matches k' prev_k' matches
	else matches
	find_matches (P.Length - 1) (-1) []

	// #endregion