Skip to content

Instantly share code, notes, and snippets.

@bruisedsamurai
Last active October 23, 2020 11:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bruisedsamurai/495b98c362ccd27b11f5f311bf2a58a2 to your computer and use it in GitHub Desktop.
Save bruisedsamurai/495b98c362ccd27b11f5f311bf2a58a2 to your computer and use it in GitHub Desktop.
Boyer-Moore implementation in Rust
/*
Copyright 2020 Subhanshu Goel
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/*
Boyer–Moore string-search algorithm is an efficient string-searching algorithm that is the standard benchmark for practical string-search literature.
Worst-case performance Θ(m) preprocessing + O(mn) matching
Best-case performance Θ(m) preprocessing + Ω(n/m) matching
https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
[1] Gusfield, Dan. Algorithms on Strings, Trees, and Sequences: Computer Science and Computational Biology. United Kingdom, Cambridge University Press, 1997.
*/
pub struct BM {
pattern: String,
char_occr: HashMap<char, Vec<usize>>,
z_values: Vec<usize>,
L_values: Vec<usize>,
l_values: Vec<usize>,
}
impl BM {
pub fn new(pattern: &str) -> BM {
let char_occr = Self::find_character_occurences(pattern);
let rev_string: String = pattern.chars().rev().collect();
// In boyer moore algorithm z values are calculated after reversing the pattern
let mut z_values = Self::compute_z_algorithm(&rev_string);
// The index we get are reversed i.e 0 starts from the end
// We will reverse that sequence i.e. 0 starts from beginning of original pattern
z_values.reverse();
// Bottom two methods together calculate values for good suffix rule
// L values serving primary purpose
let L_values = Self::compute_L_values(&z_values);
// l(small L) is used when a perfect match is found in `text`(we try to find another match in the text)
// or L[i] equals zero.
let l_values = Self::compute_l_values(&z_values);
BM {
pattern: pattern.to_owned(),
char_occr,
z_values,
L_values,
l_values,
}
}
pub fn search(&self, text: &str) -> Vec<usize> {
let mut ret = vec![];
let pattern: Vec<char> = self.pattern.chars().collect();
let text: Vec<char> = text.chars().collect();
let len = pattern.len();
let mut text_curr_right_end = len - 1;
while text_curr_right_end < text.len() {
let mut i = len - 1;
let mut h = text_curr_right_end;
while i != 0 && pattern[i] == text[h] {
i -= 1;
h -= 1;
}
let mut patt_match = false;
if i == 0 {
if pattern[i] == text[h] {
ret.push(text_curr_right_end);
text_curr_right_end = text_curr_right_end + len - self.l_values[1];
patt_match = true;
} else {
//This is done for the case when first char mismatches.
// Ideally in if clause `i` should be checked against -1
//But type of `i` is usize and changing it to isize will force us to do lot of explcit conversions here and there.
i = 1;
}
}
if !patt_match {
if i == len - 1 {
text_curr_right_end += 1;
} else {
let mut l = self.L_values[i + 1];
if l == 0 {
l = self.l_values[i + 1];
}
let r_x = self.find_character_position(pattern[i - 1], i - 1);
let max = cmp::max(l, r_x);
text_curr_right_end = text_curr_right_end + len - max;
}
}
}
ret
}
fn find_character_position(&self, ch: char, index: usize) -> usize {
let vec = self.char_occr.get(&ch).unwrap();
let cmp = |&x| {
if index > x {
Ordering::Less
} else if index == x {
Ordering::Equal
} else {
Ordering::Greater
}
};
let i = vec.binary_search_by(cmp).unwrap();
if i + 1 < vec.len() {
vec[i + 1]
} else {
0
}
}
fn find_character_occurences(pattern: &str) -> HashMap<char, Vec<usize>> {
//! Find all positions of occurences of a character in the pattern
//! Positions are in descending order
let mut map: HashMap<char, Vec<usize>> = HashMap::new();
for (i, ch) in pattern.char_indices().rev() {
if map.contains_key(&ch) {
if let Some(vec) = map.get_mut(&ch) {
vec.push(i);
}
} else {
map.insert(ch, vec![i]);
}
}
map
}
fn compute_z_algorithm(pattern: &str) -> Vec<usize> {
//! Computer length of longest prefix of pattern[k..pattern.len()]
//! That matches prefix of `pattern`
let pattern_vec: Vec<char> = pattern.chars().collect();
let mut z_vec: Vec<usize> = vec![0];
let mut r = 0;
let mut l = 0;
for k in 1..pattern_vec.len() {
if k > r {
for (z_len, ch) in pattern_vec.iter().enumerate() {
if let Some(pre_ch) = pattern_vec.get(k + z_len) {
if pre_ch != ch {
z_vec.push(z_len);
if z_len > 0 {
l = k;
r = k + z_len - 1;
}
break;
}
} else {
z_vec.push(z_len);
if z_len > 0 {
l = k;
r = k + z_len - 1;
}
break;
}
}
} else if k <= r {
let k1 = k - l;
let beta = r - k + 1;
let z_len1 = z_vec[k1];
if z_len1 < beta {
z_vec.push(z_len1);
} else {
let mut z_len = beta;
for (i, ch) in pattern_vec[(beta + 1)..].iter().enumerate() {
if let Some(pre_ch) = pattern_vec.get(r + i + 1) {
if pre_ch != ch {
z_len += i;
z_vec.push(z_len);
l = k;
r = r + i - 1;
break;
}
} else {
z_len += i;
z_vec.push(z_len);
l = k;
r = r + i - 1;
break;
}
}
}
}
}
z_vec
}
fn compute_L_values(vec: &Vec<usize>) -> Vec<usize> {
//! Compute largest index j such that vec[j] = pattern[i..pattern.len()].len()
let mut ret = vec![0; vec.len()];
let len = vec.len();
for j in 0..(len - 1) {
let i = len - 1 - vec[j];
ret[i] = j;
}
ret
}
fn compute_l_values(vec: &Vec<usize>) -> Vec<usize> {
//! Computes largest length of suffix pattern[i..pattern.len()] that is also prefix of pattern
let mut ret = vec![0; vec.len()];
let len = vec.len();
let mut max = 0;
for j in 0..(len - 1) {
let j_len = vec[j];
max = if j_len > j && j_len > max { j_len } else { max };
let i = len - 1 - j;
ret[i] = max;
}
ret
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment