Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbowles/c0f6de86003e7ad5ad97ecb352ee04fe to your computer and use it in GitHub Desktop.
Save jbowles/c0f6de86003e7ad5ad97ecb352ee04fe to your computer and use it in GitHub Desktop.
/*
RatcliffObserhelp distance
SEE tokenizer
[NIST Ratcliff/Obershelp pattern recognition](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html):
Compute the similarity of two strings as the number of matching characters divided by the total number of characters in the two strings.
Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
*/
/*
// use this to replace longest_common_substring
// Return start of commn substring in s1, start of common substring in s2, and length of substring
// Indexes refer to character number, not index (differ for Unicode strings)
fn longest_common_substring() -> (usize, usize, usize) {
//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L3
(0, 0, 0)
}
*/
fn longest_common_substring<'a>(
shorter: &'a str,
longer: &'a str,
low1: usize,
high1: usize,
low2: usize,
high2: usize,
) -> (usize, usize, usize) {
let longsub = &longer[low2..high2];
let slen = high1 - low1;
for size in (1..=slen + 1).rev() {
for start in 0..=slen - size {
let substr = &shorter[low1 + start..low1 + start + size];
let matches: Vec<(usize, &'a str)> = longsub.match_indices(substr).collect();
if let Some(&(startb, matchstr)) = matches.first() {
return (low1 + start, low2 + startb, matchstr.len());
}
}
}
(low1, low2, 0)
}
/*
// use this to replace matching_blocks
fn matching_blocks() -> Vec<(usize, usize, usize)> {
//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L31
vec![(0, 0, 0)]
}
*/
pub fn matching_blocks<'a>(shorter: &'a str, longer: &'a str) -> Vec<(usize, usize, usize)> {
let (len1, len2) = (shorter.len(), longer.len());
let mut queue: Vec<(usize, usize, usize, usize)> = vec![(0, len1, 0, len2)];
let mut matching_blocks = Vec::new();
while let Some((low1, high1, low2, high2)) = queue.pop() {
let (i, j, k) = longest_common_substring(shorter, longer, low1, high1, low2, high2);
if k != 0 {
matching_blocks.push((i, j, k));
if low1 < i && low2 < j {
queue.push((low1, i, low2, j));
}
if i + k < high1 && j + k < high2 {
queue.push((i + k, high1, j + k, high2));
}
}
}
//matching_blocks.sort(); // Is this necessary?
let (mut i1, mut j1, mut k1) = (0, 0, 0);
let mut non_adjacent = Vec::new();
for (i2, j2, k2) in matching_blocks {
if i1 + k1 == i2 && j1 + k1 == j2 {
k1 += k2;
} else {
if k1 != 0 {
non_adjacent.push((i1, j1, k1));
}
i1 = i2;
j1 = j2;
k1 = k2;
}
}
if k1 != 0 {
non_adjacent.push((i1, j1, k1));
}
non_adjacent.push((len1, len2, 0));
non_adjacent
}
/*
token_sort evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on sorted order of the tokens.
pass in two strings, the sorter, and the similarity:
token_sort(s1, s2, &TokenCmp::new_sort, &TokenCmp::partial_similarity)
token_sort(s1, s2, &TokenCmp::new_sort_join, &TokenCmp::similarity)
'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
*/
pub fn token_sort<'a>(
t1: &'a str,
t2: &'a str,
sorter: &Fn(
std::vec::Vec<std::borrow::Cow<'a, str>>,
std::vec::Vec<std::borrow::Cow<'a, str>>,
) -> TokenCmp<'a>,
rat: &Fn(&TokenCmp<'a>) -> u8,
) -> u8 {
let an = AlphaNumericTokenizer; //NOT INCLUDED IN THIS GIST
rat(&sorter(an.sequencer(t1), an.sequencer(t2)))
}
/*
token_set evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on the set of intersection and union of tokens.
pass in two strings and the similarity:
token_set(s1, s2, &TokenCmp::partial_similarity)
token_set(s1, s2, &TokenCmp::similarity)
'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
*/
pub fn token_set<'a>(s1: &'a str, s2: &'a str, rat: &Fn(&TokenCmp<'a>) -> u8) -> u8 {
let an = AlphaNumericTokenizer;
let (p1, p2) = (an.sequencer(s1), an.sequencer(s2));
let mut s1_i_s2 = p1.intersect(p2.clone());
let mut s1q = p1.uniq(p2.clone()); //diff1to2
let mut s2q = p2.uniq(p1.clone()); //diff2to1
s1_i_s2.sort();
s1q.sort();
s2q.sort();
let s1_i_s2_u_s1q = s1_i_s2.union(s1q.clone()); //combined_1to2
let s1_i_s2_u_s2q = s1_i_s2.union(s2q.clone()); //combined_2to1
vec![
rat(&TokenCmp::new_set(s1_i_s2.clone(), s1_i_s2_u_s1q.clone())),
rat(&TokenCmp::new_set(s1_i_s2, s1_i_s2_u_s2q.clone())),
rat(&TokenCmp::new_set(s1_i_s2_u_s1q, s1_i_s2_u_s2q)),
]
.iter()
.cloned()
.u8_max()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment