jbowles/ratcliff_obershelp_with_token_sort.rs

## ratcliff_obershelp_with_token_sort.rs
/*
RatcliffObserhelp distance

SEE tokenizer

 [NIST Ratcliff/Obershelp pattern recognition](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html):
Compute the similarity of two strings as the number of matching characters divided by the total number of characters in the two strings.
Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
*/

/*
// use this to replace longest_common_substring
// Return start of commn substring in s1, start of common substring in s2, and length of substring
// Indexes refer to character number, not index (differ for Unicode strings)
fn longest_common_substring() -> (usize, usize, usize) {
    //https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L3
    (0, 0, 0)
}
*/

fn longest_common_substring<'a>(
    shorter: &'a str,
    longer: &'a str,
    low1: usize,
    high1: usize,
    low2: usize,
    high2: usize,
) -> (usize, usize, usize) {
    let longsub = &longer[low2..high2];
    let slen = high1 - low1;
    for size in (1..=slen + 1).rev() {
        for start in 0..=slen - size {
            let substr = &shorter[low1 + start..low1 + start + size];
            let matches: Vec<(usize, &'a str)> = longsub.match_indices(substr).collect();
            if let Some(&(startb, matchstr)) = matches.first() {
                return (low1 + start, low2 + startb, matchstr.len());
            }
        }
    }
    (low1, low2, 0)
}

/*
// use this to replace matching_blocks
fn matching_blocks() -> Vec<(usize, usize, usize)> {
    //https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L31
    vec![(0, 0, 0)]
}
*/

pub fn matching_blocks<'a>(shorter: &'a str, longer: &'a str) -> Vec<(usize, usize, usize)> {
    let (len1, len2) = (shorter.len(), longer.len());
    let mut queue: Vec<(usize, usize, usize, usize)> = vec![(0, len1, 0, len2)];
    let mut matching_blocks = Vec::new();
    while let Some((low1, high1, low2, high2)) = queue.pop() {
        let (i, j, k) = longest_common_substring(shorter, longer, low1, high1, low2, high2);
        if k != 0 {
            matching_blocks.push((i, j, k));
            if low1 < i && low2 < j {
                queue.push((low1, i, low2, j));
            }
            if i + k < high1 && j + k < high2 {
                queue.push((i + k, high1, j + k, high2));
            }
        }
    }
    //matching_blocks.sort(); // Is this necessary?
    let (mut i1, mut j1, mut k1) = (0, 0, 0);
    let mut non_adjacent = Vec::new();
    for (i2, j2, k2) in matching_blocks {
        if i1 + k1 == i2 && j1 + k1 == j2 {
            k1 += k2;
        } else {
            if k1 != 0 {
                non_adjacent.push((i1, j1, k1));
            }
            i1 = i2;
            j1 = j2;
            k1 = k2;
        }
    }
    if k1 != 0 {
        non_adjacent.push((i1, j1, k1));
    }
    non_adjacent.push((len1, len2, 0));
    non_adjacent
}


/*
token_sort evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on sorted order of the tokens.
pass in two strings, the sorter, and the similarity:

   token_sort(s1, s2, &TokenCmp::new_sort, &TokenCmp::partial_similarity)
   token_sort(s1, s2, &TokenCmp::new_sort_join, &TokenCmp::similarity)

'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
*/
pub fn token_sort<'a>(
    t1: &'a str,
    t2: &'a str,
    sorter: &Fn(
        std::vec::Vec<std::borrow::Cow<'a, str>>,
        std::vec::Vec<std::borrow::Cow<'a, str>>,
    ) -> TokenCmp<'a>,
    rat: &Fn(&TokenCmp<'a>) -> u8,
) -> u8 {
    let an = AlphaNumericTokenizer; //NOT INCLUDED IN THIS GIST
    rat(&sorter(an.sequencer(t1), an.sequencer(t2)))
}

/*
token_set evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on the set of intersection and union of tokens.
pass in two strings and the similarity:

   token_set(s1, s2, &TokenCmp::partial_similarity)
   token_set(s1, s2, &TokenCmp::similarity)

'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
*/
pub fn token_set<'a>(s1: &'a str, s2: &'a str, rat: &Fn(&TokenCmp<'a>) -> u8) -> u8 {
    let an = AlphaNumericTokenizer;
    let (p1, p2) = (an.sequencer(s1), an.sequencer(s2));
    let mut s1_i_s2 = p1.intersect(p2.clone());
    let mut s1q = p1.uniq(p2.clone()); //diff1to2
    let mut s2q = p2.uniq(p1.clone()); //diff2to1

    s1_i_s2.sort();
    s1q.sort();
    s2q.sort();

    let s1_i_s2_u_s1q = s1_i_s2.union(s1q.clone()); //combined_1to2
    let s1_i_s2_u_s2q = s1_i_s2.union(s2q.clone()); //combined_2to1

    vec![
        rat(&TokenCmp::new_set(s1_i_s2.clone(), s1_i_s2_u_s1q.clone())),
        rat(&TokenCmp::new_set(s1_i_s2, s1_i_s2_u_s2q.clone())),
        rat(&TokenCmp::new_set(s1_i_s2_u_s1q, s1_i_s2_u_s2q)),
    ]
    .iter()
    .cloned()
    .u8_max()
}
	/*
	RatcliffObserhelp distance

	SEE tokenizer

	[NIST Ratcliff/Obershelp pattern recognition](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html):
	Compute the similarity of two strings as the number of matching characters divided by the total number of characters in the two strings.
	Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
	*/

	/*
	// use this to replace longest_common_substring
	// Return start of commn substring in s1, start of common substring in s2, and length of substring
	// Indexes refer to character number, not index (differ for Unicode strings)
	fn longest_common_substring() -> (usize, usize, usize) {
	//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L3
	(0, 0, 0)
	}
	*/

	fn longest_common_substring<'a>(
	shorter: &'a str,
	longer: &'a str,
	low1: usize,
	high1: usize,
	low2: usize,
	high2: usize,
	) -> (usize, usize, usize) {
	let longsub = &longer[low2..high2];
	let slen = high1 - low1;
	for size in (1..=slen + 1).rev() {
	for start in 0..=slen - size {
	let substr = &shorter[low1 + start..low1 + start + size];
	let matches: Vec<(usize, &'a str)> = longsub.match_indices(substr).collect();
	if let Some(&(startb, matchstr)) = matches.first() {
	return (low1 + start, low2 + startb, matchstr.len());
	}
	}
	}
	(low1, low2, 0)
	}

	/*
	// use this to replace matching_blocks
	fn matching_blocks() -> Vec<(usize, usize, usize)> {
	//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L31
	vec![(0, 0, 0)]
	}
	*/

	pub fn matching_blocks<'a>(shorter: &'a str, longer: &'a str) -> Vec<(usize, usize, usize)> {
	let (len1, len2) = (shorter.len(), longer.len());
	let mut queue: Vec<(usize, usize, usize, usize)> = vec![(0, len1, 0, len2)];
	let mut matching_blocks = Vec::new();
	while let Some((low1, high1, low2, high2)) = queue.pop() {
	let (i, j, k) = longest_common_substring(shorter, longer, low1, high1, low2, high2);
	if k != 0 {
	matching_blocks.push((i, j, k));
	if low1 < i && low2 < j {
	queue.push((low1, i, low2, j));
	}
	if i + k < high1 && j + k < high2 {
	queue.push((i + k, high1, j + k, high2));
	}
	}
	}
	//matching_blocks.sort(); // Is this necessary?
	let (mut i1, mut j1, mut k1) = (0, 0, 0);
	let mut non_adjacent = Vec::new();
	for (i2, j2, k2) in matching_blocks {
	if i1 + k1 == i2 && j1 + k1 == j2 {
	k1 += k2;
	} else {
	if k1 != 0 {
	non_adjacent.push((i1, j1, k1));
	}
	i1 = i2;
	j1 = j2;
	k1 = k2;
	}
	}
	if k1 != 0 {
	non_adjacent.push((i1, j1, k1));
	}
	non_adjacent.push((len1, len2, 0));
	non_adjacent
	}


	/*
	token_sort evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on sorted order of the tokens.
	pass in two strings, the sorter, and the similarity:

	token_sort(s1, s2, &TokenCmp::new_sort, &TokenCmp::partial_similarity)
	token_sort(s1, s2, &TokenCmp::new_sort_join, &TokenCmp::similarity)

	'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
	*/
	pub fn token_sort<'a>(
	t1: &'a str,
	t2: &'a str,
	sorter: &Fn(
	std::vec::Vec<std::borrow::Cow<'a, str>>,
	std::vec::Vec<std::borrow::Cow<'a, str>>,
	) -> TokenCmp<'a>,
	rat: &Fn(&TokenCmp<'a>) -> u8,
	) -> u8 {
	let an = AlphaNumericTokenizer; //NOT INCLUDED IN THIS GIST
	rat(&sorter(an.sequencer(t1), an.sequencer(t2)))
	}

	/*
	token_set evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on the set of intersection and union of tokens.
	pass in two strings and the similarity:

	token_set(s1, s2, &TokenCmp::partial_similarity)
	token_set(s1, s2, &TokenCmp::similarity)

	'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ".
	*/
	pub fn token_set<'a>(s1: &'a str, s2: &'a str, rat: &Fn(&TokenCmp<'a>) -> u8) -> u8 {
	let an = AlphaNumericTokenizer;
	let (p1, p2) = (an.sequencer(s1), an.sequencer(s2));
	let mut s1_i_s2 = p1.intersect(p2.clone());
	let mut s1q = p1.uniq(p2.clone()); //diff1to2
	let mut s2q = p2.uniq(p1.clone()); //diff2to1

	s1_i_s2.sort();
	s1q.sort();
	s2q.sort();

	let s1_i_s2_u_s1q = s1_i_s2.union(s1q.clone()); //combined_1to2
	let s1_i_s2_u_s2q = s1_i_s2.union(s2q.clone()); //combined_2to1

	vec![
	rat(&TokenCmp::new_set(s1_i_s2.clone(), s1_i_s2_u_s1q.clone())),
	rat(&TokenCmp::new_set(s1_i_s2, s1_i_s2_u_s2q.clone())),
	rat(&TokenCmp::new_set(s1_i_s2_u_s1q, s1_i_s2_u_s2q)),
	]
	.iter()
	.cloned()
	.u8_max()
	}