Skip to content

Instantly share code, notes, and snippets.

@econandrew
Last active May 3, 2023 03:49
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save econandrew/a9930d812eb420b20358 to your computer and use it in GitHub Desktop.
Save econandrew/a9930d812eb420b20358 to your computer and use it in GitHub Desktop.
R code for fuzzy sentence matching
############################################################################ ###
# FUZZY MATCHING FUNCTIONS ####
############################################################################ ###
fuzzy_prep_words <- function(words) {
# Prepares a list of words for fuzzy matching. All the other fuzzy matching
# functions will run word through this. Given a list of sentences, returns
# a list of words.
words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+"))
return(words)
}
fuzzy_gen_word_freq <- function(l, fun = identity) {
# Returns a word frequency vector based on vector of sentences l and with
# frequencies post-processed by fun (e.g. log)
fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1
}
fuzzy_title_match <- function(a, b, wf) {
# Fuzzy matches a performance title based on a custom algorithm tuned for
# this purpose. Words are frequency-weighted (like tf-idf).
#
# Args:
# a, b: the two titles to match
# wf: a vector of word frequencies as generated by fuzzy_gen_word_freq
#
# Returns:
# A fuzzy match score, higher is better, +Inf for exact match
if (a == b) # Shortcut to make faster
return (Inf)
a.words <- fuzzy_prep_words(a)
b.words <- fuzzy_prep_words(b)
a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
d <- adist(a.words, b.words)
a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2
b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2
matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs))
unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs)
return(matchsum / unmatchsum)
}
A <- c(
"Plantains, green (large)",
"Plantains, yellow",
"Plantains, purple (small)",
"Beef, minced, lean",
"Beef, minced, extra lean",
"Beef, steak, filet",
"Lamb"
)
B <- c(
"Large Green Plaintains",
"Pork",
"A yellow plantan",
"Lean beef (minced)",
"Beef-steak-fillet",
"extra lean minced beef"
)
# Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple
# The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each
# with details depending on whether there can be multiple matches, whether everything must match, etc.
wf <- fuzzy_gen_word_freq(c(A, B))
vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) }
scores <- outer(A, B, vectorised_match)
rownames(scores) <- A
colnames(scores) <- B
@ahcyip
Copy link

ahcyip commented Jun 6, 2017

Check out @dgrtwo fuzzyjoin::stringdist_join which includes Levenshtein, soundex, etc methods:
https://github.com/dgrtwo/fuzzyjoin/blob/master/R/stringdist_join.R

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment