econandrew/fuzzymatch.R

## fuzzymatch.R
############################################################################ ###
# FUZZY MATCHING FUNCTIONS                                                  ####
############################################################################ ###

fuzzy_prep_words <- function(words) {
  # Prepares a list of words for fuzzy matching. All the other fuzzy matching
  # functions will run word through this. Given a list of sentences, returns
  # a list of words.

  words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+"))
  return(words)
}

fuzzy_gen_word_freq <- function(l, fun = identity) {
  # Returns a word frequency vector based on vector of sentences l and with
  # frequencies post-processed by fun (e.g. log)

  fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1
}

fuzzy_title_match <- function(a, b, wf) {
  # Fuzzy matches a performance title based on a custom algorithm tuned for
  # this purpose. Words are frequency-weighted (like tf-idf).
  #
  # Args:
  #   a, b: the two titles to match
  #   wf: a vector of word frequencies as generated by fuzzy_gen_word_freq
  #
  # Returns:
  #   A fuzzy match score, higher is better, +Inf for exact match

  if (a == b) # Shortcut to make faster
    return (Inf)
  a.words <- fuzzy_prep_words(a)
  b.words <- fuzzy_prep_words(b)
  a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
  b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })

  d <- adist(a.words, b.words)
  a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2
  b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2

  matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs))
  unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs)
  return(matchsum / unmatchsum)
}

A <- c(
  "Plantains, green (large)",
  "Plantains, yellow",
  "Plantains, purple (small)",
  "Beef, minced, lean",
  "Beef, minced, extra lean",
  "Beef, steak, filet",
  "Lamb"
)

B <- c(
  "Large Green Plaintains",
  "Pork",
  "A yellow plantan",
  "Lean beef (minced)",
  "Beef-steak-fillet",
  "extra lean minced beef"
)

# Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple
# The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each
# with details depending on whether there can be multiple matches, whether everything must match, etc.

wf <- fuzzy_gen_word_freq(c(A, B))
vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) }
scores <- outer(A, B, vectorised_match)
rownames(scores) <- A
colnames(scores) <- B
	############################################################################ ###
	# FUZZY MATCHING FUNCTIONS ####
	############################################################################ ###

	fuzzy_prep_words <- function(words) {
	# Prepares a list of words for fuzzy matching. All the other fuzzy matching
	# functions will run word through this. Given a list of sentences, returns
	# a list of words.

	words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+"))
	return(words)
	}

	fuzzy_gen_word_freq <- function(l, fun = identity) {
	# Returns a word frequency vector based on vector of sentences l and with
	# frequencies post-processed by fun (e.g. log)

	fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1
	}

	fuzzy_title_match <- function(a, b, wf) {
	# Fuzzy matches a performance title based on a custom algorithm tuned for
	# this purpose. Words are frequency-weighted (like tf-idf).
	#
	# Args:
	# a, b: the two titles to match
	# wf: a vector of word frequencies as generated by fuzzy_gen_word_freq
	#
	# Returns:
	# A fuzzy match score, higher is better, +Inf for exact match

	if (a == b) # Shortcut to make faster
	return (Inf)
	a.words <- fuzzy_prep_words(a)
	b.words <- fuzzy_prep_words(b)
	a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
	b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })

	d <- adist(a.words, b.words)
	a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2
	b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2

	matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs))
	unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs)
	return(matchsum / unmatchsum)
	}

	A <- c(
	"Plantains, green (large)",
	"Plantains, yellow",
	"Plantains, purple (small)",
	"Beef, minced, lean",
	"Beef, minced, extra lean",
	"Beef, steak, filet",
	"Lamb"
	)

	B <- c(
	"Large Green Plaintains",
	"Pork",
	"A yellow plantan",
	"Lean beef (minced)",
	"Beef-steak-fillet",
	"extra lean minced beef"
	)

	# Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple
	# The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each
	# with details depending on whether there can be multiple matches, whether everything must match, etc.

	wf <- fuzzy_gen_word_freq(c(A, B))
	vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) }
	scores <- outer(A, B, vectorised_match)
	rownames(scores) <- A
	colnames(scores) <- B