Skip to content

Instantly share code, notes, and snippets.

@gdmcdonald
Last active February 1, 2022 13:01
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save gdmcdonald/bacfaafe2cccff18b6a81b319cbc3580 to your computer and use it in GitHub Desktop.
Save gdmcdonald/bacfaafe2cccff18b6a81b319cbc3580 to your computer and use it in GitHub Desktop.
Efficient fuzzy match of two data frames by one common string column in R, outputing a list of the matching and non-matching rows
#Efficient fuzzy match of two data frames by one common column
library(dplyr)
library(fuzzyjoin)
library(stringdist)
eff_fuzzy_match<-function(data_frame_A,
data_frame_B,
by_what,
choose_p = 0.1,
choose_max_dist = 0.4,
best_only = TRUE,
make_lower_case = TRUE){
#set matching variables to lowercase if make_lower_case is true
if (make_lower_case == TRUE){
data_frame_A[,by_what]<-tolower(data_frame_A[,by_what])
data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
}
#take only unique entries
data_frame_A = unique(data_frame_A)
data_frame_B = unique(data_frame_B)
#Merge data frames with dplyr
ExactMatches<-inner_join(x = data_frame_A,
y = data_frame_B,
by = by_what)
#Now look at those which didn't match at all
NoMatchesA<-anti_join(x = data_frame_A,
y = data_frame_B,
by = by_what)
NoMatchesB<-anti_join(x = data_frame_B,
y = data_frame_A,
by = by_what)
#FuzzyMatch the remaining ones by string distance
FuzzyMatch<-stringdist_inner_join(x = NoMatchesA,
y = NoMatchesB,
by = by_what,
method= 'jw',
p = choose_p,
max_dist = choose_max_dist)
#Add a column specifying the string distance
FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")],
b = FuzzyMatch[,paste0(by_what,".y")],
method = 'jw',
p = choose_p)
#order by smallest string distance first
FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),]
#take the best matching one only if best_only is TRUE
if(best_only==TRUE){
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),]
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),]
}
NotMatchedA<-anti_join(x = NoMatchesA,
y = FuzzyMatch,
by = setNames(paste0(by_what,".x"),
by_what))
NotMatchedB<-anti_join(x = NoMatchesB,
y = FuzzyMatch,
by = setNames(paste0(by_what,".y"),
by_what))
completeReturnList = list(exact = ExactMatches,
fuzzy = FuzzyMatch,
remainderA = NotMatchedA,
remainderB = NotMatchedB)
return(completeReturnList)
}
# #testing:
#
# dfa = data.frame(FirstNames=c("George",
# "Ketut",
# "Harriet",
# "Zhu",
# "Sarika",
# "Apple"))
# dfb = data.frame(FirstNames=c("GeorGe",
# "Ketut",
# "Harry",
# "Z.",
# "Rika",
# "Marion"))
# eff_fuzzy_match(dfa,dfb,by_what = "FirstNames")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment