Last active
September 4, 2017 00:30
-
-
Save gdmcdonald/9135ec8f7e903a0735a0b16d8cb97297 to your computer and use it in GitHub Desktop.
Finding and matching typos in strings in a dataframe in R. See the question at https://stackoverflow.com/questions/45990947/how-to-find-a-typo-in-a-data-frame-and-replace-it/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringdist) | |
library(dplyr) | |
#Example Data Frame to find and correct typos in | |
my_df<-data.frame(BIRTH = c(1,1,2,3,1,5,3,3,1), | |
NAME = c("Luke","Luke","Leia","Han","Ben","Lando","Han","Ham","Luke"), | |
SURNAME = c("Skywalker","Skywalker","Organa","Solo","Solo","Calrissian","Solo","Solo","Wkywalker"), | |
random_value = c(1,2,3,7,1,3,4,4,9)) | |
#Concatenate the birthday and name columns | |
my_df$birth_and_names <- do.call(paste, c(my_df[c("BIRTH", "NAME", "SURNAME")], sep = " ")) | |
#Work out the string distances between each possible pair of entries | |
dist.matrix<-stringdistmatrix(my_df$birth_and_names,my_df$birth_and_names,method='jw',p=0.1) | |
row.names(dist.matrix)<-my_df$birth_and_names | |
names(dist.matrix)<-my_df$birth_and_names | |
dist.matrix<-as.dist(dist.matrix) | |
#Hierarchical clustering to find closest | |
clusts<-hclust(dist.matrix,method="ward.D2") | |
plot(clusts) | |
#Cut into appropriate clusters based upon height in the dendrogram | |
my_df$LikelyGroup<-cutree(clusts,h=0.2) | |
#Define "mode" function which only selects one mode even in bimodal cases. | |
Mode <- function(x) { | |
ux <- unique(x) | |
ux[which.max(tabulate(match(x, ux)))] | |
} | |
#Select modal name for each group | |
my_df<-my_df%>% | |
group_by(LikelyGroup)%>% | |
mutate(Group_Birth=Mode(BIRTH), | |
Group_Name=Mode(NAME), | |
Group_Surname=Mode(SURNAME), | |
birth_and_names=NULL) |
Author
gdmcdonald
commented
Sep 1, 2017
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment