gdmcdonald/matchTypos.R

## matchTypos.R
library(stringdist)
library(dplyr)

#Example Data Frame to find and correct typos in
my_df<-data.frame(BIRTH = c(1,1,2,3,1,5,3,3,1),
           NAME = c("Luke","Luke","Leia","Han","Ben","Lando","Han","Ham","Luke"),
           SURNAME = c("Skywalker","Skywalker","Organa","Solo","Solo","Calrissian","Solo","Solo","Wkywalker"),
           random_value = c(1,2,3,7,1,3,4,4,9))

#Concatenate the birthday and name columns
my_df$birth_and_names <- do.call(paste, c(my_df[c("BIRTH", "NAME", "SURNAME")], sep = " "))

#Work out the string distances between each possible pair of entries
dist.matrix<-stringdistmatrix(my_df$birth_and_names,my_df$birth_and_names,method='jw',p=0.1)
row.names(dist.matrix)<-my_df$birth_and_names
names(dist.matrix)<-my_df$birth_and_names
dist.matrix<-as.dist(dist.matrix)

#Hierarchical clustering to find closest
clusts<-hclust(dist.matrix,method="ward.D2")

plot(clusts)

#Cut into appropriate clusters based upon height in the dendrogram
my_df$LikelyGroup<-cutree(clusts,h=0.2)

#Define "mode" function which only selects one mode even in bimodal cases.
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

#Select modal name for each group
my_df<-my_df%>%
  group_by(LikelyGroup)%>%
  mutate(Group_Birth=Mode(BIRTH),
         Group_Name=Mode(NAME),
         Group_Surname=Mode(SURNAME),
         birth_and_names=NULL)
	library(stringdist)
	library(dplyr)

	#Example Data Frame to find and correct typos in
	my_df<-data.frame(BIRTH = c(1,1,2,3,1,5,3,3,1),
	NAME = c("Luke","Luke","Leia","Han","Ben","Lando","Han","Ham","Luke"),
	SURNAME = c("Skywalker","Skywalker","Organa","Solo","Solo","Calrissian","Solo","Solo","Wkywalker"),
	random_value = c(1,2,3,7,1,3,4,4,9))

	#Concatenate the birthday and name columns
	my_df$birth_and_names <- do.call(paste, c(my_df[c("BIRTH", "NAME", "SURNAME")], sep = " "))

	#Work out the string distances between each possible pair of entries
	dist.matrix<-stringdistmatrix(my_df$birth_and_names,my_df$birth_and_names,method='jw',p=0.1)
	row.names(dist.matrix)<-my_df$birth_and_names
	names(dist.matrix)<-my_df$birth_and_names
	dist.matrix<-as.dist(dist.matrix)

	#Hierarchical clustering to find closest
	clusts<-hclust(dist.matrix,method="ward.D2")

	plot(clusts)

	#Cut into appropriate clusters based upon height in the dendrogram
	my_df$LikelyGroup<-cutree(clusts,h=0.2)

	#Define "mode" function which only selects one mode even in bimodal cases.
	Mode <- function(x) {
	ux <- unique(x)
	ux[which.max(tabulate(match(x, ux)))]
	}

	#Select modal name for each group
	my_df<-my_df%>%
	group_by(LikelyGroup)%>%
	mutate(Group_Birth=Mode(BIRTH),
	Group_Name=Mode(NAME),
	Group_Surname=Mode(SURNAME),
	birth_and_names=NULL)