Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
quick hack at get_fuzzy_dupes() function
library(pacman)
p_load(fuzzyjoin, dplyr)
# returns clusters of records that almost match
get_fuzzy_dupes <- function(x, max_dist = 2){
result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance")
result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates
result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it. From http://stackoverflow.com/a/9028416
result <- result[!duplicated(result), ]
as_data_frame(result) %>%
select(instance1 = V2, instance2 = V3, distance = V1) %>%
arrange(instance1) %>%
assign_clusters
}
# Assigns near-match duplicates into clusters, for easier cleaning
# Helper function called by get_fuzzy_dupes
assign_clusters <- function(dat){
# go down rowwise - if either has a match in a previous cluster, assign to that cluster, otherwise new cluster
dat$cluster <- numeric(length(nrow(dat)))
dat$cluster[1] <- dat$instance1[1]
for(i in 2:nrow(dat)){
if(dat[i, "instance1"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)]) |
dat[i, "instance2"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)])){
dat$cluster[i] <- dat$cluster[min(which(dat[["instance1"]][i] == dat[["instance1"]][1:(i-1)] |
dat[["instance1"]][i] == dat[["instance2"]][1:(i-1)] |
dat[["instance2"]][i] == dat[["instance1"]][1:(i-1)] |
dat[["instance2"]][i] == dat[["instance2"]][1:(i-1)]
))
]
} else{
dat$cluster[i] <- dat$instance1[i]
}
}
dat
}
# Create a 1-vector df to play with
dat <- mtcars %>%
transmute(cars = row.names(.))
# Examples
get_fuzzy_dupes(dat, 2)
get_fuzzy_dupes(dat, 1)
@sfirke

This comment has been minimized.

Copy link
Owner Author

@sfirke sfirke commented Oct 19, 2016

That "late night having fun not gonna comment it" code that will bite me later

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment