Created
February 11, 2017 16:25
-
-
Save DomBennett/eaf08157533561a5a1acab96e57e2454 to your computer and use it in GitHub Desktop.
Calculating text distances using string distance metrics and word frequencies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# EXAMPLE TEXT DISTANCES | |
# FUNCTIONS | |
justText <- function(txt) { | |
# converts text to its readable form by removing punctuation and numbers | |
gsub("[^a-zA-Z ]", "", txt) | |
} | |
calcStrDst <- function(txts) { | |
# calculate the distance between vector of texts | |
# or for a random subset of texts if a list of list of texts | |
if(class(txts) == "list") { | |
txts <- unlist(lapply(txts, function(x) sample(x, 1))) | |
} | |
cmbs <- combn(length(txts), 2) | |
jw_dsts <- lv_dsts <- cosine_dsts <- rep(NA, ncol(cmbs)) | |
for(i in 1:ncol(cmbs)) { | |
a <- justText(txts[cmbs[ ,i]][[1]]) | |
b <- justText(txts[cmbs[ ,i]][[2]]) | |
lv_dsts[i] <- stringdist::stringdist(a, b, method="lv") | |
cosine_dsts[i] <- stringdist::stringdist(a, b, method="cosine") | |
jw_dsts[i] <- stringdist::stringdist(a, b, method="jw") | |
} | |
res <- data.frame('lv_min'=min(lv_dsts), | |
'lv_max'=max(lv_dsts), | |
'lv_mean'=mean(lv_dsts), | |
'lv_sd'=sd(lv_dsts), | |
'lv_median'=median(lv_dsts), | |
'cosine_min'=min(cosine_dsts), | |
'cosine_max'=max(cosine_dsts), | |
'cosine_mean'=mean(cosine_dsts), | |
'cosine_sd'=sd(cosine_dsts), | |
'cosine_median'=median(cosine_dsts), | |
'jw_min'=min(jw_dsts), | |
'jw_max'=max(jw_dsts), | |
'jw_mean'=mean(jw_dsts), | |
'jw_sd'=sd(jw_dsts), | |
'jw_median'=median(jw_dsts)) | |
res | |
} | |
getWrdFrq <- function(txts, wts=rep(1, length(txts)), min_wrd_sz=5, min_freq=5) { | |
# Return the frequency of unique words in each string of vector txts | |
# Use wts to determine the wt of each txt | |
cleanWrds <- function(txt) { | |
wrds <- strsplit(txt, " ")[[1]] | |
wrds <- tolower(wrds) | |
wrds <- tm::removePunctuation(wrds) | |
wrds <- tm::removeNumbers(wrds) | |
wrds <- wrds[nchar(wrds) >= min_wrd_sz] | |
wrds <- sub('es$', "", wrds) # remove plurals | |
wrds <- sub('s$', "", wrds) | |
wrds <- unique(wrds) | |
wrds | |
} | |
wrds <- vector("list", length=length(txts)) | |
for(i in 1:length(txts)) { | |
wrds[[i]] <- cleanWrds(txts[i]) | |
} | |
nreps <- unlist(lapply(wrds, function(x) length(x))) | |
wts <- rep(wts, times=nreps) | |
wrds <- unlist(wrds) | |
tbl_wrds <- tapply(wts, wrds, sum) | |
tbl_wrds <- tbl_wrds[tbl_wrds > min_freq] | |
sort(tbl_wrds, decreasing=TRUE) | |
} | |
# DATA | |
text_1 <- | |
"In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. It is named after Vladimir Levenshtein, who considered this distance in 1965.[1] | |
Levenshtein distance may also be referred to as edit distance, although that may also denote a larger family of distance metrics.[2]:32 It is closely related to pairwise string alignments." | |
text_2 <- "The Jaccard index, also known as the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets" | |
text_3 <- "In computer science and statistics, the Jaro–Winkler distance (Winkler, 1990) is a measure of similarity between two strings. It is a variant of the Jaro distance metric (Jaro, 1989, 1995), a type of string edit distance, and was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The lower the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro-Winkler similarity (for equation see below) is given by 1 - Jaro Winkler distance. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The similarity score is normalized such that 0 equates to no similarity and 1 is an exact match." | |
# TEXT DISTANCES | |
# let's create a distance matrix based on levenshtein | |
dmtrx <- matrix(0, ncol=3, nrow=3) | |
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3') | |
dmtrx['t1', 't2'] <- dmtrx['t2', 't1'] <- calcStrDst(c(text_1, text_2))[['lv_mean']] | |
dmtrx['t1', 't3'] <- dmtrx['t3', 't1'] <- calcStrDst(c(text_1, text_3))[['lv_mean']] | |
dmtrx['t3', 't2'] <- dmtrx['t2', 't3'] <- calcStrDst(c(text_2, text_3))[['lv_mean']] | |
plot(hclust(dist(dmtrx))) | |
# WORD DIFFERENCES | |
texts_1 <- rep(text_1, 10) # repeat to get a vector of texts | |
wrd_frq_1 <- getWrdFrq(texts_1) # a few errors (e.g. \n\nlevenshtein) for which an amendement to the regex cleaning could resolve | |
texts_2 <- rep(text_2, 10) | |
wrd_frq_2 <- getWrdFrq(texts_2) # avoid words like 'between', 'after' in post-filter | |
texts_3 <- rep(text_3, 10) | |
wrd_frq_3 <- getWrdFrq(texts_3) # avoid words like 'between', 'after' | |
dmtrx <- matrix(0, ncol=3, nrow=3) | |
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3') | |
dmtrx['t1', 't2'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_2))/length(wrd_frq_1) | |
dmtrx['t2', 't1'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_1))/length(wrd_frq_2) | |
dmtrx['t1', 't3'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_3))/length(wrd_frq_1) | |
dmtrx['t3', 't1'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_1))/length(wrd_frq_3) | |
dmtrx['t3', 't2'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_2))/length(wrd_frq_3) | |
dmtrx['t2', 't3'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_3))/length(wrd_frq_2) | |
plot(hclust(dist(dmtrx))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment