Skip to content

Instantly share code, notes, and snippets.

@DomBennett
Created February 11, 2017 16:25
Show Gist options
  • Save DomBennett/eaf08157533561a5a1acab96e57e2454 to your computer and use it in GitHub Desktop.
Save DomBennett/eaf08157533561a5a1acab96e57e2454 to your computer and use it in GitHub Desktop.
Calculating text distances using string distance metrics and word frequencies
# EXAMPLE TEXT DISTANCES
# FUNCTIONS
justText <- function(txt) {
# converts text to its readable form by removing punctuation and numbers
gsub("[^a-zA-Z ]", "", txt)
}
calcStrDst <- function(txts) {
# calculate the distance between vector of texts
# or for a random subset of texts if a list of list of texts
if(class(txts) == "list") {
txts <- unlist(lapply(txts, function(x) sample(x, 1)))
}
cmbs <- combn(length(txts), 2)
jw_dsts <- lv_dsts <- cosine_dsts <- rep(NA, ncol(cmbs))
for(i in 1:ncol(cmbs)) {
a <- justText(txts[cmbs[ ,i]][[1]])
b <- justText(txts[cmbs[ ,i]][[2]])
lv_dsts[i] <- stringdist::stringdist(a, b, method="lv")
cosine_dsts[i] <- stringdist::stringdist(a, b, method="cosine")
jw_dsts[i] <- stringdist::stringdist(a, b, method="jw")
}
res <- data.frame('lv_min'=min(lv_dsts),
'lv_max'=max(lv_dsts),
'lv_mean'=mean(lv_dsts),
'lv_sd'=sd(lv_dsts),
'lv_median'=median(lv_dsts),
'cosine_min'=min(cosine_dsts),
'cosine_max'=max(cosine_dsts),
'cosine_mean'=mean(cosine_dsts),
'cosine_sd'=sd(cosine_dsts),
'cosine_median'=median(cosine_dsts),
'jw_min'=min(jw_dsts),
'jw_max'=max(jw_dsts),
'jw_mean'=mean(jw_dsts),
'jw_sd'=sd(jw_dsts),
'jw_median'=median(jw_dsts))
res
}
getWrdFrq <- function(txts, wts=rep(1, length(txts)), min_wrd_sz=5, min_freq=5) {
# Return the frequency of unique words in each string of vector txts
# Use wts to determine the wt of each txt
cleanWrds <- function(txt) {
wrds <- strsplit(txt, " ")[[1]]
wrds <- tolower(wrds)
wrds <- tm::removePunctuation(wrds)
wrds <- tm::removeNumbers(wrds)
wrds <- wrds[nchar(wrds) >= min_wrd_sz]
wrds <- sub('es$', "", wrds) # remove plurals
wrds <- sub('s$', "", wrds)
wrds <- unique(wrds)
wrds
}
wrds <- vector("list", length=length(txts))
for(i in 1:length(txts)) {
wrds[[i]] <- cleanWrds(txts[i])
}
nreps <- unlist(lapply(wrds, function(x) length(x)))
wts <- rep(wts, times=nreps)
wrds <- unlist(wrds)
tbl_wrds <- tapply(wts, wrds, sum)
tbl_wrds <- tbl_wrds[tbl_wrds > min_freq]
sort(tbl_wrds, decreasing=TRUE)
}
# DATA
text_1 <-
"In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. It is named after Vladimir Levenshtein, who considered this distance in 1965.[1]
Levenshtein distance may also be referred to as edit distance, although that may also denote a larger family of distance metrics.[2]:32 It is closely related to pairwise string alignments."
text_2 <- "The Jaccard index, also known as the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets"
text_3 <- "In computer science and statistics, the Jaro–Winkler distance (Winkler, 1990) is a measure of similarity between two strings. It is a variant of the Jaro distance metric (Jaro, 1989, 1995), a type of string edit distance, and was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The lower the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro-Winkler similarity (for equation see below) is given by 1 - Jaro Winkler distance. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The similarity score is normalized such that 0 equates to no similarity and 1 is an exact match."
# TEXT DISTANCES
# let's create a distance matrix based on levenshtein
dmtrx <- matrix(0, ncol=3, nrow=3)
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
dmtrx['t1', 't2'] <- dmtrx['t2', 't1'] <- calcStrDst(c(text_1, text_2))[['lv_mean']]
dmtrx['t1', 't3'] <- dmtrx['t3', 't1'] <- calcStrDst(c(text_1, text_3))[['lv_mean']]
dmtrx['t3', 't2'] <- dmtrx['t2', 't3'] <- calcStrDst(c(text_2, text_3))[['lv_mean']]
plot(hclust(dist(dmtrx)))
# WORD DIFFERENCES
texts_1 <- rep(text_1, 10) # repeat to get a vector of texts
wrd_frq_1 <- getWrdFrq(texts_1) # a few errors (e.g. \n\nlevenshtein) for which an amendement to the regex cleaning could resolve
texts_2 <- rep(text_2, 10)
wrd_frq_2 <- getWrdFrq(texts_2) # avoid words like 'between', 'after' in post-filter
texts_3 <- rep(text_3, 10)
wrd_frq_3 <- getWrdFrq(texts_3) # avoid words like 'between', 'after'
dmtrx <- matrix(0, ncol=3, nrow=3)
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
dmtrx['t1', 't2'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_2))/length(wrd_frq_1)
dmtrx['t2', 't1'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_1))/length(wrd_frq_2)
dmtrx['t1', 't3'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_3))/length(wrd_frq_1)
dmtrx['t3', 't1'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_1))/length(wrd_frq_3)
dmtrx['t3', 't2'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_2))/length(wrd_frq_3)
dmtrx['t2', 't3'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_3))/length(wrd_frq_2)
plot(hclust(dist(dmtrx)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment