DomBennett/text_distance_example.R

## text_distance_example.R
# EXAMPLE TEXT DISTANCES

# FUNCTIONS
justText <- function(txt) {
  # converts text to its readable form by removing punctuation and numbers
  gsub("[^a-zA-Z ]", "", txt)
}

calcStrDst <- function(txts) {
  # calculate the distance between vector of texts
  # or for a random subset of texts if a list of list of texts
  if(class(txts) == "list") {
    txts <- unlist(lapply(txts, function(x) sample(x, 1)))
  }
  cmbs <- combn(length(txts), 2)
  jw_dsts <- lv_dsts <- cosine_dsts <- rep(NA, ncol(cmbs))
  for(i in 1:ncol(cmbs)) {
    a <- justText(txts[cmbs[ ,i]][[1]])
    b <- justText(txts[cmbs[ ,i]][[2]])
    lv_dsts[i] <- stringdist::stringdist(a, b, method="lv")
    cosine_dsts[i] <- stringdist::stringdist(a, b, method="cosine")
    jw_dsts[i] <- stringdist::stringdist(a, b, method="jw")
  }
  res <- data.frame('lv_min'=min(lv_dsts),
                    'lv_max'=max(lv_dsts),
                    'lv_mean'=mean(lv_dsts),
                    'lv_sd'=sd(lv_dsts),
                    'lv_median'=median(lv_dsts),
                    'cosine_min'=min(cosine_dsts),
                    'cosine_max'=max(cosine_dsts),
                    'cosine_mean'=mean(cosine_dsts),
                    'cosine_sd'=sd(cosine_dsts),
                    'cosine_median'=median(cosine_dsts),
                    'jw_min'=min(jw_dsts),
                    'jw_max'=max(jw_dsts),
                    'jw_mean'=mean(jw_dsts),
                    'jw_sd'=sd(jw_dsts),
                    'jw_median'=median(jw_dsts))
  res
}

getWrdFrq <- function(txts, wts=rep(1, length(txts)), min_wrd_sz=5, min_freq=5) {
  # Return the frequency of unique words in each string of vector txts
  # Use wts to determine the wt of each txt
  cleanWrds <- function(txt) {
    wrds <- strsplit(txt, " ")[[1]]
    wrds <- tolower(wrds)
    wrds <- tm::removePunctuation(wrds)
    wrds <- tm::removeNumbers(wrds)
    wrds <- wrds[nchar(wrds) >= min_wrd_sz]
    wrds <- sub('es$', "", wrds)  # remove plurals
    wrds <- sub('s$', "", wrds)
    wrds <- unique(wrds)
    wrds
  }
  wrds <- vector("list", length=length(txts))
  for(i in 1:length(txts)) {
    wrds[[i]] <- cleanWrds(txts[i])
  }
  nreps <- unlist(lapply(wrds, function(x) length(x)))
  wts <- rep(wts, times=nreps)
  wrds <- unlist(wrds)
  tbl_wrds <- tapply(wts, wrds, sum)
  tbl_wrds <- tbl_wrds[tbl_wrds > min_freq]
  sort(tbl_wrds, decreasing=TRUE)
}


# DATA
text_1 <-
"In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. It is named after Vladimir Levenshtein, who considered this distance in 1965.[1]

Levenshtein distance may also be referred to as edit distance, although that may also denote a larger family of distance metrics.[2]:32 It is closely related to pairwise string alignments."
text_2 <- "The Jaccard index, also known as the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets"
text_3 <- "In computer science and statistics, the Jaro–Winkler distance (Winkler, 1990) is a measure of similarity between two strings. It is a variant of the Jaro distance metric (Jaro, 1989, 1995), a type of string edit distance, and was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The lower the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro-Winkler similarity (for equation see below) is given by 1 - Jaro Winkler distance. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The similarity score is normalized such that 0 equates to no similarity and 1 is an exact match."


# TEXT DISTANCES
# let's create a distance matrix based on levenshtein
dmtrx <- matrix(0, ncol=3, nrow=3)
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
dmtrx['t1', 't2'] <- dmtrx['t2', 't1'] <- calcStrDst(c(text_1, text_2))[['lv_mean']]
dmtrx['t1', 't3'] <- dmtrx['t3', 't1'] <- calcStrDst(c(text_1, text_3))[['lv_mean']]
dmtrx['t3', 't2'] <- dmtrx['t2', 't3'] <- calcStrDst(c(text_2, text_3))[['lv_mean']]
plot(hclust(dist(dmtrx)))

# WORD DIFFERENCES
texts_1 <- rep(text_1, 10) # repeat to get a vector of texts
wrd_frq_1 <- getWrdFrq(texts_1)  # a few errors (e.g. \n\nlevenshtein) for which an amendement to the regex cleaning could resolve
texts_2 <- rep(text_2, 10)
wrd_frq_2 <- getWrdFrq(texts_2)  # avoid words like 'between', 'after' in post-filter
texts_3 <- rep(text_3, 10)
wrd_frq_3 <- getWrdFrq(texts_3)  # avoid words like 'between', 'after'
dmtrx <- matrix(0, ncol=3, nrow=3)
colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
dmtrx['t1', 't2'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_2))/length(wrd_frq_1)
dmtrx['t2', 't1'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_1))/length(wrd_frq_2)
dmtrx['t1', 't3'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_3))/length(wrd_frq_1)
dmtrx['t3', 't1'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_1))/length(wrd_frq_3)
dmtrx['t3', 't2'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_2))/length(wrd_frq_3)
dmtrx['t2', 't3'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_3))/length(wrd_frq_2)
plot(hclust(dist(dmtrx)))
	# EXAMPLE TEXT DISTANCES

	# FUNCTIONS
	justText <- function(txt) {
	# converts text to its readable form by removing punctuation and numbers
	gsub("[^a-zA-Z ]", "", txt)
	}

	calcStrDst <- function(txts) {
	# calculate the distance between vector of texts
	# or for a random subset of texts if a list of list of texts
	if(class(txts) == "list") {
	txts <- unlist(lapply(txts, function(x) sample(x, 1)))
	}
	cmbs <- combn(length(txts), 2)
	jw_dsts <- lv_dsts <- cosine_dsts <- rep(NA, ncol(cmbs))
	for(i in 1:ncol(cmbs)) {
	a <- justText(txts[cmbs[ ,i]][[1]])
	b <- justText(txts[cmbs[ ,i]][[2]])
	lv_dsts[i] <- stringdist::stringdist(a, b, method="lv")
	cosine_dsts[i] <- stringdist::stringdist(a, b, method="cosine")
	jw_dsts[i] <- stringdist::stringdist(a, b, method="jw")
	}
	res <- data.frame('lv_min'=min(lv_dsts),
	'lv_max'=max(lv_dsts),
	'lv_mean'=mean(lv_dsts),
	'lv_sd'=sd(lv_dsts),
	'lv_median'=median(lv_dsts),
	'cosine_min'=min(cosine_dsts),
	'cosine_max'=max(cosine_dsts),
	'cosine_mean'=mean(cosine_dsts),
	'cosine_sd'=sd(cosine_dsts),
	'cosine_median'=median(cosine_dsts),
	'jw_min'=min(jw_dsts),
	'jw_max'=max(jw_dsts),
	'jw_mean'=mean(jw_dsts),
	'jw_sd'=sd(jw_dsts),
	'jw_median'=median(jw_dsts))
	res
	}

	getWrdFrq <- function(txts, wts=rep(1, length(txts)), min_wrd_sz=5, min_freq=5) {
	# Return the frequency of unique words in each string of vector txts
	# Use wts to determine the wt of each txt
	cleanWrds <- function(txt) {
	wrds <- strsplit(txt, " ")[[1]]
	wrds <- tolower(wrds)
	wrds <- tm::removePunctuation(wrds)
	wrds <- tm::removeNumbers(wrds)
	wrds <- wrds[nchar(wrds) >= min_wrd_sz]
	wrds <- sub('es$', "", wrds) # remove plurals
	wrds <- sub('s$', "", wrds)
	wrds <- unique(wrds)
	wrds
	}
	wrds <- vector("list", length=length(txts))
	for(i in 1:length(txts)) {
	wrds[[i]] <- cleanWrds(txts[i])
	}
	nreps <- unlist(lapply(wrds, function(x) length(x)))
	wts <- rep(wts, times=nreps)
	wrds <- unlist(wrds)
	tbl_wrds <- tapply(wts, wrds, sum)
	tbl_wrds <- tbl_wrds[tbl_wrds > min_freq]
	sort(tbl_wrds, decreasing=TRUE)
	}


	# DATA
	text_1 <-
	"In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. It is named after Vladimir Levenshtein, who considered this distance in 1965.[1]

	Levenshtein distance may also be referred to as edit distance, although that may also denote a larger family of distance metrics.[2]:32 It is closely related to pairwise string alignments."
	text_2 <- "The Jaccard index, also known as the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets"
	text_3 <- "In computer science and statistics, the Jaro–Winkler distance (Winkler, 1990) is a measure of similarity between two strings. It is a variant of the Jaro distance metric (Jaro, 1989, 1995), a type of string edit distance, and was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The lower the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro-Winkler similarity (for equation see below) is given by 1 - Jaro Winkler distance. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The similarity score is normalized such that 0 equates to no similarity and 1 is an exact match."


	# TEXT DISTANCES
	# let's create a distance matrix based on levenshtein
	dmtrx <- matrix(0, ncol=3, nrow=3)
	colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
	dmtrx['t1', 't2'] <- dmtrx['t2', 't1'] <- calcStrDst(c(text_1, text_2))[['lv_mean']]
	dmtrx['t1', 't3'] <- dmtrx['t3', 't1'] <- calcStrDst(c(text_1, text_3))[['lv_mean']]
	dmtrx['t3', 't2'] <- dmtrx['t2', 't3'] <- calcStrDst(c(text_2, text_3))[['lv_mean']]
	plot(hclust(dist(dmtrx)))

	# WORD DIFFERENCES
	texts_1 <- rep(text_1, 10) # repeat to get a vector of texts
	wrd_frq_1 <- getWrdFrq(texts_1) # a few errors (e.g. \n\nlevenshtein) for which an amendement to the regex cleaning could resolve
	texts_2 <- rep(text_2, 10)
	wrd_frq_2 <- getWrdFrq(texts_2) # avoid words like 'between', 'after' in post-filter
	texts_3 <- rep(text_3, 10)
	wrd_frq_3 <- getWrdFrq(texts_3) # avoid words like 'between', 'after'
	dmtrx <- matrix(0, ncol=3, nrow=3)
	colnames(dmtrx) <- rownames(dmtrx) <- c('t1', 't2', 't3')
	dmtrx['t1', 't2'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_2))/length(wrd_frq_1)
	dmtrx['t2', 't1'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_1))/length(wrd_frq_2)
	dmtrx['t1', 't3'] <- sum(!names(wrd_frq_1) %in% names(wrd_frq_3))/length(wrd_frq_1)
	dmtrx['t3', 't1'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_1))/length(wrd_frq_3)
	dmtrx['t3', 't2'] <- sum(!names(wrd_frq_3) %in% names(wrd_frq_2))/length(wrd_frq_3)
	dmtrx['t2', 't3'] <- sum(!names(wrd_frq_2) %in% names(wrd_frq_3))/length(wrd_frq_2)
	plot(hclust(dist(dmtrx)))