markziemann/text_similarity_analysis.R

## text_similarity_analysis.R
library(stringr)
library(text2vec)

filelist = list.files(pattern = ".*.txt")
x = lapply(filelist, function(x)readLines(x))

prep_fun = function(x) {
  x %>%
    # make text lower case
    str_to_lower %>%
    # remove non-alphanumeric symbols
    str_replace_all("[^[:alnum:]]", " ") %>%
    # collapse multiple spaces
    str_replace_all("\\s+", " ")
}
x$clean = prep_fun(x)
it = itoken(x$clean, progressbar = FALSE)
v = create_vocabulary(it) %>% prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 3)

vectorizer = vocab_vectorizer(v)

dtm = create_dtm(it, vectorizer)

heatmap(as.matrix(sim2(dtm,method="cosine",norm="l2")),scale="none")

as.vector(t(str_match(x$clean[1],regex("(\\d{9})")  )))[1]
	library(stringr)
	library(text2vec)

	filelist = list.files(pattern = ".*.txt")
	x = lapply(filelist, function(x)readLines(x))

	prep_fun = function(x) {
	x %>%
	# make text lower case
	str_to_lower %>%
	# remove non-alphanumeric symbols
	str_replace_all("[^[:alnum:]]", " ") %>%
	# collapse multiple spaces
	str_replace_all("\\s+", " ")
	}
	x$clean = prep_fun(x)
	it = itoken(x$clean, progressbar = FALSE)
	v = create_vocabulary(it) %>% prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 3)

	vectorizer = vocab_vectorizer(v)

	dtm = create_dtm(it, vectorizer)

	heatmap(as.matrix(sim2(dtm,method="cosine",norm="l2")),scale="none")

	as.vector(t(str_match(x$clean[1],regex("(\\d{9})") )))[1]