dir <- ("posts", package = "textreuse")
minhash <- minhash_generator(200, seed = 235)
ats <- TextReuseCorpus(dir = dir,
tokenizer = tokenize_ngrams, n = 5,
minhash_func = minhash)
buckets <- lsh(ats, bands = 50, progress = FALSE)
candidates <- lsh_candidates(buckets)
scores <- lsh_compare(candidates, ats, jaccard_similarity, progress = FALSE)
write.csv(scores, file="textreusescores.csv")
