Last active
December 7, 2018 19:31
-
-
Save shawngraham/d8ac1f2eadde9f21bd7ef6157275471c to your computer and use it in GitHub Desktop.
walking through textreuse for andrew
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# use ctrl+enter to run each line in turn | |
install.packages("textreuse") | |
# next line just displays the help file for the package in the help window in R studio | |
vignette("textreuse-introduction", package = "textreuse") | |
setwd("full-path-to-the-directory-you're-working-in") | |
# check what directory you're in | |
getwd() | |
## have the text you're interested in a subdirectory called corpus | |
library(textreuse) | |
# put the subdirectory into a variable | |
dir <- ("corpus") | |
# start the text reuse; it compares sequences of words 7 words in length. You might want to change that for your | |
# purposes, so in line 26 change n = 7 to whatever seems appropriate | |
# dir <- system.file("corpus", package = "textreuse") <- don't use this line, seems to screw things up. | |
# if you get an error with that line, remove 'system.file' and ', package = "textreuse"' | |
corpus <- TextReuseCorpus(dir = dir, meta = list(title = "Collosal Cave Adventure"), | |
tokenizer = tokenize_ngrams, n = 7) | |
# check that everything is there: | |
corpus | |
names(corpus) | |
# compute similarity | |
comparisons <- pairwise_compare(corpus, jaccard_similarity) | |
comparisons | |
# turn it into a dataframe if you want | |
pairwise_candidates(comparisons) | |
df <- pairwise_candidates(comparisons) | |
View(df) | |
# write the results to file! | |
write.csv(comparisons, file="textreuse-comparisons.csv") | |
#### if you've got a lot of data, that can be really slow. so you'd do this instead: | |
#dir <- system.file("corpus", package = "textreuse") # see comment on line 24 if you get an error here | |
dir <- ("corpus") | |
minhash <- minhash_generator(200, seed = 235) | |
ats <- TextReuseCorpus(dir = dir, | |
tokenizer = tokenize_ngrams, n = 5, | |
minhash_func = minhash) | |
### then the scores: | |
buckets <- lsh(ats, bands = 50, progress = FALSE) | |
candidates <- lsh_candidates(buckets) | |
scores <- lsh_compare(candidates, ats, jaccard_similarity, progress = FALSE) | |
scores | |
write.csv(scores, file="textreuse-scores.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
while the vignette says to do
dir <- system.file("corpus", package = "textreuse")
that seems to throw an error every time.