shawngraham/textreuse.r

## textreuse.r
# use ctrl+enter to run each line in turn

install.packages("textreuse")

# next line just displays the help file for the package in the help window in R studio
vignette("textreuse-introduction", package = "textreuse")

setwd("full-path-to-the-directory-you're-working-in")

# check what directory you're in
getwd()

## have the text you're interested in a subdirectory called corpus

library(textreuse)

# put the subdirectory into a variable
dir <- ("corpus")

# start the text reuse; it compares sequences of words 7 words in length. You might want to change that for your
# purposes, so in line 26 change n = 7 to whatever seems appropriate
# dir <- system.file("corpus", package = "textreuse") <- don't use this line, seems to screw things up.

# if you get an error with that line, remove 'system.file' and ', package = "textreuse"'
corpus <- TextReuseCorpus(dir = dir, meta = list(title = "Collosal Cave Adventure"),
                          tokenizer = tokenize_ngrams, n = 7)

# check that everything is there:
corpus
names(corpus)

# compute similarity
comparisons <- pairwise_compare(corpus, jaccard_similarity)
comparisons

# turn it into a dataframe if you want
pairwise_candidates(comparisons)
df <- pairwise_candidates(comparisons)
View(df)

# write the results to file!
write.csv(comparisons, file="textreuse-comparisons.csv")

#### if you've got a lot of data, that can be really slow. so you'd do this instead:
#dir <- system.file("corpus", package = "textreuse")  # see comment on line 24 if you get an error here
dir <- ("corpus")
minhash <- minhash_generator(200, seed = 235)
ats <- TextReuseCorpus(dir = dir,
                       tokenizer = tokenize_ngrams, n = 5,
                       minhash_func = minhash)

### then the scores:
buckets <- lsh(ats, bands = 50, progress = FALSE)
candidates <- lsh_candidates(buckets)
scores <- lsh_compare(candidates, ats, jaccard_similarity, progress = FALSE)
scores
write.csv(scores, file="textreuse-scores.csv")
	# use ctrl+enter to run each line in turn

	install.packages("textreuse")

	# next line just displays the help file for the package in the help window in R studio
	vignette("textreuse-introduction", package = "textreuse")

	setwd("full-path-to-the-directory-you're-working-in")

	# check what directory you're in
	getwd()

	## have the text you're interested in a subdirectory called corpus

	library(textreuse)

	# put the subdirectory into a variable
	dir <- ("corpus")

	# start the text reuse; it compares sequences of words 7 words in length. You might want to change that for your
	# purposes, so in line 26 change n = 7 to whatever seems appropriate
	# dir <- system.file("corpus", package = "textreuse") <- don't use this line, seems to screw things up.

	# if you get an error with that line, remove 'system.file' and ', package = "textreuse"'
	corpus <- TextReuseCorpus(dir = dir, meta = list(title = "Collosal Cave Adventure"),
	tokenizer = tokenize_ngrams, n = 7)

	# check that everything is there:
	corpus
	names(corpus)

	# compute similarity
	comparisons <- pairwise_compare(corpus, jaccard_similarity)
	comparisons

	# turn it into a dataframe if you want
	pairwise_candidates(comparisons)
	df <- pairwise_candidates(comparisons)
	View(df)

	# write the results to file!
	write.csv(comparisons, file="textreuse-comparisons.csv")

	#### if you've got a lot of data, that can be really slow. so you'd do this instead:
	#dir <- system.file("corpus", package = "textreuse") # see comment on line 24 if you get an error here
	dir <- ("corpus")
	minhash <- minhash_generator(200, seed = 235)
	ats <- TextReuseCorpus(dir = dir,
	tokenizer = tokenize_ngrams, n = 5,
	minhash_func = minhash)

	### then the scores:
	buckets <- lsh(ats, bands = 50, progress = FALSE)
	candidates <- lsh_candidates(buckets)
	scores <- lsh_compare(candidates, ats, jaccard_similarity, progress = FALSE)
	scores
	write.csv(scores, file="textreuse-scores.csv")