benmarwick/gist:57d1c1ba265a2e5ab6c5f33b729b8fdd

## gistfile1.txt
library(tidyverse)
library(textreuse)

# one row per student, to get the data, go to canvas -> quiz -> 'quiz stats' -> 'student analysis'
cnvs <- read_csv("quiz-responses-downloaded-from-canvas.csv")

# select only the column with the text we want to compare
cnvs_q5 <-
cnvs %>%
  select( q5 = contains("peers' reflections"))

# set the hashing parameters
n_hashes <- 1000
n_bands <- n_hashes / 2

# check that number of hashes is evenly divisible by the number of bands
# this must be TRUE
n_hashes %% n_bands == 0

minhash <- minhash_generator(n = n_hashes,
                             seed = 3552)
cnvs_q5_crps <-
  TextReuseCorpus(text = cnvs_q5$q5,
                  minhash_func = minhash,
                  keep_tokens = TRUE)

# lsh_threshold(h = n_hashes, b = n_bands)

#  locality-sensitive hashing algorithm,
buckets <- lsh(cnvs_q5_crps,
               bands = n_bands)

candidates <- lsh_candidates(buckets)

# make a table of pairwise similarity scores
comparison_tbl <-
lsh_compare(candidates,
            cnvs_q5_crps,
            jaccard_similarity) %>%
  arrange(desc(score))

# inspect raw text of top scoring candidates to
# see exactly what they wrote:
content(cnvs_q5_crps[[comparison_tbl$a[1]]])
content(cnvs_q5_crps[[comparison_tbl$b[1]]])

content(cnvs_q5_crps[[comparison_tbl$a[2]]])
content(cnvs_q5_crps[[comparison_tbl$b[2]]])

content(cnvs_q5_crps[[comparison_tbl$a[3]]])
content(cnvs_q5_crps[[comparison_tbl$b[3]]])

content(cnvs_q5_crps[[comparison_tbl$a[4]]])
content(cnvs_q5_crps[[comparison_tbl$b[4]]])
	library(tidyverse)
	library(textreuse)

	# one row per student, to get the data, go to canvas -> quiz -> 'quiz stats' -> 'student analysis'
	cnvs <- read_csv("quiz-responses-downloaded-from-canvas.csv")

	# select only the column with the text we want to compare
	cnvs_q5 <-
	cnvs %>%
	select( q5 = contains("peers' reflections"))

	# set the hashing parameters
	n_hashes <- 1000
	n_bands <- n_hashes / 2

	# check that number of hashes is evenly divisible by the number of bands
	# this must be TRUE
	n_hashes %% n_bands == 0

	minhash <- minhash_generator(n = n_hashes,
	seed = 3552)
	cnvs_q5_crps <-
	TextReuseCorpus(text = cnvs_q5$q5,
	minhash_func = minhash,
	keep_tokens = TRUE)

	# lsh_threshold(h = n_hashes, b = n_bands)

	# locality-sensitive hashing algorithm,
	buckets <- lsh(cnvs_q5_crps,
	bands = n_bands)

	candidates <- lsh_candidates(buckets)

	# make a table of pairwise similarity scores
	comparison_tbl <-
	lsh_compare(candidates,
	cnvs_q5_crps,
	jaccard_similarity) %>%
	arrange(desc(score))

	# inspect raw text of top scoring candidates to
	# see exactly what they wrote:
	content(cnvs_q5_crps[[comparison_tbl$a[1]]])
	content(cnvs_q5_crps[[comparison_tbl$b[1]]])

	content(cnvs_q5_crps[[comparison_tbl$a[2]]])
	content(cnvs_q5_crps[[comparison_tbl$b[2]]])

	content(cnvs_q5_crps[[comparison_tbl$a[3]]])
	content(cnvs_q5_crps[[comparison_tbl$b[3]]])

	content(cnvs_q5_crps[[comparison_tbl$a[4]]])
	content(cnvs_q5_crps[[comparison_tbl$b[4]]])