Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active May 10, 2022 18:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/57d1c1ba265a2e5ab6c5f33b729b8fdd to your computer and use it in GitHub Desktop.
Save benmarwick/57d1c1ba265a2e5ab6c5f33b729b8fdd to your computer and use it in GitHub Desktop.
Analyse text reuse using minhash and locality-sensitive hashing (LSH)
library(tidyverse)
library(textreuse)
# one row per student, to get the data, go to canvas -> quiz -> 'quiz stats' -> 'student analysis'
cnvs <- read_csv("quiz-responses-downloaded-from-canvas.csv")
# select only the column with the text we want to compare
cnvs_q5 <-
cnvs %>%
select( q5 = contains("peers' reflections"))
# set the hashing parameters
n_hashes <- 1000
n_bands <- n_hashes / 2
# check that number of hashes is evenly divisible by the number of bands
# this must be TRUE
n_hashes %% n_bands == 0
minhash <- minhash_generator(n = n_hashes,
seed = 3552)
cnvs_q5_crps <-
TextReuseCorpus(text = cnvs_q5$q5,
minhash_func = minhash,
keep_tokens = TRUE)
# lsh_threshold(h = n_hashes, b = n_bands)
# locality-sensitive hashing algorithm,
buckets <- lsh(cnvs_q5_crps,
bands = n_bands)
candidates <- lsh_candidates(buckets)
# make a table of pairwise similarity scores
comparison_tbl <-
lsh_compare(candidates,
cnvs_q5_crps,
jaccard_similarity) %>%
arrange(desc(score))
# inspect raw text of top scoring candidates to
# see exactly what they wrote:
content(cnvs_q5_crps[[comparison_tbl$a[1]]])
content(cnvs_q5_crps[[comparison_tbl$b[1]]])
content(cnvs_q5_crps[[comparison_tbl$a[2]]])
content(cnvs_q5_crps[[comparison_tbl$b[2]]])
content(cnvs_q5_crps[[comparison_tbl$a[3]]])
content(cnvs_q5_crps[[comparison_tbl$b[3]]])
content(cnvs_q5_crps[[comparison_tbl$a[4]]])
content(cnvs_q5_crps[[comparison_tbl$b[4]]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment