Last active
May 10, 2022 18:56
-
-
Save benmarwick/57d1c1ba265a2e5ab6c5f33b729b8fdd to your computer and use it in GitHub Desktop.
Analyse text reuse using minhash and locality-sensitive hashing (LSH)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(textreuse) | |
# one row per student, to get the data, go to canvas -> quiz -> 'quiz stats' -> 'student analysis' | |
cnvs <- read_csv("quiz-responses-downloaded-from-canvas.csv") | |
# select only the column with the text we want to compare | |
cnvs_q5 <- | |
cnvs %>% | |
select( q5 = contains("peers' reflections")) | |
# set the hashing parameters | |
n_hashes <- 1000 | |
n_bands <- n_hashes / 2 | |
# check that number of hashes is evenly divisible by the number of bands | |
# this must be TRUE | |
n_hashes %% n_bands == 0 | |
minhash <- minhash_generator(n = n_hashes, | |
seed = 3552) | |
cnvs_q5_crps <- | |
TextReuseCorpus(text = cnvs_q5$q5, | |
minhash_func = minhash, | |
keep_tokens = TRUE) | |
# lsh_threshold(h = n_hashes, b = n_bands) | |
# locality-sensitive hashing algorithm, | |
buckets <- lsh(cnvs_q5_crps, | |
bands = n_bands) | |
candidates <- lsh_candidates(buckets) | |
# make a table of pairwise similarity scores | |
comparison_tbl <- | |
lsh_compare(candidates, | |
cnvs_q5_crps, | |
jaccard_similarity) %>% | |
arrange(desc(score)) | |
# inspect raw text of top scoring candidates to | |
# see exactly what they wrote: | |
content(cnvs_q5_crps[[comparison_tbl$a[1]]]) | |
content(cnvs_q5_crps[[comparison_tbl$b[1]]]) | |
content(cnvs_q5_crps[[comparison_tbl$a[2]]]) | |
content(cnvs_q5_crps[[comparison_tbl$b[2]]]) | |
content(cnvs_q5_crps[[comparison_tbl$a[3]]]) | |
content(cnvs_q5_crps[[comparison_tbl$b[3]]]) | |
content(cnvs_q5_crps[[comparison_tbl$a[4]]]) | |
content(cnvs_q5_crps[[comparison_tbl$b[4]]]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment