Skip to content

Instantly share code, notes, and snippets.

@dhicks
Created August 24, 2020 16:55
Show Gist options
  • Save dhicks/757698b289d6e762eb2d74c87cf42c40 to your computer and use it in GitHub Desktop.
Save dhicks/757698b289d6e762eb2d74c87cf42c40 to your computer and use it in GitHub Desktop.
## Parses a roster page saved from Canvas and allocates students to groups within discussion sections
## To use:
## 1. Navigate to the People page in a Canvas course using your browser
## 2. Right-click on the page and select "Save as"
## 3a. `Rscript parse.R path/to/roster.html`
## -OR-
## 3b. Save it to `data_folder` as `roster.html`. `Rscript parse.R` or run in RStudio.
## Setup ----
library(tidyverse)
library(rvest)
library(assertthat)
library(randomizr)
group_size = 5 ## Target size for groups
data_folder = file.path('..', 'data')
out_folder = file.path('..', 'out')
if (!dir.exists(out_folder)) {
dir.create(out_folder)
}
## If R is not running interactively, get path to roster file from command line
args = commandArgs(trailingOnly = TRUE)
# message(args)
if (!interactive() && length(args) > 0) {
roster = args[[1]]
} else {
roster = file.path(data_folder, 'roster.html')
}
# message(roster)
## Load HTML ----
raw = read_html(roster)
## Parsing ----
dataf = raw %>%
## Extract roster table
html_table() %>%
.[[1]] %>%
## Convert to a tibble, w/ nicer names
as_tibble(.name_repair = 'universal') %>%
rename_all(tolower) %>%
## Columns we care about
select(name, login.id, section, role) %>%
## Need to parse `section` and `role`:
## Can have multiple entries, corresponding to lecture and different discussion sections
rowwise() %>%
mutate_at(vars(section, role),
~ str_split(., '[\n ]+')) %>%
## Unnest; filter down to only students
unnest(c(section, role)) %>%
filter(role == 'Student') %>%
select(-role) %>%
## Split section into `lecture` and `discussion` columns
separate(section, into = c('type', 'number')) %>%
pivot_wider(names_from = type,
values_from = number) %>%
rename_all(tolower)
## Data validation ----
## All lecture values are non-empty
assert_that(all(!is.na(dataf$lecture)),
msg = 'missing value for lecture')
## All discussion values are non-empty
# assert_that(all(!is.na(dataf$discussion)),
# msg = 'missing value for discussion')
# dataf %>%
# filter(is.na(discussion))
## No duplicated rows
assert_that(all(!duplicated(dataf)),
msg = 'duplicated rows')
## No duplicated login IDs
assert_that(length(unique(dataf$login.id)) == nrow(dataf),
msg = 'duplicated login IDs')
## Group allocation ----
set.seed(2020-08-24)
## Students in each discussion section
num_sections = dataf %>%
count(discussion) %>%
filter(!is.na(discussion)) %>%
nrow()
num_arms = ceiling(nrow(dataf) / group_size / num_sections)
allocation = dataf %>%
filter(!is.na(discussion)) %>%
mutate(group = block_ra(discussion, num_arms = num_arms),
group = str_c(discussion, '-', group))
## Validation: All groups within 1 of the target size
allocation %>%
count(group) %>%
pull(n) %>%
{. - group_size} %>%
abs() %>%
{. <= 1} %>%
all() %>%
assert_that(msg = 'group too far from group_size')
## Output ----
write_rds(allocation, file.path(out_folder, 'roster.Rds'))
write_csv(allocation, file.path(out_folder, 'roster.csv'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment