Created
August 24, 2020 16:55
-
-
Save dhicks/757698b289d6e762eb2d74c87cf42c40 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Parses a roster page saved from Canvas and allocates students to groups within discussion sections | |
## To use: | |
## 1. Navigate to the People page in a Canvas course using your browser | |
## 2. Right-click on the page and select "Save as" | |
## 3a. `Rscript parse.R path/to/roster.html` | |
## -OR- | |
## 3b. Save it to `data_folder` as `roster.html`. `Rscript parse.R` or run in RStudio. | |
## Setup ---- | |
library(tidyverse) | |
library(rvest) | |
library(assertthat) | |
library(randomizr) | |
group_size = 5 ## Target size for groups | |
data_folder = file.path('..', 'data') | |
out_folder = file.path('..', 'out') | |
if (!dir.exists(out_folder)) { | |
dir.create(out_folder) | |
} | |
## If R is not running interactively, get path to roster file from command line | |
args = commandArgs(trailingOnly = TRUE) | |
# message(args) | |
if (!interactive() && length(args) > 0) { | |
roster = args[[1]] | |
} else { | |
roster = file.path(data_folder, 'roster.html') | |
} | |
# message(roster) | |
## Load HTML ---- | |
raw = read_html(roster) | |
## Parsing ---- | |
dataf = raw %>% | |
## Extract roster table | |
html_table() %>% | |
.[[1]] %>% | |
## Convert to a tibble, w/ nicer names | |
as_tibble(.name_repair = 'universal') %>% | |
rename_all(tolower) %>% | |
## Columns we care about | |
select(name, login.id, section, role) %>% | |
## Need to parse `section` and `role`: | |
## Can have multiple entries, corresponding to lecture and different discussion sections | |
rowwise() %>% | |
mutate_at(vars(section, role), | |
~ str_split(., '[\n ]+')) %>% | |
## Unnest; filter down to only students | |
unnest(c(section, role)) %>% | |
filter(role == 'Student') %>% | |
select(-role) %>% | |
## Split section into `lecture` and `discussion` columns | |
separate(section, into = c('type', 'number')) %>% | |
pivot_wider(names_from = type, | |
values_from = number) %>% | |
rename_all(tolower) | |
## Data validation ---- | |
## All lecture values are non-empty | |
assert_that(all(!is.na(dataf$lecture)), | |
msg = 'missing value for lecture') | |
## All discussion values are non-empty | |
# assert_that(all(!is.na(dataf$discussion)), | |
# msg = 'missing value for discussion') | |
# dataf %>% | |
# filter(is.na(discussion)) | |
## No duplicated rows | |
assert_that(all(!duplicated(dataf)), | |
msg = 'duplicated rows') | |
## No duplicated login IDs | |
assert_that(length(unique(dataf$login.id)) == nrow(dataf), | |
msg = 'duplicated login IDs') | |
## Group allocation ---- | |
set.seed(2020-08-24) | |
## Students in each discussion section | |
num_sections = dataf %>% | |
count(discussion) %>% | |
filter(!is.na(discussion)) %>% | |
nrow() | |
num_arms = ceiling(nrow(dataf) / group_size / num_sections) | |
allocation = dataf %>% | |
filter(!is.na(discussion)) %>% | |
mutate(group = block_ra(discussion, num_arms = num_arms), | |
group = str_c(discussion, '-', group)) | |
## Validation: All groups within 1 of the target size | |
allocation %>% | |
count(group) %>% | |
pull(n) %>% | |
{. - group_size} %>% | |
abs() %>% | |
{. <= 1} %>% | |
all() %>% | |
assert_that(msg = 'group too far from group_size') | |
## Output ---- | |
write_rds(allocation, file.path(out_folder, 'roster.Rds')) | |
write_csv(allocation, file.path(out_folder, 'roster.csv')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment