dhicks/parse_and_allocate.R

## parse_and_allocate.R
## Parses a roster page saved from Canvas and allocates students to groups within discussion sections

## To use:
## 1. Navigate to the People page in a Canvas course using your browser
## 2. Right-click on the page and select "Save as"
## 3a. `Rscript parse.R path/to/roster.html`
## -OR-
## 3b. Save it to `data_folder` as `roster.html`. `Rscript parse.R` or run in RStudio.

## Setup ----
library(tidyverse)
library(rvest)
library(assertthat)
library(randomizr)

group_size = 5 ## Target size for groups

data_folder = file.path('..', 'data')
out_folder = file.path('..', 'out')
if (!dir.exists(out_folder)) {
    dir.create(out_folder)
}

## If R is not running interactively, get path to roster file from command line
args = commandArgs(trailingOnly = TRUE)
# message(args)
if (!interactive() && length(args) > 0) {
    roster = args[[1]]
} else {
    roster = file.path(data_folder, 'roster.html')
}
# message(roster)

## Load HTML ----
raw = read_html(roster)

## Parsing ----
dataf = raw %>%
    ## Extract roster table
    html_table() %>%
    .[[1]] %>%
    ## Convert to a tibble, w/ nicer names
    as_tibble(.name_repair = 'universal') %>%
    rename_all(tolower) %>%
    ## Columns we care about
    select(name, login.id, section, role) %>%
    ## Need to parse `section` and `role`:
    ## Can have multiple entries, corresponding to lecture and different discussion sections
    rowwise() %>%
    mutate_at(vars(section, role),
              ~ str_split(., '[\n ]+')) %>%
    ## Unnest; filter down to only students
    unnest(c(section, role)) %>%
    filter(role == 'Student') %>%
    select(-role) %>%
    ## Split section into `lecture` and `discussion` columns
    separate(section, into = c('type', 'number')) %>%
    pivot_wider(names_from = type,
                values_from = number) %>%
    rename_all(tolower)

## Data validation ----
## All lecture values are non-empty
assert_that(all(!is.na(dataf$lecture)),
            msg = 'missing value for lecture')
## All discussion values are non-empty
# assert_that(all(!is.na(dataf$discussion)),
#             msg = 'missing value for discussion')
# dataf %>%
#     filter(is.na(discussion))

## No duplicated rows
assert_that(all(!duplicated(dataf)),
            msg = 'duplicated rows')
## No duplicated login IDs
assert_that(length(unique(dataf$login.id)) == nrow(dataf),
            msg = 'duplicated login IDs')

## Group allocation ----
set.seed(2020-08-24)
## Students in each discussion section
num_sections = dataf %>%
    count(discussion) %>%
    filter(!is.na(discussion)) %>%
    nrow()
num_arms = ceiling(nrow(dataf) / group_size / num_sections)
allocation = dataf %>%
    filter(!is.na(discussion)) %>%
    mutate(group = block_ra(discussion, num_arms = num_arms),
           group = str_c(discussion, '-', group))

## Validation: All groups within 1 of the target size
allocation %>%
    count(group) %>%
    pull(n) %>%
    {. - group_size} %>%
    abs() %>%
    {. <= 1} %>%
    all() %>%
    assert_that(msg = 'group too far from group_size')

## Output ----
write_rds(allocation, file.path(out_folder, 'roster.Rds'))
write_csv(allocation, file.path(out_folder, 'roster.csv'))
	## Parses a roster page saved from Canvas and allocates students to groups within discussion sections

	## To use:
	## 1. Navigate to the People page in a Canvas course using your browser
	## 2. Right-click on the page and select "Save as"
	## 3a. `Rscript parse.R path/to/roster.html`
	## -OR-
	## 3b. Save it to `data_folder` as `roster.html`. `Rscript parse.R` or run in RStudio.

	## Setup ----
	library(tidyverse)
	library(rvest)
	library(assertthat)
	library(randomizr)

	group_size = 5 ## Target size for groups

	data_folder = file.path('..', 'data')
	out_folder = file.path('..', 'out')
	if (!dir.exists(out_folder)) {
	dir.create(out_folder)
	}

	## If R is not running interactively, get path to roster file from command line
	args = commandArgs(trailingOnly = TRUE)
	# message(args)
	if (!interactive() && length(args) > 0) {
	roster = args[[1]]
	} else {
	roster = file.path(data_folder, 'roster.html')
	}
	# message(roster)

	## Load HTML ----
	raw = read_html(roster)

	## Parsing ----
	dataf = raw %>%
	## Extract roster table
	html_table() %>%
	.[[1]] %>%
	## Convert to a tibble, w/ nicer names
	as_tibble(.name_repair = 'universal') %>%
	rename_all(tolower) %>%
	## Columns we care about
	select(name, login.id, section, role) %>%
	## Need to parse `section` and `role`:
	## Can have multiple entries, corresponding to lecture and different discussion sections
	rowwise() %>%
	mutate_at(vars(section, role),
	~ str_split(., '[\n ]+')) %>%
	## Unnest; filter down to only students
	unnest(c(section, role)) %>%
	filter(role == 'Student') %>%
	select(-role) %>%
	## Split section into `lecture` and `discussion` columns
	separate(section, into = c('type', 'number')) %>%
	pivot_wider(names_from = type,
	values_from = number) %>%
	rename_all(tolower)

	## Data validation ----
	## All lecture values are non-empty
	assert_that(all(!is.na(dataf$lecture)),
	msg = 'missing value for lecture')
	## All discussion values are non-empty
	# assert_that(all(!is.na(dataf$discussion)),
	# msg = 'missing value for discussion')
	# dataf %>%
	# filter(is.na(discussion))

	## No duplicated rows
	assert_that(all(!duplicated(dataf)),
	msg = 'duplicated rows')
	## No duplicated login IDs
	assert_that(length(unique(dataf$login.id)) == nrow(dataf),
	msg = 'duplicated login IDs')

	## Group allocation ----
	set.seed(2020-08-24)
	## Students in each discussion section
	num_sections = dataf %>%
	count(discussion) %>%
	filter(!is.na(discussion)) %>%
	nrow()
	num_arms = ceiling(nrow(dataf) / group_size / num_sections)
	allocation = dataf %>%
	filter(!is.na(discussion)) %>%
	mutate(group = block_ra(discussion, num_arms = num_arms),
	group = str_c(discussion, '-', group))

	## Validation: All groups within 1 of the target size
	allocation %>%
	count(group) %>%
	pull(n) %>%
	{. - group_size} %>%
	abs() %>%
	{. <= 1} %>%
	all() %>%
	assert_that(msg = 'group too far from group_size')

	## Output ----
	write_rds(allocation, file.path(out_folder, 'roster.Rds'))
	write_csv(allocation, file.path(out_folder, 'roster.csv'))