Skip to content

Instantly share code, notes, and snippets.

@davclark
Created February 5, 2015 19:13
Show Gist options
  • Save davclark/6b6ed503d6a1e5e48516 to your computer and use it in GitHub Desktop.
Save davclark/6b6ed503d6a1e5e48516 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''ungraded.py - extract answers to ungraded problems
Data is obtained from the `course_structure` and `courseware_studentmodule`
data, which are currently specified at the top of the file.
This was developed in response to a request from an instructor to obtain
individual student responses to ungraded questions *with 0 weight*. However,
upon inspection, I noted that all items had either 0 weight, or had weight
unspecified, so I extract answers for the (maybe?) more general class of *all*
ungraded problems. It's easy enough for an instructor to ignore a given file,
and once we've read the data in, that process is pretty fast - so this slightly
more general approach seems apropriate.
'''
from os import makedirs # Nicer than mkdir
import json
import pandas as pd
# Could make these function parameters
# Most folks will need to replace with their own data!
structure_fname = 'BerkeleyX-GG101x-1T2014-course_structure-prod-analytics.json'
student_fname = 'BerkeleyX-GG101x-1T2014-courseware_studentmodule-prod-analytics.sql'
# A JSON file with information about course content
with open(structure_fname) as f:
course_structure = json.load(f)
# A sql dump that's actually a TSV file
# Reducing the amount of parsing & data retained here is likely the biggest
# optimization target
studentmodule = pd.read_csv(student_fname, '\t', na_values='na')
# Student answers are unfortunately heavily quoted JSON (so you see things like
# "The following is \\"Quoted\\""), as well as ASCII-escaped unicode characters.
# I don't want to process ALL student answers, so I only process selected rows
# below using this function
def extract_student_answers(s):
'''Helper for the following loop'''
# This will convert to bytes, then convert to unicode on the way back in -
# assuming you're on Python 3 If you're dealing with Unicode hell, you
# definitely want to be in Python 3 encode() for an ASCII string simply
# converts to bytes - there's no real "encoding"
s = s.encode().decode('unicode_escape')
data = json.loads(s)
# This appears to be where the student answers reliably occur. Of the data
# I've looked at, this is a blob that also includes an escaped version of
# the i4x index. I'm scared to delete it, so I leave it. It looks like this:
# i4x-BerkeleyX-GG101x-problem-db71da27320a44bdb45df31d0d801e20_2_1
# The initial index looked like this:
# i4x://BerkeleyX/GG101x/problem/db71da27320a44bdb45df31d0d801e20
# Note the lack of the trailing _2_1
return data.get('student_answers', {})
ungraded = {}
# Essentially implementing Xpath with for loops. Maybe better to just convert to
# XML (or mongo). You'd need to convert from the reference approach to actual
# containment for such an approach to work
for id, desc in course_structure.items():
# Based on Dav's exploration, all top-level containers are 'sequential'
# is this guaranteed to be true? I don't have documentation.
if desc['category'] == 'sequential':
metadata = desc['metadata']
# Based on Dav's exploratons, graded 'sequential' objects have
# 'graded': True. Ungraded objects lack this attribute.
if 'graded' not in metadata:
vert_ids = desc['children']
for i, vid in enumerate(vert_ids):
vert = course_structure[vid]
vert_name = vert['metadata']['display_name']
for child_id in vert['children']:
child = course_structure[child_id]
# There are many other categories, but I'm not sure how to
# make sense of all of them. Some instructors are interested
# in, e.g., seeing how much of a video was played.
if child['category'] == 'problem':
# If you want to debug, print stuff here:
# print('\t', child['metadata']['display_name'])
ungraded.setdefault(vert_name, []).append(
(child['metadata']['display_name'], child_id) )
# Create a hierarchy of directories and files corresponding to sections and
# student answers to ungraded selected problems
for section, problems in ungraded.items():
# Might not work on Windows (same with .to_csv() below)
makedirs('ungraded_problems/' + section, exist_ok=True)
for name, pid in problems:
raw_records = studentmodule.loc[studentmodule.module_id == pid]
# This triggers a warning, but we don't want to do this on all rows!
# We know we're potentially working on a DataFrame view (but probably
# not).
raw_records['student_answers'] = raw_records.state.apply(
extract_student_answers)
outfname = 'ungraded_problems/{}/{}.tsv'.format(section, name)
raw_records.to_csv(outfname, sep='\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment