davclark/ungraded.py

## ungraded.py
#!/usr/bin/env python3

'''ungraded.py - extract answers to ungraded problems

Data is obtained from the `course_structure` and `courseware_studentmodule`
data, which are currently specified at the top of the file.

This was developed in response to a request from an instructor to obtain
individual student responses to ungraded questions *with 0 weight*. However,
upon inspection, I noted that all items had either 0 weight, or had weight
unspecified, so I extract answers for the (maybe?) more general class of *all*
ungraded problems. It's easy enough for an instructor to ignore a given file,
and once we've read the data in, that process is pretty fast - so this slightly
more general approach seems apropriate.
'''

from os import makedirs # Nicer than mkdir
import json

import pandas as pd

# Could make these function parameters
# Most folks will need to replace with their own data!
structure_fname = 'BerkeleyX-GG101x-1T2014-course_structure-prod-analytics.json'
student_fname = 'BerkeleyX-GG101x-1T2014-courseware_studentmodule-prod-analytics.sql'

# A JSON file with information about course content
with open(structure_fname) as f:
    course_structure = json.load(f)

# A sql dump that's actually a TSV file
# Reducing the amount of parsing & data retained here is likely the biggest
# optimization target
studentmodule = pd.read_csv(student_fname, '\t', na_values='na')

# Student answers are unfortunately heavily quoted JSON (so you see things like
# "The following is \\"Quoted\\""), as well as ASCII-escaped unicode characters.
# I don't want to process ALL student answers, so I only process selected rows
# below using this function
def extract_student_answers(s):
    '''Helper for the following loop'''
    # This will convert to bytes, then convert to unicode on the way back in -
    # assuming you're on Python 3 If you're dealing with Unicode hell, you
    # definitely want to be in Python 3 encode() for an ASCII string simply
    # converts to bytes - there's no real "encoding"
    s = s.encode().decode('unicode_escape')
    data = json.loads(s)
    # This appears to be where the student answers reliably occur. Of the data
    # I've looked at, this is a blob that also includes an escaped version of
    # the i4x index. I'm scared to delete it, so I leave it. It looks like this:
    # i4x-BerkeleyX-GG101x-problem-db71da27320a44bdb45df31d0d801e20_2_1
    # The initial index looked like this:
    # i4x://BerkeleyX/GG101x/problem/db71da27320a44bdb45df31d0d801e20
    # Note the lack of the trailing _2_1
    return data.get('student_answers', {})

ungraded = {}

# Essentially implementing Xpath with for loops. Maybe better to just convert to
# XML (or mongo). You'd need to convert from the reference approach to actual
# containment for such an approach to work
for id, desc in course_structure.items():
    # Based on Dav's exploration, all top-level containers are 'sequential'
    # is this guaranteed to be true? I don't have documentation.
    if desc['category'] == 'sequential':
        metadata = desc['metadata']
        # Based on Dav's exploratons, graded 'sequential' objects have
        # 'graded': True. Ungraded objects lack this attribute.
        if 'graded' not in metadata:
            vert_ids = desc['children']
            for i, vid in enumerate(vert_ids):
                vert = course_structure[vid]
                vert_name = vert['metadata']['display_name']
                for child_id in vert['children']:
                    child = course_structure[child_id]
                    # There are many other categories, but I'm not sure how to
                    # make sense of all of them. Some instructors are interested
                    # in, e.g., seeing how much of a video was played.
                    if child['category'] == 'problem':
                        # If you want to debug, print stuff here:
                        # print('\t', child['metadata']['display_name'])
                        ungraded.setdefault(vert_name, []).append(
                            (child['metadata']['display_name'], child_id) )

# Create a hierarchy of directories and files corresponding to sections and
# student answers to ungraded selected problems
for section, problems in ungraded.items():
    # Might not work on Windows (same with .to_csv() below)
    makedirs('ungraded_problems/' + section, exist_ok=True)
    for name, pid in problems:
        raw_records = studentmodule.loc[studentmodule.module_id == pid]
        # This triggers a warning, but we don't want to do this on all rows!
        # We know we're potentially working on a DataFrame view (but probably
        # not).
        raw_records['student_answers'] = raw_records.state.apply(
                                             extract_student_answers)
        outfname = 'ungraded_problems/{}/{}.tsv'.format(section, name)
        raw_records.to_csv(outfname, sep='\t')
	#!/usr/bin/env python3

	'''ungraded.py - extract answers to ungraded problems

	Data is obtained from the `course_structure` and `courseware_studentmodule`
	data, which are currently specified at the top of the file.

	This was developed in response to a request from an instructor to obtain
	individual student responses to ungraded questions with 0 weight. However,
	upon inspection, I noted that all items had either 0 weight, or had weight
	unspecified, so I extract answers for the (maybe?) more general class of all
	ungraded problems. It's easy enough for an instructor to ignore a given file,
	and once we've read the data in, that process is pretty fast - so this slightly
	more general approach seems apropriate.
	'''

	from os import makedirs # Nicer than mkdir
	import json

	import pandas as pd

	# Could make these function parameters
	# Most folks will need to replace with their own data!
	structure_fname = 'BerkeleyX-GG101x-1T2014-course_structure-prod-analytics.json'
	student_fname = 'BerkeleyX-GG101x-1T2014-courseware_studentmodule-prod-analytics.sql'

	# A JSON file with information about course content
	with open(structure_fname) as f:
	course_structure = json.load(f)

	# A sql dump that's actually a TSV file
	# Reducing the amount of parsing & data retained here is likely the biggest
	# optimization target
	studentmodule = pd.read_csv(student_fname, '\t', na_values='na')

	# Student answers are unfortunately heavily quoted JSON (so you see things like
	# "The following is \\"Quoted\\""), as well as ASCII-escaped unicode characters.
	# I don't want to process ALL student answers, so I only process selected rows
	# below using this function
	def extract_student_answers(s):
	'''Helper for the following loop'''
	# This will convert to bytes, then convert to unicode on the way back in -
	# assuming you're on Python 3 If you're dealing with Unicode hell, you
	# definitely want to be in Python 3 encode() for an ASCII string simply
	# converts to bytes - there's no real "encoding"
	s = s.encode().decode('unicode_escape')
	data = json.loads(s)
	# This appears to be where the student answers reliably occur. Of the data
	# I've looked at, this is a blob that also includes an escaped version of
	# the i4x index. I'm scared to delete it, so I leave it. It looks like this:
	# i4x-BerkeleyX-GG101x-problem-db71da27320a44bdb45df31d0d801e20_2_1
	# The initial index looked like this:
	# i4x://BerkeleyX/GG101x/problem/db71da27320a44bdb45df31d0d801e20
	# Note the lack of the trailing _2_1
	return data.get('student_answers', {})

	ungraded = {}

	# Essentially implementing Xpath with for loops. Maybe better to just convert to
	# XML (or mongo). You'd need to convert from the reference approach to actual
	# containment for such an approach to work
	for id, desc in course_structure.items():
	# Based on Dav's exploration, all top-level containers are 'sequential'
	# is this guaranteed to be true? I don't have documentation.
	if desc['category'] == 'sequential':
	metadata = desc['metadata']
	# Based on Dav's exploratons, graded 'sequential' objects have
	# 'graded': True. Ungraded objects lack this attribute.
	if 'graded' not in metadata:
	vert_ids = desc['children']
	for i, vid in enumerate(vert_ids):
	vert = course_structure[vid]
	vert_name = vert['metadata']['display_name']
	for child_id in vert['children']:
	child = course_structure[child_id]
	# There are many other categories, but I'm not sure how to
	# make sense of all of them. Some instructors are interested
	# in, e.g., seeing how much of a video was played.
	if child['category'] == 'problem':
	# If you want to debug, print stuff here:
	# print('\t', child['metadata']['display_name'])
	ungraded.setdefault(vert_name, []).append(
	(child['metadata']['display_name'], child_id) )

	# Create a hierarchy of directories and files corresponding to sections and
	# student answers to ungraded selected problems
	for section, problems in ungraded.items():
	# Might not work on Windows (same with .to_csv() below)
	makedirs('ungraded_problems/' + section, exist_ok=True)
	for name, pid in problems:
	raw_records = studentmodule.loc[studentmodule.module_id == pid]
	# This triggers a warning, but we don't want to do this on all rows!
	# We know we're potentially working on a DataFrame view (but probably
	# not).
	raw_records['student_answers'] = raw_records.state.apply(
	extract_student_answers)
	outfname = 'ungraded_problems/{}/{}.tsv'.format(section, name)
	raw_records.to_csv(outfname, sep='\t')