Skip to content

Instantly share code, notes, and snippets.

@jasonost
Last active August 29, 2015 14:10
Show Gist options
  • Save jasonost/7ca980986a9d6aa7a774 to your computer and use it in GitHub Desktop.
Save jasonost/7ca980986a9d6aa7a774 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "### Features created\n\n##### Assignment behavior\n\n* Number of unique assignments submitted\n* Number of total assignments submitted\n* Can calculate average number of times each assignment was submitted\n* Average improvement from first assignment submission to last\n\n##### Reading behavior\n\n"
},
{
"metadata": {},
"cell_type": "code",
"input": "import pandas as pd, numpy as np, datetime, random, cPickle as pickle\nfrom __future__ import division\npd.set_option('max_colwidth', 200)\npd.set_option('max_rows', 200)",
"prompt_number": 1,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Read 2011 data"
},
{
"metadata": {},
"cell_type": "code",
"input": "base_path = '../data/'\nfile_2011 = 'stark_2011_events.csv'\ngrades_2011 = 'anonymized11.csv'",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "events = pd.read_csv('%s%s' % (base_path,file_2011),\n skiprows=1,\n names=['userId',\n 'updatedAt',\n 'eventType',\n 'activityType',\n 'activityTitle',\n 'topicTitle',\n 'timeMinutes',\n 'scorePercent',\n 'anchorValue',\n 'anchorSelection',\n 'topicId',\n 'activityId',\n 'activityEventId',\n 'updateCount'],\n parse_dates=[\"updatedAt\"],\n na_values=[\"\\N\"],\n delimiter=\"\\t\"\n )\n\ngrades = pd.read_csv('%s%s' % (base_path,grades_2011),\n skiprows=1,\n names=[\"dropcol\",\n \"userId\",\n \"Set0\",\n \"Set1\",\n \"Set2\",\n \"Set3\",\n \"Set4\",\n \"Set5\",\n \"Set6\",\n \"Set7\",\n \"Set8\",\n \"Set9\",\n \"Set10\",\n \"Set11\",\n \"Set12\",\n \"Set13\",\n \"Set14\",\n \"Set15\",\n \"Set16\",\n \"Set17\",\n \"Set18\",\n \"Set19\",\n \"Set20\",\n \"Set21\",\n \"Set22\",\n \"Set23\",\n \"Set24\",\n \"Set25\",\n \"Set26\",\n \"Set27\",\n \"Set28\",\n \"demerit\",\n \"s0\",\n \"s1\",\n \"s2\",\n \"s3\",\n \"s4\",\n \"s5\",\n \"s6\",\n \"s7\",\n \"s8\",\n \"s9\",\n \"s10\",\n \"s11\",\n \"s12\",\n \"s13\",\n \"s14\",\n \"s15\",\n \"s16\",\n \"s17\",\n \"s18\",\n \"s19\",\n \"s20\",\n \"s21\",\n \"s22\",\n \"s23\",\n \"s24\",\n \"s25\",\n \"s26\",\n \"s27\",\n \"s28\",\n \"set2Adj\",\n \"set3Adj\",\n \"set4Adj\",\n \"set5Adj\",\n \"set9Adj\",\n \"set13Adj\",\n \"set23Adj\",\n \"Cred.Code\",\n \"hw\",\n \"final\",\n \"course\",\n \"letter\",\n \"Comment\"])\n\ngrades = grades[grades.course.apply(lambda x: pd.notnull(x))]\ndel grades['dropcol']",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "events[:5]",
"prompt_number": 4,
"outputs": [
{
"output_type": "pyout",
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>userId</th>\n <th>updatedAt</th>\n <th>eventType</th>\n <th>activityType</th>\n <th>activityTitle</th>\n <th>topicTitle</th>\n <th>timeMinutes</th>\n <th>scorePercent</th>\n <th>anchorValue</th>\n <th>anchorSelection</th>\n <th>topicId</th>\n <th>activityId</th>\n <th>activityEventId</th>\n <th>updateCount</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> b678938e-c784-4a46-8c32-9babd9dd8b05</td>\n <td>2011-07-20 00:26:38</td>\n <td> CLOSED</td>\n <td> LISTEN</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> 0</td>\n <td>NaN</td>\n <td> NaN</td>\n <td>NaN</td>\n <td> 0OL-NSOMP_DA8YG95U85S1CUL3</td>\n <td> 0NL-TX9HXSEAYIDJBU85S1CUL3</td>\n <td> -6A-LS9UAFGA_7ER2IRQZNBUL3</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> ec43374b-90d3-4b95-9590-c1f5a731bee8</td>\n <td>2011-07-16 00:25:54</td>\n <td> OPENED</td>\n <td> LISTEN</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td>NaN</td>\n <td>NaN</td>\n <td> NaN</td>\n <td>NaN</td>\n <td> 0OL-NSOMP_DA8YG95U85S1CUL3</td>\n <td> 0NL-TX9HXSEAYIDJBU85S1CUL3</td>\n <td> -SKJ3R2X7GA_7ER2IRQZNBUL3</td>\n <td> 0</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 8feddd5e-08fb-440e-947f-6026bad628ad</td>\n <td>2011-07-19 03:22:14</td>\n <td> CLOSED</td>\n <td> LISTEN</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> 0</td>\n <td>NaN</td>\n <td> NaN</td>\n <td>NaN</td>\n <td> 0OL-NSOMP_DA8YG95U85S1CUL3</td>\n <td> 0NL-TX9HXSEAYIDJBU85S1CUL3</td>\n <td> -TJAKFV3CGA_7ER2IRQZNBUL3</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>3</th>\n <td> cba5cb38-f633-4b41-a0f2-751b75f34940</td>\n <td>2011-07-20 01:35:23</td>\n <td> CLOSED</td>\n <td> LISTEN</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> 0</td>\n <td>NaN</td>\n <td> NaN</td>\n <td>NaN</td>\n <td> 0OL-NSOMP_DA8YG95U85S1CUL3</td>\n <td> 0NL-TX9HXSEAYIDJBU85S1CUL3</td>\n <td> 229-LS9UAFGA_7ER2IRQZNBUL3</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>4</th>\n <td> 0fc9a23f-a41e-4a07-ba18-8f63cf37cee4</td>\n <td>2011-07-19 14:16:58</td>\n <td> CLOSED</td>\n <td> LISTEN</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> A Box Model for the Let's Make a Deal Problem</td>\n <td> 0</td>\n <td>NaN</td>\n <td> NaN</td>\n <td>NaN</td>\n <td> 0OL-NSOMP_DA8YG95U85S1CUL3</td>\n <td> 0NL-TX9HXSEAYIDJBU85S1CUL3</td>\n <td> 29E-LS9UAFGA_7ER2IRQZNBUL3</td>\n <td> 1</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"prompt_number": 4,
"text": " userId updatedAt eventType \\\n0 b678938e-c784-4a46-8c32-9babd9dd8b05 2011-07-20 00:26:38 CLOSED \n1 ec43374b-90d3-4b95-9590-c1f5a731bee8 2011-07-16 00:25:54 OPENED \n2 8feddd5e-08fb-440e-947f-6026bad628ad 2011-07-19 03:22:14 CLOSED \n3 cba5cb38-f633-4b41-a0f2-751b75f34940 2011-07-20 01:35:23 CLOSED \n4 0fc9a23f-a41e-4a07-ba18-8f63cf37cee4 2011-07-19 14:16:58 CLOSED \n\n activityType activityTitle \\\n0 LISTEN A Box Model for the Let's Make a Deal Problem \n1 LISTEN A Box Model for the Let's Make a Deal Problem \n2 LISTEN A Box Model for the Let's Make a Deal Problem \n3 LISTEN A Box Model for the Let's Make a Deal Problem \n4 LISTEN A Box Model for the Let's Make a Deal Problem \n\n topicTitle timeMinutes scorePercent \\\n0 A Box Model for the Let's Make a Deal Problem 0 NaN \n1 A Box Model for the Let's Make a Deal Problem NaN NaN \n2 A Box Model for the Let's Make a Deal Problem 0 NaN \n3 A Box Model for the Let's Make a Deal Problem 0 NaN \n4 A Box Model for the Let's Make a Deal Problem 0 NaN \n\n anchorValue anchorSelection topicId \\\n0 NaN NaN 0OL-NSOMP_DA8YG95U85S1CUL3 \n1 NaN NaN 0OL-NSOMP_DA8YG95U85S1CUL3 \n2 NaN NaN 0OL-NSOMP_DA8YG95U85S1CUL3 \n3 NaN NaN 0OL-NSOMP_DA8YG95U85S1CUL3 \n4 NaN NaN 0OL-NSOMP_DA8YG95U85S1CUL3 \n\n activityId activityEventId updateCount \n0 0NL-TX9HXSEAYIDJBU85S1CUL3 -6A-LS9UAFGA_7ER2IRQZNBUL3 1 \n1 0NL-TX9HXSEAYIDJBU85S1CUL3 -SKJ3R2X7GA_7ER2IRQZNBUL3 0 \n2 0NL-TX9HXSEAYIDJBU85S1CUL3 -TJAKFV3CGA_7ER2IRQZNBUL3 1 \n3 0NL-TX9HXSEAYIDJBU85S1CUL3 229-LS9UAFGA_7ER2IRQZNBUL3 1 \n4 0NL-TX9HXSEAYIDJBU85S1CUL3 29E-LS9UAFGA_7ER2IRQZNBUL3 1 "
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "events.groupby(['eventType']).count()[['userId']]",
"prompt_number": 5,
"outputs": [
{
"output_type": "pyout",
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>userId</th>\n </tr>\n <tr>\n <th>eventType</th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>CLOSED</th>\n <td> 801847</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 27657</td>\n </tr>\n <tr>\n <th>SUSPENDED</th>\n <td> 10688</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 537958</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"prompt_number": 5,
"text": " userId\neventType \nCLOSED 801847\nOPENED 27657\nSUSPENDED 10688\nWORKED 537958"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "events.groupby(['activityType']).count()[['userId']]",
"prompt_number": 6,
"outputs": [
{
"output_type": "pyout",
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>userId</th>\n </tr>\n <tr>\n <th>activityType</th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>ASSIGNMENT</th>\n <td> 683908</td>\n </tr>\n <tr>\n <th>DISCUSS</th>\n <td> 616</td>\n </tr>\n <tr>\n <th>LISTEN</th>\n <td> 15699</td>\n </tr>\n <tr>\n <th>PRACTICE</th>\n <td> 164862</td>\n </tr>\n <tr>\n <th>READ</th>\n <td> 478772</td>\n </tr>\n <tr>\n <th>WATCH</th>\n <td> 34293</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"prompt_number": 6,
"text": " userId\nactivityType \nASSIGNMENT 683908\nDISCUSS 616\nLISTEN 15699\nPRACTICE 164862\nREAD 478772\nWATCH 34293"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "events.groupby(['activityType','eventType']).count()[['userId']]",
"prompt_number": 7,
"outputs": [
{
"output_type": "pyout",
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th></th>\n <th>userId</th>\n </tr>\n <tr>\n <th>activityType</th>\n <th>eventType</th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th rowspan=\"3\" valign=\"top\">ASSIGNMENT</th>\n <th>CLOSED</th>\n <td> 608775</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 3532</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 71601</td>\n </tr>\n <tr>\n <th rowspan=\"4\" valign=\"top\">DISCUSS</th>\n <th>CLOSED</th>\n <td> 504</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 31</td>\n </tr>\n <tr>\n <th>SUSPENDED</th>\n <td> 64</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 17</td>\n </tr>\n <tr>\n <th rowspan=\"4\" valign=\"top\">LISTEN</th>\n <th>CLOSED</th>\n <td> 14251</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 1122</td>\n </tr>\n <tr>\n <th>SUSPENDED</th>\n <td> 294</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 32</td>\n </tr>\n <tr>\n <th rowspan=\"3\" valign=\"top\">PRACTICE</th>\n <th>CLOSED</th>\n <td> 111957</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 18833</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 34072</td>\n </tr>\n <tr>\n <th rowspan=\"3\" valign=\"top\">READ</th>\n <th>CLOSED</th>\n <td> 45525</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 2496</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 430751</td>\n </tr>\n <tr>\n <th rowspan=\"4\" valign=\"top\">WATCH</th>\n <th>CLOSED</th>\n <td> 20835</td>\n </tr>\n <tr>\n <th>OPENED</th>\n <td> 1643</td>\n </tr>\n <tr>\n <th>SUSPENDED</th>\n <td> 10330</td>\n </tr>\n <tr>\n <th>WORKED</th>\n <td> 1485</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"prompt_number": 7,
"text": " userId\nactivityType eventType \nASSIGNMENT CLOSED 608775\n OPENED 3532\n WORKED 71601\nDISCUSS CLOSED 504\n OPENED 31\n SUSPENDED 64\n WORKED 17\nLISTEN CLOSED 14251\n OPENED 1122\n SUSPENDED 294\n WORKED 32\nPRACTICE CLOSED 111957\n OPENED 18833\n WORKED 34072\nREAD CLOSED 45525\n OPENED 2496\n WORKED 430751\nWATCH CLOSED 20835\n OPENED 1643\n SUSPENDED 10330\n WORKED 1485"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#writer = pd.ExcelWriter('../data/event_oneuser.xlsx')\n#events[events.userId == '03ad0a94-bbb6-4e23-9704-4de918908205'].to_excel(writer,'Sheet1')\n#writer.save()",
"prompt_number": 8,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Drop rows with sketchy data"
},
{
"metadata": {},
"cell_type": "code",
"input": "events = events[(events.activityType != 'DISCUSS') & (events.eventType != 'OPENED')]",
"prompt_number": 9,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "submits = events[(events.activityType == 'ASSIGNMENT') & \n (events.eventType == 'CLOSED') & \n (events.anchorValue.apply(lambda x: pd.isnull(x))) &\n (events.scorePercent.apply(lambda x: pd.notnull(x))) &\n ((events.scorePercent + events.timeMinutes).apply(lambda x: x > 0))][['userId','updatedAt','topicTitle','timeMinutes','scorePercent']]",
"prompt_number": 10,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "submits2 = submits.set_index(['userId','topicTitle'])",
"prompt_number": 11,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# procedure to determine average improvement from first submission to last submission on each assignment\ndef improve(uid):\n '''\n df: pandas dataframe with submission data\n uid: string, userId\n '''\n ua = submits[submits.userId == uid].groupby('topicTitle')\n topics = ua.topicTitle.nunique().index\n diffs = []\n for t in topics:\n scores = submits2.xs(uid).xs(t)\n if isinstance(scores,pd.core.series.Series):\n diffs.append(0)\n else:\n vals = scores.sort('updatedAt').scorePercent.values\n diffs.append(vals[-1] - vals[0])\n return float(sum(diffs)) / len(diffs)",
"prompt_number": 12,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "good_users = list(grades['userId'].values)\nassign_behav = {u: {'total_submit': submits.groupby('userId').topicTitle.count().loc[u],\n 'uniq_assign': submits.groupby('userId').topicTitle.nunique().loc[u],\n 'avg_diff': improve(u)\n }\n for u in good_users if u in submits.userId.values}",
"prompt_number": 13,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "assign_behav.items()[:5]",
"prompt_number": 14,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 14,
"metadata": {},
"text": "[('bf7aa87b-444a-4eff-9f81-b4078e6dccd3',\n {'avg_diff': 36.2962962962963, 'total_submit': 104, 'uniq_assign': 27}),\n ('40a0d886-bb53-49bb-a216-233d62ca0937',\n {'avg_diff': 19.714285714285715, 'total_submit': 100, 'uniq_assign': 21}),\n ('f13130e9-f9c9-4121-a428-632878ee9b67',\n {'avg_diff': 18.96153846153846, 'total_submit': 104, 'uniq_assign': 26}),\n ('229a2d19-5c1f-4be0-9a64-283203642e8a',\n {'avg_diff': 16.37037037037037, 'total_submit': 111, 'uniq_assign': 27}),\n ('147fd7b8-422a-4c8e-b0d9-184e586edbe7',\n {'avg_diff': 12.2, 'total_submit': 24, 'uniq_assign': 5})]"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"gist_id": "7ca980986a9d6aa7a774",
"name": "",
"signature": "sha256:2430583ce0e1defe46a596c8e79dd3162fa27ad70e502f522e69db9538c7ca10"
},
"nbformat": 3
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment