Skip to content

Instantly share code, notes, and snippets.

@jasonost
Created October 27, 2014 19:30
Show Gist options
  • Save jasonost/5a81243a717060c358a6 to your computer and use it in GitHub Desktop.
Save jasonost/5a81243a717060c358a6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import pandas as pd, numpy as np, datetime, random, cPickle as pickle\nfrom sklearn import linear_model\nfrom sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix\nfrom sklearn.feature_selection import RFE\nfrom __future__ import division\npd.set_option('max_colwidth', 200)\n%matplotlib inline",
"prompt_number": 1,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Loading data"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Base file path"
},
{
"metadata": {},
"cell_type": "code",
"input": "base_path = 'data/HumanitiesSciences_Econ-1_Summer2014/'\nraw_file = 'HumanitiesSciences_Econ-1_Summer2014'\neng_file = 'engagement_HumanitiesSciences_Econ-1_Summer2014'",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Event Extract"
},
{
"metadata": {},
"cell_type": "code",
"input": "event = pd.read_csv('%s%s_EventXtract.csv' % (base_path,raw_file),\n skiprows=1,\n names=[\"anon_screen_name\",\n \"event_type\",\n \"ip_country\",\n \"time\",\n \"course_display_name\",\n \"resource_display_name\",\n \"success\",\n \"video_code\",\n \"video_current_time\",\n \"video_speed\",\n \"video_old_time\",\n \"video_new_time\",\n \"video_seek_type\",\n \"video_new_speed\",\n \"video_old_speed\",\n \"goto_from\",\n \"goto_dest\"],\n parse_dates=[\"time\"],\n na_values=[\"None\"]\n )",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Activity Grade"
},
{
"metadata": {},
"cell_type": "code",
"input": "grades = pd.read_csv('%s%s_ActivityGrade.csv' % (base_path,raw_file),\n skiprows=1,\n names=[\"activity_grade_id\",\n \"student_id\",\n \"course_display_name\",\n \"grade\",\n \"max_grade\",\n \"percent_grade\",\n \"parts_correctness\",\n \"answers\",\n \"num_attempts\",\n \"first_submit\",\n \"last_submit\",\n \"module_type\",\n \"anon_screen_name\",\n \"resource_display_name\",\n \"module_id\",\n \"name\",\n \"screen_name\"],\n parse_dates=[\"first_submit\",\"last_submit\"]\n )",
"prompt_number": 5,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Video Interaction"
},
{
"metadata": {},
"cell_type": "code",
"input": "video = pd.read_csv('%s%s_VideoInteraction.csv' % (base_path,raw_file),\n skiprows=1,\n names=[\"event_type\",\n \"resource_display_name\",\n \"video_current_time\",\n \"video_speed\",\n \"video_new_speed\",\n \"video_old_speed\",\n \"video_new_time\",\n \"video_old_time\",\n \"video_seek_type\",\n \"video_code\",\n \"time\",\n \"course_display_name\",\n \"anon_screen_name\",\n \"video_id\",\n \"name\",\n \"screen_name\"],\n parse_dates=[\"time\"],\n na_values=[\"None\"]\n )",
"prompt_number": 6,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Engagement data (all)"
},
{
"metadata": {},
"cell_type": "code",
"input": "eng_all = pd.read_csv('%s%s_allData.csv' % (base_path,eng_file),\n skiprows=1,\n names=[\"Platform\",\n \"Course\",\n \"anon_screen_name\",\n \"Date\",\n \"Time\",\n \"SessionLength\",\n \"NumEventsInSession\"]\n )",
"prompt_number": 7,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Weekly engagement data"
},
{
"metadata": {},
"cell_type": "code",
"input": "eng_weekly = pd.read_csv('%s%s_weeklyEffort.csv' % (base_path,eng_file),\n skiprows=1,\n names=[\"Platform\",\n \"Course\",\n \"anon_screen_name\",\n \"Week\",\n \"Effort\"]\n )",
"prompt_number": 8,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Creating useful extracts"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Create useful grades extract"
},
{
"metadata": {},
"cell_type": "code",
"input": "grades_ok = grades[(grades.max_grade > 0) & \n (grades.resource_display_name.apply(lambda x: 'Quiz' in str(x))) &\n (grades.num_attempts == 1)][['grade',\n 'max_grade',\n 'parts_correctness',\n 'module_type',\n 'anon_screen_name',\n 'resource_display_name',\n 'module_id']]\ngrades_ok.update(grades_ok.module_id.apply(lambda x: x[x.rfind('/')+1:]))",
"prompt_number": 9,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Create extract with first attempt at each quiz"
},
{
"metadata": {},
"cell_type": "code",
"input": "first_attempts = event[(event.event_type == 'problem_check') &\n (event.resource_display_name.apply(lambda x: 'Quiz' in str(x))) &\n (event.success.notnull())][['anon_screen_name',\n 'time',\n 'resource_display_name',\n 'success']]\n\n# adding order for each quiz, and selecting first\nfirst_attempts['order_quiz'] = first_attempts.groupby(['anon_screen_name','resource_display_name']).time.rank('min')\nfirst_attempts = first_attempts[first_attempts.order_quiz == 1][['anon_screen_name',\n 'time',\n 'resource_display_name',\n 'success']]\n\n# total number of quizzes\nfa = first_attempts.groupby('anon_screen_name').time.count()\nfirst_attempts['num_quiz'] = first_attempts.anon_screen_name.apply(lambda x: fa[x])\n\n# only looking at those with more than one quiz\nfirst_attempts = first_attempts[first_attempts.num_quiz > 1][['anon_screen_name',\n 'time',\n 'resource_display_name',\n 'success']]",
"prompt_number": 32,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Features"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Quiz-level measure"
},
{
"metadata": {},
"cell_type": "code",
"input": "quiz_total = first_attempts.groupby('resource_display_name').time.count()\nquiz_correct = first_attempts[first_attempts.success == 'correct'].groupby('resource_display_name').time.count()\n\nquiz_correct_share = quiz_correct / quiz_total",
"prompt_number": 11,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Person-level measure"
},
{
"metadata": {},
"cell_type": "code",
"input": "person_total = first_attempts.groupby('anon_screen_name').time.count()\nperson_correct = first_attempts[first_attempts.success == 'correct'].groupby('anon_screen_name').time.count()\n\nperson_correct_share = person_correct / person_total",
"prompt_number": 12,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Activity prior to quiz"
},
{
"metadata": {},
"cell_type": "code",
"input": "# shuffling attempts\nattempt_sample = list(first_attempts.values)\nrandom.shuffle(attempt_sample)\n# attempt_sample = attempt_sample[:10000]",
"prompt_number": 33,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# function to summarize type of event based on resource_display_name and event_type\ndef type_summary(row):\n return 'LECTURE' if 'Lecture:' in str(row['resource_display_name']) else \\\n 'QUIZ' if str(row['resource_display_name']).startswith('Quiz') else \\\n 'INTERFACE' if str(row['resource_display_name']) in ['Multiple Choice','Checkboxes','Dropdown'] else \\\n 'COURSEWARE' if not pd.isnull(row['resource_display_name']) else \\\n 'VIDEO' if '_video' in row['event_type'] else \\\n 'PROBLEM' if '_problem;' in row['event_type'] or '/problem' in row['event_type'] else \\\n 'GRADES' if '_transcript' in row['event_type'] else \\\n 'DISCUSSION' if '/discussion' in row['event_type'] or 'forum.searched' in row['event_type'] else \\\n 'COURSEWARE' if '/courseware' in row['event_type'] else \\\n 'BLOG' if '/info' in row['event_type'] else \\\n 'PROGRESS' if '/progress' in row['event_type'] else \\\n 'MODULES' if row['event_type'].startswith('seq_') or row['event_type'].endswith('goto_position') else \\\n 'WIKI' if '/wiki' in row['event_type'] or '/course_wiki' in row['event_type'] else \\\n 'INSTRUCTOR' if '/instructor' in row['event_type'] else \\\n 'LINKS' if 'e0b95242b5db4fdba5da3dbca7097681' in row['event_type'] else \\\n 'USELESS' if '1e0faf66846c425dafe91eb33bb90c51' in row['event_type'] or row['event_type'].startswith('/') else \\\n row['event_type']\n\n# function to aggregate output of value_counts() on previous summaries\ndef count_types(summary):\n return [sum([summary[k] for k in summary if k in ['LECTURE','VIDEO']]), # video\n sum([summary[k] for k in summary if k in ['COURSEWARE','MODULES']]), # course\n sum([summary[k] for k in summary if k in ['DISCUSSION']]), # discussion\n sum([summary[k] for k in summary if k in ['BLOG','WIKI','LINKS']]), # wiki\n sum([summary[k] for k in summary if k in ['QUIZ','INTERFACE','PROBLEM']]), # quiz\n sum([summary[k] for k in summary if k in ['PROGRESS','GRADES']])] # grades",
"prompt_number": 151,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# storage = []\nfor i in range(21013,len(attempt_sample)):\n a, t, r, s = attempt_sample[i]\n activity = []\n acts = event[(event.anon_screen_name == a) & (event.time <= t) & (event.time >= t - datetime.timedelta(hours=6))]\n for h in [6,1,0.5,0.1]:\n subacts = acts[acts.time >= t - datetime.timedelta(hours=h)].apply(type_summary, axis=1).value_counts()\n activity += count_types(dict(subacts))\n storage.append([a,t,r,s] + activity)\n if i % 500 == 0: print i, datetime.datetime.now()",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "col_names = ['anon_screen_name',\n 'time',\n 'resource_display_name',\n 'success']\n\nfor t in ['6hours','1hour','30min','6min']:\n for r in ['video','course','wiki','quiz','grades']:\n col_names.append('resource_%s_%s' % (t, r))\n\nold_activity = pd.DataFrame(storage[:21013], columns=col_names)\nactivity = pd.DataFrame(storage[21013:], columns=col_names)",
"prompt_number": 121,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for c in col_names[4:]:\n maxval = activity[c].max()\n minval = activity[c].min()\n activity[c + '_any'] = activity[c].apply(lambda x: 1 if x > 0 else 0)\n activity[c] = activity[c].apply(lambda x: (x - minval) / (maxval - minval))\n\nactivity['quiz_correct_share'] = activity.resource_display_name.map(quiz_correct_share)\nactivity['person_correct_share'] = activity.anon_screen_name.map(person_correct_share)\nactivity.person_correct_share = activity.person_correct_share.replace(np.nan,0)\nactivity['success'] = activity.success.apply(lambda x: 1 if x == 'correct' else 0)",
"prompt_number": 125,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for c in col_names[4:]:\n maxval = old_activity[c].max()\n minval = old_activity[c].min()\n old_activity[c + '_any'] = old_activity[c].apply(lambda x: 1 if x > 0 else 0)\n old_activity[c] = old_activity[c].apply(lambda x: (x - minval) / (maxval - minval))\n\nold_activity['quiz_correct_share'] = old_activity.resource_display_name.map(quiz_correct_share)\nold_activity['person_correct_share'] = old_activity.anon_screen_name.map(person_correct_share)\nold_activity.person_correct_share = old_activity.person_correct_share.replace(np.nan,0)",
"prompt_number": 126,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "concatenated = pd.concat([old_activity, activity])\npickle.dump(concatenated, open('activity.pickle','wb'))",
"prompt_number": 128,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "activity = pickle.load(open('activity.pickle','rb'))",
"prompt_number": 58,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Trying to optimize this"
},
{
"metadata": {},
"cell_type": "code",
"input": "# initializing dictionary for dataframes of only those people we are looking at\ndfs = {}",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# populating dictionary\nfor k in first_attempts.groupby('anon_screen_name').groups.keys():\n if k not in dfs:\n dfs[k] = event[event.anon_screen_name == k]",
"prompt_number": 147,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(dfs, open('event_dict.pickle','wb'))",
"prompt_number": 149,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "storage2 = []\nfor i in range(len(attempt_sample)):\n a, t, r, s = attempt_sample[i]\n if a in dfs:\n activity = []\n cur_df = dfs[a]\n acts = cur_df[(cur_df.time <= t) & (cur_df.time >= t - datetime.timedelta(hours=6))]\n for h in [6,1,0.5,0.1]:\n subacts = acts[acts.time >= t - datetime.timedelta(hours=h)].apply(type_summary, axis=1).value_counts()\n activity += count_types(dict(subacts))\n storage2.append([a,t,r,s] + activity)\n if i % 500 == 0: print i, datetime.datetime.now()",
"prompt_number": 152,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "0 2014-10-27 11:03:34.237024\n500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:03:54.851823\n1000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:04:15.245450\n1500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:04:33.072404\n2000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:04:53.306127\n2500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:05:13.944672\n3000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:05:34.138569\n3500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:05:56.399671\n4000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:06:16.337065\n4500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:06:37.062941\n5000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:06:55.849502\n5500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:07:13.796666\n6000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:07:31.799823\n6500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:07:49.704206\n7000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:08:07.934125\n7500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:08:26.343233\n8000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:08:45.117150\n8500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:09:04.234663\n9000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:09:21.157156\n9500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:09:38.741805\n10000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:09:58.189321\n10500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:10:15.735977\n11000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:10:38.300154\n11500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:10:57.953401\n12000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:11:16.650318\n12500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:11:36.684224\n13000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:11:57.354762\n13500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:12:17.473436\n14000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:12:36.943896\n14500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:12:55.035823\n15000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:13:15.792556\n15500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:13:38.186435\n16000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:13:59.929229\n16500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:14:19.382506\n17000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:14:42.192833\n17500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:15:01.283186\n18000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:15:20.447119\n18500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:15:41.314251\n19000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:16:03.113775\n19500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:16:25.404203\n20000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:16:45.226568\n20500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:17:03.171314\n21000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:17:20.676830\n21500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:17:38.499961\n22000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:17:55.992648\n22500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:18:13.914992\n23000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:18:32.524462\n23500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:18:49.583950\n24000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:19:08.177233\n24500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:19:26.470250\n25000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:19:45.560429\n25500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:20:03.342082\n26000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:20:22.237072\n26500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:20:39.524033\n27000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:20:58.497706\n27500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:21:19.857610\n28000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:21:37.552539\n28500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:21:57.279325\n29000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:22:14.942724\n29500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:22:32.386296\n30000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:22:50.459661\n30500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:23:08.890897\n31000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:23:27.636047\n31500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:23:45.602049\n32000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:24:03.445068\n32500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:24:21.753831\n33000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:24:39.408077\n33500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:24:56.899293\n34000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:25:14.599139\n34500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:25:32.782685\n35000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:25:50.004346\n35500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:26:08.403383\n36000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:26:26.327823\n36500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:26:44.236467\n37000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:27:01.469091\n37500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:27:20.011740\n38000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:27:38.304257\n38500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:27:57.663245\n39000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:28:15.807952\n39500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:28:34.362414\n40000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:28:52.833261\n40500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:29:10.628461\n41000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:29:27.270448\n41500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:29:44.324895\n42000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:30:01.724763\n42500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:30:19.050785\n43000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:30:36.914742\n43500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:30:54.822246\n44000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:31:13.026938\n44500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:31:31.380504\n45000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:31:50.473807\n45500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:32:08.888659\n46000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:32:27.260992\n46500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:32:45.156315\n47000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:33:03.639859\n47500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:33:21.107805\n48000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:33:39.711672\n48500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:33:58.477618\n49000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:34:15.469009\n49500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:34:34.295875\n50000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:34:52.225234\n50500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:35:10.335047\n51000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:35:30.523229\n51500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:35:47.762531\n52000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:36:04.836141\n52500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:36:23.118148\n53000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:36:41.431932\n53500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:36:59.257263\n54000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:37:17.234768\n54500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:37:35.932701\n55000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:37:53.878923\n55500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:38:12.178446\n56000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:38:29.483291\n56500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:38:48.792852\n57000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:39:06.960973\n57500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:39:24.702827\n58000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:39:43.186532\n58500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:40:02.263123\n59000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:40:20.345162\n59500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:40:36.930184\n60000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:40:56.585237\n60500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:41:14.814615\n61000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:41:33.028213\n61500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:41:50.890852\n62000"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:42:12.007967\n62500"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 2014-10-27 11:42:32.935895\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "len(storage2)",
"prompt_number": 153,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 153,
"metadata": {},
"text": "62678"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "col_names = ['anon_screen_name',\n 'time',\n 'resource_display_name',\n 'success']\n\nfor t in ['6hours','1hour','30min','6min']:\n for r in ['video','course','discuss','wiki','quiz','grades']:\n col_names.append('resource_%s_%s' % (t, r))\n\nactivity = pd.DataFrame(storage2, columns=col_names)",
"prompt_number": 154,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "activity['quiz_correct_share'] = activity.resource_display_name.map(quiz_correct_share)\nactivity['person_correct_share'] = activity.anon_screen_name.map(person_correct_share)\nactivity.person_correct_share = activity.person_correct_share.replace(np.nan,0)\nactivity['success'] = activity.success.apply(lambda x: 1 if x == 'correct' else 0)",
"prompt_number": 156,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "activity.success.value_counts()",
"prompt_number": 160,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 160,
"metadata": {},
"text": "1 35565\n0 27113\ndtype: int64"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(activity, open('activity_raw.pickle','wb'))",
"prompt_number": 158,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for c in col_names[4:]:\n maxval = activity[c].max()\n minval = activity[c].min()\n activity[c + '_any'] = activity[c].apply(lambda x: 1 if x > 0 else 0)\n activity[c] = activity[c].apply(lambda x: (x - minval) / (maxval - minval))",
"prompt_number": 159,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(activity, open('activity.pickle','wb'))",
"prompt_number": 161,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:d2003c17d7155b60ad824f2352a8c3d7bd632f164c6909a6a78d8174f2fed380"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment