Skip to content

Instantly share code, notes, and snippets.

@jasonost
Created October 28, 2014 22:33
Show Gist options
  • Save jasonost/db96f34cc80d3ba5fab6 to your computer and use it in GitHub Desktop.
Save jasonost/db96f34cc80d3ba5fab6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import pandas as pd, numpy as np, datetime, random, cPickle as pickle\nfrom sklearn import linear_model\nfrom sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix\nfrom sklearn.feature_selection import RFE\nfrom __future__ import division\npd.set_option('max_colwidth', 200)\n%matplotlib inline",
"prompt_number": 1,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Load data"
},
{
"metadata": {},
"cell_type": "code",
"input": "activity = pickle.load(open('activity.pickle','rb'))",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Building models"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Logistic regression"
},
{
"metadata": {},
"cell_type": "code",
"input": "cutoff = int(len(activity.index) * 0.75)\nrows = random.sample(activity.index, cutoff)\ntrainset = activity.ix[rows]\ntestset = activity.drop(rows)",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for t in ['6hours','1hour','30min','6min']:\n logit = linear_model.LogisticRegression()\n cols = [c for c in activity.columns if t in c or '_correct_' in c]\n logit.fit(trainset[cols], trainset['success'])\n pred = logit.predict(testset[cols])\n roc = roc_auc_score(testset['success'], pred)\n ls = logit.score(testset[cols], testset['success'])\n # print sorted(zip(cols, logit.coef_[0]), key=lambda x: abs(x[1]), reverse=True)\n print '%s\\t%g\\t%g' % (t,roc,ls)",
"prompt_number": 4,
"outputs": [
{
"output_type": "stream",
"text": "6hours\t0.827618\t0.832291\n1hour\t0.828469\t0.832993",
"stream": "stdout"
},
{
"output_type": "stream",
"text": "\n30min\t0.828691\t0.833184",
"stream": "stdout"
},
{
"output_type": "stream",
"text": "\n6min\t0.828863\t0.833504",
"stream": "stdout"
},
{
"output_type": "stream",
"text": "\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "cols = activity.columns[4:]\nfeat = RFE(estimator=logit, n_features_to_select=1, step=1)\nfeat.fit(trainset[cols], trainset['success'])",
"prompt_number": 5,
"outputs": [
{
"text": "RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),\n estimator_params={}, n_features_to_select=1, step=1, verbose=0)",
"output_type": "pyout",
"metadata": {},
"prompt_number": 5
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "sorted(zip(cols,feat.ranking_), key=lambda x: x[1])",
"prompt_number": 6,
"outputs": [
{
"text": "[('person_correct_share', 1),\n ('quiz_correct_share', 2),\n ('resource_6min_quiz_any', 3),\n ('resource_30min_quiz_any', 4),\n ('resource_1hour_quiz_any', 5),\n ('resource_6hours_quiz_any', 6),\n ('resource_6min_quiz', 7),\n ('resource_6min_grades', 8),\n ('resource_6min_wiki', 9),\n ('resource_6hours_wiki', 10),\n ('resource_6hours_course_any', 11),\n ('resource_6hours_discuss', 12),\n ('resource_1hour_discuss', 13),\n ('resource_1hour_wiki', 14),\n ('resource_6hours_grades', 15),\n ('resource_30min_grades', 16),\n ('resource_1hour_video', 17),\n ('resource_6min_discuss', 18),\n ('resource_1hour_quiz', 19),\n ('resource_6hours_quiz', 20),\n ('resource_30min_wiki', 21),\n ('resource_30min_discuss', 22),\n ('resource_30min_quiz', 23),\n ('resource_6min_wiki_any', 24),\n ('resource_6hours_video', 25),\n ('resource_6min_video', 26),\n ('resource_6hours_video_any', 27),\n ('resource_6hours_grades_any', 28),\n ('resource_6min_course', 29),\n ('resource_30min_video', 30),\n ('resource_6min_course_any', 31),\n ('resource_1hour_course_any', 32),\n ('resource_1hour_course', 33),\n ('resource_1hour_wiki_any', 34),\n ('resource_30min_wiki_any', 35),\n ('resource_30min_discuss_any', 36),\n ('resource_6hours_discuss_any', 37),\n ('resource_1hour_discuss_any', 38),\n ('resource_1hour_grades_any', 39),\n ('resource_30min_grades_any', 40),\n ('resource_6hours_course', 41),\n ('resource_30min_course_any', 42),\n ('resource_1hour_grades', 43),\n ('resource_30min_course', 44),\n ('resource_6min_video_any', 45),\n ('resource_6hours_wiki_any', 46),\n ('resource_30min_video_any', 47),\n ('resource_6min_grades_any', 48),\n ('resource_1hour_video_any', 49),\n ('resource_6min_discuss_any', 50)]",
"output_type": "pyout",
"metadata": {},
"prompt_number": 6
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "cols = ['quiz_correct_share',\n 'person_correct_share',\n 'resource_30min_course',\n 'resource_6hours_discuss',\n 'resource_6hours_grades',\n 'resource_6hours_quiz',\n 'resource_30min_wiki',\n 'resource_30min_video']\nlogit = linear_model.LogisticRegression(C=1e5)\nlogit.fit(trainset[cols], trainset['success'])\npred = logit.predict(testset[cols])\nroc = roc_auc_score(testset['success'], pred)\nls = logit.score(testset[cols], testset['success'])",
"prompt_number": 25,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "roc",
"prompt_number": 26,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 26,
"metadata": {},
"text": "0.83271090175215978"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "zip(cols,logit.coef_[0])",
"prompt_number": 13,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 13,
"metadata": {},
"text": "[('quiz_correct_share', 1.9588739831540602),\n ('person_correct_share', 1.4164212229647777),\n ('resource_30min_course', -0.0053761447780298725),\n ('resource_6hours_discuss', 0.002423759621709912),\n ('resource_6hours_grades', -0.0092037533471761752),\n ('resource_6hours_quiz', 0.012233299020303021),\n ('resource_30min_wiki', -0.0051592065705068184),\n ('resource_30min_video', 0.00015785408217884936)]"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### 10-fold cross validation to get average coefficient"
},
{
"metadata": {},
"cell_type": "code",
"input": "cols = ['quiz_correct_share',\n 'person_correct_share',\n 'resource_30min_course',\n 'resource_1hour_discuss',\n 'resource_6hours_grades',\n 'resource_6hours_quiz',\n 'resource_30min_wiki',\n 'resource_30min_video']\ncoef_dict = {k: [] for k in cols}\ncutoffs = np.linspace(0,len(activity.index),10).astype(int)\n\nfor i in range(len(cutoffs)-1):\n trainset = pd.concat([activity[:cutoffs[i]],activity[cutoffs[i+1]:]])\n testset = activity[cutoffs[i]:cutoffs[i+1]]\n \n logit = linear_model.LogisticRegression(C=1e5)\n logit.fit(trainset[cols], trainset['success'])\n pred = logit.predict(testset[cols])\n roc = roc_auc_score(testset['success'], pred)\n print roc\n ls = logit.score(testset[cols], testset['success'])\n cur_coef = dict(zip(cols,logit.coef_[0]))\n for c in cur_coef.keys():\n coef_dict[c].append(cur_coef[c])",
"prompt_number": 24,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "0.822335766423\n0.826636644675"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.830296227347"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.820790076235"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.826902885421"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.834583815565"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.827650411056"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.830916805743"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n0.832217852166"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "sorted([(k, sum(c) / len(c)) for k, c in coef_dict.items()], key=lambda x: abs(x[1]), reverse=True)",
"prompt_number": 25,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 25,
"metadata": {},
"text": "[('person_correct_share', 7.7775643794348861),\n ('quiz_correct_share', 6.6159103178916485),\n ('resource_6hours_quiz', 0.44101550329900574),\n ('resource_6hours_grades', -0.33686739046138942),\n ('resource_30min_video', 0.23319519041209835),\n ('resource_1hour_discuss', -0.12726112503349668),\n ('resource_30min_course', -0.10627057826122169),\n ('resource_30min_wiki', -0.00054306281933279917)]"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for v, c in sorted([(k, sum(c) / len(c)) for k, c in coef_dict.items()], key=lambda x: abs(x[1]), reverse=True):\n print '%s\\t%g' % (v, c)",
"prompt_number": 19,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "person_correct_share\t7.77582\nquiz_correct_share\t6.61585\nresource_6hours_quiz\t0.443215\nresource_6hours_grades\t-0.346701\nresource_30min_video\t0.225609\nresource_30min_course\t-0.110822\nresource_6hours_discuss\t0.0774493\nresource_30min_wiki\t-0.00638102\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:eb47e2e310cdf37c09ea9c66be2746bc0d72d0b41f39ef8f1cb07d1cb0bfe6b6"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment