Created
October 28, 2014 22:33
-
-
Save jasonost/db96f34cc80d3ba5fab6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "import pandas as pd, numpy as np, datetime, random, cPickle as pickle\nfrom sklearn import linear_model\nfrom sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix\nfrom sklearn.feature_selection import RFE\nfrom __future__ import division\npd.set_option('max_colwidth', 200)\n%matplotlib inline", | |
"prompt_number": 1, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Load data" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "activity = pickle.load(open('activity.pickle','rb'))", | |
"prompt_number": 2, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Building models" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Logistic regression" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cutoff = int(len(activity.index) * 0.75)\nrows = random.sample(activity.index, cutoff)\ntrainset = activity.ix[rows]\ntestset = activity.drop(rows)", | |
"prompt_number": 3, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for t in ['6hours','1hour','30min','6min']:\n logit = linear_model.LogisticRegression()\n cols = [c for c in activity.columns if t in c or '_correct_' in c]\n logit.fit(trainset[cols], trainset['success'])\n pred = logit.predict(testset[cols])\n roc = roc_auc_score(testset['success'], pred)\n ls = logit.score(testset[cols], testset['success'])\n # print sorted(zip(cols, logit.coef_[0]), key=lambda x: abs(x[1]), reverse=True)\n print '%s\\t%g\\t%g' % (t,roc,ls)", | |
"prompt_number": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "6hours\t0.827618\t0.832291\n1hour\t0.828469\t0.832993", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n30min\t0.828691\t0.833184", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n6min\t0.828863\t0.833504", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cols = activity.columns[4:]\nfeat = RFE(estimator=logit, n_features_to_select=1, step=1)\nfeat.fit(trainset[cols], trainset['success'])", | |
"prompt_number": 5, | |
"outputs": [ | |
{ | |
"text": "RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),\n estimator_params={}, n_features_to_select=1, step=1, verbose=0)", | |
"output_type": "pyout", | |
"metadata": {}, | |
"prompt_number": 5 | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "sorted(zip(cols,feat.ranking_), key=lambda x: x[1])", | |
"prompt_number": 6, | |
"outputs": [ | |
{ | |
"text": "[('person_correct_share', 1),\n ('quiz_correct_share', 2),\n ('resource_6min_quiz_any', 3),\n ('resource_30min_quiz_any', 4),\n ('resource_1hour_quiz_any', 5),\n ('resource_6hours_quiz_any', 6),\n ('resource_6min_quiz', 7),\n ('resource_6min_grades', 8),\n ('resource_6min_wiki', 9),\n ('resource_6hours_wiki', 10),\n ('resource_6hours_course_any', 11),\n ('resource_6hours_discuss', 12),\n ('resource_1hour_discuss', 13),\n ('resource_1hour_wiki', 14),\n ('resource_6hours_grades', 15),\n ('resource_30min_grades', 16),\n ('resource_1hour_video', 17),\n ('resource_6min_discuss', 18),\n ('resource_1hour_quiz', 19),\n ('resource_6hours_quiz', 20),\n ('resource_30min_wiki', 21),\n ('resource_30min_discuss', 22),\n ('resource_30min_quiz', 23),\n ('resource_6min_wiki_any', 24),\n ('resource_6hours_video', 25),\n ('resource_6min_video', 26),\n ('resource_6hours_video_any', 27),\n ('resource_6hours_grades_any', 28),\n ('resource_6min_course', 29),\n ('resource_30min_video', 30),\n ('resource_6min_course_any', 31),\n ('resource_1hour_course_any', 32),\n ('resource_1hour_course', 33),\n ('resource_1hour_wiki_any', 34),\n ('resource_30min_wiki_any', 35),\n ('resource_30min_discuss_any', 36),\n ('resource_6hours_discuss_any', 37),\n ('resource_1hour_discuss_any', 38),\n ('resource_1hour_grades_any', 39),\n ('resource_30min_grades_any', 40),\n ('resource_6hours_course', 41),\n ('resource_30min_course_any', 42),\n ('resource_1hour_grades', 43),\n ('resource_30min_course', 44),\n ('resource_6min_video_any', 45),\n ('resource_6hours_wiki_any', 46),\n ('resource_30min_video_any', 47),\n ('resource_6min_grades_any', 48),\n ('resource_1hour_video_any', 49),\n ('resource_6min_discuss_any', 50)]", | |
"output_type": "pyout", | |
"metadata": {}, | |
"prompt_number": 6 | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cols = ['quiz_correct_share',\n 'person_correct_share',\n 'resource_30min_course',\n 'resource_6hours_discuss',\n 'resource_6hours_grades',\n 'resource_6hours_quiz',\n 'resource_30min_wiki',\n 'resource_30min_video']\nlogit = linear_model.LogisticRegression(C=1e5)\nlogit.fit(trainset[cols], trainset['success'])\npred = logit.predict(testset[cols])\nroc = roc_auc_score(testset['success'], pred)\nls = logit.score(testset[cols], testset['success'])", | |
"prompt_number": 25, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "roc", | |
"prompt_number": 26, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"metadata": {}, | |
"text": "0.83271090175215978" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "zip(cols,logit.coef_[0])", | |
"prompt_number": 13, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"metadata": {}, | |
"text": "[('quiz_correct_share', 1.9588739831540602),\n ('person_correct_share', 1.4164212229647777),\n ('resource_30min_course', -0.0053761447780298725),\n ('resource_6hours_discuss', 0.002423759621709912),\n ('resource_6hours_grades', -0.0092037533471761752),\n ('resource_6hours_quiz', 0.012233299020303021),\n ('resource_30min_wiki', -0.0051592065705068184),\n ('resource_30min_video', 0.00015785408217884936)]" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "#### 10-fold cross validation to get average coefficient" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cols = ['quiz_correct_share',\n 'person_correct_share',\n 'resource_30min_course',\n 'resource_1hour_discuss',\n 'resource_6hours_grades',\n 'resource_6hours_quiz',\n 'resource_30min_wiki',\n 'resource_30min_video']\ncoef_dict = {k: [] for k in cols}\ncutoffs = np.linspace(0,len(activity.index),10).astype(int)\n\nfor i in range(len(cutoffs)-1):\n trainset = pd.concat([activity[:cutoffs[i]],activity[cutoffs[i+1]:]])\n testset = activity[cutoffs[i]:cutoffs[i+1]]\n \n logit = linear_model.LogisticRegression(C=1e5)\n logit.fit(trainset[cols], trainset['success'])\n pred = logit.predict(testset[cols])\n roc = roc_auc_score(testset['success'], pred)\n print roc\n ls = logit.score(testset[cols], testset['success'])\n cur_coef = dict(zip(cols,logit.coef_[0]))\n for c in cur_coef.keys():\n coef_dict[c].append(cur_coef[c])", | |
"prompt_number": 24, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.822335766423\n0.826636644675" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.830296227347" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.820790076235" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.826902885421" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.834583815565" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.827650411056" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.830916805743" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n0.832217852166" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "sorted([(k, sum(c) / len(c)) for k, c in coef_dict.items()], key=lambda x: abs(x[1]), reverse=True)", | |
"prompt_number": 25, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"metadata": {}, | |
"text": "[('person_correct_share', 7.7775643794348861),\n ('quiz_correct_share', 6.6159103178916485),\n ('resource_6hours_quiz', 0.44101550329900574),\n ('resource_6hours_grades', -0.33686739046138942),\n ('resource_30min_video', 0.23319519041209835),\n ('resource_1hour_discuss', -0.12726112503349668),\n ('resource_30min_course', -0.10627057826122169),\n ('resource_30min_wiki', -0.00054306281933279917)]" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for v, c in sorted([(k, sum(c) / len(c)) for k, c in coef_dict.items()], key=lambda x: abs(x[1]), reverse=True):\n print '%s\\t%g' % (v, c)", | |
"prompt_number": 19, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "person_correct_share\t7.77582\nquiz_correct_share\t6.61585\nresource_6hours_quiz\t0.443215\nresource_6hours_grades\t-0.346701\nresource_30min_video\t0.225609\nresource_30min_course\t-0.110822\nresource_6hours_discuss\t0.0774493\nresource_30min_wiki\t-0.00638102\n" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:eb47e2e310cdf37c09ea9c66be2746bc0d72d0b41f39ef8f1cb07d1cb0bfe6b6" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment