Created
May 11, 2018 20:54
-
-
Save cle-ment/178a3dd6fbb7bf46a3ea9b8ab79f82b3 to your computer and use it in GitHub Desktop.
duolingo_pred.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "%matplotlib inline", | |
"execution_count": 137, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd\nimport numpy as np\nimport scipy.stats\nimport random\nfrom sklearn import linear_model\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import mean_absolute_error", | |
"execution_count": 118, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Sample and preprocess data" | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "learning_traces_path = \"./Studio/data/learning_traces.13m.csv\"", | |
"execution_count": 119, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "# number of records in file (excludes header)\nn = sum(1 for line in open(learning_traces_path)) - 1 \ns = 100000 #desired sample size\n#the 0-indexed header will not be included in the skip list\nskip = sorted(random.sample(range(1,n+1),n-s)) ", | |
"execution_count": 120, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "learning_traces = pd.read_csv(learning_traces_path, skiprows=skip)", | |
"execution_count": 121, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "learning_traces.head(10)", | |
"execution_count": 122, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " p_recall timestamp delta user_id learning_language ui_language \\\n0 1.00 1362082527 179 u:h5ex de en \n1 1.00 1362082533 443 u:ezFh es en \n2 1.00 1362082561 269 u:gZQD en es \n3 1.00 1362082574 547 u:mKI es en \n4 1.00 1362082598 51714 u:iISw es en \n5 1.00 1362082603 133 u:hZ9T es en \n6 0.75 1362082610 1151 u:gA8I en pt \n7 1.00 1362082614 185547 u:idJd fr en \n8 1.00 1362082652 438 u:ecYb en es \n9 1.00 1362082655 197936 u:ubz es en \n\n lexeme_id lexeme_string \\\n0 9e1f56b08922d1d9f7ab663b58d88367 freundin/freundin<n><f><sg><acc> \n1 99cd0848be9239250ffd99e11add7338 de/de<pr> \n2 7db6cb705cd5fecf3fa67ee708c8854b between/between<pr> \n3 07605081e461f8cc70d0e27a91d1ba37 tortugas/tortuga<n><f><pl> \n4 d64b11ab44cc6fa1b32ce3d4998bd5af pájaros/pájaro<n><m><pl> \n5 36cc2f758a27a4ebeb9f23d82881a811 pescados/pescado<n><m><pl> \n6 f254244207240db11107ffbe433d1e0e bird/bird<n><sg> \n7 3a05062c9117beb722ae9acb73b53eb6 bonnes/bon<adj><f><pl> \n8 384ecdedd0e8caea25f8ba4e2798bf2b orange/orange<adj> \n9 2a5f9687f146bdb703ab03f7bedf2d3c arroz/arroz<n><m><sg> \n\n history_seen history_correct session_seen session_correct \n0 4 4 2 2 \n1 368 330 1 1 \n2 36 34 2 2 \n3 5 4 1 1 \n4 1 1 1 1 \n5 6 6 3 3 \n6 1 1 4 3 \n7 4 4 2 2 \n8 5 4 1 1 \n9 35 35 2 2 ", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>p_recall</th>\n <th>timestamp</th>\n <th>delta</th>\n <th>user_id</th>\n <th>learning_language</th>\n <th>ui_language</th>\n <th>lexeme_id</th>\n <th>lexeme_string</th>\n <th>history_seen</th>\n <th>history_correct</th>\n <th>session_seen</th>\n <th>session_correct</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.00</td>\n <td>1362082527</td>\n <td>179</td>\n <td>u:h5ex</td>\n <td>de</td>\n <td>en</td>\n <td>9e1f56b08922d1d9f7ab663b58d88367</td>\n <td>freundin/freundin<n><f><sg><acc></td>\n <td>4</td>\n <td>4</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.00</td>\n <td>1362082533</td>\n <td>443</td>\n <td>u:ezFh</td>\n <td>es</td>\n <td>en</td>\n <td>99cd0848be9239250ffd99e11add7338</td>\n <td>de/de<pr></td>\n <td>368</td>\n <td>330</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.00</td>\n <td>1362082561</td>\n <td>269</td>\n <td>u:gZQD</td>\n <td>en</td>\n <td>es</td>\n <td>7db6cb705cd5fecf3fa67ee708c8854b</td>\n <td>between/between<pr></td>\n <td>36</td>\n <td>34</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1.00</td>\n <td>1362082574</td>\n <td>547</td>\n <td>u:mKI</td>\n <td>es</td>\n <td>en</td>\n <td>07605081e461f8cc70d0e27a91d1ba37</td>\n <td>tortugas/tortuga<n><f><pl></td>\n <td>5</td>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1.00</td>\n <td>1362082598</td>\n <td>51714</td>\n <td>u:iISw</td>\n <td>es</td>\n <td>en</td>\n <td>d64b11ab44cc6fa1b32ce3d4998bd5af</td>\n <td>pájaros/pájaro<n><m><pl></td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>1.00</td>\n <td>1362082603</td>\n <td>133</td>\n <td>u:hZ9T</td>\n <td>es</td>\n <td>en</td>\n <td>36cc2f758a27a4ebeb9f23d82881a811</td>\n <td>pescados/pescado<n><m><pl></td>\n <td>6</td>\n <td>6</td>\n <td>3</td>\n <td>3</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.75</td>\n <td>1362082610</td>\n <td>1151</td>\n <td>u:gA8I</td>\n <td>en</td>\n <td>pt</td>\n <td>f254244207240db11107ffbe433d1e0e</td>\n <td>bird/bird<n><sg></td>\n <td>1</td>\n <td>1</td>\n <td>4</td>\n <td>3</td>\n </tr>\n <tr>\n <th>7</th>\n <td>1.00</td>\n <td>1362082614</td>\n <td>185547</td>\n <td>u:idJd</td>\n <td>fr</td>\n <td>en</td>\n <td>3a05062c9117beb722ae9acb73b53eb6</td>\n <td>bonnes/bon<adj><f><pl></td>\n <td>4</td>\n <td>4</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>8</th>\n <td>1.00</td>\n <td>1362082652</td>\n <td>438</td>\n <td>u:ecYb</td>\n <td>en</td>\n <td>es</td>\n <td>384ecdedd0e8caea25f8ba4e2798bf2b</td>\n <td>orange/orange<adj></td>\n <td>5</td>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9</th>\n <td>1.00</td>\n <td>1362082655</td>\n <td>197936</td>\n <td>u:ubz</td>\n <td>es</td>\n <td>en</td>\n <td>2a5f9687f146bdb703ab03f7bedf2d3c</td>\n <td>arroz/arroz<n><m><sg></td>\n <td>35</td>\n <td>35</td>\n <td>2</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 122 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "data = learning_traces[[\"delta\", \"history_seen\", \"history_correct\", \"session_seen\", \"session_correct\", \"p_recall\"]].as_matrix()", | |
"execution_count": 123, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "X = data[:, :5]\nY = data[:, 5:]", | |
"execution_count": 124, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "train_test_ratio = 0.8\ntrain_size = int(np.floor(train_test_ratio * X.shape[0]))", | |
"execution_count": 125, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "train_size", | |
"execution_count": 136, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "80000" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 136 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "X_train, X_test = X[:train_size,:], X[train_size:,:]\nY_train, Y_test = Y[:train_size,:], Y[train_size:,:]", | |
"execution_count": 126, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Predict with baseline constant" | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "Y_train_predicted_constant, Y_test_predicted_constant = np.ones(Y_train.shape) * 0.859, np.ones(Y_test.shape) * 0.859", | |
"execution_count": 127, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mean_absolute_error(Y_train, Y_train_predicted_constant)", | |
"execution_count": 128, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "0.19978840807296061" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 128 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mean_absolute_error(Y_test, Y_test_predicted_constant)", | |
"execution_count": 129, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "0.19901131964983185" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 129 | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Predict with linear regression" | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "reg = linear_model.LinearRegression()", | |
"execution_count": 130, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "reg.fit(X_train, Y_train)", | |
"execution_count": 131, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 131 | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Predict and measure error on training data" | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "Y_train_predicted_linear_regression = reg.predict(X_train)", | |
"execution_count": 132, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mean_absolute_error(Y_train, Y_train_predicted_linear_regression)", | |
"execution_count": 133, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "0.08409331230676749" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 133 | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Predict and measure error on test data" | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "Y_test_predicted_linear_regression = reg.predict(X_test)", | |
"execution_count": 134, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mean_absolute_error(Y_test, Y_test_predicted_linear_regression)", | |
"execution_count": 135, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "0.084153068502124867" | |
}, | |
"output_type": "execute_result", | |
"metadata": {}, | |
"execution_count": 135 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"language_info": { | |
"file_extension": ".py", | |
"codemirror_mode": { | |
"version": 3, | |
"name": "ipython" | |
}, | |
"version": "3.5.4", | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"name": "python" | |
}, | |
"kernelspec": { | |
"name": "conda-root-py", | |
"display_name": "Python [conda root]", | |
"language": "python" | |
}, | |
"gist_info": {}, | |
"toc": { | |
"toc_threshold": 6, | |
"toc_number_sections": true, | |
"toc_cell": false, | |
"toc_window_display": false | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "duolingo_pred.ipynb", | |
"public": false | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment