Skip to content

Instantly share code, notes, and snippets.

@cle-ment
Created May 11, 2018 20:54
Show Gist options
  • Save cle-ment/178a3dd6fbb7bf46a3ea9b8ab79f82b3 to your computer and use it in GitHub Desktop.
Save cle-ment/178a3dd6fbb7bf46a3ea9b8ab79f82b3 to your computer and use it in GitHub Desktop.
duolingo_pred.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "%matplotlib inline",
"execution_count": 137,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "import pandas as pd\nimport numpy as np\nimport scipy.stats\nimport random\nfrom sklearn import linear_model\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import mean_absolute_error",
"execution_count": 118,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Sample and preprocess data"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "learning_traces_path = \"./Studio/data/learning_traces.13m.csv\"",
"execution_count": 119,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "# number of records in file (excludes header)\nn = sum(1 for line in open(learning_traces_path)) - 1 \ns = 100000 #desired sample size\n#the 0-indexed header will not be included in the skip list\nskip = sorted(random.sample(range(1,n+1),n-s)) ",
"execution_count": 120,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "learning_traces = pd.read_csv(learning_traces_path, skiprows=skip)",
"execution_count": 121,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learning_traces.head(10)",
"execution_count": 122,
"outputs": [
{
"data": {
"text/plain": " p_recall timestamp delta user_id learning_language ui_language \\\n0 1.00 1362082527 179 u:h5ex de en \n1 1.00 1362082533 443 u:ezFh es en \n2 1.00 1362082561 269 u:gZQD en es \n3 1.00 1362082574 547 u:mKI es en \n4 1.00 1362082598 51714 u:iISw es en \n5 1.00 1362082603 133 u:hZ9T es en \n6 0.75 1362082610 1151 u:gA8I en pt \n7 1.00 1362082614 185547 u:idJd fr en \n8 1.00 1362082652 438 u:ecYb en es \n9 1.00 1362082655 197936 u:ubz es en \n\n lexeme_id lexeme_string \\\n0 9e1f56b08922d1d9f7ab663b58d88367 freundin/freundin<n><f><sg><acc> \n1 99cd0848be9239250ffd99e11add7338 de/de<pr> \n2 7db6cb705cd5fecf3fa67ee708c8854b between/between<pr> \n3 07605081e461f8cc70d0e27a91d1ba37 tortugas/tortuga<n><f><pl> \n4 d64b11ab44cc6fa1b32ce3d4998bd5af pájaros/pájaro<n><m><pl> \n5 36cc2f758a27a4ebeb9f23d82881a811 pescados/pescado<n><m><pl> \n6 f254244207240db11107ffbe433d1e0e bird/bird<n><sg> \n7 3a05062c9117beb722ae9acb73b53eb6 bonnes/bon<adj><f><pl> \n8 384ecdedd0e8caea25f8ba4e2798bf2b orange/orange<adj> \n9 2a5f9687f146bdb703ab03f7bedf2d3c arroz/arroz<n><m><sg> \n\n history_seen history_correct session_seen session_correct \n0 4 4 2 2 \n1 368 330 1 1 \n2 36 34 2 2 \n3 5 4 1 1 \n4 1 1 1 1 \n5 6 6 3 3 \n6 1 1 4 3 \n7 4 4 2 2 \n8 5 4 1 1 \n9 35 35 2 2 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>p_recall</th>\n <th>timestamp</th>\n <th>delta</th>\n <th>user_id</th>\n <th>learning_language</th>\n <th>ui_language</th>\n <th>lexeme_id</th>\n <th>lexeme_string</th>\n <th>history_seen</th>\n <th>history_correct</th>\n <th>session_seen</th>\n <th>session_correct</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.00</td>\n <td>1362082527</td>\n <td>179</td>\n <td>u:h5ex</td>\n <td>de</td>\n <td>en</td>\n <td>9e1f56b08922d1d9f7ab663b58d88367</td>\n <td>freundin/freundin&lt;n&gt;&lt;f&gt;&lt;sg&gt;&lt;acc&gt;</td>\n <td>4</td>\n <td>4</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.00</td>\n <td>1362082533</td>\n <td>443</td>\n <td>u:ezFh</td>\n <td>es</td>\n <td>en</td>\n <td>99cd0848be9239250ffd99e11add7338</td>\n <td>de/de&lt;pr&gt;</td>\n <td>368</td>\n <td>330</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.00</td>\n <td>1362082561</td>\n <td>269</td>\n <td>u:gZQD</td>\n <td>en</td>\n <td>es</td>\n <td>7db6cb705cd5fecf3fa67ee708c8854b</td>\n <td>between/between&lt;pr&gt;</td>\n <td>36</td>\n <td>34</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1.00</td>\n <td>1362082574</td>\n <td>547</td>\n <td>u:mKI</td>\n <td>es</td>\n <td>en</td>\n <td>07605081e461f8cc70d0e27a91d1ba37</td>\n <td>tortugas/tortuga&lt;n&gt;&lt;f&gt;&lt;pl&gt;</td>\n <td>5</td>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1.00</td>\n <td>1362082598</td>\n <td>51714</td>\n <td>u:iISw</td>\n <td>es</td>\n <td>en</td>\n <td>d64b11ab44cc6fa1b32ce3d4998bd5af</td>\n <td>pájaros/pájaro&lt;n&gt;&lt;m&gt;&lt;pl&gt;</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>1.00</td>\n <td>1362082603</td>\n <td>133</td>\n <td>u:hZ9T</td>\n <td>es</td>\n <td>en</td>\n <td>36cc2f758a27a4ebeb9f23d82881a811</td>\n <td>pescados/pescado&lt;n&gt;&lt;m&gt;&lt;pl&gt;</td>\n <td>6</td>\n <td>6</td>\n <td>3</td>\n <td>3</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.75</td>\n <td>1362082610</td>\n <td>1151</td>\n <td>u:gA8I</td>\n <td>en</td>\n <td>pt</td>\n <td>f254244207240db11107ffbe433d1e0e</td>\n <td>bird/bird&lt;n&gt;&lt;sg&gt;</td>\n <td>1</td>\n <td>1</td>\n <td>4</td>\n <td>3</td>\n </tr>\n <tr>\n <th>7</th>\n <td>1.00</td>\n <td>1362082614</td>\n <td>185547</td>\n <td>u:idJd</td>\n <td>fr</td>\n <td>en</td>\n <td>3a05062c9117beb722ae9acb73b53eb6</td>\n <td>bonnes/bon&lt;adj&gt;&lt;f&gt;&lt;pl&gt;</td>\n <td>4</td>\n <td>4</td>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>8</th>\n <td>1.00</td>\n <td>1362082652</td>\n <td>438</td>\n <td>u:ecYb</td>\n <td>en</td>\n <td>es</td>\n <td>384ecdedd0e8caea25f8ba4e2798bf2b</td>\n <td>orange/orange&lt;adj&gt;</td>\n <td>5</td>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9</th>\n <td>1.00</td>\n <td>1362082655</td>\n <td>197936</td>\n <td>u:ubz</td>\n <td>es</td>\n <td>en</td>\n <td>2a5f9687f146bdb703ab03f7bedf2d3c</td>\n <td>arroz/arroz&lt;n&gt;&lt;m&gt;&lt;sg&gt;</td>\n <td>35</td>\n <td>35</td>\n <td>2</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 122
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "data = learning_traces[[\"delta\", \"history_seen\", \"history_correct\", \"session_seen\", \"session_correct\", \"p_recall\"]].as_matrix()",
"execution_count": 123,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "X = data[:, :5]\nY = data[:, 5:]",
"execution_count": 124,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "train_test_ratio = 0.8\ntrain_size = int(np.floor(train_test_ratio * X.shape[0]))",
"execution_count": 125,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "train_size",
"execution_count": 136,
"outputs": [
{
"data": {
"text/plain": "80000"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 136
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "X_train, X_test = X[:train_size,:], X[train_size:,:]\nY_train, Y_test = Y[:train_size,:], Y[train_size:,:]",
"execution_count": 126,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Predict with baseline constant"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "Y_train_predicted_constant, Y_test_predicted_constant = np.ones(Y_train.shape) * 0.859, np.ones(Y_test.shape) * 0.859",
"execution_count": 127,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "mean_absolute_error(Y_train, Y_train_predicted_constant)",
"execution_count": 128,
"outputs": [
{
"data": {
"text/plain": "0.19978840807296061"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 128
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "mean_absolute_error(Y_test, Y_test_predicted_constant)",
"execution_count": 129,
"outputs": [
{
"data": {
"text/plain": "0.19901131964983185"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 129
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Predict with linear regression"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "reg = linear_model.LinearRegression()",
"execution_count": 130,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "reg.fit(X_train, Y_train)",
"execution_count": 131,
"outputs": [
{
"data": {
"text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 131
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Predict and measure error on training data"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "Y_train_predicted_linear_regression = reg.predict(X_train)",
"execution_count": 132,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "mean_absolute_error(Y_train, Y_train_predicted_linear_regression)",
"execution_count": 133,
"outputs": [
{
"data": {
"text/plain": "0.08409331230676749"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 133
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Predict and measure error on test data"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "Y_test_predicted_linear_regression = reg.predict(X_test)",
"execution_count": 134,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "mean_absolute_error(Y_test, Y_test_predicted_linear_regression)",
"execution_count": 135,
"outputs": [
{
"data": {
"text/plain": "0.084153068502124867"
},
"output_type": "execute_result",
"metadata": {},
"execution_count": 135
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"language_info": {
"file_extension": ".py",
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"version": "3.5.4",
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"name": "python"
},
"kernelspec": {
"name": "conda-root-py",
"display_name": "Python [conda root]",
"language": "python"
},
"gist_info": {},
"toc": {
"toc_threshold": 6,
"toc_number_sections": true,
"toc_cell": false,
"toc_window_display": false
},
"gist": {
"id": "",
"data": {
"description": "duolingo_pred.ipynb",
"public": false
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment