Skip to content

Instantly share code, notes, and snippets.

@allisonmorgan
Last active May 3, 2021 23:04
Show Gist options
  • Save allisonmorgan/914ec8237a381b8ed124d0e695c95894 to your computer and use it in GitHub Desktop.
Save allisonmorgan/914ec8237a381b8ed124d0e695c95894 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's say you wanted to learn the fraction of women across a variety of subtopics. Your data contains for each researcher, whether or not they are a woman and their primary subtopic. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fraction female across dataset: 0.31\n",
"Topic distribution: {'ml': 0.3756, 'nlp': 0.2112, 'hci': 0.2078, 'ai': 0.2054}\n"
]
}
],
"source": [
"N = 5000 # Number of researchers\n",
"\n",
"# Subset of topics with different probabilities of men and women specializing in them\n",
"topics = ['nlp', 'ai', 'ml', 'hci'] \n",
"women_topic_weights = np.array([0.5, 0.2, 0.1, 0.2])\n",
"men_topic_weights = np.array([0.1, 0.2, 0.5, 0.2])\n",
"\n",
"data = pd.DataFrame({'is_woman': np.random.choice([0, 1], N, p=[0.7, 0.3]),\n",
" 'department_id': np.random.choice(range(1, 100), N)})\n",
"# This next line is just assigning a primary topic based on some knowledge about the likliehood of women or men to work in that are. Assuming men and women are unevenly distributed across topics.\n",
"data['primary_topic'] = data['is_woman'].apply(\n",
" lambda x: np.random.choice(topics, 1, p=women_topic_weights)[0] if (x == 1)\n",
" else np.random.choice(topics, 1, p=men_topic_weights)[0])\n",
"\n",
"print('Fraction female across dataset: %.2f' % (sum(data.is_woman)/N))\n",
"print('Topic distribution: ', dict(data.primary_topic.value_counts(normalize=True)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's see if we can recover the same distribution of fraction female in each topic via regression and data analysis."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import statsmodels.formula.api as smf\n",
"import statsmodels.api as sm"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.489842\n",
" Iterations 6\n"
]
},
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<caption>Logit Regression Results</caption>\n",
"<tr>\n",
" <th>Dep. Variable:</th> <td>is_woman</td> <th> No. Observations: </th> <td> 5000</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 4996</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 3</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Date:</th> <td>Tue, 15 Dec 2020</td> <th> Pseudo R-squ.: </th> <td>0.2042</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Time:</th> <td>09:09:24</td> <th> Log-Likelihood: </th> <td> -2449.2</td> \n",
"</tr>\n",
"<tr>\n",
" <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td> -3077.7</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td>3.268e-272</td>\n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>Intercept</th> <td> -0.9088</td> <td> 0.069</td> <td> -13.178</td> <td> 0.000</td> <td> -1.044</td> <td> -0.774</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(primary_topic)[T.hci]</th> <td> 0.1307</td> <td> 0.096</td> <td> 1.361</td> <td> 0.173</td> <td> -0.057</td> <td> 0.319</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(primary_topic)[T.ml]</th> <td> -1.4649</td> <td> 0.108</td> <td> -13.609</td> <td> 0.000</td> <td> -1.676</td> <td> -1.254</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(primary_topic)[T.nlp]</th> <td> 1.7870</td> <td> 0.097</td> <td> 18.508</td> <td> 0.000</td> <td> 1.598</td> <td> 1.976</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary.Summary'>\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: is_woman No. Observations: 5000\n",
"Model: Logit Df Residuals: 4996\n",
"Method: MLE Df Model: 3\n",
"Date: Tue, 15 Dec 2020 Pseudo R-squ.: 0.2042\n",
"Time: 09:09:24 Log-Likelihood: -2449.2\n",
"converged: True LL-Null: -3077.7\n",
"Covariance Type: nonrobust LLR p-value: 3.268e-272\n",
"===========================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------------\n",
"Intercept -0.9088 0.069 -13.178 0.000 -1.044 -0.774\n",
"C(primary_topic)[T.hci] 0.1307 0.096 1.361 0.173 -0.057 0.319\n",
"C(primary_topic)[T.ml] -1.4649 0.108 -13.609 0.000 -1.676 -1.254\n",
"C(primary_topic)[T.nlp] 1.7870 0.097 18.508 0.000 1.598 1.976\n",
"===========================================================================================\n",
"\"\"\""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mod = smf.logit('is_woman ~ C(primary_topic)', data)\n",
"res = mod.fit()\n",
"res.summary()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'nlp': 0.7064393939393939,\n",
" 'ai': 0.2872444011684516,\n",
" 'ml': 0.08519701810436642,\n",
" 'hci': 0.3147256977863331}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dict([(t, res.predict(pd.DataFrame({'primary_topic': [t]}))[0]) for t in topics])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ai': 0.2872444011684518,\n",
" 'hci': 0.314725697786333,\n",
" 'ml': 0.08519701810436635,\n",
" 'nlp': 0.7064393939393939}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dict(data.groupby(['primary_topic']).mean()['is_woman'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Those two distributions looks exactly the same :check:. Alright, now let's try and run this as a regression. The unit of analysis is departments."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"departments = pd.DataFrame(data.groupby(['department_id']).mean()['is_woman']).reset_index()\n",
"\n",
"departments['nlp_researchers'] = departments['department_id'].apply(\n",
" lambda x: data[data.department_id == x].primary_topic.value_counts()['nlp'])\n",
"departments['ai_researchers'] = departments['department_id'].apply(\n",
" lambda x: data[data.department_id == x].primary_topic.value_counts()['ai'])\n",
"departments['ml_researchers'] = departments['department_id'].apply(\n",
" lambda x: data[data.department_id == x].primary_topic.value_counts()['ml'])\n",
"departments['hci_researchers'] = departments['department_id'].apply(\n",
" lambda x: data[data.department_id == x].primary_topic.value_counts()['hci'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.546718\n",
" Iterations 5\n"
]
},
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<caption>Logit Regression Results</caption>\n",
"<tr>\n",
" <th>Dep. Variable:</th> <td>is_woman</td> <th> No. Observations: </th> <td> 99</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 98</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 0</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Date:</th> <td>Tue, 15 Dec 2020</td> <th> Pseudo R-squ.: </th> <td>-77.09</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Time:</th> <td>09:09:24</td> <th> Log-Likelihood: </th> <td> -54.125</td>\n",
"</tr>\n",
"<tr>\n",
" <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td>-0.69315</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td> nan</td> \n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>Intercept</th> <td> -0.8246</td> <td> 0.218</td> <td> -3.777</td> <td> 0.000</td> <td> -1.252</td> <td> -0.397</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary.Summary'>\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: is_woman No. Observations: 99\n",
"Model: Logit Df Residuals: 98\n",
"Method: MLE Df Model: 0\n",
"Date: Tue, 15 Dec 2020 Pseudo R-squ.: -77.09\n",
"Time: 09:09:24 Log-Likelihood: -54.125\n",
"converged: True LL-Null: -0.69315\n",
"Covariance Type: nonrobust LLR p-value: nan\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"Intercept -0.8246 0.218 -3.777 0.000 -1.252 -0.397\n",
"==============================================================================\n",
"\"\"\""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"simple_mod = smf.logit('is_woman ~ 1', departments)\n",
"simple_res = simple_mod.fit()\n",
"simple_res.summary()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"null_model_predictions = simple_res.predict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's make a fancier model where we know how many researchers from each topic were hired. does that help us predict the fraction of women at the department level? This model is like yours but we are learning"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.541322\n",
" Iterations 5\n"
]
},
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<caption>Logit Regression Results</caption>\n",
"<tr>\n",
" <th>Dep. Variable:</th> <td>is_woman</td> <th> No. Observations: </th> <td> 99</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 94</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 4</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Date:</th> <td>Tue, 15 Dec 2020</td> <th> Pseudo R-squ.: </th> <td>-76.32</td> \n",
"</tr>\n",
"<tr>\n",
" <th>Time:</th> <td>09:09:24</td> <th> Log-Likelihood: </th> <td> -53.591</td>\n",
"</tr>\n",
"<tr>\n",
" <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td>-0.69315</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td> 1.000</td> \n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>Intercept</th> <td> -0.8307</td> <td> 1.589</td> <td> -0.523</td> <td> 0.601</td> <td> -3.946</td> <td> 2.285</td>\n",
"</tr>\n",
"<tr>\n",
" <th>nlp_researchers</th> <td> 0.0336</td> <td> 0.059</td> <td> 0.567</td> <td> 0.571</td> <td> -0.083</td> <td> 0.150</td>\n",
"</tr>\n",
"<tr>\n",
" <th>ai_researchers</th> <td> 0.0094</td> <td> 0.075</td> <td> 0.125</td> <td> 0.901</td> <td> -0.138</td> <td> 0.157</td>\n",
"</tr>\n",
"<tr>\n",
" <th>ml_researchers</th> <td> -0.0287</td> <td> 0.052</td> <td> -0.549</td> <td> 0.583</td> <td> -0.131</td> <td> 0.074</td>\n",
"</tr>\n",
"<tr>\n",
" <th>hci_researchers</th> <td> 0.0084</td> <td> 0.073</td> <td> 0.115</td> <td> 0.908</td> <td> -0.135</td> <td> 0.152</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary.Summary'>\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: is_woman No. Observations: 99\n",
"Model: Logit Df Residuals: 94\n",
"Method: MLE Df Model: 4\n",
"Date: Tue, 15 Dec 2020 Pseudo R-squ.: -76.32\n",
"Time: 09:09:24 Log-Likelihood: -53.591\n",
"converged: True LL-Null: -0.69315\n",
"Covariance Type: nonrobust LLR p-value: 1.000\n",
"===================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------\n",
"Intercept -0.8307 1.589 -0.523 0.601 -3.946 2.285\n",
"nlp_researchers 0.0336 0.059 0.567 0.571 -0.083 0.150\n",
"ai_researchers 0.0094 0.075 0.125 0.901 -0.138 0.157\n",
"ml_researchers -0.0287 0.052 -0.549 0.583 -0.131 0.074\n",
"hci_researchers 0.0084 0.073 0.115 0.908 -0.135 0.152\n",
"===================================================================================\n",
"\"\"\""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fancier_model = smf.logit(\n",
" 'is_woman ~ nlp_researchers + ai_researchers + ml_researchers + hci_researchers', departments)\n",
"fancier_res = fancier_model.fit()\n",
"fancier_res.summary()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"fancier_model_predictions = fancier_res.predict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I always struggle with: how do I interpret these parameters on logistic regression? This [link](https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/) is helpful."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Intercept -0.830695\n",
"nlp_researchers 0.033579\n",
"ai_researchers 0.009386\n",
"ml_researchers -0.028682\n",
"hci_researchers 0.008449\n",
"dtype: float64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fancier_res.params"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.03414924055928"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Holding everything else constant, adding one NLP researcher, increases the female:male ratio\n",
"np.exp(fancier_res.params['nlp_researchers'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0094304471581492"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Holding everything else constant, adding one AI researcher, increases the female:male ratio\n",
"np.exp(fancier_res.params['ai_researchers'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9717250353098399"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.exp(fancier_res.params['ml_researchers'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0084845088020287"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.exp(fancier_res.params['hci_researchers'])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(departments.department_id, departments.is_woman, label='Department averages')\n",
"plt.plot(departments.department_id, null_model_predictions, label='Overall average')\n",
"plt.scatter(departments.department_id, fancier_model_predictions, label='Prediction based on hiring in topics')\n",
"plt.legend(frameon=False)\n",
"plt.ylabel('Women to Men Ratio')\n",
"plt.xlabel('Department ID')\n",
"plt.ylim(0, 0.5)\n",
"plt.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Is this fancier model better than the simpler? An ANOVA test will test whether one of two nested models are better. The p-value says the fancier model results in a significant increase to the R^2."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>df_resid</th>\n",
" <th>ssr</th>\n",
" <th>df_diff</th>\n",
" <th>ss_diff</th>\n",
" <th>F</th>\n",
" <th>Pr(&gt;F)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>98.0</td>\n",
" <td>0.487476</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>94.0</td>\n",
" <td>0.338255</td>\n",
" <td>4.0</td>\n",
" <td>0.149221</td>\n",
" <td>10.36697</td>\n",
" <td>5.343899e-07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" df_resid ssr df_diff ss_diff F Pr(>F)\n",
"0 98.0 0.487476 0.0 NaN NaN NaN\n",
"1 94.0 0.338255 4.0 0.149221 10.36697 5.343899e-07"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"simple_res = smf.ols('is_woman ~ 1', departments).fit()\n",
"fancier_res = smf.ols(\n",
" 'is_woman ~ nlp_researchers + ai_researchers + ml_researchers + hci_researchers', departments).fit()\n",
"sm.stats.anova_lm(simple_res, \n",
" fancier_res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alright let's do this by hand based on [its definition](https://en.wikipedia.org/wiki/F-test#Regression_problems). The F-statistic is $\\frac{(RSS_{1} - RSS_{2})/(p_{2} - p_{1})}{RSS_{2}/(n-p_{2})}$ where $RSS_{1}$ and $RSS_{2}$ are the residual sum squared for both models, $p_{1}$ and $p_{2}$ describe the number of parameters in each model, and $n$ is number of observations."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"rss_1 = sum((simple_res.resid)**2)\n",
"rss_2 = sum((fancier_res.resid)**2)\n",
"\n",
"p_1 = len(simple_res.params)\n",
"p_2 = len(fancier_res.params)\n",
"\n",
"n = len(departments)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10.366970002659706\n"
]
}
],
"source": [
"f_stat = ((rss_1 - rss_2)/(p_2 - p_1))/(rss_2/(n - p_2))\n",
"print(f_stat) # Notice this the same as in the table above!"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5.343899306398114e-07\n"
]
}
],
"source": [
"import scipy\n",
"p_value = 1-scipy.stats.f.cdf(f_stat, (p_2 - p_1), (n - p_2))\n",
"print(p_value)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice these same values are provided on the regression result for the fancier model. The F-statistic printed out always compares the model to an intercept only model, and the probability of that F-statistic is used to figure out whether our fancy model significantly reduces the residual sum squared compared to a cimple intercept only model."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<caption>OLS Regression Results</caption>\n",
"<tr>\n",
" <th>Dep. Variable:</th> <td>is_woman</td> <th> R-squared: </th> <td> 0.306</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Model:</th> <td>OLS</td> <th> Adj. R-squared: </th> <td> 0.277</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Method:</th> <td>Least Squares</td> <th> F-statistic: </th> <td> 10.37</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Date:</th> <td>Tue, 15 Dec 2020</td> <th> Prob (F-statistic):</th> <td>5.34e-07</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Time:</th> <td>09:09:25</td> <th> Log-Likelihood: </th> <td> 140.64</td>\n",
"</tr>\n",
"<tr>\n",
" <th>No. Observations:</th> <td> 99</td> <th> AIC: </th> <td> -271.3</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Df Residuals:</th> <td> 94</td> <th> BIC: </th> <td> -258.3</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Df Model:</th> <td> 4</td> <th> </th> <td> </td> \n",
"</tr>\n",
"<tr>\n",
" <th>Covariance Type:</th> <td>nonrobust</td> <th> </th> <td> </td> \n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>coef</th> <th>std err</th> <th>t</th> <th>P>|t|</th> <th>[0.025</th> <th>0.975]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>Intercept</th> <td> 0.3045</td> <td> 0.044</td> <td> 6.987</td> <td> 0.000</td> <td> 0.218</td> <td> 0.391</td>\n",
"</tr>\n",
"<tr>\n",
" <th>nlp_researchers</th> <td> 0.0073</td> <td> 0.002</td> <td> 4.376</td> <td> 0.000</td> <td> 0.004</td> <td> 0.011</td>\n",
"</tr>\n",
"<tr>\n",
" <th>ai_researchers</th> <td> 0.0021</td> <td> 0.002</td> <td> 1.009</td> <td> 0.316</td> <td> -0.002</td> <td> 0.006</td>\n",
"</tr>\n",
"<tr>\n",
" <th>ml_researchers</th> <td> -0.0062</td> <td> 0.001</td> <td> -4.271</td> <td> 0.000</td> <td> -0.009</td> <td> -0.003</td>\n",
"</tr>\n",
"<tr>\n",
" <th>hci_researchers</th> <td> 0.0018</td> <td> 0.002</td> <td> 0.880</td> <td> 0.381</td> <td> -0.002</td> <td> 0.006</td>\n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <th>Omnibus:</th> <td> 0.661</td> <th> Durbin-Watson: </th> <td> 2.097</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Prob(Omnibus):</th> <td> 0.718</td> <th> Jarque-Bera (JB): </th> <td> 0.697</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Skew:</th> <td>-0.189</td> <th> Prob(JB): </th> <td> 0.706</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Kurtosis:</th> <td> 2.838</td> <th> Cond. No. </th> <td> 192.</td>\n",
"</tr>\n",
"</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
],
"text/plain": [
"<class 'statsmodels.iolib.summary.Summary'>\n",
"\"\"\"\n",
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: is_woman R-squared: 0.306\n",
"Model: OLS Adj. R-squared: 0.277\n",
"Method: Least Squares F-statistic: 10.37\n",
"Date: Tue, 15 Dec 2020 Prob (F-statistic): 5.34e-07\n",
"Time: 09:09:25 Log-Likelihood: 140.64\n",
"No. Observations: 99 AIC: -271.3\n",
"Df Residuals: 94 BIC: -258.3\n",
"Df Model: 4 \n",
"Covariance Type: nonrobust \n",
"===================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------\n",
"Intercept 0.3045 0.044 6.987 0.000 0.218 0.391\n",
"nlp_researchers 0.0073 0.002 4.376 0.000 0.004 0.011\n",
"ai_researchers 0.0021 0.002 1.009 0.316 -0.002 0.006\n",
"ml_researchers -0.0062 0.001 -4.271 0.000 -0.009 -0.003\n",
"hci_researchers 0.0018 0.002 0.880 0.381 -0.002 0.006\n",
"==============================================================================\n",
"Omnibus: 0.661 Durbin-Watson: 2.097\n",
"Prob(Omnibus): 0.718 Jarque-Bera (JB): 0.697\n",
"Skew: -0.189 Prob(JB): 0.706\n",
"Kurtosis: 2.838 Cond. No. 192.\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"\"\"\""
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fancier_res.summary()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3061085772316079\n"
]
}
],
"source": [
"ssr = sum((departments.is_woman - fancier_res.predict())**2)\n",
"sst = sum((departments.is_woman - departments.is_woman.mean())**2)\n",
"r2 = 1 - (ssr/sst)\n",
"print(r2)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n"
]
}
],
"source": [
"ssr = sum((departments.is_woman - simple_res.predict())**2)\n",
"sst = sum((departments.is_woman - departments.is_woman.mean())**2)\n",
"r2 = 1 - (ssr/sst)\n",
"print(r2) # This should be zero because the sst == ssr. The simplest model is just predicting the average"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment