Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
{
"cells": [
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import xgboost as xgb\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import plot_confusion_matrix\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import RepeatedStratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.feature_selection import RFE\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"import numpy as np\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import umap\n",
"import umap.plot\n",
"\n",
"from imblearn.under_sampling import RandomUnderSampler"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>typealyzer</th>\n",
" <th>actual</th>\n",
" <th>e</th>\n",
" <th>s</th>\n",
" <th>t</th>\n",
" <th>sntf_s</th>\n",
" <th>sntf_n</th>\n",
" <th>sntf_t</th>\n",
" <th>sntf_f</th>\n",
" <th>...</th>\n",
" <th>sad</th>\n",
" <th>you</th>\n",
" <th>cogmech</th>\n",
" <th>auxverb</th>\n",
" <th>they</th>\n",
" <th>incl</th>\n",
" <th>money</th>\n",
" <th>feel</th>\n",
" <th>we</th>\n",
" <th>hear</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>http://adropofcolour.tumblr.com</td>\n",
" <td>ISFP</td>\n",
" <td>INFJ</td>\n",
" <td>0.291281</td>\n",
" <td>0.787844</td>\n",
" <td>0.460961</td>\n",
" <td>0.663515</td>\n",
" <td>0.178565</td>\n",
" <td>0.069282</td>\n",
" <td>0.088638</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.019704</td>\n",
" <td>0.098522</td>\n",
" <td>0.147783</td>\n",
" <td>0.000000</td>\n",
" <td>0.039409</td>\n",
" <td>0.009852</td>\n",
" <td>0.019704</td>\n",
" <td>0.044335</td>\n",
" <td>0.009852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://godheadcomplex.tumblr.com</td>\n",
" <td>ESFP</td>\n",
" <td>INFP</td>\n",
" <td>0.883579</td>\n",
" <td>0.951693</td>\n",
" <td>0.238407</td>\n",
" <td>0.855921</td>\n",
" <td>0.046931</td>\n",
" <td>0.021850</td>\n",
" <td>0.075297</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.017513</td>\n",
" <td>0.201401</td>\n",
" <td>0.084063</td>\n",
" <td>0.001751</td>\n",
" <td>0.056042</td>\n",
" <td>0.007005</td>\n",
" <td>0.017513</td>\n",
" <td>0.047285</td>\n",
" <td>0.003503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>http://chaotikaeon2.tumblr.com</td>\n",
" <td>INTJ</td>\n",
" <td>INTP</td>\n",
" <td>0.332444</td>\n",
" <td>0.357863</td>\n",
" <td>0.591322</td>\n",
" <td>0.147668</td>\n",
" <td>0.252326</td>\n",
" <td>0.339831</td>\n",
" <td>0.260175</td>\n",
" <td>...</td>\n",
" <td>0.003283</td>\n",
" <td>0.014540</td>\n",
" <td>0.181989</td>\n",
" <td>0.114916</td>\n",
" <td>0.000938</td>\n",
" <td>0.071295</td>\n",
" <td>0.010319</td>\n",
" <td>0.008912</td>\n",
" <td>0.054409</td>\n",
" <td>0.014540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>http://perpetually-in-transit.blogspot.com</td>\n",
" <td>ESFP</td>\n",
" <td>ENFJ</td>\n",
" <td>0.944394</td>\n",
" <td>0.943192</td>\n",
" <td>0.105527</td>\n",
" <td>0.778825</td>\n",
" <td>0.051134</td>\n",
" <td>0.017299</td>\n",
" <td>0.152742</td>\n",
" <td>...</td>\n",
" <td>0.002497</td>\n",
" <td>0.018727</td>\n",
" <td>0.207241</td>\n",
" <td>0.104869</td>\n",
" <td>0.002497</td>\n",
" <td>0.049938</td>\n",
" <td>0.014981</td>\n",
" <td>0.011236</td>\n",
" <td>0.041199</td>\n",
" <td>0.017478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>http://museofmystery.wordpress.com/2012/08/29/...</td>\n",
" <td>ISTP</td>\n",
" <td>INFP</td>\n",
" <td>0.073352</td>\n",
" <td>0.850472</td>\n",
" <td>0.608812</td>\n",
" <td>0.628322</td>\n",
" <td>0.112762</td>\n",
" <td>0.149270</td>\n",
" <td>0.109646</td>\n",
" <td>...</td>\n",
" <td>0.001031</td>\n",
" <td>0.005155</td>\n",
" <td>0.215464</td>\n",
" <td>0.122680</td>\n",
" <td>0.005155</td>\n",
" <td>0.043299</td>\n",
" <td>0.019588</td>\n",
" <td>0.002062</td>\n",
" <td>0.021649</td>\n",
" <td>0.012371</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25432</th>\n",
" <td>http://pistoche.tumblr.com</td>\n",
" <td>ESFP</td>\n",
" <td>INTJ</td>\n",
" <td>0.685653</td>\n",
" <td>0.969891</td>\n",
" <td>0.480241</td>\n",
" <td>0.960824</td>\n",
" <td>0.029758</td>\n",
" <td>0.004220</td>\n",
" <td>0.005199</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25433</th>\n",
" <td>http://lokh.tumblr.com</td>\n",
" <td>ISTP</td>\n",
" <td>INTP</td>\n",
" <td>0.201637</td>\n",
" <td>0.553602</td>\n",
" <td>0.662618</td>\n",
" <td>0.468074</td>\n",
" <td>0.374926</td>\n",
" <td>0.099968</td>\n",
" <td>0.057033</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25435</th>\n",
" <td>http://readerdye.tumblr.com</td>\n",
" <td>ISTP</td>\n",
" <td>INFP</td>\n",
" <td>0.375704</td>\n",
" <td>0.756593</td>\n",
" <td>0.740688</td>\n",
" <td>0.697536</td>\n",
" <td>0.229456</td>\n",
" <td>0.051684</td>\n",
" <td>0.021324</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25436</th>\n",
" <td>http://loveisart.tumblr.com</td>\n",
" <td>ISTP</td>\n",
" <td>ENFP</td>\n",
" <td>0.002516</td>\n",
" <td>0.848823</td>\n",
" <td>0.661502</td>\n",
" <td>0.584138</td>\n",
" <td>0.118812</td>\n",
" <td>0.192779</td>\n",
" <td>0.104271</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25437</th>\n",
" <td>http://angelalll.tumblr.com</td>\n",
" <td>ESTP</td>\n",
" <td>INFP</td>\n",
" <td>0.814616</td>\n",
" <td>0.652280</td>\n",
" <td>0.832608</td>\n",
" <td>0.518149</td>\n",
" <td>0.281291</td>\n",
" <td>0.163392</td>\n",
" <td>0.037168</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>22919 rows × 115 columns</p>\n",
"</div>"
],
"text/plain": [
" url typealyzer actual \\\n",
"1 http://adropofcolour.tumblr.com ISFP INFJ \n",
"2 http://godheadcomplex.tumblr.com ESFP INFP \n",
"3 http://chaotikaeon2.tumblr.com INTJ INTP \n",
"5 http://perpetually-in-transit.blogspot.com ESFP ENFJ \n",
"10 http://museofmystery.wordpress.com/2012/08/29/... ISTP INFP \n",
"... ... ... ... \n",
"25432 http://pistoche.tumblr.com ESFP INTJ \n",
"25433 http://lokh.tumblr.com ISTP INTP \n",
"25435 http://readerdye.tumblr.com ISTP INFP \n",
"25436 http://loveisart.tumblr.com ISTP ENFP \n",
"25437 http://angelalll.tumblr.com ESTP INFP \n",
"\n",
" e s t sntf_s sntf_n sntf_t sntf_f \\\n",
"1 0.291281 0.787844 0.460961 0.663515 0.178565 0.069282 0.088638 \n",
"2 0.883579 0.951693 0.238407 0.855921 0.046931 0.021850 0.075297 \n",
"3 0.332444 0.357863 0.591322 0.147668 0.252326 0.339831 0.260175 \n",
"5 0.944394 0.943192 0.105527 0.778825 0.051134 0.017299 0.152742 \n",
"10 0.073352 0.850472 0.608812 0.628322 0.112762 0.149270 0.109646 \n",
"... ... ... ... ... ... ... ... \n",
"25432 0.685653 0.969891 0.480241 0.960824 0.029758 0.004220 0.005199 \n",
"25433 0.201637 0.553602 0.662618 0.468074 0.374926 0.099968 0.057033 \n",
"25435 0.375704 0.756593 0.740688 0.697536 0.229456 0.051684 0.021324 \n",
"25436 0.002516 0.848823 0.661502 0.584138 0.118812 0.192779 0.104271 \n",
"25437 0.814616 0.652280 0.832608 0.518149 0.281291 0.163392 0.037168 \n",
"\n",
" ... sad you cogmech auxverb they incl \\\n",
"1 ... 0.000000 0.019704 0.098522 0.147783 0.000000 0.039409 \n",
"2 ... 0.000000 0.017513 0.201401 0.084063 0.001751 0.056042 \n",
"3 ... 0.003283 0.014540 0.181989 0.114916 0.000938 0.071295 \n",
"5 ... 0.002497 0.018727 0.207241 0.104869 0.002497 0.049938 \n",
"10 ... 0.001031 0.005155 0.215464 0.122680 0.005155 0.043299 \n",
"... ... ... ... ... ... ... ... \n",
"25432 ... NaN NaN NaN NaN NaN NaN \n",
"25433 ... NaN NaN NaN NaN NaN NaN \n",
"25435 ... NaN NaN NaN NaN NaN NaN \n",
"25436 ... NaN NaN NaN NaN NaN NaN \n",
"25437 ... NaN NaN NaN NaN NaN NaN \n",
"\n",
" money feel we hear \n",
"1 0.009852 0.019704 0.044335 0.009852 \n",
"2 0.007005 0.017513 0.047285 0.003503 \n",
"3 0.010319 0.008912 0.054409 0.014540 \n",
"5 0.014981 0.011236 0.041199 0.017478 \n",
"10 0.019588 0.002062 0.021649 0.012371 \n",
"... ... ... ... ... \n",
"25432 NaN NaN NaN NaN \n",
"25433 NaN NaN NaN NaN \n",
"25435 NaN NaN NaN NaN \n",
"25436 NaN NaN NaN NaN \n",
"25437 NaN NaN NaN NaN \n",
"\n",
"[22919 rows x 115 columns]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_pickle(\"dataframe_survey_2018-01-23_enriched.pickle\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 f\n",
"2 t\n",
"3 f\n",
"5 f\n",
"10 t\n",
" ..\n",
"25432 f\n",
"25433 f\n",
"25435 t\n",
"25436 f\n",
"25437 n\n",
"Name: func, Length: 22919, dtype: object"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>negate</th>\n",
" <th>ppron</th>\n",
" <th>nonfl</th>\n",
" <th>i</th>\n",
" <th>relativ</th>\n",
" <th>percept</th>\n",
" <th>quant</th>\n",
" <th>affect</th>\n",
" <th>shehe</th>\n",
" <th>achieve</th>\n",
" <th>...</th>\n",
" <th>you</th>\n",
" <th>cogmech</th>\n",
" <th>auxverb</th>\n",
" <th>they</th>\n",
" <th>incl</th>\n",
" <th>money</th>\n",
" <th>feel</th>\n",
" <th>we</th>\n",
" <th>hear</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.034483</td>\n",
" <td>0.428571</td>\n",
" <td>0.049261</td>\n",
" <td>0.334975</td>\n",
" <td>0.197044</td>\n",
" <td>0.039409</td>\n",
" <td>0.024631</td>\n",
" <td>0.088670</td>\n",
" <td>0.029557</td>\n",
" <td>0.024631</td>\n",
" <td>...</td>\n",
" <td>0.019704</td>\n",
" <td>0.098522</td>\n",
" <td>0.147783</td>\n",
" <td>0.000000</td>\n",
" <td>0.039409</td>\n",
" <td>0.009852</td>\n",
" <td>0.019704</td>\n",
" <td>0.044335</td>\n",
" <td>0.009852</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.040280</td>\n",
" <td>0.416813</td>\n",
" <td>0.063047</td>\n",
" <td>0.285464</td>\n",
" <td>0.316988</td>\n",
" <td>0.031524</td>\n",
" <td>0.029772</td>\n",
" <td>0.082312</td>\n",
" <td>0.064799</td>\n",
" <td>0.012259</td>\n",
" <td>...</td>\n",
" <td>0.017513</td>\n",
" <td>0.201401</td>\n",
" <td>0.084063</td>\n",
" <td>0.001751</td>\n",
" <td>0.056042</td>\n",
" <td>0.007005</td>\n",
" <td>0.017513</td>\n",
" <td>0.047285</td>\n",
" <td>0.003503</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.017824</td>\n",
" <td>0.439962</td>\n",
" <td>0.068949</td>\n",
" <td>0.277674</td>\n",
" <td>0.353189</td>\n",
" <td>0.040807</td>\n",
" <td>0.031895</td>\n",
" <td>0.090994</td>\n",
" <td>0.092402</td>\n",
" <td>0.018293</td>\n",
" <td>...</td>\n",
" <td>0.014540</td>\n",
" <td>0.181989</td>\n",
" <td>0.114916</td>\n",
" <td>0.000938</td>\n",
" <td>0.071295</td>\n",
" <td>0.010319</td>\n",
" <td>0.008912</td>\n",
" <td>0.054409</td>\n",
" <td>0.014540</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.038702</td>\n",
" <td>0.377029</td>\n",
" <td>0.049938</td>\n",
" <td>0.233458</td>\n",
" <td>0.223471</td>\n",
" <td>0.046192</td>\n",
" <td>0.041199</td>\n",
" <td>0.072409</td>\n",
" <td>0.081149</td>\n",
" <td>0.024969</td>\n",
" <td>...</td>\n",
" <td>0.018727</td>\n",
" <td>0.207241</td>\n",
" <td>0.104869</td>\n",
" <td>0.002497</td>\n",
" <td>0.049938</td>\n",
" <td>0.014981</td>\n",
" <td>0.011236</td>\n",
" <td>0.041199</td>\n",
" <td>0.017478</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.014433</td>\n",
" <td>0.479381</td>\n",
" <td>0.091753</td>\n",
" <td>0.364948</td>\n",
" <td>0.319588</td>\n",
" <td>0.026804</td>\n",
" <td>0.047423</td>\n",
" <td>0.102062</td>\n",
" <td>0.082474</td>\n",
" <td>0.030928</td>\n",
" <td>...</td>\n",
" <td>0.005155</td>\n",
" <td>0.215464</td>\n",
" <td>0.122680</td>\n",
" <td>0.005155</td>\n",
" <td>0.043299</td>\n",
" <td>0.019588</td>\n",
" <td>0.002062</td>\n",
" <td>0.021649</td>\n",
" <td>0.012371</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22914</th>\n",
" <td>0.091241</td>\n",
" <td>0.390511</td>\n",
" <td>0.040146</td>\n",
" <td>0.244526</td>\n",
" <td>0.306569</td>\n",
" <td>0.025547</td>\n",
" <td>0.021898</td>\n",
" <td>0.065693</td>\n",
" <td>0.025547</td>\n",
" <td>0.021898</td>\n",
" <td>...</td>\n",
" <td>0.040146</td>\n",
" <td>0.244526</td>\n",
" <td>0.069343</td>\n",
" <td>0.003650</td>\n",
" <td>0.091241</td>\n",
" <td>0.007299</td>\n",
" <td>0.000000</td>\n",
" <td>0.076642</td>\n",
" <td>0.007299</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22915</th>\n",
" <td>0.020309</td>\n",
" <td>0.508530</td>\n",
" <td>0.053615</td>\n",
" <td>0.351340</td>\n",
" <td>0.287571</td>\n",
" <td>0.049553</td>\n",
" <td>0.045491</td>\n",
" <td>0.119415</td>\n",
" <td>0.084890</td>\n",
" <td>0.017059</td>\n",
" <td>...</td>\n",
" <td>0.019090</td>\n",
" <td>0.211210</td>\n",
" <td>0.150690</td>\n",
" <td>0.012998</td>\n",
" <td>0.045085</td>\n",
" <td>0.016247</td>\n",
" <td>0.016653</td>\n",
" <td>0.040211</td>\n",
" <td>0.010154</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22916</th>\n",
" <td>0.029458</td>\n",
" <td>0.482904</td>\n",
" <td>0.061021</td>\n",
" <td>0.307207</td>\n",
" <td>0.262493</td>\n",
" <td>0.039979</td>\n",
" <td>0.028406</td>\n",
" <td>0.110994</td>\n",
" <td>0.124671</td>\n",
" <td>0.025776</td>\n",
" <td>...</td>\n",
" <td>0.017885</td>\n",
" <td>0.197791</td>\n",
" <td>0.162020</td>\n",
" <td>0.006312</td>\n",
" <td>0.042083</td>\n",
" <td>0.010521</td>\n",
" <td>0.013677</td>\n",
" <td>0.026828</td>\n",
" <td>0.010521</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22917</th>\n",
" <td>0.113971</td>\n",
" <td>0.139706</td>\n",
" <td>0.025735</td>\n",
" <td>0.084559</td>\n",
" <td>0.136029</td>\n",
" <td>0.022059</td>\n",
" <td>0.018382</td>\n",
" <td>0.040441</td>\n",
" <td>0.025735</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.014706</td>\n",
" <td>0.113971</td>\n",
" <td>0.069853</td>\n",
" <td>0.007353</td>\n",
" <td>0.011029</td>\n",
" <td>0.000000</td>\n",
" <td>0.007353</td>\n",
" <td>0.007353</td>\n",
" <td>0.000000</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22918</th>\n",
" <td>0.003861</td>\n",
" <td>0.455598</td>\n",
" <td>0.057915</td>\n",
" <td>0.333977</td>\n",
" <td>0.322394</td>\n",
" <td>0.019305</td>\n",
" <td>0.036680</td>\n",
" <td>0.073359</td>\n",
" <td>0.075290</td>\n",
" <td>0.017375</td>\n",
" <td>...</td>\n",
" <td>0.027027</td>\n",
" <td>0.133205</td>\n",
" <td>0.113900</td>\n",
" <td>0.003861</td>\n",
" <td>0.055985</td>\n",
" <td>0.003861</td>\n",
" <td>0.003861</td>\n",
" <td>0.015444</td>\n",
" <td>0.011583</td>\n",
" <td>n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20819 rows × 65 columns</p>\n",
"</div>"
],
"text/plain": [
" negate ppron nonfl i relativ percept quant \\\n",
"1 0.034483 0.428571 0.049261 0.334975 0.197044 0.039409 0.024631 \n",
"2 0.040280 0.416813 0.063047 0.285464 0.316988 0.031524 0.029772 \n",
"3 0.017824 0.439962 0.068949 0.277674 0.353189 0.040807 0.031895 \n",
"5 0.038702 0.377029 0.049938 0.233458 0.223471 0.046192 0.041199 \n",
"10 0.014433 0.479381 0.091753 0.364948 0.319588 0.026804 0.047423 \n",
"... ... ... ... ... ... ... ... \n",
"22914 0.091241 0.390511 0.040146 0.244526 0.306569 0.025547 0.021898 \n",
"22915 0.020309 0.508530 0.053615 0.351340 0.287571 0.049553 0.045491 \n",
"22916 0.029458 0.482904 0.061021 0.307207 0.262493 0.039979 0.028406 \n",
"22917 0.113971 0.139706 0.025735 0.084559 0.136029 0.022059 0.018382 \n",
"22918 0.003861 0.455598 0.057915 0.333977 0.322394 0.019305 0.036680 \n",
"\n",
" affect shehe achieve ... you cogmech auxverb \\\n",
"1 0.088670 0.029557 0.024631 ... 0.019704 0.098522 0.147783 \n",
"2 0.082312 0.064799 0.012259 ... 0.017513 0.201401 0.084063 \n",
"3 0.090994 0.092402 0.018293 ... 0.014540 0.181989 0.114916 \n",
"5 0.072409 0.081149 0.024969 ... 0.018727 0.207241 0.104869 \n",
"10 0.102062 0.082474 0.030928 ... 0.005155 0.215464 0.122680 \n",
"... ... ... ... ... ... ... ... \n",
"22914 0.065693 0.025547 0.021898 ... 0.040146 0.244526 0.069343 \n",
"22915 0.119415 0.084890 0.017059 ... 0.019090 0.211210 0.150690 \n",
"22916 0.110994 0.124671 0.025776 ... 0.017885 0.197791 0.162020 \n",
"22917 0.040441 0.025735 0.000000 ... 0.014706 0.113971 0.069853 \n",
"22918 0.073359 0.075290 0.017375 ... 0.027027 0.133205 0.113900 \n",
"\n",
" they incl money feel we hear y \n",
"1 0.000000 0.039409 0.009852 0.019704 0.044335 0.009852 f \n",
"2 0.001751 0.056042 0.007005 0.017513 0.047285 0.003503 t \n",
"3 0.000938 0.071295 0.010319 0.008912 0.054409 0.014540 f \n",
"5 0.002497 0.049938 0.014981 0.011236 0.041199 0.017478 f \n",
"10 0.005155 0.043299 0.019588 0.002062 0.021649 0.012371 t \n",
"... ... ... ... ... ... ... .. \n",
"22914 0.003650 0.091241 0.007299 0.000000 0.076642 0.007299 t \n",
"22915 0.012998 0.045085 0.016247 0.016653 0.040211 0.010154 n \n",
"22916 0.006312 0.042083 0.010521 0.013677 0.026828 0.010521 n \n",
"22917 0.007353 0.011029 0.000000 0.007353 0.007353 0.000000 f \n",
"22918 0.003861 0.055985 0.003861 0.003861 0.015444 0.011583 n \n",
"\n",
"[20819 rows x 65 columns]"
]
},
"execution_count": 144,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"liwc_cols = [\"negate\",\"ppron\",\"nonfl\",\"i\",\"relativ\",\"percept\",\"quant\",\"affect\",\"shehe\",\"achieve\",\"bio\",\"leisure\",\"conj\",\"motion\",\"posemo\",\"adverb\",\"home\",\"future\",\"negemo\",\"number\",\"inhib\",\"humans\",\"pronoun\",\"excl\",\"space\",\"tentat\",\"see\",\"past\",\"anx\",\"family\",\"present\",\"health\",\"verb\",\"certain\",\"anger\",\"preps\",\"swear\",\"ingest\",\"discrep\",\"friend\",\"relig\",\"time\",\"cause\",\"article\",\"body\",\"social\",\"assent\",\"work\",\"sexual\",\"insight\",\"ipron\",\"filler\",\"death\",\"funct\",\"sad\",\"you\",\"cogmech\",\"auxverb\",\"they\",\"incl\",\"money\",\"feel\",\"we\",\"hear\"]\n",
"data = df[liwc_cols]\n",
"data[\"y\"] = df.func\n",
"data = data.dropna()\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"65\n"
]
}
],
"source": [
"print(len(data.columns))\n",
"y = data.iloc[:,[64]]\n",
"X = data.iloc[:,0:63]"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4164, 63)"
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test.values.shape"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4164,)"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.values.ravel().shape"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n 6883\n",
"f 4489\n",
"t 3278\n",
"s 2005\n",
"Name: y, dtype: int64"
]
},
"execution_count": 161,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train.y.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(y_train.iloc[:,0])"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
" importance_type='gain', interaction_constraints='',\n",
" learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
" n_estimators=100, n_jobs=0, num_parallel_tree=1,\n",
" objective='multi:softprob', random_state=0, reg_alpha=0,\n",
" reg_lambda=1, scale_pos_weight=None, subsample=1,\n",
" tree_method='exact', validate_parameters=1, verbosity=None)"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = xgb.XGBClassifier()\n",
"model.fit(X_train, y_train.iloc[:,0])"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.3547070124879923"
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy = accuracy_score(y_test, y_pred)\n",
"accuracy"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x864 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(figsize=(12, 12))\n",
"plot_confusion_matrix(model, X_test, y_test, ax=ax)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# With balanced classes"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
"rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)\n",
"data_balanced, balanced_y = rus.fit_resample(data, data['y'])"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>negate</th>\n",
" <th>ppron</th>\n",
" <th>nonfl</th>\n",
" <th>i</th>\n",
" <th>relativ</th>\n",
" <th>percept</th>\n",
" <th>quant</th>\n",
" <th>affect</th>\n",
" <th>shehe</th>\n",
" <th>achieve</th>\n",
" <th>...</th>\n",
" <th>you</th>\n",
" <th>cogmech</th>\n",
" <th>auxverb</th>\n",
" <th>they</th>\n",
" <th>incl</th>\n",
" <th>money</th>\n",
" <th>feel</th>\n",
" <th>we</th>\n",
" <th>hear</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.050562</td>\n",
" <td>0.252809</td>\n",
" <td>0.033708</td>\n",
" <td>0.188202</td>\n",
" <td>0.160112</td>\n",
" <td>0.025281</td>\n",
" <td>0.016854</td>\n",
" <td>0.070225</td>\n",
" <td>0.030899</td>\n",
" <td>0.008427</td>\n",
" <td>...</td>\n",
" <td>0.002809</td>\n",
" <td>0.115169</td>\n",
" <td>0.115169</td>\n",
" <td>0.002809</td>\n",
" <td>0.025281</td>\n",
" <td>0.005618</td>\n",
" <td>0.016854</td>\n",
" <td>0.028090</td>\n",
" <td>0.002809</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.038462</td>\n",
" <td>0.378205</td>\n",
" <td>0.054487</td>\n",
" <td>0.272436</td>\n",
" <td>0.285256</td>\n",
" <td>0.054487</td>\n",
" <td>0.038462</td>\n",
" <td>0.108974</td>\n",
" <td>0.057692</td>\n",
" <td>0.006410</td>\n",
" <td>...</td>\n",
" <td>0.012821</td>\n",
" <td>0.227564</td>\n",
" <td>0.108974</td>\n",
" <td>0.000000</td>\n",
" <td>0.083333</td>\n",
" <td>0.003205</td>\n",
" <td>0.019231</td>\n",
" <td>0.035256</td>\n",
" <td>0.022436</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.018786</td>\n",
" <td>0.552023</td>\n",
" <td>0.046243</td>\n",
" <td>0.341040</td>\n",
" <td>0.339595</td>\n",
" <td>0.052023</td>\n",
" <td>0.066474</td>\n",
" <td>0.114162</td>\n",
" <td>0.135838</td>\n",
" <td>0.026012</td>\n",
" <td>...</td>\n",
" <td>0.027457</td>\n",
" <td>0.225434</td>\n",
" <td>0.167630</td>\n",
" <td>0.020231</td>\n",
" <td>0.049133</td>\n",
" <td>0.010116</td>\n",
" <td>0.018786</td>\n",
" <td>0.027457</td>\n",
" <td>0.020231</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.039113</td>\n",
" <td>0.413299</td>\n",
" <td>0.061278</td>\n",
" <td>0.320730</td>\n",
" <td>0.251630</td>\n",
" <td>0.053455</td>\n",
" <td>0.032595</td>\n",
" <td>0.067797</td>\n",
" <td>0.053455</td>\n",
" <td>0.002608</td>\n",
" <td>...</td>\n",
" <td>0.010430</td>\n",
" <td>0.170795</td>\n",
" <td>0.088657</td>\n",
" <td>0.001304</td>\n",
" <td>0.045632</td>\n",
" <td>0.005215</td>\n",
" <td>0.013038</td>\n",
" <td>0.027379</td>\n",
" <td>0.007823</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.026490</td>\n",
" <td>0.394702</td>\n",
" <td>0.079470</td>\n",
" <td>0.270199</td>\n",
" <td>0.241060</td>\n",
" <td>0.046358</td>\n",
" <td>0.027815</td>\n",
" <td>0.092715</td>\n",
" <td>0.070199</td>\n",
" <td>0.023841</td>\n",
" <td>...</td>\n",
" <td>0.027815</td>\n",
" <td>0.143046</td>\n",
" <td>0.092715</td>\n",
" <td>0.000000</td>\n",
" <td>0.018543</td>\n",
" <td>0.006623</td>\n",
" <td>0.010596</td>\n",
" <td>0.026490</td>\n",
" <td>0.025166</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10031</th>\n",
" <td>0.027867</td>\n",
" <td>0.454984</td>\n",
" <td>0.065380</td>\n",
" <td>0.274920</td>\n",
" <td>0.310289</td>\n",
" <td>0.032154</td>\n",
" <td>0.029475</td>\n",
" <td>0.088424</td>\n",
" <td>0.138800</td>\n",
" <td>0.012326</td>\n",
" <td>...</td>\n",
" <td>0.013398</td>\n",
" <td>0.190782</td>\n",
" <td>0.150054</td>\n",
" <td>0.003751</td>\n",
" <td>0.043944</td>\n",
" <td>0.008039</td>\n",
" <td>0.007503</td>\n",
" <td>0.024116</td>\n",
" <td>0.013934</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10032</th>\n",
" <td>0.052083</td>\n",
" <td>0.322917</td>\n",
" <td>0.050000</td>\n",
" <td>0.229167</td>\n",
" <td>0.237500</td>\n",
" <td>0.018750</td>\n",
" <td>0.006250</td>\n",
" <td>0.093750</td>\n",
" <td>0.041667</td>\n",
" <td>0.010417</td>\n",
" <td>...</td>\n",
" <td>0.008333</td>\n",
" <td>0.129167</td>\n",
" <td>0.079167</td>\n",
" <td>0.002083</td>\n",
" <td>0.012500</td>\n",
" <td>0.002083</td>\n",
" <td>0.006250</td>\n",
" <td>0.041667</td>\n",
" <td>0.004167</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10033</th>\n",
" <td>0.128889</td>\n",
" <td>0.306667</td>\n",
" <td>0.017778</td>\n",
" <td>0.213333</td>\n",
" <td>0.284444</td>\n",
" <td>0.022222</td>\n",
" <td>0.013333</td>\n",
" <td>0.075556</td>\n",
" <td>0.022222</td>\n",
" <td>0.017778</td>\n",
" <td>...</td>\n",
" <td>0.026667</td>\n",
" <td>0.155556</td>\n",
" <td>0.053333</td>\n",
" <td>0.000000</td>\n",
" <td>0.031111</td>\n",
" <td>0.008889</td>\n",
" <td>0.008889</td>\n",
" <td>0.044444</td>\n",
" <td>0.000000</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10034</th>\n",
" <td>0.048276</td>\n",
" <td>0.439655</td>\n",
" <td>0.047414</td>\n",
" <td>0.289655</td>\n",
" <td>0.238793</td>\n",
" <td>0.046552</td>\n",
" <td>0.056034</td>\n",
" <td>0.106034</td>\n",
" <td>0.076724</td>\n",
" <td>0.015517</td>\n",
" <td>...</td>\n",
" <td>0.020690</td>\n",
" <td>0.199138</td>\n",
" <td>0.129310</td>\n",
" <td>0.000862</td>\n",
" <td>0.052586</td>\n",
" <td>0.008621</td>\n",
" <td>0.012069</td>\n",
" <td>0.051724</td>\n",
" <td>0.022414</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10035</th>\n",
" <td>0.060134</td>\n",
" <td>0.443207</td>\n",
" <td>0.044543</td>\n",
" <td>0.293987</td>\n",
" <td>0.360802</td>\n",
" <td>0.020045</td>\n",
" <td>0.042316</td>\n",
" <td>0.122494</td>\n",
" <td>0.073497</td>\n",
" <td>0.026726</td>\n",
" <td>...</td>\n",
" <td>0.022272</td>\n",
" <td>0.200445</td>\n",
" <td>0.113586</td>\n",
" <td>0.006682</td>\n",
" <td>0.069042</td>\n",
" <td>0.002227</td>\n",
" <td>0.011136</td>\n",
" <td>0.046771</td>\n",
" <td>0.004454</td>\n",
" <td>t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10036 rows × 65 columns</p>\n",
"</div>"
],
"text/plain": [
" negate ppron nonfl i relativ percept quant \\\n",
"0 0.050562 0.252809 0.033708 0.188202 0.160112 0.025281 0.016854 \n",
"1 0.038462 0.378205 0.054487 0.272436 0.285256 0.054487 0.038462 \n",
"2 0.018786 0.552023 0.046243 0.341040 0.339595 0.052023 0.066474 \n",
"3 0.039113 0.413299 0.061278 0.320730 0.251630 0.053455 0.032595 \n",
"4 0.026490 0.394702 0.079470 0.270199 0.241060 0.046358 0.027815 \n",
"... ... ... ... ... ... ... ... \n",
"10031 0.027867 0.454984 0.065380 0.274920 0.310289 0.032154 0.029475 \n",
"10032 0.052083 0.322917 0.050000 0.229167 0.237500 0.018750 0.006250 \n",
"10033 0.128889 0.306667 0.017778 0.213333 0.284444 0.022222 0.013333 \n",
"10034 0.048276 0.439655 0.047414 0.289655 0.238793 0.046552 0.056034 \n",
"10035 0.060134 0.443207 0.044543 0.293987 0.360802 0.020045 0.042316 \n",
"\n",
" affect shehe achieve ... you cogmech auxverb \\\n",
"0 0.070225 0.030899 0.008427 ... 0.002809 0.115169 0.115169 \n",
"1 0.108974 0.057692 0.006410 ... 0.012821 0.227564 0.108974 \n",
"2 0.114162 0.135838 0.026012 ... 0.027457 0.225434 0.167630 \n",
"3 0.067797 0.053455 0.002608 ... 0.010430 0.170795 0.088657 \n",
"4 0.092715 0.070199 0.023841 ... 0.027815 0.143046 0.092715 \n",
"... ... ... ... ... ... ... ... \n",
"10031 0.088424 0.138800 0.012326 ... 0.013398 0.190782 0.150054 \n",
"10032 0.093750 0.041667 0.010417 ... 0.008333 0.129167 0.079167 \n",
"10033 0.075556 0.022222 0.017778 ... 0.026667 0.155556 0.053333 \n",
"10034 0.106034 0.076724 0.015517 ... 0.020690 0.199138 0.129310 \n",
"10035 0.122494 0.073497 0.026726 ... 0.022272 0.200445 0.113586 \n",
"\n",
" they incl money feel we hear y \n",
"0 0.002809 0.025281 0.005618 0.016854 0.028090 0.002809 f \n",
"1 0.000000 0.083333 0.003205 0.019231 0.035256 0.022436 f \n",
"2 0.020231 0.049133 0.010116 0.018786 0.027457 0.020231 f \n",
"3 0.001304 0.045632 0.005215 0.013038 0.027379 0.007823 f \n",
"4 0.000000 0.018543 0.006623 0.010596 0.026490 0.025166 f \n",
"... ... ... ... ... ... ... .. \n",
"10031 0.003751 0.043944 0.008039 0.007503 0.024116 0.013934 t \n",
"10032 0.002083 0.012500 0.002083 0.006250 0.041667 0.004167 t \n",
"10033 0.000000 0.031111 0.008889 0.008889 0.044444 0.000000 t \n",
"10034 0.000862 0.052586 0.008621 0.012069 0.051724 0.022414 t \n",
"10035 0.006682 0.069042 0.002227 0.011136 0.046771 0.004454 t \n",
"\n",
"[10036 rows x 65 columns]"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_balanced"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"t 2509\n",
"n 2509\n",
"f 2509\n",
"s 2509\n",
"Name: y, dtype: int64"
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"balanced_y.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"t 2509\n",
"n 2509\n",
"f 2509\n",
"s 2509\n",
"Name: y, dtype: int64"
]
},
"execution_count": 182,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_balanced.y.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"65\n"
]
}
],
"source": [
"print(len(data.columns))\n",
"y = data_balanced.iloc[:,[64]]\n",
"X = data_balanced.iloc[:,0:63]"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"s 2033\n",
"t 2017\n",
"f 1994\n",
"n 1984\n",
"Name: y, dtype: int64"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train.y.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n 525\n",
"f 515\n",
"t 492\n",
"s 476\n",
"Name: y, dtype: int64"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.y.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
" importance_type='gain', interaction_constraints='',\n",
" learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
" n_estimators=100, n_jobs=0, num_parallel_tree=1,\n",
" objective='multi:softprob', random_state=0, reg_alpha=0,\n",
" reg_lambda=1, scale_pos_weight=None, subsample=1,\n",
" tree_method='exact', validate_parameters=1, verbosity=None)"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = xgb.XGBClassifier()\n",
"model.fit(X_train.values, y_train.iloc[:,0])"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>negate</th>\n",
" <th>ppron</th>\n",
" <th>nonfl</th>\n",
" <th>i</th>\n",
" <th>relativ</th>\n",
" <th>percept</th>\n",
" <th>quant</th>\n",
" <th>affect</th>\n",
" <th>shehe</th>\n",
" <th>achieve</th>\n",
" <th>...</th>\n",
" <th>funct</th>\n",
" <th>sad</th>\n",
" <th>you</th>\n",
" <th>cogmech</th>\n",
" <th>auxverb</th>\n",
" <th>they</th>\n",
" <th>incl</th>\n",
" <th>money</th>\n",
" <th>feel</th>\n",
" <th>we</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7343</th>\n",
" <td>0.020151</td>\n",
" <td>0.458438</td>\n",
" <td>0.076826</td>\n",
" <td>0.332494</td>\n",
" <td>0.328715</td>\n",
" <td>0.030227</td>\n",
" <td>0.036524</td>\n",
" <td>0.095718</td>\n",
" <td>0.044081</td>\n",
" <td>0.028967</td>\n",
" <td>...</td>\n",
" <td>1.358942</td>\n",
" <td>0.006297</td>\n",
" <td>0.021411</td>\n",
" <td>0.152393</td>\n",
" <td>0.119647</td>\n",
" <td>0.010076</td>\n",
" <td>0.035264</td>\n",
" <td>0.013854</td>\n",
" <td>0.003778</td>\n",
" <td>0.050378</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2127</th>\n",
" <td>0.034139</td>\n",
" <td>0.470128</td>\n",
" <td>0.034139</td>\n",
" <td>0.301565</td>\n",
" <td>0.281650</td>\n",
" <td>0.046230</td>\n",
" <td>0.044808</td>\n",
" <td>0.088193</td>\n",
" <td>0.076814</td>\n",
" <td>0.014936</td>\n",
" <td>...</td>\n",
" <td>1.494310</td>\n",
" <td>0.002845</td>\n",
" <td>0.013514</td>\n",
" <td>0.256046</td>\n",
" <td>0.148649</td>\n",
" <td>0.008535</td>\n",
" <td>0.047653</td>\n",
" <td>0.009957</td>\n",
" <td>0.014936</td>\n",
" <td>0.069701</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2225</th>\n",
" <td>0.030955</td>\n",
" <td>0.484260</td>\n",
" <td>0.073977</td>\n",
" <td>0.351522</td>\n",
" <td>0.317419</td>\n",
" <td>0.049318</td>\n",
" <td>0.037775</td>\n",
" <td>0.107030</td>\n",
" <td>0.076600</td>\n",
" <td>0.031480</td>\n",
" <td>...</td>\n",
" <td>1.606506</td>\n",
" <td>0.003148</td>\n",
" <td>0.021511</td>\n",
" <td>0.178909</td>\n",
" <td>0.136411</td>\n",
" <td>0.006821</td>\n",
" <td>0.038300</td>\n",
" <td>0.013641</td>\n",
" <td>0.018363</td>\n",
" <td>0.027807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5473</th>\n",
" <td>0.067358</td>\n",
" <td>0.393782</td>\n",
" <td>0.058722</td>\n",
" <td>0.260794</td>\n",
" <td>0.303972</td>\n",
" <td>0.034542</td>\n",
" <td>0.058722</td>\n",
" <td>0.074266</td>\n",
" <td>0.072539</td>\n",
" <td>0.010363</td>\n",
" <td>...</td>\n",
" <td>1.298791</td>\n",
" <td>0.003454</td>\n",
" <td>0.005181</td>\n",
" <td>0.127807</td>\n",
" <td>0.100173</td>\n",
" <td>0.001727</td>\n",
" <td>0.032815</td>\n",
" <td>0.015544</td>\n",
" <td>0.013817</td>\n",
" <td>0.053541</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1515</th>\n",
" <td>0.037975</td>\n",
" <td>0.379747</td>\n",
" <td>0.042194</td>\n",
" <td>0.299578</td>\n",
" <td>0.248945</td>\n",
" <td>0.059072</td>\n",
" <td>0.046414</td>\n",
" <td>0.033755</td>\n",
" <td>0.063291</td>\n",
" <td>0.016878</td>\n",
" <td>...</td>\n",
" <td>1.059072</td>\n",
" <td>0.000000</td>\n",
" <td>0.004219</td>\n",
" <td>0.168776</td>\n",
" <td>0.067511</td>\n",
" <td>0.008439</td>\n",
" <td>0.025316</td>\n",
" <td>0.021097</td>\n",
" <td>0.021097</td>\n",
" <td>0.004219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>550</th>\n",
" <td>0.053942</td>\n",
" <td>0.448133</td>\n",
" <td>0.049793</td>\n",
" <td>0.319502</td>\n",
" <td>0.242739</td>\n",
" <td>0.020747</td>\n",
" <td>0.024896</td>\n",
" <td>0.080913</td>\n",
" <td>0.068465</td>\n",
" <td>0.006224</td>\n",
" <td>...</td>\n",
" <td>1.307054</td>\n",
" <td>0.014523</td>\n",
" <td>0.020747</td>\n",
" <td>0.155602</td>\n",
" <td>0.105809</td>\n",
" <td>0.008299</td>\n",
" <td>0.037344</td>\n",
" <td>0.020747</td>\n",
" <td>0.006224</td>\n",
" <td>0.031120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3477</th>\n",
" <td>0.025516</td>\n",
" <td>0.520656</td>\n",
" <td>0.102066</td>\n",
" <td>0.375456</td>\n",
" <td>0.284933</td>\n",
" <td>0.060753</td>\n",
" <td>0.034629</td>\n",
" <td>0.116039</td>\n",
" <td>0.078372</td>\n",
" <td>0.027339</td>\n",
" <td>...</td>\n",
" <td>1.597813</td>\n",
" <td>0.007290</td>\n",
" <td>0.029162</td>\n",
" <td>0.157959</td>\n",
" <td>0.156744</td>\n",
" <td>0.002430</td>\n",
" <td>0.033414</td>\n",
" <td>0.019441</td>\n",
" <td>0.026124</td>\n",
" <td>0.035237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3042</th>\n",
" <td>0.010163</td>\n",
" <td>0.443089</td>\n",
" <td>0.052846</td>\n",
" <td>0.239837</td>\n",
" <td>0.241870</td>\n",
" <td>0.034553</td>\n",
" <td>0.038618</td>\n",
" <td>0.083333</td>\n",
" <td>0.095528</td>\n",
" <td>0.010163</td>\n",
" <td>...</td>\n",
" <td>1.355691</td>\n",
" <td>0.008130</td>\n",
" <td>0.026423</td>\n",
" <td>0.172764</td>\n",
" <td>0.095528</td>\n",
" <td>0.012195</td>\n",
" <td>0.069106</td>\n",
" <td>0.000000</td>\n",
" <td>0.006098</td>\n",
" <td>0.069106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9734</th>\n",
" <td>0.045028</td>\n",
" <td>0.360225</td>\n",
" <td>0.061914</td>\n",
" <td>0.250469</td>\n",
" <td>0.275797</td>\n",
" <td>0.055347</td>\n",
" <td>0.034709</td>\n",
" <td>0.110694</td>\n",
" <td>0.059099</td>\n",
" <td>0.014071</td>\n",
" <td>...</td>\n",
" <td>1.252345</td>\n",
" <td>0.016886</td>\n",
" <td>0.009381</td>\n",
" <td>0.214822</td>\n",
" <td>0.119137</td>\n",
" <td>0.000938</td>\n",
" <td>0.045966</td>\n",
" <td>0.024390</td>\n",
" <td>0.037523</td>\n",
" <td>0.040338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9261</th>\n",
" <td>0.034934</td>\n",
" <td>0.425036</td>\n",
" <td>0.055313</td>\n",
" <td>0.216885</td>\n",
" <td>0.229985</td>\n",
" <td>0.039301</td>\n",
" <td>0.024745</td>\n",
" <td>0.090247</td>\n",
" <td>0.085881</td>\n",
" <td>0.011645</td>\n",
" <td>...</td>\n",
" <td>1.199418</td>\n",
" <td>0.001456</td>\n",
" <td>0.080058</td>\n",
" <td>0.177584</td>\n",
" <td>0.109170</td>\n",
" <td>0.001456</td>\n",
" <td>0.046579</td>\n",
" <td>0.008734</td>\n",
" <td>0.013100</td>\n",
" <td>0.040757</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2008 rows × 63 columns</p>\n",
"</div>"
],
"text/plain": [
" negate ppron nonfl i relativ percept quant \\\n",
"7343 0.020151 0.458438 0.076826 0.332494 0.328715 0.030227 0.036524 \n",
"2127 0.034139 0.470128 0.034139 0.301565 0.281650 0.046230 0.044808 \n",
"2225 0.030955 0.484260 0.073977 0.351522 0.317419 0.049318 0.037775 \n",
"5473 0.067358 0.393782 0.058722 0.260794 0.303972 0.034542 0.058722 \n",
"1515 0.037975 0.379747 0.042194 0.299578 0.248945 0.059072 0.046414 \n",
"... ... ... ... ... ... ... ... \n",
"550 0.053942 0.448133 0.049793 0.319502 0.242739 0.020747 0.024896 \n",
"3477 0.025516 0.520656 0.102066 0.375456 0.284933 0.060753 0.034629 \n",
"3042 0.010163 0.443089 0.052846 0.239837 0.241870 0.034553 0.038618 \n",
"9734 0.045028 0.360225 0.061914 0.250469 0.275797 0.055347 0.034709 \n",
"9261 0.034934 0.425036 0.055313 0.216885 0.229985 0.039301 0.024745 \n",
"\n",
" affect shehe achieve ... funct sad you \\\n",
"7343 0.095718 0.044081 0.028967 ... 1.358942 0.006297 0.021411 \n",
"2127 0.088193 0.076814 0.014936 ... 1.494310 0.002845 0.013514 \n",
"2225 0.107030 0.076600 0.031480 ... 1.606506 0.003148 0.021511 \n",
"5473 0.074266 0.072539 0.010363 ... 1.298791 0.003454 0.005181 \n",
"1515 0.033755 0.063291 0.016878 ... 1.059072 0.000000 0.004219 \n",
"... ... ... ... ... ... ... ... \n",
"550 0.080913 0.068465 0.006224 ... 1.307054 0.014523 0.020747 \n",
"3477 0.116039 0.078372 0.027339 ... 1.597813 0.007290 0.029162 \n",
"3042 0.083333 0.095528 0.010163 ... 1.355691 0.008130 0.026423 \n",
"9734 0.110694 0.059099 0.014071 ... 1.252345 0.016886 0.009381 \n",
"9261 0.090247 0.085881 0.011645 ... 1.199418 0.001456 0.080058 \n",
"\n",
" cogmech auxverb they incl money feel we \n",
"7343 0.152393 0.119647 0.010076 0.035264 0.013854 0.003778 0.050378 \n",
"2127 0.256046 0.148649 0.008535 0.047653 0.009957 0.014936 0.069701 \n",
"2225 0.178909 0.136411 0.006821 0.038300 0.013641 0.018363 0.027807 \n",
"5473 0.127807 0.100173 0.001727 0.032815 0.015544 0.013817 0.053541 \n",
"1515 0.168776 0.067511 0.008439 0.025316 0.021097 0.021097 0.004219 \n",
"... ... ... ... ... ... ... ... \n",
"550 0.155602 0.105809 0.008299 0.037344 0.020747 0.006224 0.031120 \n",
"3477 0.157959 0.156744 0.002430 0.033414 0.019441 0.026124 0.035237 \n",
"3042 0.172764 0.095528 0.012195 0.069106 0.000000 0.006098 0.069106 \n",
"9734 0.214822 0.119137 0.000938 0.045966 0.024390 0.037523 0.040338 \n",
"9261 0.177584 0.109170 0.001456 0.046579 0.008734 0.013100 0.040757 \n",
"\n",
"[2008 rows x 63 columns]"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(X_test.values)"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.24651394422310757"
]
},
"execution_count": 197,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy = accuracy_score(y_test, y_pred)\n",
"accuracy"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x864 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(figsize=(12, 12))\n",
"plot_confusion_matrix(model, X_test.values, y_test, ax=ax)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" f 0.27 0.27 0.27 515\n",
" n 0.26 0.25 0.25 525\n",
" s 0.24 0.26 0.25 476\n",
" t 0.21 0.21 0.21 492\n",
"\n",
" accuracy 0.25 2008\n",
" macro avg 0.25 0.25 0.25 2008\n",
"weighted avg 0.25 0.25 0.25 2008\n",
"\n"
]
}
],
"source": [
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# With reduced dimensionality"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [],
"source": [
"reducer = umap.UMAP()\n",
"mapper = umap.UMAP().fit(X) # for plotting\n",
"embedding = reducer.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10036, 2)"
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embedding.shape"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['f', 'f', 'f', ..., 't', 't', 't'], dtype=object)"
]
},
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.values.ravel()"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:>"
]
},
"execution_count": 203,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "