Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script used to produce results in blog post about experiment classifying Jungian cognitive functions with one classifier for percieving functions sensing vs intuition and one classifier for judging functions thinking vs feeling at www.mattiasostmar.se. The raw data (pickled Pandas DataFame) is available on https://osf.io/gyrc7/
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import requests\n",
"import numpy as np\n",
"import operator\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import confusion_matrix\n",
"import seaborn as sns\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/mos/Dropbox/memeticscience/typealyzer-dataset/notebooks/jungian_classification\r\n"
]
}
],
"source": [
"!pwd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_pickle_path = \"../../pickles/dataframe_survey_2018-01-23_enriched.pickle\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>actual</th>\n",
" <th>actual_temp</th>\n",
" <th>is_s</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>INFJ</td>\n",
" <td>nf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>INFP</td>\n",
" <td>nf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>INTP</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>ENFJ</td>\n",
" <td>nf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>INFP</td>\n",
" <td>nf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" actual actual_temp is_s\n",
"1 INFJ nf 0\n",
"2 INFP nf 0\n",
"3 INTP nt 0\n",
"5 ENFJ nf 0\n",
"10 INFP nf 0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata = pd.read_pickle(df_pickle_path)\n",
"indata[[\"actual\",\"actual_temp\",\"is_s\"]].head(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tumblr 21938\n",
"blogspot 513\n",
"wordpress 468\n",
"Name: domain, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata.domain.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"en 22588\n",
"fr 50\n",
"da 34\n",
"de 25\n",
"no 23\n",
"Name: lang, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata.lang.value_counts().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Filter out English texts only"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22919"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(indata)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"indata = indata[indata.lang == \"en\"]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22588"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(indata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add columns percieving and judging for evaluation of s/n and t/f classifiers"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"indata[\"perc_func\"] = indata.actual_temp.str.extract(\"(\\w)\\w\", expand=False)\n",
"indata[\"judg_func\"] = indata.actual_temp.str.extract(\"\\w(\\w)\", expand=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(indata[pd.isnull(indata[\"tokens\"])])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>typealyzer</th>\n",
" <th>actual</th>\n",
" <th>e</th>\n",
" <th>s</th>\n",
" <th>t</th>\n",
" <th>sntf_s</th>\n",
" <th>sntf_n</th>\n",
" <th>sntf_t</th>\n",
" <th>sntf_f</th>\n",
" <th>...</th>\n",
" <th>cogmech</th>\n",
" <th>auxverb</th>\n",
" <th>they</th>\n",
" <th>incl</th>\n",
" <th>money</th>\n",
" <th>feel</th>\n",
" <th>we</th>\n",
" <th>hear</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [url, typealyzer, actual, e, s, t, sntf_s, sntf_n, sntf_t, sntf_f, date, text, tokens, domain, lang, datetime, func, att, funcatt, actual_temp, is_e, is_i, is_f, is_n, is_s, is_t, is_fe, is_fi, is_ne, is_ni, is_se, is_si, is_te, is_ti, is_enfj, is_enfp, is_entj, is_entp, is_esfj, is_esfp, is_estj, is_estp, is_unknown, is_infj, is_infp, is_intj, is_intp, is_isfj, is_isfp, is_istj, is_istp, negate, ppron, nonfl, i, relativ, percept, quant, affect, shehe, achieve, bio, leisure, conj, motion, posemo, adverb, home, future, negemo, number, inhib, humans, pronoun, excl, space, tentat, see, past, anx, family, present, health, verb, certain, anger, preps, swear, ingest, discrep, friend, relig, time, cause, article, body, social, assent, work, sexual, ...]\n",
"Index: []\n",
"\n",
"[0 rows x 117 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata[pd.isnull(indata[\"tokens\"])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inspect original data function distributions"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'f': 488.2908894968084,\n",
" 'n': 511.4496560721063,\n",
" 's': 457.63189127105665,\n",
" 't': 511.27211970074814}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s_series = indata[indata.perc_func == \"s\"][\"tokens\"]\n",
"n_series = indata[indata.perc_func == \"n\"][\"tokens\"]\n",
"t_series = indata[indata.judg_func == \"t\"][\"tokens\"]\n",
"f_series = indata[indata.judg_func == \"f\"][\"tokens\"]\n",
"\n",
"avg_tkns = {\n",
" \"s\":s_series.mean(),\n",
" \"n\":n_series.mean(),\n",
" \"t\":t_series.mean(),\n",
" \"f\":f_series.mean()\n",
"}\n",
"avg_tkns"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n 16864\n",
"s 5224\n",
"Name: perc_func, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata.perc_func.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"f 12063\n",
"t 10025\n",
"Name: judg_func, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indata.judg_func.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sample equal size text chunks for training and evaluation data\n",
"See: [Pandas sample()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Percieving function"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# We have 5224 cases in the smallest class s \n",
"perc_samples = pd.concat([\n",
" indata[indata.perc_func == \"s\"].sample(3000, random_state=123456)[[\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\"]],\n",
" indata[indata.perc_func == \"n\"].sample(3000, random_state=123456)[[\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\"]]\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6000"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(perc_samples)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"s 3000\n",
"n 3000\n",
"Name: perc_func, dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perc_samples.perc_func.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"f 3363\n",
"t 2637\n",
"Name: judg_func, dtype: int64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perc_samples.judg_func.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"perc_samples.to_pickle(\"jung_percieving_functions_samples_blogs_totn6000.pickle\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"perc_samples = pd.read_pickle(\"jung_percieving_functions_samples_blogs_totn6000.pickle\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Judging function"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# We have 10025 cases in the smallest class t\n",
"judg_samples = pd.concat([\n",
" indata[indata.judg_func == \"t\"].sample(3000, random_state=123456)[[\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\"]],\n",
" indata[indata.judg_func == \"f\"].sample(3000, random_state=123456)[[\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\"]]\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6000"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(judg_samples)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"t 3000\n",
"f 3000\n",
"Name: judg_func, dtype: int64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"judg_samples.judg_func.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n 4598\n",
"s 1402\n",
"Name: perc_func, dtype: int64"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"judg_samples.perc_func.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"judg_samples.to_pickle(\"jung_judging_functions_samples_blogs_totn6000.pickle\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"judg_samples = pd.read_pickle(\"jung_judging_functions_samples_blogs_totn6000.pickle\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup uClassify classifer and prepare training and evaluation datasets\n",
"The variable `os.environ[\"UCLASSIFY_WRITE\"]` is created by adding a line to e.g. `~/.profile`:\n",
"\n",
"`export UCLASSIFY_WRITE = \"<your_uclassify_write_key>\"`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split percieving samples into train and eval subsets."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6000"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(perc_samples)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>perc_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8623</th>\n",
" <td>Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...</td>\n",
" <td>386</td>\n",
" <td>s</td>\n",
" <td>f</td>\n",
" <td>sf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11987</th>\n",
" <td>Log in | Tumblr Sign up Terms Privacy Posted b...</td>\n",
" <td>52</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5340</th>\n",
" <td>a thing of blood © hi im logan and i love the ...</td>\n",
" <td>440</td>\n",
" <td>s</td>\n",
" <td>f</td>\n",
" <td>sf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"8623 Sonny Jooooooooon INDEX ASK PAST THEME Sonny J... 386 s \n",
"11987 Log in | Tumblr Sign up Terms Privacy Posted b... 52 s \n",
"5340 a thing of blood © hi im logan and i love the ... 440 s \n",
"\n",
" judg_func actual_temp perc_training_set \n",
"8623 f sf 0 \n",
"11987 t st 0 \n",
"5340 f sf 0 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# prepare column to keep track of what's been used for training\n",
"zeros = np.zeros(len(perc_samples))\n",
"perc_samples[\"perc_training_set\"] = zeros\n",
"perc_samples[\"perc_training_set\"] = perc_samples.perc_training_set.astype(\"int\")\n",
"perc_samples.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>perc_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8623</th>\n",
" <td>Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...</td>\n",
" <td>386</td>\n",
" <td>s</td>\n",
" <td>f</td>\n",
" <td>sf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11987</th>\n",
" <td>Log in | Tumblr Sign up Terms Privacy Posted b...</td>\n",
" <td>52</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5340</th>\n",
" <td>a thing of blood © hi im logan and i love the ...</td>\n",
" <td>440</td>\n",
" <td>s</td>\n",
" <td>f</td>\n",
" <td>sf</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"8623 Sonny Jooooooooon INDEX ASK PAST THEME Sonny J... 386 s \n",
"11987 Log in | Tumblr Sign up Terms Privacy Posted b... 52 s \n",
"5340 a thing of blood © hi im logan and i love the ... 440 s \n",
"\n",
" judg_func actual_temp perc_training_set \n",
"8623 f sf 0 \n",
"11987 t st 1 \n",
"5340 f sf 1 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sn_traing_set_size = 2100 # e.g. 2100 is 70% of 3000 samples\n",
"perc_s_train = perc_samples[perc_samples.perc_func == \"s\"].sample(sn_traing_set_size).index\n",
"perc_n_train = perc_samples[perc_samples.perc_func == \"n\"].sample(sn_traing_set_size).index\n",
"\n",
"perc_train = perc_s_train.union(perc_n_train)\n",
"\n",
"perc_samples.loc[perc_train, \"perc_training_set\"] = 1\n",
"perc_samples.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4200"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(perc_samples[perc_samples.perc_training_set == 1])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>perc_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8623</th>\n",
" <td>Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...</td>\n",
" <td>386</td>\n",
" <td>s</td>\n",
" <td>f</td>\n",
" <td>sf</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18909</th>\n",
" <td>Wit Beyond Measure Wit Beyond Measure Aug 14, ...</td>\n",
" <td>340</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7557</th>\n",
" <td>IT'S ALL COMIN' DOWN ON US, BOYS why is my das...</td>\n",
" <td>336</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"8623 Sonny Jooooooooon INDEX ASK PAST THEME Sonny J... 386 s \n",
"18909 Wit Beyond Measure Wit Beyond Measure Aug 14, ... 340 s \n",
"7557 IT'S ALL COMIN' DOWN ON US, BOYS why is my das... 336 s \n",
"\n",
" judg_func actual_temp perc_training_set \n",
"8623 f sf 0 \n",
"18909 t st 0 \n",
"7557 t st 0 "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Separate evaluation DataFrame\n",
"perc_eval_set = perc_samples[perc_samples.perc_training_set == 0]\n",
"perc_eval_set.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1800"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(perc_eval_set)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train SN classifer\n",
"\n",
"https://uclassify.com/manage/classifiers/jung-percieving-2100"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"def train_jung_cognitive_functions_en_classes(func, classifier):\n",
" \"\"\"Presupposes that classifier is created and that setup_jung_functions_en_classes() is already run.\n",
" func: expects one of [\"s\",\"n\",\"t\",\"f\"]\n",
" classifier: expects on of [\"sntf\", \"tf\", \"sn\"]\n",
" \n",
" \"\"\"\n",
" trained_ix = []\n",
" text_count = 1\n",
" if classifier == \"sn\":\n",
" for ix, row in perc_samples.loc[(perc_samples.perc_func == name) & (perc_samples.perc_training_set == 1)].iterrows():\n",
" trained_ix.append(ix)\n",
" data = {\"texts\":[row[\"text\"]]}\n",
" header = {\"Content-Type\": \"application/json\",\n",
" \"Authorization\": \"Token \" + os.environ[\"UCLASSIFY_WRITE\"]}\n",
" \n",
" response = requests.post('https://api.uclassify.com/v1/me/jung-percieving-2100/' + name + \"/train\", \n",
" json = data,\n",
" headers = header)\n",
" if text_count % 100 == 0:\n",
" print(\"{}:{}\".format(name, text_count))\n",
" text_count += 1\n",
" \n",
" elif classifier == \"tf\":\n",
" for ix, row in judg_samples.loc[(judg_samples.judg_func == name) & (judg_samples.judg_training_set == 1)].iterrows():\n",
" trained_ix.append(ix)\n",
" data = {\"texts\":[row[\"text\"]]}\n",
" header = {\"Content-Type\": \"application/json\",\n",
" \"Authorization\": \"Token \" + os.environ[\"UCLASSIFY_WRITE\"]}\n",
" \n",
" response = requests.post('https://api.uclassify.com/v1/me/jung-judging-2100/' + name + \"/train\", \n",
" json = data,\n",
" headers = header)\n",
" if text_count % 100 == 0:\n",
" print(\"{}:{}\".format(name, text_count))\n",
" text_count += 1\n",
" \n",
" \n",
" print(\"Finished training Jung Cognitive Functions: {}\".format(name))\n",
" return trained_ix"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"s:100\n",
"s:200\n",
"s:300\n",
"s:400\n",
"s:500\n",
"s:600\n",
"s:700\n",
"s:800\n",
"s:900\n",
"s:1000\n",
"s:1100\n",
"s:1200\n",
"s:1300\n",
"s:1400\n",
"s:1500\n",
"s:1600\n",
"s:1700\n",
"s:1800\n",
"s:1900\n",
"s:2000\n",
"s:2100\n",
"Finished training Jung Cognitive Functions: s\n",
"n:100\n",
"n:200\n",
"n:300\n",
"n:400\n",
"n:500\n",
"n:600\n",
"n:700\n",
"n:800\n",
"n:900\n",
"n:1000\n",
"n:1100\n",
"n:1200\n",
"n:1300\n",
"n:1400\n",
"n:1500\n",
"n:1600\n",
"n:1700\n",
"n:1800\n",
"n:1900\n",
"n:2000\n",
"n:2100\n",
"Finished training Jung Cognitive Functions: n\n"
]
}
],
"source": [
"perc_trained_ix = []\n",
"for name in [\"s\",\"n\"]:\n",
" functions_trained_ix = train_jung_cognitive_functions_en_classes(name, classifier=\"sn\")\n",
" perc_trained_ix.append(perc_trained_ix)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"length perc_eval_set: 1800\n",
"length perc_trained_ix: 2\n"
]
}
],
"source": [
"print(\"length perc_eval_set: {}\".format(len(perc_eval_set)))\n",
"print(\"length perc_trained_ix: {}\".format(len(perc_trained_ix)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split judging samples into train and eval subsets."
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>judg_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22981</th>\n",
" <td>it is what it is About Name: Heidi Age:16 Wher...</td>\n",
" <td>565</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24378</th>\n",
" <td>https://www.tumblr.com/themes/by/leentheme htt...</td>\n",
" <td>582</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5187</th>\n",
" <td>three things cannot be long hidden © three thi...</td>\n",
" <td>516</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"22981 it is what it is About Name: Heidi Age:16 Wher... 565 s \n",
"24378 https://www.tumblr.com/themes/by/leentheme htt... 582 n \n",
"5187 three things cannot be long hidden © three thi... 516 n \n",
"\n",
" judg_func actual_temp judg_training_set \n",
"22981 t st 0 \n",
"24378 t nt 0 \n",
"5187 t nt 0 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# prepare column to keep track of what's been used for training\n",
"zeros = np.zeros(len(judg_samples))\n",
"judg_samples[\"judg_training_set\"] = zeros\n",
"judg_samples[\"judg_training_set\"] = judg_samples.judg_training_set.astype(\"int\")\n",
"judg_samples.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>judg_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22981</th>\n",
" <td>it is what it is About Name: Heidi Age:16 Wher...</td>\n",
" <td>565</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24378</th>\n",
" <td>https://www.tumblr.com/themes/by/leentheme htt...</td>\n",
" <td>582</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5187</th>\n",
" <td>three things cannot be long hidden © three thi...</td>\n",
" <td>516</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4307</th>\n",
" <td>none gf with left feel 12442 ★ August 1st, 201...</td>\n",
" <td>499</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2606</th>\n",
" <td>God, Faith, &amp; Fitness God, Faith, &amp; Fitness Me...</td>\n",
" <td>924</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18214</th>\n",
" <td>big hype, big letdown ♡ i'm charlotte and i li...</td>\n",
" <td>101</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10718</th>\n",
" <td>Love the life you live Live the life you love ...</td>\n",
" <td>353</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20277</th>\n",
" <td>❤❤❤ - - - - - ♚ - - | momo | 14 | ♎ | ESTJ | |...</td>\n",
" <td>473</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>846</th>\n",
" <td>Cynically Marvelous | It's Axiomatic. Cynicall...</td>\n",
" <td>6973</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16405</th>\n",
" <td>I'll just pretend that youth will never end I'...</td>\n",
" <td>693</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11753</th>\n",
" <td>Something-or-other Something-or-other Let's ju...</td>\n",
" <td>483</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18004</th>\n",
" <td>/ ._&gt; | | | '_&gt;/ ._&gt;| . \\/ ._&gt;| || |/ ._&gt; | _/...</td>\n",
" <td>180</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23577</th>\n",
" <td>WINTERFELL About Mariele. 21. Germany. ISTJ. M...</td>\n",
" <td>102</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4829</th>\n",
" <td>ʕ•ᴥ•ʔ The bears are in 684,715 plays we-r-who-...</td>\n",
" <td>445</td>\n",
" <td>s</td>\n",
" <td>t</td>\n",
" <td>st</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4225</th>\n",
" <td>XANADU XANADU</td>\n",
" <td>2</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"22981 it is what it is About Name: Heidi Age:16 Wher... 565 s \n",
"24378 https://www.tumblr.com/themes/by/leentheme htt... 582 n \n",
"5187 three things cannot be long hidden © three thi... 516 n \n",
"4307 none gf with left feel 12442 ★ August 1st, 201... 499 s \n",
"2606 God, Faith, & Fitness God, Faith, & Fitness Me... 924 n \n",
"18214 big hype, big letdown ♡ i'm charlotte and i li... 101 s \n",
"10718 Love the life you live Live the life you love ... 353 n \n",
"20277 ❤❤❤ - - - - - ♚ - - | momo | 14 | ♎ | ESTJ | |... 473 s \n",
"846 Cynically Marvelous | It's Axiomatic. Cynicall... 6973 n \n",
"16405 I'll just pretend that youth will never end I'... 693 n \n",
"11753 Something-or-other Something-or-other Let's ju... 483 n \n",
"18004 / ._> | | | '_>/ ._>| . \\/ ._>| || |/ ._> | _/... 180 n \n",
"23577 WINTERFELL About Mariele. 21. Germany. ISTJ. M... 102 s \n",
"4829 ʕ•ᴥ•ʔ The bears are in 684,715 plays we-r-who-... 445 s \n",
"4225 XANADU XANADU 2 n \n",
"\n",
" judg_func actual_temp judg_training_set \n",
"22981 t st 1 \n",
"24378 t nt 1 \n",
"5187 t nt 1 \n",
"4307 t st 1 \n",
"2606 t nt 1 \n",
"18214 t st 1 \n",
"10718 t nt 1 \n",
"20277 t st 1 \n",
"846 t nt 1 \n",
"16405 t nt 0 \n",
"11753 t nt 1 \n",
"18004 t nt 1 \n",
"23577 t st 1 \n",
"4829 t st 1 \n",
"4225 t nt 1 "
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf_traing_set_size = 2100 # e.g. 2100 is 70% of 3000 samples\n",
"judg_t_train = judg_samples[judg_samples.judg_func == \"t\"].sample(tf_traing_set_size).index\n",
"judg_f_train = judg_samples[judg_samples.judg_func == \"f\"].sample(tf_traing_set_size).index\n",
"\n",
"judg_train = judg_t_train.union(judg_f_train)\n",
"\n",
"judg_samples.loc[judg_train, \"judg_training_set\"] = 1\n",
"judg_samples.head(15)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>perc_func</th>\n",
" <th>judg_func</th>\n",
" <th>actual_temp</th>\n",
" <th>judg_training_set</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16405</th>\n",
" <td>I'll just pretend that youth will never end I'...</td>\n",
" <td>693</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24806</th>\n",
" <td>The Queen The Queen Raquel Alexis | 17 | FL | ...</td>\n",
" <td>77</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15132</th>\n",
" <td>this could have been worse this could have bee...</td>\n",
" <td>492</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>nt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text tokens perc_func \\\n",
"16405 I'll just pretend that youth will never end I'... 693 n \n",
"24806 The Queen The Queen Raquel Alexis | 17 | FL | ... 77 n \n",
"15132 this could have been worse this could have bee... 492 n \n",
"\n",
" judg_func actual_temp judg_training_set \n",
"16405 t nt 0 \n",
"24806 t nt 0 \n",
"15132 t nt 0 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Separate evaluation DataFrame\n",
"judg_eval_set = judg_samples[judg_samples.judg_training_set == 0]\n",
"judg_eval_set.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train TF classifier"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"t:100\n",
"t:200\n",
"t:300\n",
"t:400\n",
"t:500\n",
"t:600\n",
"t:700\n",
"t:800\n",
"t:900\n",
"t:1000\n",
"t:1100\n",
"t:1200\n",
"t:1300\n",
"t:1400\n",
"t:1500\n",
"t:1600\n",
"t:1700\n",
"t:1800\n",
"t:1900\n",
"t:2000\n",
"t:2100\n",
"t:2200\n",
"t:2300\n",
"t:2400\n",
"t:2500\n",
"t:2600\n",
"t:2700\n",
"t:2800\n",
"t:2900\n",
"t:3000\n",
"t:3100\n",
"t:3200\n",
"t:3300\n",
"t:3400\n",
"t:3500\n",
"t:3600\n",
"t:3700\n",
"t:3800\n",
"t:3900\n",
"t:4000\n",
"t:4100\n",
"t:4200\n",
"t:4300\n",
"t:4400\n",
"t:4500\n",
"t:4600\n",
"t:4700\n",
"t:4800\n",
"t:4900\n",
"t:5000\n",
"t:5100\n",
"t:5200\n",
"t:5300\n",
"t:5400\n",
"t:5500\n",
"t:5600\n",
"t:5700\n",
"t:5800\n",
"t:5900\n",
"t:6000\n",
"t:6100\n",
"t:6200\n",
"t:6300\n",
"t:6400\n",
"t:6500\n",
"t:6600\n",
"t:6700\n",
"t:6800\n",
"t:6900\n",
"t:7000\n",
"Finished training Jung Cognitive Functions.\n",
"f:100\n",
"f:200\n",
"f:300\n",
"f:400\n",
"f:500\n",
"f:600\n",
"f:700\n",
"f:800\n",
"f:900\n",
"f:1000\n",
"f:1100\n",
"f:1200\n",
"f:1300\n",
"f:1400\n",
"f:1500\n",
"f:1600\n",
"f:1700\n",
"f:1800\n",
"f:1900\n",
"f:2000\n",
"f:2100\n",
"f:2200\n",
"f:2300\n",
"f:2400\n",
"f:2500\n",
"f:2600\n",
"f:2700\n",
"f:2800\n",
"f:2900\n",
"f:3000\n",
"f:3100\n",
"f:3200\n",
"f:3300\n",
"f:3400\n",
"f:3500\n",
"f:3600\n",
"f:3700\n",
"f:3800\n",
"f:3900\n",
"f:4000\n",
"f:4100\n",
"f:4200\n",
"f:4300\n",
"f:4400\n",
"f:4500\n",
"f:4600\n",
"f:4700\n",
"f:4800\n",
"f:4900\n",
"f:5000\n",
"f:5100\n",
"f:5200\n",
"f:5300\n",
"f:5400\n",
"f:5500\n",
"f:5600\n",
"f:5700\n",
"f:5800\n",
"f:5900\n",
"f:6000\n",
"f:6100\n",
"f:6200\n",
"f:6300\n",
"f:6400\n",
"f:6500\n",
"f:6600\n",
"f:6700\n",
"f:6800\n",
"f:6900\n",
"f:7000\n",
"Finished training Jung Cognitive Functions.\n"
]
}
],
"source": [
"tf_trained_ix = []\n",
"for name in [\"t\",\"f\"]:\n",
" try:\n",
" tf_trained_ix = train_jung_cognitive_functions_en_classes(name, classifier=\"tf\")\n",
" tf_trained_ix.append(tf_trained_ix)\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"length judg_eval_set: 1800\n"
]
},
{
"ename": "NameError",
"evalue": "name 'tf_trained_ix' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-48-9a63d8973ddd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"length judg_eval_set: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjudg_eval_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"length tf_trained_ix: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf_trained_ix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'tf_trained_ix' is not defined"
]
}
],
"source": [
"print(\"length judg_eval_set: {}\".format(len(judg_eval_set)))\n",
"print(\"length tf_trained_ix: {}\".format(len(tf_trained_ix)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classify percieving function "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"def classify_jung_percieving_function_of_text(text):\n",
" \"\"\"Does what it says, pretty much.\"\"\"\n",
" header = {\"Content-Type\": \"application/json\",\n",
" \"Authorization\": \"Token \" + os.environ[\"UCLASSIFY_READ\"]}\n",
" data = {\"texts\":[text]} # send a one-item list for now, since we don't have a feel for sizes\n",
" result = requests.post(\"https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/classify\",\n",
" json = data,\n",
" headers = header)\n",
" json_result = result.json()\n",
" \n",
" res_dict = {\"s\":0, \"n\":0}\n",
" \n",
" for classItem in json_result[0][\"classification\"]:\n",
" res_dict[classItem[\"className\"]] = classItem[\"p\"]\n",
" \n",
" sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)\n",
" return sorted_dict"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"row: 1800 of 1800\r"
]
}
],
"source": [
"zeros = np.zeros(len(perc_eval_set))\n",
"sn_results = []\n",
"row_cnt = 1\n",
"for ix, row in perc_eval_set.iterrows():\n",
" print(\"row: {} of {}\".format(row_cnt, len(perc_eval_set)),end=\"\\r\")\n",
" res = classify_jung_percieving_function_of_text(row[\"text\"])\n",
" sn_results.append(res[0][0])\n",
" row_cnt += 1"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1800"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sn_results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add the percieving classification results to the evaulation dataset "
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" text tokens perc_func \\\n",
"8623 Sonny Jooooooooon INDEX ASK PAST THEME Sonny J... 386 s \n",
"18909 Wit Beyond Measure Wit Beyond Measure Aug 14, ... 340 s \n",
"7557 IT'S ALL COMIN' DOWN ON US, BOYS why is my das... 336 s \n",
"\n",
" judg_func actual_temp perc_training_set sn \n",
"8623 f sf 0 s \n",
"18909 t st 0 n \n",
"7557 t st 0 s \n"
]
}
],
"source": [
"perc_eval_set = pd.concat([perc_eval_set,\n",
" pd.DataFrame(sn_results, index=perc_eval_set.index)\n",
" ], axis=1, ignore_index=True)\n",
"perc_eval_set.columns = [\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\",\"perc_training_set\",\"sn\"]\n",
"perc_eval_set.to_pickle(\"classification_results_percieving_function_blogs_n5000_dataframe.pickle\")\n",
"print(perc_eval_set.head(3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classify TF"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"def classify_jung_judging_function_of_text(text):\n",
" \"\"\"Does what it says, pretty much.\"\"\"\n",
" header = {\"Content-Type\": \"application/json\",\n",
" \"Authorization\": \"Token \" + os.environ[\"UCLASSIFY_READ\"]}\n",
" data = {\"texts\":[text]} # send a one-item list for now, since we don't have a feel for sizes\n",
" result = requests.post(\"https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/classify\",\n",
" json = data,\n",
" headers = header)\n",
" json_result = result.json()\n",
" \n",
" res_dict = {\"t\":0, \"f\":0}\n",
" \n",
" for classItem in json_result[0][\"classification\"]:\n",
" res_dict[classItem[\"className\"]] = classItem[\"p\"]\n",
" \n",
" sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)\n",
" return sorted_dict"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"row: 1800 of 1800\r"
]
}
],
"source": [
"zeros = np.zeros(len(judg_eval_set))\n",
"tf_results = []\n",
"row_cnt = 1\n",
"for ix, row in judg_eval_set.iterrows():\n",
" print(\"row: {} of {}\".format(row_cnt, len(judg_eval_set)),end=\"\\r\")\n",
" res = classify_jung_judging_function_of_text(row[\"text\"])\n",
" tf_results.append(res[0][0])\n",
" row_cnt += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add judging classification results to evaluation set"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" text tokens perc_func \\\n",
"16405 I'll just pretend that youth will never end I'... 693 n \n",
"24806 The Queen The Queen Raquel Alexis | 17 | FL | ... 77 n \n",
"15132 this could have been worse this could have bee... 492 n \n",
"\n",
" judg_func actual_temp judg_training_set tf \n",
"16405 t nt 0 t \n",
"24806 t nt 0 t \n",
"15132 t nt 0 t \n"
]
}
],
"source": [
"judg_eval_set = pd.concat([judg_eval_set,\n",
" pd.DataFrame(tf_results, index=judg_eval_set.index)\n",
" ], axis=1, ignore_index=True)\n",
"judg_eval_set.columns = [\"text\",\"tokens\",\"perc_func\",\"judg_func\",\"actual_temp\",\"judg_training_set\",\"tf\"]\n",
"judg_eval_set.to_pickle(\"classification_results_judging_function_blogs_n10000_dataframe.pickle\")\n",
"print(judg_eval_set.head(3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Evaluation of percieving classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Classification report percieving"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" n 0.87 0.86 0.87 900\n",
" s 0.86 0.88 0.87 900\n",
"\n",
"avg / total 0.87 0.87 0.87 1800\n",
"\n"
]
}
],
"source": [
"sn_cr = classification_report(perc_eval_set['perc_func'], perc_eval_set['sn'])\n",
"print(sn_cr)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Percieving accuracy"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8688888888888889\n"
]
}
],
"source": [
"sn_accuracy = sum(perc_eval_set['perc_func']==perc_eval_set['sn'])/len(perc_eval_set)\n",
"print(sn_accuracy)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Percieving Kappa"