Skip to content

Instantly share code, notes, and snippets.

@louisguitton
Last active January 4, 2023 07:20
Show Gist options
  • Save louisguitton/d2656df1069c9bf3b4d476d9a4bbc629 to your computer and use it in GitHub Desktop.
Save louisguitton/d2656df1069c9bf3b4d476d9a4bbc629 to your computer and use it in GitHub Desktop.
Proof of concept with tagspace and starspace
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"s3://of-article-entities/textcat/date_at=2020-02-28/language_code=en/articles_for_comprehend.csv\", header=None, names=[\"label\", \"text\"])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df = df[[\"text\", \"label\"]].rename(columns={\"text\": \"content\", \"label\": \"labels\"})\n",
"df.content = df.content.str.replace(\"\\n\", \" \")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"df_train, df_test = train_test_split(\n",
" df, test_size=0.1, random_state=42, stratify=df.labels, shuffle=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"df_train.to_csv(\"data/en_onefootball_train.csv\", index=False, sep=\"\\t\")\n",
"df_test.to_csv(\"data/en_onefootball_test.csv\", index=False, sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import starwrap as sw\n",
"from operator import itemgetter"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"arg = sw.args()\n",
"arg.trainFile = 'data/en_onefootball_train.csv'\n",
"\n",
"arg.minCount = 2\n",
"arg.minCountLabel = 5\n",
"arg.ngrams = 3\n",
"# arg.bucket\n",
"arg.label = \"__label__\"\n",
"\n",
"arg.trainMode = 0\n",
"arg.lr = 0.01\n",
"arg.dim = 100\n",
"arg.epoch = 10\n",
"arg.negSearchLimit = 50\n",
"arg.loss = \"hinge\"\n",
"arg.similarity = \"cosine\"\n",
"arg.adagrad = False\n",
"arg.initRandSd = 0.01\n",
"\n",
"arg.normalizeText = 1 # this in fact doesn't do much. Quotes are still there and break spacy, punctuation is still there etc ...\n",
"arg.thread = 20\n",
"\n",
"sp = sw.starSpace(arg)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"sp.init()\n",
"sp.train()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"sp.saveModel('model/onefootball')\n",
"sp.saveModelTsv('model/onefootball.tsv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Training epoch 9: 0.000999999 0.001\n",
"Epoch: 99.6% lr: -0.000000 loss: 0.005281 eta: <1min tot: 0h0m34s (100.0%)\n",
" ---+++ Epoch 9 Train error : 0.00490574 +++--- ☃"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/bin/sh: line 1: starspace: command not found\n"
]
}
],
"source": [
"cmd = \"\"\"\n",
"./starspace test \\\n",
" -model /Users/louis.guitton/workspace/nlp/tagspace/model/onefootball \\\n",
" -testFile /Users/louis.guitton/workspace/nlp/tagspace/data/en_onefootball_test.csv \\\n",
" -ngrams 3 \\\n",
" -dim 100 \\\n",
" -label \"__label__\" \\\n",
" -thread 10 \\\n",
" -similarity \"cosine\" \\\n",
" -trainMode 0 \\\n",
" -verbose true \\\n",
" -minCount 2 \\\n",
" -minCountLabel 5 \\\n",
" -normalizeText 1\n",
"\"\"\"\n",
"!{cmd}"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>__label__player_focus</td>\n",
" <td>bayern munich negotiating with kingsley coman ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>__label__fixture_piece</td>\n",
" <td>city football group ' close ' to completing ta...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>__label__player_focus</td>\n",
" <td>_ emoji _ ballon d'onefootball : our world 's ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>__label__one_sided</td>\n",
" <td>' pl will cancel rest of season ' . the premi...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>__label__player_focus</td>\n",
" <td>four decades in a row ! zlatan ibrahimović sco...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4093</td>\n",
" <td>__label__match_day_round_up</td>\n",
" <td>messi ' loves being at barcelona ' . quique s...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4094</td>\n",
" <td>__label__player_focus</td>\n",
" <td>richarlison agrees new everton contract . ric...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4095</td>\n",
" <td>__label__player_focus</td>\n",
" <td>handanovic wants inter trophies . samir handa...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4096</td>\n",
" <td>__label__fixture_piece</td>\n",
" <td>liverpool fan sean cox returns home . liverpo...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4097</td>\n",
" <td>__label__one_sided</td>\n",
" <td>nuno ' delighted ' after wolves power through ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4098 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" label \\\n",
"0 __label__player_focus \n",
"1 __label__fixture_piece \n",
"2 __label__player_focus \n",
"3 __label__one_sided \n",
"4 __label__player_focus \n",
"... ... \n",
"4093 __label__match_day_round_up \n",
"4094 __label__player_focus \n",
"4095 __label__player_focus \n",
"4096 __label__fixture_piece \n",
"4097 __label__one_sided \n",
"\n",
" text \n",
"0 bayern munich negotiating with kingsley coman ... \n",
"1 city football group ' close ' to completing ta... \n",
"2 _ emoji _ ballon d'onefootball : our world 's ... \n",
"3 ' pl will cancel rest of season ' . the premi... \n",
"4 four decades in a row ! zlatan ibrahimović sco... \n",
"... ... \n",
"4093 messi ' loves being at barcelona ' . quique s... \n",
"4094 richarlison agrees new everton contract . ric... \n",
"4095 handanovic wants inter trophies . samir handa... \n",
"4096 liverpool fan sean cox returns home . liverpo... \n",
"4097 nuno ' delighted ' after wolves power through ... \n",
"\n",
"[4098 rows x 2 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(\n",
" \"s3://of-article-entities/textcat/date_at=2020-02-28/language_code=en/fasttext/train_channel/data.txt\",\n",
" sep=\"\\t\",\n",
" header=None,\n",
" names=[\"label\", \"text\"],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>content</th>\n",
" <th>labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>4111</td>\n",
" <td>Andy Robertson regrets attacking Messi in Cham...</td>\n",
" <td>__label__fixture_piece</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1480</td>\n",
" <td>_EMOJI_ Remembering when Edgar Davids nearly k...</td>\n",
" <td>__label__fixture_piece</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3177</td>\n",
" <td>Report: Aston Villa 1-2 Man City. Manchester ...</td>\n",
" <td>__label__fixture_piece</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1488</td>\n",
" <td>Chelsea's best ever jersey countdown ... numbe...</td>\n",
" <td>__label__one_sided</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2525</td>\n",
" <td>Muller: I won't discuss future yet. Thomas Mu...</td>\n",
" <td>__label__player_focus</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3275</td>\n",
" <td>Leicester eye experienced left-back to replace...</td>\n",
" <td>__label__match_day_round_up</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1432</td>\n",
" <td>Holgate signs new deal with Everton. Mason Ho...</td>\n",
" <td>__label__player_focus</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1799</td>\n",
" <td>Mbappe included in PSG squad. Kylian Mbappe i...</td>\n",
" <td>__label__fixture_piece</td>\n",
" </tr>\n",
" <tr>\n",
" <td>917</td>\n",
" <td>The big Friday Premier League quiz. All 20 Pr...</td>\n",
" <td>__label__match_day_round_up</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4195</td>\n",
" <td>_EMOJI_ Slick City, we love the EFL Cup and th...</td>\n",
" <td>__label__others</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>456 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" content \\\n",
"4111 Andy Robertson regrets attacking Messi in Cham... \n",
"1480 _EMOJI_ Remembering when Edgar Davids nearly k... \n",
"3177 Report: Aston Villa 1-2 Man City. Manchester ... \n",
"1488 Chelsea's best ever jersey countdown ... numbe... \n",
"2525 Muller: I won't discuss future yet. Thomas Mu... \n",
"... ... \n",
"3275 Leicester eye experienced left-back to replace... \n",
"1432 Holgate signs new deal with Everton. Mason Ho... \n",
"1799 Mbappe included in PSG squad. Kylian Mbappe i... \n",
"917 The big Friday Premier League quiz. All 20 Pr... \n",
"4195 _EMOJI_ Slick City, we love the EFL Cup and th... \n",
"\n",
" labels \n",
"4111 __label__fixture_piece \n",
"1480 __label__fixture_piece \n",
"3177 __label__fixture_piece \n",
"1488 __label__one_sided \n",
"2525 __label__player_focus \n",
"... ... \n",
"3275 __label__match_day_round_up \n",
"1432 __label__player_focus \n",
"1799 __label__fixture_piece \n",
"917 __label__match_day_round_up \n",
"4195 __label__others \n",
"\n",
"[456 rows x 2 columns]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"sp.initFromSavedModel('model/onefootball')\n",
"sp.initFromTsv('model/onefootball.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"y_pred = [list(sp.predictTags(\n",
" x,\n",
" 1\n",
").keys())[0] for x in df_test.content]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import (\n",
" accuracy_score,\n",
" classification_report,\n",
" confusion_matrix,\n",
" f1_score,\n",
" precision_score,\n",
" recall_score,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"y_test = df_test.labels"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.41885070925922685"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f1_score(y_test, y_pred, average=\"weighted\")"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[16, 50, 13, 16, 5],\n",
" [ 9, 77, 4, 10, 0],\n",
" [ 4, 31, 51, 9, 5],\n",
" [ 2, 14, 0, 40, 0],\n",
" [11, 43, 15, 10, 21]])"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.44956140350877194"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5031794986326356"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"precision_score(y_test, y_pred, average=\"weighted\")"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" __label__fixture_piece 0.38 0.16 0.23 100\n",
"__label__match_day_round_up 0.36 0.77 0.49 100\n",
" __label__one_sided 0.61 0.51 0.56 100\n",
" __label__others 0.47 0.71 0.57 56\n",
" __label__player_focus 0.68 0.21 0.32 100\n",
"\n",
" accuracy 0.45 456\n",
" macro avg 0.50 0.47 0.43 456\n",
" weighted avg 0.50 0.45 0.42 456\n",
"\n"
]
}
],
"source": [
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment