Skip to content

Instantly share code, notes, and snippets.

@louisguitton
Last active January 4, 2023 07:20
Show Gist options
  • Save louisguitton/d2656df1069c9bf3b4d476d9a4bbc629 to your computer and use it in GitHub Desktop.
Save louisguitton/d2656df1069c9bf3b4d476d9a4bbc629 to your computer and use it in GitHub Desktop.
Proof of concept with tagspace and starspace
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create training set"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import dask.dataframe as dd\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!aws s3 sync s3://of-article-entities/ready/ data/ --exclude \"*\" --include \"*/*/*/language=en/articles.json\""
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"_path = 'data/*/*/*/language={lang}/articles.json'"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"ENGLISH_ARTICLES = dd.read_json(_path.format(lang=\"en\")).compute()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(23753, 9)"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ENGLISH_ARTICLES.shape"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"ENGLISH_ARTICLES[\"labels\"] = ENGLISH_ARTICLES.entity_list.apply(lambda ents: \" \".join([\"__label__\" + \"_\".join([e.replace(\" \", \"_\") for e in ent]) for ent in ents]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# tokenize properly with spacy to remove punctuation\n",
"# import spacy\n",
"\n",
"# texts = [\n",
"# \"Net income was $9.4 million compared to the prior year of $2.7 million.\",\n",
"# \"Revenue exceeded twelve billion dollars, with a loss of $1b.\",\n",
"# ]\n",
"\n",
"# nlp = spacy.load(\"en_core_web_sm\")\n",
"# for doc in nlp.pipe(texts, disable=[\"tagger\", \"parser\"]):\n",
"# # Do something with the doc here\n",
"# print([(ent.text, ent.label_) for ent in doc.ents])"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"ENGLISH_ARTICLES[\"content\"] = ENGLISH_ARTICLES.content.map(lambda x: x.replace('\"', ''))"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"train_test_split = 0.9\n",
"split_index = int(len(ENGLISH_ARTICLES) * train_test_split)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"train_set = ENGLISH_ARTICLES.iloc[:split_index]\n",
"validation_set = ENGLISH_ARTICLES.iloc[split_index:]"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"train_set[[\"content\", \"labels\"]].to_csv(\"data/onefootball.train\", sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"validation_set[[\"content\", \"labels\"]].to_csv(\"data/onefootball.test\", sep='\\t', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model training"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import starwrap as sw\n",
"from operator import itemgetter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"arg = sw.args()\n",
"arg.trainFile = 'data/onefootball.train'\n",
"\n",
"arg.minCount = 2\n",
"arg.minCountLabel = 5\n",
"arg.ngrams = 3\n",
"# arg.bucket\n",
"arg.label = \"__label__\"\n",
"\n",
"arg.trainMode = 0\n",
"arg.lr = 0.01\n",
"arg.dim = 100\n",
"arg.epoch = 10\n",
"arg.negSearchLimit = 50\n",
"arg.loss = \"hinge\"\n",
"arg.similarity = \"cosine\"\n",
"arg.adagrad = False\n",
"arg.initRandSd = 0.01\n",
"\n",
"arg.normalizeText = 1 # this in fact doesn't do much. Quotes are still there and break spacy, punctuation is still there etc ...\n",
"arg.thread = 20\n",
"\n",
"sp = sw.starSpace(arg)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"sp.init()\n",
"sp.train()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"sp.nearestNeighbor('__label__team_198_west_ham', 10)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"sp.nearestNeighbor('__label__competition_76_CONMEBOL_Libertadores', 10)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"sp.saveModel('model/onefootball')\n",
"sp.saveModelTsv('model/onefootball.tsv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Accuracy analysis"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"sp.initFromSavedModel('model/onefootball')\n",
"sp.initFromTsv('model/onefootball.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('__label__player_31540_andré_gomes', 0.3621276617050171),\n",
" ('__label__team_18_liverpool', 0.316768079996109),\n",
" ('__label__player_28543_raul_jimenez', 0.28684937953948975),\n",
" ('__label__team_584_thailand', 0.2846275269985199),\n",
" ('__label__player_218631_none', 0.2717776298522949),\n",
" ('__label__player_3111_jordan_henderson', 0.268502414226532),\n",
" ('__label__player_18433_mateo_kovacic', 0.26080992817878723),\n",
" ('__label__player_1342_daniele_de_rossi', 0.25639188289642334),\n",
" ('__label__team_8936_los_angeles_fc', 0.251379132270813),\n",
" ('__label__player_236331_jean-clair_todibo', 0.24982589483261108)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TEST_ARTICLE = \"\"\"Liverpool are keeping a close eye on Napoli midfielder Fabian Ruiz, who put in a solid performance in the middle of the park in their 2-0 victory against the Reds on Tuesday.\n",
"\n",
"Also wanted by the likes of Real Madrid and Barcelona, the 23-year old could cost over £40 million next summer, according to reports from the Daily Mail.\n",
"\n",
"So should Liverpool retain their interest in this midfielder?\n",
"\n",
"Ruiz And His Growth\n",
"Having spent four years with the Betis senior side in La Liga, Ruiz only moved to Italy in 2018 under Carlo Ancelotti, and took no time to cement his place in the first team.\n",
"\n",
"He started 31 games in all competitions for the Italian outfit last season, registering six goals and four assists and has started every game for the club this season.\n",
"\n",
"A fleet-footed midfielder with the ability to control the tempo of the game, Ruiz is a strong presence in the midfield.\n",
"\n",
"Playing in a 4-3-3 or a 4-2-3-1 system, the 23-year old is adept at breaking up play and creating chances for his side but what makes him that much more attractive for Liverpool is his strength and ability to hold up play.\n",
"\n",
"Unlike many silky Spaniards who wriggle their way through defenders, Ruiz is a more complete midfielder, happy to put in a shift for his side when needed.\n",
"\n",
"He idolised former Barcelona star Xavi and though he is not as mesmerising, Ruiz is certainly a player that can win games for his side.\n",
"\n",
"The former Betis man has two international caps and has enough experience to be a real asset for Jürgen Klopp at Liverpool.\n",
"\n",
"A Good Option For Klopp\n",
"The Reds have a number of midfield options in their squad, some of who are even struggling for playing time. The likes of Fabinho, Jordan Henderson and Georginio Wijnaldum are expected to be the first choice for Klopp while Alex Oxlade-Chamberlain, Naby Keita and James Milner are all waiting in the ranks for their chances.\n",
"\n",
"But Ruiz adds a different dimension to this midfield. The Napoli midfielder adds a physical presence, can replace Henderson in the long run and at just 23, is only going to get better playing in a squad like Liverpool under a manager like Klopp.\n",
"\n",
"He can also be an option to replace James Milner, who is in the final year of his contract at Merseyside.\n",
"\n",
"Ruiz may not be the best midfielder up for grabs in the transfer market, but he is certainly a player who will improve the Liverpool squad and is an investment worth the risk for the club.\"\"\"\n",
"\n",
"dict_obj = sp.predictTags(\n",
" TEST_ARTICLE,\n",
" 10\n",
")\n",
"dict_obj = sorted(dict_obj.items(), key=itemgetter(1), reverse=True)\n",
"dict_obj"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boca Juniors set up a first ever Superclasico Copa Libertadores final after overcoming Palmeiras on Wednesday. A 2-2 draw in Sao Paulo was enough for Boca to move into the decider courtesy of a 4-2 aggregate win, and fierce rivals River Plate await in the final. Dario Benedetto, Boca's hero in the semi-final first leg, scored once again off the bench, ending Palmeiras' challenge after Luan and Gustavo Gomez had dragged them back into the tie. Ramon Abila's opener had put Boca in complete control, before Palmeiras' fightback. But the six-time champions moved into the final for a record 11th time and rivals River will be aiming to deny them a seventh title. Palmeiras made the better start in the second leg and thought they had a 10th-minute goal through Bruno Henrique, but the Video Assistant Referee (VAR) was used to correctly rule out the effort for offside. Instead, Boca went ahead in the 18th minute, Abila tapping in after superb work by Sebastian Villa. But Luiz Felipe Scolari's Palmeiras worked their way back into the tie to begin the second half, Luan drilling in a low finish after a set-piece. The hosts then struck again through a Gomez penalty, awarded after Carlos Izquierdoz clumsily brought down Dudu. However, Benedetto came on to haunt Palmeiras once more, drilling in a brilliant 25-yard finish to end the tie.\n"
]
},
{
"data": {
"text/plain": [
"['__label__competition_76_CONMEBOL_Libertadores',\n",
" '__label__player_41959_Gustavo_Gomez',\n",
" '__label__player_44649_None',\n",
" '__label__team_1693_Palmeiras']"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TEST_ARTICLE = \"Boca Juniors set up a first ever Superclasico Copa Libertadores final after overcoming Palmeiras on Wednesday. A 2-2 draw in Sao Paulo was enough for Boca to move into the decider courtesy of a 4-2 aggregate win, and fierce rivals River Plate await in the final. Dario Benedetto, Boca's hero in the semi-final first leg, scored once again off the bench, ending Palmeiras' challenge after Luan and Gustavo Gomez had dragged them back into the tie. Ramon Abila's opener had put Boca in complete control, before Palmeiras' fightback. But the six-time champions moved into the final for a record 11th time and rivals River will be aiming to deny them a seventh title. Palmeiras made the better start in the second leg and thought they had a 10th-minute goal through Bruno Henrique, but the Video Assistant Referee (VAR) was used to correctly rule out the effort for offside. Instead, Boca went ahead in the 18th minute, Abila tapping in after superb work by Sebastian Villa. But Luiz Felipe Scolari's Palmeiras worked their way back into the tie to begin the second half, Luan drilling in a low finish after a set-piece. The hosts then struck again through a Gomez penalty, awarded after Carlos Izquierdoz clumsily brought down Dudu. However, Benedetto came on to haunt Palmeiras once more, drilling in a brilliant 25-yard finish to end the tie.\"\n",
"print(TEST_ARTICLE)\n",
"ground_truth = [\"__label__competition_76_CONMEBOL_Libertadores\", \"__label__player_41959_Gustavo_Gomez\", \"__label__player_44649_None\", \"__label__team_1693_Palmeiras\"]\n",
"ground_truth"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('__label__competition_76_conmebol_libertadores', 0.44700634479522705),\n",
" ('__label__team_1693_palmeiras', 0.4286065697669983),\n",
" ('__label__team_1677_sao_paulo', 0.4140372574329376),\n",
" ('__label__team_857_real_salt_lake', 0.4024886190891266),\n",
" ('__label__team_7535_atlanta_united_fc', 0.39739909768104553),\n",
" ('__label__team_1670_gremio', 0.39005374908447266),\n",
" ('__label__team_1696_boca_juniors', 0.37965211272239685),\n",
" ('__label__team_1862_athletico_paranaense', 0.3795163631439209),\n",
" ('__label__competition_183_superliga_argentina', 0.3694731593132019),\n",
" ('__label__team_1829_river_plate', 0.36560237407684326)]"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dict_obj = sp.predictTags(\n",
" TEST_ARTICLE,\n",
" 10\n",
")\n",
"dict_obj = sorted(dict_obj.items(), key=itemgetter(1), reverse=True)\n",
"dict_obj"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Barcelona midfielder Arturo Vidal is desperate to make the most of his chances to push for a regular spot after helping his team edge Cultural Leonesa. Vidal made just his third start for Barca as the LaLiga champions recorded a 1-0 win in the first leg of the Copa del Rey tie on Wednesday. The Chile international, who joined Barca from Bayern Munich in August, said he wanted to capitalise on his opportunities when given the chance. I am taking advantage of the minutes that come my way to fight for a starting place, Vidal said, via the club's website. Barca needed a 91st-minute goal from Clement Lenglet to claim victory over their third-tier opponents. Ernesto Valverde praised the performance of his team, who had three teenagers in the starting XI. The team had the right winning attitude, but it was hard to get our game flowing and we lost a lot of balls because they were coming in so strong, he said. Football is always a permanent test, but I did see a lot of good things. My players looked good. They shouldered a lot of responsibility. The youngsters are used to playing in Division 2B, but it was much harder because Cultural upped the intensity.\n"
]
},
{
"data": {
"text/plain": [
"['__label__competition_18_Copa_del_Rey',\n",
" '__label__player_1203_Arturo_Vidal',\n",
" '__label__team_5_FC_Barcelona',\n",
" '__label__team_3511_Cultural_Leonesa']"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TEST_ARTICLE = \"Barcelona midfielder Arturo Vidal is desperate to make the most of his chances to push for a regular spot after helping his team edge Cultural Leonesa. Vidal made just his third start for Barca as the LaLiga champions recorded a 1-0 win in the first leg of the Copa del Rey tie on Wednesday. The Chile international, who joined Barca from Bayern Munich in August, said he wanted to capitalise on his opportunities when given the chance. \"\"I am taking advantage of the minutes that come my way to fight for a starting place,\"\" Vidal said, via the club's website. Barca needed a 91st-minute goal from Clement Lenglet to claim victory over their third-tier opponents. Ernesto Valverde praised the performance of his team, who had three teenagers in the starting XI. \"\"The team had the right winning attitude, but it was hard to get our game flowing and we lost a lot of balls because they were coming in so strong,\"\" he said. \"\"Football is always a permanent test, but I did see a lot of good things. \"\"My players looked good. They shouldered a lot of responsibility. The youngsters are used to playing in Division 2B, but it was much harder because Cultural upped the intensity.\"\"\"\t\n",
"ground_truth = [\"__label__competition_18_Copa_del_Rey\", \"__label__player_1203_Arturo_Vidal\", \"__label__team_5_FC_Barcelona\", \"__label__team_3511_Cultural_Leonesa\"]\n",
"print(TEST_ARTICLE)\n",
"ground_truth"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('__label__player_1203_arturo_vidal', 0.48603254556655884),\n",
" ('__label__player_167741_ousmane_dembele', 0.369517058134079),\n",
" ('__label__player_236331_jean-clair_todibo', 0.3270140290260315),\n",
" ('__label__player_1823_ivan_rakitic', 0.3263510465621948),\n",
" ('__label__team_5_fc_barcelona', 0.3189865052700043),\n",
" ('__label__team_216_getafe', 0.31509459018707275),\n",
" ('__label__player_121372_arthur', 0.3119848668575287),\n",
" ('__label__player_4833_serge_aurier', 0.30515265464782715),\n",
" ('__label__player_51793_nabil_fekir', 0.29782170057296753),\n",
" ('__label__player_20158_jasper_cillessen', 0.2864113450050354)]"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dict_obj = sp.predictTags(\n",
" TEST_ARTICLE,\n",
" 10\n",
")\n",
"dict_obj = sorted(dict_obj.items(), key=itemgetter(1), reverse=True)\n",
"dict_obj"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Gianfranco Zola was impressed by what he saw from Chelsea loanees Mason Mount and Fikayo Tomori during his side's 3-2 win over Derby County. The Premier League giants moved into the EFL Cup quarter-finals after the tight win at Stamford Bridge on Wednesday. Tomori and Richard Keogh scored own goals for Derby, who fought back on both occasions through Jack Marriott and Martyn Waghorn, before Cesc Fabregas' 41st-minute strike proved to be the winner. Mount and Tomori are both on loan from Chelsea and Zola enjoyed the duo's performances. He [Tomori] has been unlucky with the own goal, but the performance he and Keogh put on afterwards was excellent, the Chelsea assistant told a news conference. It happens. If he hadn't touched that ball, [Alvaro] Morata would have scored a goal. He played well. Mason Mount played excellent. It looks like they're in good hands. They're doing very well for the team. The coaching they're getting is excellent. Zola was unhappy with the way Chelsea finished the encounter, while he praised Frank Lampard's Derby. More than a little bit [shaky], to be honest. Especially towards the end, he said. A lot is down to them. They really played well and surprised us, played with a lot of quality. A lot was down to us. The passing wasn't as sharp as it normally is, and the last 10-15 minutes we lost control. The distances were too far apart, and when you give space to the players Derby have got, it becomes difficult. It was positive in some ways, but others didn't work properly.\n"
]
},
{
"data": {
"text/plain": [
"['__label__competition_41_EFL_Cup',\n",
" '__label__team_9_Chelsea',\n",
" '__label__team_568_Derby_County']"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TEST_ARTICLE = \"Gianfranco Zola was impressed by what he saw from Chelsea loanees Mason Mount and Fikayo Tomori during his side's 3-2 win over Derby County. The Premier League giants moved into the EFL Cup quarter-finals after the tight win at Stamford Bridge on Wednesday. Tomori and Richard Keogh scored own goals for Derby, who fought back on both occasions through Jack Marriott and Martyn Waghorn, before Cesc Fabregas' 41st-minute strike proved to be the winner. Mount and Tomori are both on loan from Chelsea and Zola enjoyed the duo's performances. \"\"He [Tomori] has been unlucky with the own goal, but the performance he and Keogh put on afterwards was excellent,\"\" the Chelsea assistant told a news conference. \"\"It happens. If he hadn't touched that ball, [Alvaro] Morata would have scored a goal. He played well. \"\"Mason Mount played excellent. It looks like they're in good hands. They're doing very well for the team. The coaching they're getting is excellent.\"\" Zola was unhappy with the way Chelsea finished the encounter, while he praised Frank Lampard's Derby. \"\"More than a little bit [shaky], to be honest. Especially towards the end,\"\" he said. \"\"A lot is down to them. They really played well and surprised us, played with a lot of quality. \"\"A lot was down to us. The passing wasn't as sharp as it normally is, and the last 10-15 minutes we lost control. The distances were too far apart, and when you give space to the players Derby have got, it becomes difficult. \"\"It was positive in some ways, but others didn't work properly.\"\"\"\n",
"ground_truth = [\"__label__competition_41_EFL_Cup\", \"__label__team_9_Chelsea\", \"__label__team_568_Derby_County\"]\n",
"print(TEST_ARTICLE)\n",
"ground_truth"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('__label__team_568_derby_county', 0.49994295835494995),\n",
" ('__label__team_9_chelsea', 0.444227933883667),\n",
" ('__label__player_244_frank_james_lampard_jr.', 0.44013872742652893),\n",
" ('__label__team_581_reading', 0.4061957001686096),\n",
" ('__label__player_44582_emerson', 0.3675287663936615),\n",
" ('__label__player_2944_gary_cahill', 0.3649985194206238),\n",
" ('__label__player_35_cesc_fábregas', 0.3624136447906494),\n",
" ('__label__competition_41_efl_cup', 0.35637256503105164),\n",
" ('__label__player_165454_christian_pulisic', 0.34450763463974),\n",
" ('__label__team_169_paderborn', 0.3281993269920349)]"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dict_obj = sp.predictTags(\n",
" TEST_ARTICLE,\n",
" 10\n",
")\n",
"dict_obj = sorted(dict_obj.items(), key=itemgetter(1), reverse=True)\n",
"dict_obj"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the starspce environment\n",
"```sh\n",
"./starspace test \\\n",
" -model /Users/louis.guitton/workspace/nlp/tagspace/model/onefootball \\\n",
" -testFile /Users/louis.guitton/workspace/nlp/tagspace/data/onefootball.test \\\n",
" -ngrams 3 \\\n",
" -dim 100 \\\n",
" -label \"__label__\" \\\n",
" -thread 10 \\\n",
" -similarity \"cosine\" \\\n",
" -trainMode 0 \\\n",
" -verbose true \\\n",
" -minCount 2 \\\n",
" -minCountLabel 5 \\\n",
" -normalizeText 1\n",
"```\n",
"```\n",
"Predictions use 929 known labels.\n",
"Evaluation Metrics :\n",
"hit@1: 0.151026 hit@10: 0.541056 hit@20: 0.644673 hit@50: 0.757576 mean ranks : 94.2087 Total examples : 2046\n",
"```\n",
"to be compared with AG News\n",
"```\n",
"Predictions use 4 known labels.\n",
"Evaluation Metrics :\n",
"hit@1: 0.456184 hit@10: 1 hit@20: 1 hit@50: 1 mean ranks : 1.70526 Total examples : 7600\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting embeddings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The n-gram embeddings are not saved in the .tsv model; that's why you can't see `West Ham` as the nearest neighbour of `__label__west_ham__`\n",
"Instead use \n",
"```sh\n",
"./print_ngrams /Users/louis.guitton/workspace/nlp/tagspace/model/onefootball\n",
"```\n",
"```\n",
"West Ham\t-0.0244914\t0.00680907\t0.157752\t-0.0735132\t0.143847\t0.056452\t0.00722723\t-0.0353172\t-0.132659\t-0.0656663\t-0.112149\t-0.134879\t0.0470783\t-0.007374440.0650546\t-0.0718336\t0.0160372\t-0.0271225\t0.0798726\t-0.0102446\t-0.0902533\t-0.0664017\t-0.0650377\t0.06048\t0.000442296\t-0.09967\t0.0335253\t-0.00222331\t0.154808\t-0.113908\t0.052111\t-0.0760448\t-0.0700552\t-0.096797\t0.138114\t0.0166862\t-0.122502\t0.0319883\t0.0259743\t0.0519145\t-0.0705131\t-0.0235085\t-0.00520651\t0.0568472\t-0.0697021\t0.000675455\t-0.146852\t-0.0845096\t0.0178718\t0.0640614\t0.146261\t-0.194477\t-0.0523474\t0.137312\t0.124314\t-0.0406497\t-0.090948\t-0.0736607\t-0.111932\t-0.0375369\t-0.120031\t0.00686499\t-0.0730754\t-0.144816\t-0.173841\t0.137984\t0.183118\t-0.0604024\t-0.012861\t0.113193\t-0.0398777\t0.135168\t-0.112527\t0.0188564\t0.0822087\t-0.218082\t-0.0508953\t0.0607064\t0.023897\t-0.076177\t0.028344\t-0.08976\t-0.13332\t0.087658\t0.094266\t0.115975\t0.0862276\t0.126824\t0.0160287\t-0.119148\t0.0502826\t0.0887933\t0.133288\t0.0113388\t-0.0045052\t0.23863\t0.131842\t0.0359911\t-0.0231085\t0.109756\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-1.58090472e-01, -1.90266907e-01, 2.77571119e-02,\n",
" -2.61785239e-02, 8.93533677e-02, -2.05935746e-01,\n",
" -1.75044522e-01, 2.00909898e-02, 6.47536367e-02,\n",
" 4.29923600e-03, 8.37889686e-02, 7.56595656e-02,\n",
" 1.30969426e-02, 2.01743454e-01, 2.85478439e-02,\n",
" 1.54873123e-02, -7.28713945e-02, 1.14682987e-01,\n",
" 1.01946019e-01, 1.77947462e-01, -1.56516861e-02,\n",
" 4.87061962e-02, 4.14418988e-02, -1.05563611e-01,\n",
" 2.19263693e-05, 1.38643518e-01, -5.18400073e-02,\n",
" -3.00905257e-02, -1.30034268e-01, -5.03365658e-02,\n",
" -1.42969385e-01, -8.97409022e-03, 9.83169377e-02,\n",
" 1.53958917e-01, -8.38928670e-02, 3.06476317e-02,\n",
" 2.24182149e-03, -1.00615226e-01, -8.42559040e-02,\n",
" -1.25957832e-01, 1.05084844e-01, 4.47565466e-02,\n",
" -5.95831200e-02, 6.12190515e-02, 2.42323969e-02,\n",
" 1.21473270e-02, -1.15984283e-01, 1.93821371e-01,\n",
" 6.35505319e-02, 4.54409374e-03, -2.06738770e-01,\n",
" -5.37876189e-02, -1.71813235e-01, -4.64049093e-02,\n",
" -2.34730188e-02, 1.25759184e-01, -6.36858493e-02,\n",
" -7.31229261e-02, -1.15670480e-01, -5.30677438e-02,\n",
" 1.07386880e-01, -6.95242658e-02, -8.97739157e-02,\n",
" -4.65730615e-02, 1.82087481e-01, 1.00765660e-01,\n",
" -7.22700963e-03, -1.89503461e-01, -6.72605783e-02,\n",
" -2.92470634e-01, -7.16677830e-02, 1.35851875e-01,\n",
" 8.08467790e-02, 8.78363028e-02, 6.93718567e-02,\n",
" 7.46491104e-02, -2.12936401e-02, 4.59866077e-02,\n",
" -8.51631239e-02, -3.61795388e-02, -2.83122770e-02,\n",
" 4.48369347e-02, 1.42192552e-02, 2.55957581e-02,\n",
" -6.12656102e-02, -2.49524876e-01, 3.31865884e-02,\n",
" -9.19917002e-02, -5.81201026e-03, -2.47431397e-02,\n",
" -5.87100536e-02, 1.83010660e-02, 1.06517691e-02,\n",
" -8.11685920e-02, -1.22607863e-02, -1.03914328e-02,\n",
" -5.51938489e-02, 1.78675920e-01, -6.31804243e-02,\n",
" 5.91063388e-02]], dtype=float32)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TEST_SENTENCE = \"Gianfranco Zola was impressed by what he saw from Chelsea loanees Mason Mount and Fikayo Tomori during his side's 3-2 win over Derby County. The Premier League giants moved into the EFL Cup quarter-finals after the tight win at Stamford Bridge on Wednesday.\"\n",
"np.array(sp.getDocVector(TEST_SENTENCE, ' '))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using the vectors in spacy"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"embeddings = pd.read_csv('model/onefootball.tsv', sep='\\t', header=None, index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"embeddings.to_csv('model/onefootball_for_spacy.csv', sep=' ', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"with open('model/onefootball_for_spacy.csv', 'r+') as f:\n",
" content = f.read()\n",
" f.seek(0, 0)\n",
" f.write(' '.join(map(str, embeddings.shape)).rstrip('\\r\\n') + '\\n' + content)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[38;5;2m✔ Successfully created model\u001b[0m\n",
"62572it [00:01, 32302.48it/s]onefootball_for_spacy.csv\n",
"\u001b[2K\u001b[38;5;2m✔ Loaded vectors from model/onefootball_for_spacy.csv\u001b[0m\n",
"\u001b[38;5;2m✔ Sucessfully compiled vocab\u001b[0m\n",
"62962 entries, 62572 vectors\n"
]
}
],
"source": [
"!python -m spacy init-model en spacy/onefootball_vectors_lg --vectors-loc model/onefootball_for_spacy.csv"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py:193: UserWarning: [W019] Changing vectors name from en_model.vectors to en_model.vectors_62572, to avoid clash with previously loaded vectors. See Issue #3853.\n",
" \"__main__\", mod_spec)\n"
]
},
{
"data": {
"text/plain": [
"-0.019760523523685992"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp_onefootball = spacy.load(\"spacy/onefootball_vectors_lg\")\n",
"doc1 = nlp_onefootball(u\"Eden Hazard scored for Real Madrid\")\n",
"doc2 = nlp_onefootball(u\"Eden Hazard used to play in the premier League\")\n",
"doc3 = nlp_onefootball(u\"Firmino scored for Liverpool\")\n",
"\n",
"doc1.similarity(doc2)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0000000218686238"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc1.similarity(doc3)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.019760515161368584"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc3.similarity(doc2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment