Skip to content

Instantly share code, notes, and snippets.

@digitalTranshumant
Last active January 25, 2023 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save digitalTranshumant/304be239b848d5858fd374ac3e59bbcb to your computer and use it in GitHub Desktop.
Save digitalTranshumant/304be239b848d5858fd374ac3e59bbcb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "f047e3cb",
"metadata": {},
"source": [
"# 0 Clone repo\n",
"Clone this Repo: https://gitlab.wikimedia.org/repos/research/section-image-recs\n",
"\n",
"# 1 First run this (on bash)\n",
"\n",
"PYSPARK_DRIVER_PYTHON=python \\\n",
"PYSPARK_PYTHON=./environment/bin/python \\\n",
"spark2-submit \\\n",
" --master yarn \\\n",
" --executor-cores 4 \\\n",
" --executor-memory 8G \\\n",
" --conf spark.dynamicAllocation.maxExecutors=128 \\\n",
" --conf spark.sql.shuffle.partitions=1024 \\\n",
" --archives imagerec_env.tar.gz#environment \\\n",
" imagerec/article_images.py \\\n",
" --wikitext-snapshot 2022-12 \\\n",
" --item-page-link-snapshot 2023-01-02 \\\n",
" --output /user/dsaez/sec_english_without_images \\\n",
" --wp-codes-file wikipedias_codes.json\n",
" \n",
"# 2 Second this \n",
"\n",
"PYSPARK_DRIVER_PYTHON=python \\\n",
"PYSPARK_PYTHON=./environment/bin/python \\\n",
"spark2-submit \\\n",
" --master yarn \\\n",
" --executor-cores 4 \\\n",
" --executor-memory 8G \\\n",
" --conf spark.dynamicAllocation.maxExecutors=128 \\\n",
" --conf spark.sql.shuffle.partitions=2048 \\\n",
" --archives imagerec_env.tar.gz#environment \\\n",
" imagerec/recommendation.py \\\n",
" --section-images /user/dsaez/sec_english_without_images \\\n",
" --section-alignments /user/mnz/secmap_results/aligned_sections_subset/aligned_sections_subset_9.0_2022-02.parquet \\\n",
" --output /user/dsaez/sec_english_without_images_recs \\\n",
" --wp-codes en --max-target-images 0 \n",
" \n",
"# Post Process "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a595e675",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using wmfdata v1.3.3, but v2.0.0 is available.\n",
"\n",
"To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release --ignore-installed`.\n",
"\n",
"To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md\n",
"PySpark executors will use /opt/conda-analytics/bin/python3.\n"
]
}
],
"source": [
"import wmfdata\n",
"spark = wmfdata.spark.get_session(type='yarn-regular')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7c13ed6f",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.parquet('/user/dsaez/sec_english_without_images_recs')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "e43d54a1",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import map_values,explode"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "354c3a12",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+--------------+----+\n",
"| item_id| target_title| target_heading| recommended_images|target_wiki_db|imgs|\n",
"+----------+--------------------+--------------------+--------------------+--------------+----+\n",
"| Q1003844| Newberry, Florida| geography|[[iowiki -> [FLMa...| enwiki| 2|\n",
"| Q1008| Ivory Coast| external links|[[gdwiki -> [Côte...| enwiki| 2|\n",
"| Q1010602| Giorgio Gaber| discography|[[eswiki -> [Ombr...| enwiki| 2|\n",
"| Q1022913| Haizhu District| transportation|[[ruwiki -> [Brid...| enwiki| 1|\n",
"| Q1023770| Taiji, Wakayama| sister cities|[[zhwiki -> [Broo...| enwiki| 1|\n",
"| Q1025260| Huncovce| geography|[[dewiki -> [Hunc...| enwiki| 1|\n",
"| Q1027725| Gotland Russ| references|[[euwiki -> [Gotl...| enwiki| 1|\n",
"| Q10284326|People's Armed Fo...| history|[[eswiki -> [East...| enwiki| 1|\n",
"| Q1029041| Tupi Football Club| history|[[ruwiki -> [Крас...| enwiki| 1|\n",
"| Q1029522| Le Ponchel| geography|[[ocwiki -> [Blan...| enwiki| 1|\n",
"| Q1059798| Baho| population|[[anwiki -> [Popu...| enwiki| 7|\n",
"|Q106218052|Rollin' (Brave Gi...| reception|[[ptwiki -> [Brav...| enwiki| 1|\n",
"| Q1064421| Sei Itō| life|[[jawiki -> [Niho...| enwiki| 1|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki| 3|\n",
"|Q107674650| Percy Liza| career statistics|[[frpwiki -> [Fla...| enwiki| 4|\n",
"| Q1090156| Chvalšiny| history|[[dewiki -> [Kals...| enwiki| 1|\n",
"|Q109466988|List of minor pla...| external links|[[itwiki -> [Gali...| enwiki| 1|\n",
"| Q1102383| Bassoncourt| population|[[zhwiki -> [Popu...| enwiki| 1|\n",
"| Q1137509|2006–07 Coupe de ...| final|[[frwiki -> [Olym...| enwiki| 3|\n",
"| Q1137605| Martizay| population|[[zhwiki -> [Popu...| enwiki| 1|\n",
"+----------+--------------------+--------------------+--------------------+--------------+----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.selectExpr('*','size(recommended_images) as imgs').show()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "69d99f4c",
"metadata": {},
"outputs": [],
"source": [
"df2 = df.selectExpr('*','explode(recommended_images) as wiki_to_images')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "933d430b",
"metadata": {},
"outputs": [],
"source": [
"df3 = df2.withColumn('imgs',explode(map_values('wiki_to_images')))"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "fff04bf5",
"metadata": {},
"outputs": [],
"source": [
"df4 = df3.withColumn('img',explode('imgs'))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "b36406eb",
"metadata": {},
"outputs": [],
"source": [
"import pyspark.sql.functions as sql_fun\n",
"df5 = df4.filter(sql_fun.lower(df4.img).contains(\".jpg\"))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "843e6d02",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+\n",
"| item_id| target_title| target_heading| recommended_images|target_wiki_db| wiki_to_images| imgs| img|\n",
"+----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+\n",
"| Q1008| Ivory Coast| external links|[[gdwiki -> [Côte...| enwiki|[gdwiki -> [Côte ...|[Côte d'Ivoire Ma...|Côte d'Ivoire Map...|\n",
"| Q1010602| Giorgio Gaber| discography|[[eswiki -> [Ombr...| enwiki|[eswiki -> [Ombre...|[Ombretta Colli.jpg]| Ombretta Colli.jpg|\n",
"| Q1010602| Giorgio Gaber| discography|[[eswiki -> [Ombr...| enwiki|[plwiki -> [Giorg...|[Giorgio Gaber li...|Giorgio Gaber liv...|\n",
"| Q1022913| Haizhu District| transportation|[[ruwiki -> [Brid...| enwiki|[ruwiki -> [Bridg...|[Bridge at Guangz...|Bridge at Guangzh...|\n",
"| Q1022913| Haizhu District| transportation|[[ruwiki -> [Brid...| enwiki|[ruwiki -> [Bridg...|[Bridge at Guangz...|THZ 004 Pass thru...|\n",
"| Q1022913| Haizhu District| transportation|[[ruwiki -> [Brid...| enwiki|[ruwiki -> [Bridg...|[Bridge at Guangz...|Canton Tower Stat...|\n",
"| Q1023770| Taiji, Wakayama| sister cities|[[zhwiki -> [Broo...| enwiki|[zhwiki -> [Broom...|[BroomeJapaneseCe...|BroomeJapaneseCem...|\n",
"| Q1025260| Huncovce| geography|[[dewiki -> [Hunc...| enwiki|[dewiki -> [Hunco...|[Huncovce, widok ...|Huncovce, widok n...|\n",
"| Q1027725| Gotland Russ| references|[[euwiki -> [Gotl...| enwiki|[euwiki -> [Gotla...|[GotlandrussAussc...|GotlandrussAussch...|\n",
"| Q1059798| Baho| population|[[anwiki -> [Popu...| enwiki|[frwiki -> [Caste...|[Castells bao 200...|Castells bao 2005...|\n",
"|Q106218052|Rollin' (Brave Gi...| reception|[[ptwiki -> [Brav...| enwiki|[ptwiki -> [Brave...|[Brave Girls PAK....| Brave Girls PAK.jpg|\n",
"| Q1064421| Sei Itō| life|[[jawiki -> [Niho...| enwiki|[jawiki -> [Nihon...|[Nihon-kindai-bun...|Nihon-kindai-bung...|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki|[astwiki -> [Solj...|[Solja Tinganes.j...| Solja Tinganes.jpg|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki|[astwiki -> [Solj...|[Solja Tinganes.j...|Torshavn town hal...|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki|[dewiki -> [Tinga...| [Tinganes.jpg]| Tinganes.jpg|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki|[eswiki -> [Solja...|[Solja Tinganes.j...| Solja Tinganes.jpg|\n",
"| Q10704| Tórshavn|politics and gove...|[[astwiki -> [Sol...| enwiki|[eswiki -> [Solja...|[Solja Tinganes.j...|Torshavn town hal...|\n",
"| Q1090156| Chvalšiny| history|[[dewiki -> [Kals...| enwiki|[dewiki -> [Kalsc...|[Kalsching-2007-0...|Kalsching-2007-07...|\n",
"|Q109466988|List of minor pla...| external links|[[itwiki -> [Gali...| enwiki|[itwiki -> [Galil...|[Galileo Ida n Da...|Galileo Ida n Dac...|\n",
"| Q1137509|2006–07 Coupe de ...| final|[[frwiki -> [Olym...| enwiki|[frwiki -> [Olymp...|[Olympique de Mar...|Olympique de Mars...|\n",
"+----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df5.show()"
]
},
{
"cell_type": "code",
"execution_count": 152,
"id": "48018001",
"metadata": {},
"outputs": [],
"source": [
"# Count the total number of recommendations (even if images are repated across wikis) per section\n",
"df6 = df5.select('item_id','target_title','target_heading').groupby('item_id','target_title','target_heading').count()"
]
},
{
"cell_type": "code",
"execution_count": 153,
"id": "215f9573",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"352751"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df6.count()"
]
},
{
"cell_type": "code",
"execution_count": 154,
"id": "3c87e79b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[item_id: string, target_title: string, target_heading: string, count: bigint]"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df6.cache()"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "4d04742a",
"metadata": {},
"outputs": [],
"source": [
"page_ids = spark.sql(\"\"\"\n",
"SELECT\n",
" item_id,\n",
" page_id,\n",
"FROM wmf.wikidata_item_page_link\n",
"WHERE snapshot='2023-01-02' \n",
" AND wiki_db = 'enwiki'\n",
" AND page_namespace = 0\n",
"\"\"\")\n",
"\n",
"non_redirects = spark.sql(\"\"\"\n",
"\n",
"SELECT DISTINCT page_id\n",
"FROM wmf.mediawiki_history\n",
"WHERE snapshot='2022-12' \n",
" AND wiki_db = 'enwiki'\n",
" AND page_is_redirect = False\n",
" AND page_namespace=0\n",
"\"\"\")\n",
"page_ids = page_ids.join(non_redirects,'page_id')"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "9dbb4d25",
"metadata": {},
"outputs": [],
"source": [
"df7 = df6.join(page_ids,'item_id')"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "c307cbc4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"352750"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df7.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3eeb57d5",
"metadata": {},
"outputs": [],
"source": [
"output = df7.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 172,
"id": "8b00710a",
"metadata": {},
"outputs": [],
"source": [
"output.rename(columns={'count':'n_recommendations','target_title':'page_title','target_heading':'section_heading','item_id':'wikidata_id'},inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 173,
"id": "31faad48",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>wikidata_id</th>\n",
" <th>page_title</th>\n",
" <th>section_heading</th>\n",
" <th>n_recommendations</th>\n",
" <th>page_id</th>\n",
" <th>page_title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Q1012273</td>\n",
" <td>Heinzenberg Castle</td>\n",
" <td>history</td>\n",
" <td>4</td>\n",
" <td>53359722</td>\n",
" <td>Heinzenberg_Castle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Q1013544</td>\n",
" <td>Kaza, Himachal Pradesh</td>\n",
" <td>festivals &amp; tourism</td>\n",
" <td>1</td>\n",
" <td>21521011</td>\n",
" <td>Kaza,_Himachal_Pradesh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Q1014119</td>\n",
" <td>Fort McPherson, Northwest Territories</td>\n",
" <td>history</td>\n",
" <td>1</td>\n",
" <td>3122565</td>\n",
" <td>Fort_McPherson,_Northwest_Territories</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Q1018115</td>\n",
" <td>2007–08 Boston Celtics season</td>\n",
" <td>roster</td>\n",
" <td>1</td>\n",
" <td>11959669</td>\n",
" <td>2007–08_Boston_Celtics_season</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Q1018191</td>\n",
" <td>Petrovec Municipality</td>\n",
" <td>geography</td>\n",
" <td>1</td>\n",
" <td>4823615</td>\n",
" <td>Petrovec_Municipality</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>352745</th>\n",
" <td>Q977915</td>\n",
" <td>Golden Raspberry Award for Worst Supporting Ac...</td>\n",
" <td>winners and nominees</td>\n",
" <td>37</td>\n",
" <td>20999821</td>\n",
" <td>Golden_Raspberry_Award_for_Worst_Supporting_Ac...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>352746</th>\n",
" <td>Q977926</td>\n",
" <td>2006 Champ Car season</td>\n",
" <td>drivers and teams</td>\n",
" <td>1</td>\n",
" <td>4795854</td>\n",
" <td>2006_Champ_Car_season</td>\n",
" </tr>\n",
" <tr>\n",
" <th>352747</th>\n",
" <td>Q985972</td>\n",
" <td>Palmeiras de Goiás</td>\n",
" <td>location</td>\n",
" <td>3</td>\n",
" <td>4322036</td>\n",
" <td>Palmeiras_de_Goiás</td>\n",
" </tr>\n",
" <tr>\n",
" <th>352748</th>\n",
" <td>Q993714</td>\n",
" <td>Kenora</td>\n",
" <td>economy</td>\n",
" <td>1</td>\n",
" <td>179037</td>\n",
" <td>Kenora</td>\n",
" </tr>\n",
" <tr>\n",
" <th>352749</th>\n",
" <td>Q999772</td>\n",
" <td>Fender Starcaster</td>\n",
" <td>construction</td>\n",
" <td>1</td>\n",
" <td>736150</td>\n",
" <td>Fender_Starcaster</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>352750 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" wikidata_id page_title \\\n",
"0 Q1012273 Heinzenberg Castle \n",
"1 Q1013544 Kaza, Himachal Pradesh \n",
"2 Q1014119 Fort McPherson, Northwest Territories \n",
"3 Q1018115 2007–08 Boston Celtics season \n",
"4 Q1018191 Petrovec Municipality \n",
"... ... ... \n",
"352745 Q977915 Golden Raspberry Award for Worst Supporting Ac... \n",
"352746 Q977926 2006 Champ Car season \n",
"352747 Q985972 Palmeiras de Goiás \n",
"352748 Q993714 Kenora \n",
"352749 Q999772 Fender Starcaster \n",
"\n",
" section_heading n_recommendations page_id \\\n",
"0 history 4 53359722 \n",
"1 festivals & tourism 1 21521011 \n",
"2 history 1 3122565 \n",
"3 roster 1 11959669 \n",
"4 geography 1 4823615 \n",
"... ... ... ... \n",
"352745 winners and nominees 37 20999821 \n",
"352746 drivers and teams 1 4795854 \n",
"352747 location 3 4322036 \n",
"352748 economy 1 179037 \n",
"352749 construction 1 736150 \n",
"\n",
" page_title \n",
"0 Heinzenberg_Castle \n",
"1 Kaza,_Himachal_Pradesh \n",
"2 Fort_McPherson,_Northwest_Territories \n",
"3 2007–08_Boston_Celtics_season \n",
"4 Petrovec_Municipality \n",
"... ... \n",
"352745 Golden_Raspberry_Award_for_Worst_Supporting_Ac... \n",
"352746 2006_Champ_Car_season \n",
"352747 Palmeiras_de_Goiás \n",
"352748 Kenora \n",
"352749 Fender_Starcaster \n",
"\n",
"[352750 rows x 6 columns]"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "07348808",
"metadata": {},
"outputs": [],
"source": [
"#sort columns\n",
"output = output[['wikidata_id','page_id','page_title','section_heading','n_recommendations']]"
]
},
{
"cell_type": "code",
"execution_count": 176,
"id": "2c270181",
"metadata": {},
"outputs": [],
"source": [
"output.to_csv('RecsPerSectionEnwiki.csv.gz',index=False,compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment