Skip to content

Instantly share code, notes, and snippets.

@digitalTranshumant
Created June 29, 2022 16:19
Show Gist options
  • Save digitalTranshumant/35a182eb226392c08f46db489162fd01 to your computer and use it in GitHub Desktop.
Save digitalTranshumant/35a182eb226392c08f46db489162fd01 to your computer and use it in GitHub Desktop.
blue links score
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "cc673716",
"metadata": {},
"source": [
"# Section \"topics\"\n",
"First we produce some samples\n",
"\n",
"Next, we compute relevance scores"
]
},
{
"cell_type": "markdown",
"id": "d9ae7e90",
"metadata": {},
"source": [
"## Samples\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b10c543a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using wmfdata v1.3.2, but v1.3.3 is available.\n",
"\n",
"To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release --ignore-installed`.\n",
"\n",
"To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md\n",
"PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n"
]
}
],
"source": [
"#Create a spark context\n",
"import wmfdata\n",
"\n",
"spark = wmfdata.spark.get_session(type='yarn-regular')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f9eeb11b",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.parquet('/user/mnz/secmap_results/sections/sections_2022-04.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2ca43a28",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[item_id: string, wiki_db: string, section_attributes: array<struct<heading:string,links:array<string>,count:bigint,pos_start_mean:double,pos_end_mean:double>>]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.cache()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d044ba51",
"metadata": {},
"outputs": [],
"source": [
"langs = ['enwiki','ptwiki','idwiki','eswiki','arwiki','cswiki','bnwiki','frwiki','ruwiki']\n",
"from pyspark.sql.functions import explode\n",
"tmp = df.where(df['wiki_db'].isin(langs)).select('wiki_db','item_id',explode('section_attributes'))\n",
"wikis = tmp.selectExpr('wiki_db','item_id as page_wikidata_item','col.heading as section_heading','col.links as section_links')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e29f47fa",
"metadata": {},
"outputs": [],
"source": [
"#explode outter to keep sections without links\n",
"from pyspark.sql.functions import explode_outer\n",
"wikis2 = wikis.select('wiki_db','page_wikidata_item','section_heading',explode_outer('section_links').alias('section_link_item_id'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e52fcd9a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+\n",
"| partition|\n",
"+-------------------+\n",
"|snapshot=2022-03-28|\n",
"|snapshot=2022-04-04|\n",
"|snapshot=2022-04-11|\n",
"|snapshot=2022-04-18|\n",
"|snapshot=2022-04-25|\n",
"|snapshot=2022-05-02|\n",
"|snapshot=2022-05-09|\n",
"+-------------------+\n",
"\n"
]
}
],
"source": [
"spark.sql('show partitions wmf.wikidata_item_page_link ').show()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2ad68f02",
"metadata": {},
"outputs": [],
"source": [
"#get articles title\n",
"titles = spark.sql('''SELECT wiki_db, page_title,item_id as page_wikidata_item\n",
" FROM wmf.wikidata_item_page_link \n",
" WHERE snapshot='2022-04-04' AND page_namespace=0 ''')\n",
"titles = titles.where(titles['wiki_db'].isin(langs))\n",
"titles = titles.distinct() #one wikidata_item, page_title pair can have multiple page_ids (redirects), but we are not interested on those ones"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "af86fe62",
"metadata": {},
"outputs": [],
"source": [
"output_data = wikis2.join(titles,['page_wikidata_item','wiki_db']).select('wiki_db','page_wikidata_item','page_title','section_heading','section_link_item_id')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "4fb9f77e",
"metadata": {},
"outputs": [],
"source": [
"output_data = output_data.join(titles.selectExpr('wiki_db',\n",
" 'page_wikidata_item as section_link_item_id',\n",
" 'page_title as section_link_item_id_label'),\n",
" ['wiki_db','section_link_item_id'],'left')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7050a9c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[wiki_db: string, section_link_item_id: string, page_wikidata_item: string, page_title: string, section_heading: string, section_link_item_id_label: string]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_data.cache()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "cfad2fb9",
"metadata": {},
"outputs": [],
"source": [
"#sorting columns in the expected order\n",
"output_data = output_data.select('wiki_db','page_title','section_heading', 'section_link_item_id','section_link_item_id_label')\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "964e3335",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"enwiki\n",
"ptwiki\n",
"idwiki\n",
"eswiki\n",
"arwiki\n",
"cswiki\n",
"bnwiki\n",
"frwiki\n",
"ruwiki\n"
]
}
],
"source": [
"from pyspark.sql.functions import rand\n",
"\n",
"samples = {}\n",
"for lang in langs:\n",
" print(lang)\n",
" lang_tmp = output_data.where(output_data['wiki_db'] ==lang).drop('page_wikidata_item')\n",
" #get random articles from the ones that at least have one bluelink\n",
" random_articles = lang_tmp.where(lang_tmp['section_link_item_id'].isNotNull()).select('page_title').distinct().orderBy(rand(seed=8)).limit(100).collect()\n",
" page_titles = [p.page_title for p in random_articles]\n",
" samples[lang] = lang_tmp.where(lang_tmp['page_title'].isin(page_titles)).toPandas()\n",
" samples[lang].sort_values(['page_title','section_heading'],inplace=True)\n",
" samples[lang]['section_link_item_id'] = samples[lang]['section_link_item_id'].apply(lambda x: '=HYPERLINK(\"https://www.wikidata.org/wiki/{0}\",\"{0}\")'.format(x) if x else '')\n",
" samples[lang]['page_title'] = samples[lang]['page_title'].apply(lambda x: '=HYPERLINK(\"https://{0}.wikipedia.org/wiki/{1}\",\"{1}\")'.format(lang[0:2],x) if x else '')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65e8a1e8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "1e0d65e9",
"metadata": {},
"outputs": [],
"source": [
"#save in one excel\n",
"import pandas as pd\n",
"\n",
"with pd.ExcelWriter('100_articles_per_sample.xlsx') as writer:\n",
" for lang,data in samples.items():\n",
" data.to_excel(writer,sheet_name=lang,index=False)\n"
]
},
{
"cell_type": "markdown",
"id": "73166347",
"metadata": {},
"source": [
"## Relevance Score\n",
"\n",
"(this code is standlone. It is not depending on (and it's repeating some of) the previous code\n",
"The relevance score is based couting the number of languages containing a given pair (page,bluelink)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "44a3b4e8",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.parquet('/user/mnz/secmap_results/sections/sections_2022-04.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "5611846f",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import explode,explode_outer\n",
"df_explode = df.select('wiki_db','item_id',explode('section_attributes'))\n",
"df_explode = df_explode.selectExpr('wiki_db','item_id as page_wikidata_item','col.heading as section_heading','col.links as section_links')"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "b5c07562",
"metadata": {},
"outputs": [],
"source": [
"df_explode = df_explode.select('wiki_db','page_wikidata_item','section_heading',explode_outer('section_links').alias('section_link_item_id'))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "d57d1913",
"metadata": {},
"outputs": [],
"source": [
"titles = spark.sql('''SELECT wiki_db, page_title,item_id as page_wikidata_item\n",
" FROM wmf.wikidata_item_page_link \n",
" WHERE snapshot='2022-05-09' AND page_namespace=0 ''')\n",
"titles = titles.distinct() #one wikidata_item, page_title pair can have multiple page_ids (redirects), but we are not interested on those ones"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "c04b6dc0",
"metadata": {},
"outputs": [],
"source": [
"df_explode = df_explode.join(titles,['page_wikidata_item','wiki_db']).select('wiki_db','page_wikidata_item','page_title','section_heading','section_link_item_id')"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "b56cbd00",
"metadata": {},
"outputs": [],
"source": [
"df_all = df_explode.join(titles.selectExpr('wiki_db',\n",
" 'page_wikidata_item as section_link_item_id',\n",
" 'page_title as section_link_item_id_label'),\n",
" ['wiki_db','section_link_item_id'],'left')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "c2c17a10",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+--------------------+------------------+--------------------+--------------------+--------------------------+\n",
"|wiki_db|section_link_item_id|page_wikidata_item| page_title| section_heading|section_link_item_id_label|\n",
"+-------+--------------------+------------------+--------------------+--------------------+--------------------------+\n",
"| abwiki| Q14399| Q9129| Абырзен_бызшәа| абырзен алфавит| Ι|\n",
"| abwiki| Q14402| Q9129| Абырзен_бызшәа| абырзен алфавит| Ν|\n",
"| abwiki| Q18792| Q1047| Неру,_Џьавахарлал| ақәыԥшра| 1929|\n",
"| abwiki| Q18792| Q40349| Гагра_араион| аҭоурых| 1929|\n",
"| abwiki| Q239346| Q21199|Иԥсабаратәу_ахыԥх...| ахыԥхьаӡарақәа 0–99| 90_(ахыԥхьаӡара)|\n",
"| abwiki| Q25705093| Q3596063|Арасаӡыхь_(Очамчыра)| ажәлақәа| Амҷба_(аҵакырацәара)|\n",
"| abwiki| Q25705093| Q4151202| Ӷәада| ажәлақәа| Амҷба_(аҵакырацәара)|\n",
"| abwiki| Q2774| Q1047| Неру,_Џьавахарлал|индиа раԥхьатәи а...| Нанҳәамза_15|\n",
"| abwiki| Q673849| Q21199|Иԥсабаратәу_ахыԥх...| ахыԥхьаӡарақәа 0–99| 97_(ахыԥхьаӡара)|\n",
"| abwiki| Q6776| Q16365995|Бедиатәи_аепископ...| аҭоурых| 1613|\n",
"| abwiki| Q6776| Q896086|Урыс_аҳәынҭқарра_...|семибоярщина и зе...| 1613|\n",
"| abwiki| Q6776| Q896086|Урыс_аҳәынҭқарра_...| урыс ацарцәа| 1613|\n",
"| abwiki| Q6776| Q31355368|Мықәтәи_аепископц...| аҭоурых| 1613|\n",
"| abwiki| Q6776| Q31355371|Драндатәи_аеписко...| аҭоурых| 1613|\n",
"| abwiki| Q6776| Q2646612| Драндатәи_ауахәама| аҭоурых| 1613|\n",
"| abwiki| Q712764| Q21199|Иԥсабаратәу_ахыԥх...| ахыԥхьаӡарақәа 0–99| 48_(ахыԥхьаӡара)|\n",
"|acewiki| Q12494691| Q12125983|Setia_Bakti,_Acèh...| gampông| Lhôk_Bot,_Setia_B...|\n",
"|acewiki| Q12508066| Q9637989|Krueng_Sabee,_Acè...| gampông| Ranto_Panyang,_Kr...|\n",
"|acewiki| Q150| Q142| Peurancih| peunawôt luwa| Bahsa_Peurancih|\n",
"|acewiki| Q150| Q90| Paris| peunawôt luwa| Bahsa_Peurancih|\n",
"+-------+--------------------+------------------+--------------------+--------------------+--------------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df_all.show()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "30b27ff6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- wiki_db: string (nullable = true)\n",
" |-- section_link_item_id: string (nullable = true)\n",
" |-- page_wikidata_item: string (nullable = true)\n",
" |-- page_title: string (nullable = true)\n",
" |-- section_heading: string (nullable = true)\n",
" |-- section_link_item_id_label: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"df_all.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "337a8835",
"metadata": {},
"outputs": [],
"source": [
"scores = df_all.select('page_wikidata_item','section_link_item_id').groupby(['page_wikidata_item','section_link_item_id']).count().withColumnRenamed('count','score')"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "4b800523",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+--------------------+-----+\n",
"|page_wikidata_item|section_link_item_id|score|\n",
"+------------------+--------------------+-----+\n",
"| Q5468519| Q1218| 2|\n",
"| Q19325| Q134128| 50|\n",
"| Q25289| Q15878| 1|\n",
"| Q105338| Q168828| 4|\n",
"| Q159915| Q1726| 11|\n",
"| Q221603| Q172843| 24|\n",
"| Q25614315| Q2064| 1|\n",
"| Q61359125| Q2104692| 2|\n",
"| Q5160564| Q2832| 1|\n",
"| Q817393| Q432| 4|\n",
"| Q5428| Q432| 17|\n",
"| Q210398| Q561490| 9|\n",
"| Q200325| Q635162| 7|\n",
"| Q5468| Q763039| 1|\n",
"| Q44| Q13049268| 1|\n",
"| Q5527| Q2725| 14|\n",
"| Q18190945| Q9333879| 1|\n",
"| Q5097794| Q1029430| 2|\n",
"| Q19568369| Q104228| 1|\n",
"| Q1762173| Q1141026| 4|\n",
"+------------------+--------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"scores.show()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "0c8dcc0a",
"metadata": {},
"outputs": [],
"source": [
"df_all_with_score = df_all.join(scores,['page_wikidata_item','section_link_item_id']).sort(['wiki_db','page_title','score'],ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "97839eaa",
"metadata": {},
"outputs": [],
"source": [
"df_all_with_score.write.parquet('sectionBlueLinksScores.parquet',mode='overwrite')"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "46d61c7c",
"metadata": {},
"outputs": [],
"source": [
"df_all_with_score_check = spark.read.parquet('sectionBlueLinksScores.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "d6df0ed8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+--------------------+-------+-------------------+--------------------+--------------------------+-----+\n",
"|page_wikidata_item|section_link_item_id|wiki_db| page_title| section_heading|section_link_item_id_label|score|\n",
"+------------------+--------------------+-------+-------------------+--------------------+--------------------------+-----+\n",
"| Q8075652| Q8023| zuwiki| Zwelakhe_Sisulu|umsebenzi awufundela| Nelson_Mandela| 3|\n",
"| Q8075652| Q83162| zuwiki| Zwelakhe_Sisulu| umlando wakhe siqu| African_National_...| 2|\n",
"| Q8075652| Q153081| zuwiki| Zwelakhe_Sisulu|umsebenzi awufundela| Ukuvukela_kweNtsh...| 2|\n",
"| Q8075652| Q5869016| zuwiki| Zwelakhe_Sisulu| bheka futhi| Umlando_we-ANC| 2|\n",
"| Q24235929| Q492042| zuwiki|Zvishavane_District| okuhlobene| Izifundazwe_zase_...| 1|\n",
"| Q24235929| Q5283558| zuwiki|Zvishavane_District| okuhlobene| Izifunda_zase_Zim...| 1|\n",
"| Q8075561| Q492042| zuwiki| Zvimba_District| okuhlobene| Izifundazwe_zase_...| 2|\n",
"| Q8075561| Q5283558| zuwiki| Zvimba_District| okuhlobene| Izifunda_zase_Zim...| 2|\n",
"| Q20020484| Q130840| zuwiki| Zukisa_Tshiqi| amareferensi| IMpumalanga_Kapa| 4|\n",
"| Q20020484| Q130840| zuwiki| Zukisa_Tshiqi|isiqalo sempilo y...| IMpumalanga_Kapa| 4|\n",
"| Q20020484| Q534643| zuwiki| Zukisa_Tshiqi| amareferensi| University_of_the...| 3|\n",
"| Q20020484| Q534643| zuwiki| Zukisa_Tshiqi|isiqalo sempilo y...| University_of_the...| 3|\n",
"| Q20020484| Q10610936| zuwiki| Zukisa_Tshiqi|isiqalo sempilo y...| INgcobo,_IMpumala...| 2|\n",
"| Q8073715| Q258| zuwiki| Zomato| umlando| IRiphabhuliki_yas...| 9|\n",
"| Q8073715| Q854| zuwiki| Zomato| umlando| Sri_Lanka| 9|\n",
"| Q8073715| Q664| zuwiki| Zomato| umlando| INyuzilandi| 9|\n",
"| Q8073715| Q145| zuwiki| Zomato| umlando| Umbuso_Ohlangeneyo| 9|\n",
"| Q8073715| Q155| zuwiki| Zomato| umlando| IBrazili| 8|\n",
"| Q8073715| Q252| zuwiki| Zomato| umlando| I-Indonesia| 8|\n",
"| Q8073715| Q43| zuwiki| Zomato| umlando| ITheki| 8|\n",
"+------------------+--------------------+-------+-------------------+--------------------+--------------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df_all_with_score_check.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7973b40d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment