Last active
April 18, 2018 14:03
-
-
Save gabrielchl/c7babd09ede0d78496e0dda9a7979300 to your computer and use it in GitHub Desktop.
Remove redundant interlanguage links from category pages that are already linked by {{Interwiki from Wikidata}}.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script written by Gabriel Lee Chi Hong | |
# Commons user page: https://commons.wikimedia.org/wiki/User:Gabrielchihonglee | |
# | |
# With great support from Zhuyifei1999 | |
# Commons user page: https://commons.wikimedia.org/wiki/User:Zhuyifei1999 | |
# | |
# Written in Late-March, 2018 (During my public exam, DSE) | |
# BRFA: https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Gabrielchihonglee-Bot_(4) | |
# | |
# Published under the terms of MIT License | |
# https://choosealicense.com/licenses/mit/ | |
import os | |
import re | |
import pywikibot | |
import mwparserfromhell | |
from pywikibot import xmlreader | |
# setting up pywikibot | |
site = pywikibot.Site('commons', 'commons', 'Gabrielchihonglee-Bot') | |
famname = 'wikipedia' | |
famlang = site.family.load(famname).langs | |
def redirect_check(link): | |
page = pywikibot.Page(link) | |
processed = set() | |
site = page.site | |
while True: | |
if page in processed: | |
break # loop? | |
processed.add(page) | |
if page.site != site: | |
break # cross-wiki | |
if page.section(): | |
break # following sections loses information | |
try: | |
page.site.loadpageinfo(page) | |
except Exception: | |
pywikibot.exception() | |
break | |
if not hasattr(page, '_pageid'): | |
break # not a MediaWiki-accepted page. eg: Special pages | |
if page.exists(): | |
if page.isRedirectPage(): | |
try: | |
page = page.getRedirectTarget() | |
continue | |
except pywikibot.CircularRedirect: | |
break | |
elif page.isCategoryRedirect(): | |
page = page.getCategoryRedirectTarget() | |
continue | |
else: | |
break | |
else: | |
try: | |
logs = page.site.logevents(logtype='move', page=page, total=1) | |
except Exception: | |
pywikibot.exception() | |
break | |
for log in logs: | |
break # deleted page without moving | |
else: | |
break # no move log | |
try: | |
page = log.target_page | |
continue | |
except Exception: | |
pywikibot.exception() | |
break | |
return page | |
def interwiki_check(wikitext): | |
for regex_obj in pywikibot.link_regex.finditer(wikitext): | |
# formatting the regex_obj | |
title = regex_obj.group('title').strip() | |
if ':' not in title: continue | |
if any(char in regex_obj.group(0) for char in '|#'): continue | |
lang = title.split(':', 1)[0].strip() | |
if lang not in famlang: continue | |
# check if its interwiki regex_obj by going to the page | |
try: | |
link_obj = pywikibot.Link(title, source=site) | |
link_obj.parse() | |
except pywikibot.Error: | |
continue | |
if link_obj.site == site: continue | |
if link_obj.site.family.name != famname: continue | |
if link_obj.site.code != lang: continue | |
yield redirect_check(link_obj), regex_obj.group(0) | |
def template_param(page): | |
code = mwparserfromhell.parse(page.text) | |
template_name_regex = re.compile(r'(?i)Interwiki from Wikidata') | |
for template in code.filter_templates(recursive=False): | |
template_name = template.name.strip().replace('_', ' ') | |
if template_name_regex.match(template_name): | |
return str(template) | |
return '{{Interwiki from Wikidata}}' | |
def worker(page): | |
api_result = site.expand_text(template_param(page), title=page.title()) | |
links_template = set(dict(interwiki_check(api_result))) | |
links_page = list(interwiki_check(page.text)) | |
links_dup = set(wikitext | |
for target, wikitext in links_page | |
if target in links_template) | |
if not links_dup: return | |
old_page_text = page.text | |
for link in links_dup: | |
page.text = pywikibot.textlib.replaceExcept( | |
page.text, | |
'(\n)?' + re.escape(link) + '(?(1)(?=\n))', | |
'', | |
# remove text if not in following elements | |
['comment', 'header', 'pre', 'source', 'ref', 'table', 'gallery'] | |
) | |
pywikibot.showDiff(old_page_text, page.text) | |
num_removed = len(links_dup) | |
num_left = len(links_page) - num_removed | |
summary = '[[Commons:Bot|Bot]]: Removing {} interlanguage links provided by {{{{[[Template:Interwiki from Wikidata|Interwiki from Wikidata]]}}}}'.format(num_removed) | |
if num_left: | |
summary += ', {} links left'.format(num_left) | |
page.save(summary) | |
dump = xmlreader.XmlDump('/public/dumps/public/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2') | |
template_page = pywikibot.Page(site, title='Template:Interwiki from Wikidata') | |
for i, page in enumerate(dump.parse()): | |
if int(page.ns) != 14: continue | |
text = page.text | |
for x in interwiki_check(text): | |
# has an element, terminate interwiki_check | |
break | |
else: | |
# no element, next one in dump.parse | |
continue | |
#pywikibot.output('{}:{}'.format(i, page.title)) | |
page = pywikibot.Page(site, page.title) | |
templates = page.templates() | |
if template_page in templates: | |
try: | |
worker(page) | |
except pywikibot.Error: | |
pywikibot.exception() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment