Skip to content

Instantly share code, notes, and snippets.

@gabrielchl
Last active April 18, 2018 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gabrielchl/c7babd09ede0d78496e0dda9a7979300 to your computer and use it in GitHub Desktop.
Save gabrielchl/c7babd09ede0d78496e0dda9a7979300 to your computer and use it in GitHub Desktop.
Remove redundant interlanguage links from category pages that are already linked by {{Interwiki from Wikidata}}.
# Script written by Gabriel Lee Chi Hong
# Commons user page: https://commons.wikimedia.org/wiki/User:Gabrielchihonglee
#
# With great support from Zhuyifei1999
# Commons user page: https://commons.wikimedia.org/wiki/User:Zhuyifei1999
#
# Written in Late-March, 2018 (During my public exam, DSE)
# BRFA: https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Gabrielchihonglee-Bot_(4)
#
# Published under the terms of MIT License
# https://choosealicense.com/licenses/mit/
import os
import re
import pywikibot
import mwparserfromhell
from pywikibot import xmlreader
# setting up pywikibot
site = pywikibot.Site('commons', 'commons', 'Gabrielchihonglee-Bot')
famname = 'wikipedia'
famlang = site.family.load(famname).langs
def redirect_check(link):
page = pywikibot.Page(link)
processed = set()
site = page.site
while True:
if page in processed:
break # loop?
processed.add(page)
if page.site != site:
break # cross-wiki
if page.section():
break # following sections loses information
try:
page.site.loadpageinfo(page)
except Exception:
pywikibot.exception()
break
if not hasattr(page, '_pageid'):
break # not a MediaWiki-accepted page. eg: Special pages
if page.exists():
if page.isRedirectPage():
try:
page = page.getRedirectTarget()
continue
except pywikibot.CircularRedirect:
break
elif page.isCategoryRedirect():
page = page.getCategoryRedirectTarget()
continue
else:
break
else:
try:
logs = page.site.logevents(logtype='move', page=page, total=1)
except Exception:
pywikibot.exception()
break
for log in logs:
break # deleted page without moving
else:
break # no move log
try:
page = log.target_page
continue
except Exception:
pywikibot.exception()
break
return page
def interwiki_check(wikitext):
for regex_obj in pywikibot.link_regex.finditer(wikitext):
# formatting the regex_obj
title = regex_obj.group('title').strip()
if ':' not in title: continue
if any(char in regex_obj.group(0) for char in '|#'): continue
lang = title.split(':', 1)[0].strip()
if lang not in famlang: continue
# check if its interwiki regex_obj by going to the page
try:
link_obj = pywikibot.Link(title, source=site)
link_obj.parse()
except pywikibot.Error:
continue
if link_obj.site == site: continue
if link_obj.site.family.name != famname: continue
if link_obj.site.code != lang: continue
yield redirect_check(link_obj), regex_obj.group(0)
def template_param(page):
code = mwparserfromhell.parse(page.text)
template_name_regex = re.compile(r'(?i)Interwiki from Wikidata')
for template in code.filter_templates(recursive=False):
template_name = template.name.strip().replace('_', ' ')
if template_name_regex.match(template_name):
return str(template)
return '{{Interwiki from Wikidata}}'
def worker(page):
api_result = site.expand_text(template_param(page), title=page.title())
links_template = set(dict(interwiki_check(api_result)))
links_page = list(interwiki_check(page.text))
links_dup = set(wikitext
for target, wikitext in links_page
if target in links_template)
if not links_dup: return
old_page_text = page.text
for link in links_dup:
page.text = pywikibot.textlib.replaceExcept(
page.text,
'(\n)?' + re.escape(link) + '(?(1)(?=\n))',
'',
# remove text if not in following elements
['comment', 'header', 'pre', 'source', 'ref', 'table', 'gallery']
)
pywikibot.showDiff(old_page_text, page.text)
num_removed = len(links_dup)
num_left = len(links_page) - num_removed
summary = '[[Commons:Bot|Bot]]: Removing {} interlanguage links provided by {{{{[[Template:Interwiki from Wikidata|Interwiki from Wikidata]]}}}}'.format(num_removed)
if num_left:
summary += ', {} links left'.format(num_left)
page.save(summary)
dump = xmlreader.XmlDump('/public/dumps/public/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2')
template_page = pywikibot.Page(site, title='Template:Interwiki from Wikidata')
for i, page in enumerate(dump.parse()):
if int(page.ns) != 14: continue
text = page.text
for x in interwiki_check(text):
# has an element, terminate interwiki_check
break
else:
# no element, next one in dump.parse
continue
#pywikibot.output('{}:{}'.format(i, page.title))
page = pywikibot.Page(site, page.title)
templates = page.templates()
if template_page in templates:
try:
worker(page)
except pywikibot.Error:
pywikibot.exception()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment