gabrielchl/comInterWiki.py

## comInterWiki.py
# Script written by Gabriel Lee Chi Hong
# Commons user page: https://commons.wikimedia.org/wiki/User:Gabrielchihonglee
#
# With great support from Zhuyifei1999
# Commons user page: https://commons.wikimedia.org/wiki/User:Zhuyifei1999
#
# Written in Late-March, 2018 (During my public exam, DSE)
# BRFA: https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Gabrielchihonglee-Bot_(4)
#
# Published under the terms of MIT License
# https://choosealicense.com/licenses/mit/

import os
import re

import pywikibot
import mwparserfromhell
from pywikibot import xmlreader

# setting up pywikibot
site = pywikibot.Site('commons', 'commons', 'Gabrielchihonglee-Bot')
famname = 'wikipedia'
famlang = site.family.load(famname).langs

def redirect_check(link):
    page = pywikibot.Page(link)
    processed = set()
    site = page.site
    while True:
        if page in processed:
            break  # loop?
        processed.add(page)
        if page.site != site:
            break  # cross-wiki
        if page.section():
            break  # following sections loses information
        try:
            page.site.loadpageinfo(page)
        except Exception:
            pywikibot.exception()
            break
        if not hasattr(page, '_pageid'):
            break  # not a MediaWiki-accepted page. eg: Special pages
        if page.exists():
            if page.isRedirectPage():
                try:
                    page = page.getRedirectTarget()
                    continue
                except pywikibot.CircularRedirect:
                    break
            elif page.isCategoryRedirect():
                page = page.getCategoryRedirectTarget()
                continue
            else:
                break
        else:
            try:
                logs = page.site.logevents(logtype='move', page=page, total=1)
            except Exception:
                pywikibot.exception()
                break
            for log in logs:
                break  # deleted page without moving
            else:
                break  # no move log
            try:
                page = log.target_page
                continue
            except Exception:
                pywikibot.exception()
                break

    return page

def interwiki_check(wikitext):
    for regex_obj in pywikibot.link_regex.finditer(wikitext):
        # formatting the regex_obj
        title = regex_obj.group('title').strip()
        if ':' not in title: continue
        if any(char in regex_obj.group(0) for char in '|#'): continue
        lang = title.split(':', 1)[0].strip()
        if lang not in famlang: continue
        # check if its interwiki regex_obj by going to the page
        try:
            link_obj = pywikibot.Link(title, source=site)
            link_obj.parse()
        except pywikibot.Error:
            continue
        if link_obj.site == site: continue
        if link_obj.site.family.name != famname: continue
        if link_obj.site.code != lang: continue
        yield redirect_check(link_obj), regex_obj.group(0)

def template_param(page):
    code = mwparserfromhell.parse(page.text)
    template_name_regex = re.compile(r'(?i)Interwiki from Wikidata')
    for template in code.filter_templates(recursive=False):
        template_name = template.name.strip().replace('_', ' ')
        if template_name_regex.match(template_name):
            return str(template)
    return '{{Interwiki from Wikidata}}'

def worker(page):
    api_result = site.expand_text(template_param(page), title=page.title())
    links_template = set(dict(interwiki_check(api_result)))
    links_page = list(interwiki_check(page.text))
    links_dup = set(wikitext
                    for target, wikitext in links_page
                    if target in links_template)
    if not links_dup: return
    old_page_text = page.text
    for link in links_dup:
        page.text = pywikibot.textlib.replaceExcept(
            page.text,
            '(\n)?' + re.escape(link) + '(?(1)(?=\n))',
            '',
            # remove text if not in following elements
            ['comment', 'header', 'pre', 'source', 'ref', 'table', 'gallery']
        )
    pywikibot.showDiff(old_page_text, page.text)
    num_removed = len(links_dup)
    num_left = len(links_page) - num_removed
    summary = '[[Commons:Bot|Bot]]: Removing {} interlanguage links provided by {{{{[[Template:Interwiki from Wikidata|Interwiki from Wikidata]]}}}}'.format(num_removed)
    if num_left:
        summary += ', {} links left'.format(num_left)
    page.save(summary)

dump = xmlreader.XmlDump('/public/dumps/public/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2')
template_page = pywikibot.Page(site, title='Template:Interwiki from Wikidata')
for i, page in enumerate(dump.parse()):
    if int(page.ns) != 14: continue
    text = page.text
    for x in interwiki_check(text):
        # has an element, terminate interwiki_check
        break
    else:
        # no element, next one in dump.parse
        continue
    #pywikibot.output('{}:{}'.format(i, page.title))
    page = pywikibot.Page(site, page.title)
    templates = page.templates()
    if template_page in templates:
        try:
            worker(page)
        except pywikibot.Error:
            pywikibot.exception()
	# Script written by Gabriel Lee Chi Hong
	# Commons user page: https://commons.wikimedia.org/wiki/User:Gabrielchihonglee
	#
	# With great support from Zhuyifei1999
	# Commons user page: https://commons.wikimedia.org/wiki/User:Zhuyifei1999
	#
	# Written in Late-March, 2018 (During my public exam, DSE)
	# BRFA: https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Gabrielchihonglee-Bot_(4)
	#
	# Published under the terms of MIT License
	# https://choosealicense.com/licenses/mit/

	import os
	import re

	import pywikibot
	import mwparserfromhell
	from pywikibot import xmlreader

	# setting up pywikibot
	site = pywikibot.Site('commons', 'commons', 'Gabrielchihonglee-Bot')
	famname = 'wikipedia'
	famlang = site.family.load(famname).langs

	def redirect_check(link):
	page = pywikibot.Page(link)
	processed = set()
	site = page.site
	while True:
	if page in processed:
	break # loop?
	processed.add(page)
	if page.site != site:
	break # cross-wiki
	if page.section():
	break # following sections loses information
	try:
	page.site.loadpageinfo(page)
	except Exception:
	pywikibot.exception()
	break
	if not hasattr(page, '_pageid'):
	break # not a MediaWiki-accepted page. eg: Special pages
	if page.exists():
	if page.isRedirectPage():
	try:
	page = page.getRedirectTarget()
	continue
	except pywikibot.CircularRedirect:
	break
	elif page.isCategoryRedirect():
	page = page.getCategoryRedirectTarget()
	continue
	else:
	break
	else:
	try:
	logs = page.site.logevents(logtype='move', page=page, total=1)
	except Exception:
	pywikibot.exception()
	break
	for log in logs:
	break # deleted page without moving
	else:
	break # no move log
	try:
	page = log.target_page
	continue
	except Exception:
	pywikibot.exception()
	break

	return page

	def interwiki_check(wikitext):
	for regex_obj in pywikibot.link_regex.finditer(wikitext):
	# formatting the regex_obj
	title = regex_obj.group('title').strip()
	if ':' not in title: continue
	if any(char in regex_obj.group(0) for char in '\|#'): continue
	lang = title.split(':', 1)[0].strip()
	if lang not in famlang: continue
	# check if its interwiki regex_obj by going to the page
	try:
	link_obj = pywikibot.Link(title, source=site)
	link_obj.parse()
	except pywikibot.Error:
	continue
	if link_obj.site == site: continue
	if link_obj.site.family.name != famname: continue
	if link_obj.site.code != lang: continue
	yield redirect_check(link_obj), regex_obj.group(0)

	def template_param(page):
	code = mwparserfromhell.parse(page.text)
	template_name_regex = re.compile(r'(?i)Interwiki from Wikidata')
	for template in code.filter_templates(recursive=False):
	template_name = template.name.strip().replace('_', ' ')
	if template_name_regex.match(template_name):
	return str(template)
	return '{{Interwiki from Wikidata}}'

	def worker(page):
	api_result = site.expand_text(template_param(page), title=page.title())
	links_template = set(dict(interwiki_check(api_result)))
	links_page = list(interwiki_check(page.text))
	links_dup = set(wikitext
	for target, wikitext in links_page
	if target in links_template)
	if not links_dup: return
	old_page_text = page.text
	for link in links_dup:
	page.text = pywikibot.textlib.replaceExcept(
	page.text,
	'(\n)?' + re.escape(link) + '(?(1)(?=\n))',
	'',
	# remove text if not in following elements
	['comment', 'header', 'pre', 'source', 'ref', 'table', 'gallery']
	)
	pywikibot.showDiff(old_page_text, page.text)
	num_removed = len(links_dup)
	num_left = len(links_page) - num_removed
	summary = '[[Commons:Bot\|Bot]]: Removing {} interlanguage links provided by {{{{[[Template:Interwiki from Wikidata\|Interwiki from Wikidata]]}}}}'.format(num_removed)
	if num_left:
	summary += ', {} links left'.format(num_left)
	page.save(summary)

	dump = xmlreader.XmlDump('/public/dumps/public/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2')
	template_page = pywikibot.Page(site, title='Template:Interwiki from Wikidata')
	for i, page in enumerate(dump.parse()):
	if int(page.ns) != 14: continue
	text = page.text
	for x in interwiki_check(text):
	# has an element, terminate interwiki_check
	break
	else:
	# no element, next one in dump.parse
	continue
	#pywikibot.output('{}:{}'.format(i, page.title))
	page = pywikibot.Page(site, page.title)
	templates = page.templates()
	if template_page in templates:
	try:
	worker(page)
	except pywikibot.Error:
	pywikibot.exception()