-
-
Save JJMC89/d62c412f6fdceed02aa0b20ec728a6c6 to your computer and use it in GitHub Desktop.
Builds a list of rural localities by district/okrug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python3 rus_loc.py -catr:'Rural localities in Arkhangelsk Oblast' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Builds a list of rural localities by district/okrug. | |
¶ms; | |
""" | |
# Author : JJMC89 | |
# License: MIT | |
import copy | |
import mwparserfromhell | |
import pywikibot | |
from pywikibot.pagegenerators import GeneratorFactory, parameterHelp | |
from pywikibot.textlib import removeDisabledParts | |
docuReplacements = {'¶ms;': parameterHelp} # pylint: disable=invalid-name | |
CONFIG = { | |
'Infobox settlement': 'subdivision_name3', | |
'Infobox Russian inhabited locality': 'adm_district_jur', | |
} | |
def get_template_pages(templates, site=None): | |
""" | |
Given an iterable of templates, return a set of pages. | |
@param templates: iterable of templates | |
@type templates: iterable | |
@rtype: set | |
""" | |
pages = set() | |
if not site: | |
site = pywikibot.Site() | |
for template in templates: | |
if isinstance(template, str): | |
template = pywikibot.Page(site, template, ns=10) | |
if template.isRedirectPage(): | |
template = template.getRedirectTarget() | |
if not template.exists(): | |
continue | |
pages.add(template) | |
for tpl in template.backlinks(filterRedirects=True): | |
pages.add(tpl) | |
return pages | |
def get_district(page): | |
"""Get the district fro the article lead.""" | |
wikicode = mwparserfromhell.parse( | |
removeDisabledParts(page.text, site=page.site), skip_style_tags=True | |
) | |
sections = wikicode.get_sections(include_lead=True) | |
lead = sections[0] | |
district = _get_district_from_infobox(lead, page.site) | |
if not district: | |
district = _get_district_from_links(lead, page.site) | |
return district.title() if district else '_Unknown_' | |
def _get_district_from_infobox(wikicode, site): | |
"""Helper function for get_district.""" | |
value = None | |
for tpl in wikicode.ifilter_templates(): | |
template = pywikibot.Page(site, str(tpl.name), ns=10) | |
if template in CONFIG and tpl.has(CONFIG[template], ignore_empty=True): | |
value = str(tpl.get(CONFIG[template]).value) | |
break | |
if value: | |
if '[[' in value: | |
value = mwparserfromhell.parse(value, skip_style_tags=True) | |
value = _get_district_from_links(value, site) | |
else: | |
value = pywikibot.Page(site, value) | |
return value | |
def _get_district_from_links(wikicode, site): | |
"""Helper function for get_district.""" | |
district = okrug = None | |
for link in wikicode.ifilter_wikilinks(): | |
page = pywikibot.Page(site, str(link.title)) | |
title = page.title().lower() | |
if 'district' in title: | |
district = district or page | |
elif 'okrug' in title: | |
okrug = okrug or page | |
return district or okrug | |
def main(*args): | |
""" | |
Process command line arguments and invoke bot. | |
@param args: command line arguments | |
@type args: list of unicode | |
""" | |
local_args = pywikibot.handle_args(args) | |
site = pywikibot.Site() | |
site.login() | |
gen_factory = GeneratorFactory(site) | |
for arg in local_args: | |
gen_factory.handleArg(arg) | |
for infobox, param in copy.copy(CONFIG).items(): | |
pages = get_template_pages([infobox], site) | |
for page in pages: | |
CONFIG[page] = param | |
storage = dict() | |
for page in gen_factory.getCombinedGenerator(): | |
if 'list' in page.title().lower(): | |
continue | |
district = get_district(page) | |
if district not in storage: | |
storage[district] = set() | |
storage[district].add(page) | |
text = '' | |
for district in sorted(storage.keys()): | |
text += '== {0} ==\nRural localities in [[{0}]]:\n'.format(district) | |
text += '{{div col|colwidth=15em}}\n' | |
for locality in sorted(storage[district]): | |
title = locality.title() | |
if ',' in title or '(' in title: | |
text += '* [[{}|]]\n'.format(locality.title()) | |
else: | |
text += '* {}\n'.format(locality.title(as_link=True)) | |
text += '{{div col end}}\n\n' | |
file = open('rus_loc.txt', 'w', encoding='utf8') | |
file.write(text) | |
file.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment