Last active
July 14, 2017 15:43
-
-
Save danmichaelo/266a72a1f959fd9127383e2d4acd1453 to your computer and use it in GitHub Desktop.
DanmicholoBot 8 : import establishment dates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pwb | |
import pywikibot | |
from pywikibot.page import Claim | |
from pywikibot.data import sparql | |
import re | |
import logging | |
import time | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') | |
logger = logging.getLogger() | |
site = pywikibot.Site('no', 'wikipedia') | |
repo = site.data_repository() | |
endpoint = sparql.SparqlQuery(repo=repo) | |
title_cache = set() | |
edit_counter = 0 | |
def get_item(page, year): | |
""" | |
Return the corresponding Wikidata item *if* the item | |
(1) is a 'organization' or 'construction' AND | |
(2) doesn't have any P580/* claims | |
""" | |
try: | |
item = page.data_item() | |
except pywikibot.exceptions.NoPage: | |
logger.warning('No wikidata item for page: %s', page.title()) | |
return None | |
item_id = item.getID() | |
query = """ | |
SELECT ?date ?cls | |
WHERE | |
{ | |
wd:%(item_id)s wdt:P31/wdt:P279* ?cls . # organization | |
OPTIONAL { | |
wd:%(item_id)s ?prop ?date . | |
?prop ^wikibase:directClaim/wdt:P1647* wd:P580 . | |
} | |
VALUES ?cls { | |
wd:Q43229 # organization | |
wd:Q811430 # construction | |
wd:Q2065736 # kulturminne | |
wd:Q1261026 # trykksak | |
wd:Q732577 # publikasjon | |
} | |
} | |
""" % {'item_id': item_id} | |
data = endpoint.select(query) | |
if len(data) == 0: | |
logger.info('Ignoring page: %s', page.title()) | |
return None | |
else: | |
if data[0]['date'] is not None: | |
logger.info('Already a statement at %s: %s. Our year: %d', page.title(), data[0]['date'], year) | |
return None | |
else: | |
return item | |
def process_page(page, year): | |
global edit_counter | |
""" | |
Check if we should add <year> as establishment date for the item | |
linked to <page>. | |
""" | |
if page.title() in title_cache: | |
return | |
title_cache.add(page.title()) | |
item = get_item(page, year) | |
if item is not None: | |
logger.info(' -> Add %d to %s (%s)?', year, page.title(), item.getID()) | |
edit_counter += 1 | |
logger.info('Current: %d', edit_counter) | |
# time.sleep(1) | |
# process_item(item, year) | |
def process_item(item, year): | |
""" | |
Add <year> as establishment date for <item>. | |
""" | |
claim = Claim(repo, 'P571') | |
claim.setTarget(pywikibot.WbTime(year=year)) | |
item.addClaim(claim) | |
sourceClaim = Claim(repo, 'P143') # imported from | |
sourceClaim.setTarget(pywikibot.ItemPage(repo, 'Q191769')) # nowiki | |
claim.addSource(sourceClaim) | |
def process_category(cat): | |
""" | |
Traverse category tree starting from <cat> | |
""" | |
# logger.info(cat.title()) | |
m = re.search('(?:etableringer|etablert|grunnlagt) i (\d+)$', cat.title(), re.I) | |
if m: | |
year = int(m.group(1)) | |
else: | |
year = None | |
for member in cat.members(): | |
if isinstance(member, pywikibot.page.Category): | |
m = re.search('(etableringer|etablert|grunnlagt)', member.title(), re.I) | |
if m is None: | |
logger.warning('Will not follow subcategory: %s', member.title()) | |
else: | |
process_category(member) | |
elif isinstance(member, pywikibot.page.Page) and not member.isRedirectPage(): | |
if year is None: | |
logger.warning('Did not match: %s', cat.title()) | |
else: | |
process_page(member, year) | |
cat = pywikibot.Category(site, 'Kategori:Selskaper etter etableringsår') | |
process_category(cat) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment