Last active
April 6, 2023 03:41
-
-
Save Facenapalm/f4798e6ee3f1b73ec8c0fa0309d2e607 to your computer and use it in GitHub Desktop.
Working prototype of Liquipedia grabber for Wikidata, sample created item: https://www.wikidata.org/wiki/Q117440245
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
import requests | |
import os.path | |
from datetime import datetime | |
class LiquipediaLink: | |
available_projects = { | |
'ageofempires', 'apexlegends', 'arenafps', 'arenaofvalor', 'artifact', | |
'autochess', 'battalion', 'battlerite', 'brawlhalla', 'brawlstars', | |
'callofduty', 'clashroyale', 'counterstrike', 'criticalops', | |
'crossfire', 'dota2', 'fifa', 'fighters', 'fortnite', 'freefire', | |
'halo', 'hearthstone', 'heroes', 'leagueoflegends', 'magic', | |
'mobilelegends', 'naraka', 'overwatch', 'paladins', 'pokemon', 'pubg', | |
'pubgmobile', 'rainbowsix', 'rocketleague', 'runeterra', 'sideswipe', | |
'simracing', 'smash', 'splitgate', 'squadrons', 'starcraft', | |
'starcraft2', 'teamfortress', 'tft', 'trackmania', 'underlords', | |
'valorant', 'warcraft', 'wildrift', 'worldofwarcraft' | |
} | |
def __init__(self, *args): | |
""" | |
Syntax: | |
LiquipediaLink('starcraft2/Bly') | |
or | |
LiquipediaLink('starcraft2', 'Bly') | |
""" | |
if len(args) == 1: | |
slug_parts = args[0].split('/') | |
if len(slug_parts) != 2: | |
raise RuntimeError(f'Wrong slug `{args[0]}`') | |
project, article = slug_parts[0], slug_parts[1] | |
elif len(args) == 2: | |
project, article = args[0], args[1] | |
else: | |
raise RuntimeError(f'Unsupported LiquipediaLink syntax ({len(args)} arguments given)') | |
project = project.lower().strip() | |
if project not in self.available_projects: | |
raise RuntimeError(f'Unknown project `{project}`') | |
self.project = project | |
self.article = article.replace('_', ' ').strip() | |
def get_project(self): | |
return self.project | |
def get_article(self): | |
return self.article | |
def get_slug(self): | |
return self.project + '/' + self.article.replace(' ', '_') | |
def get_url(self): | |
return 'https://liquipedia.net/' + self.get_slug() | |
class LiquipediaPage: | |
countries_map = { | |
'Algeria': 'Q262', | |
'Argentina': 'Q414', | |
'Australia': 'Q408', | |
'Austria': 'Q40', | |
'Belarus': 'Q184', | |
'Belgium': 'Q31', | |
'Bolivia': 'Q750', | |
'Bosnia and Herzegovina': 'Q225', | |
'Brazil': 'Q155', | |
'Bulgaria': 'Q219', | |
'Canada': 'Q16', | |
'Chile': 'Q298', | |
'China': 'Q148', | |
'Colombia': 'Q739', | |
'Costa Rica': 'Q800', | |
'Croatia': 'Q224', | |
'Cuba': 'Q241', | |
'Czech Republic': 'Q213', | |
'Denmark': 'Q35', | |
'Egypt': 'Q79', | |
'Estonia': 'Q191', | |
'Finland': 'Q33', | |
'France': 'Q142', | |
'Germany': 'Q183', | |
'Greece': 'Q41', | |
'Honduras': 'Q783', | |
'Hong Kong': 'Q8646', | |
'Hungary': 'Q28', | |
'India': 'Q668', | |
'Indonesia': 'Q252', | |
'Iran': 'Q794', | |
'Ireland': 'Q27', | |
'Israel': 'Q801', | |
'Italy': 'Q38', | |
'Japan': 'Q17', | |
'Jordan': 'Q810', | |
'Kazakhstan': 'Q232', | |
'Korea': 'Q884', | |
'Latvia': 'Q211', | |
'Lebanon': 'Q822', | |
'Lithuania': 'Q37', | |
'Luxembourg': 'Q32', | |
'Malaysia': 'Q833', | |
'Mexico': 'Q96', | |
'Mongolia': 'Q711', | |
'Morocco': 'Q1028', | |
'Netherlands': 'Q29999', | |
'New Zealand': 'Q664', | |
'Norway': 'Q20', | |
'Pakistan': 'Q843', | |
'Panama': 'Q804', | |
'Peru': 'Q419', | |
'Philippines': 'Q928', | |
'Poland': 'Q36', | |
'Portugal': 'Q45', | |
'Romania': 'Q218', | |
'Russia': 'Q159', | |
'Serbia': 'Q403', | |
'Singapore': 'Q334', | |
'Slovakia': 'Q214', | |
'Slovenia': 'Q215', | |
'South Africa': 'Q258', | |
'South Korea': 'Q884', | |
'Korea': 'Q884', | |
'Spain': 'Q29', | |
'Sweden': 'Q34', | |
'Switzerland': 'Q39', | |
'Taiwan': 'Q865', | |
'Thailand': 'Q869', | |
'Tunisia': 'Q948', | |
'Turkey': 'Q43', | |
'Ukraine': 'Q212', | |
'United Kingdom': 'Q145', | |
'United States': 'Q30', | |
'USA': 'Q30', | |
'Uruguay': 'Q77', | |
'Venezuela': 'Q717', | |
'Vietnam': 'Q881', | |
'Non-representing': None, | |
'xx': None, | |
'': None, | |
} | |
country_adjectives_map = { | |
'Q408': 'Australian', | |
'Q31': 'Belgian', | |
'Q155': 'Brazilian', | |
'Q219': 'Bulgarian', | |
'Q16': 'Canadian', | |
'Q148': 'Chinese', | |
'Q213': 'Czech', | |
'Q35': 'Danish', | |
'Q33': 'Finnish', | |
'Q142': 'French', | |
'Q183': 'German', | |
'Q38': 'Italian', | |
'Q17': 'Japanese', | |
'Q232': 'Kazakh', | |
'Q96': 'Mexican', | |
'Q29999': 'Dutch', | |
'Q664': 'New Zealand', | |
'Q20': 'Norwegian', | |
'Q419': 'Peruvian', | |
'Q36': 'Polish', | |
'Q159': 'Russian', | |
'Q334': 'Singaporean', | |
'Q884': 'Korean', | |
'Q29': 'Spanish', | |
'Q34': 'Swedish', | |
'Q39': 'Swiss', | |
'Q865': 'Taiwanese', | |
'Q869': 'Thai', | |
'Q212': 'Ukrainian', | |
'Q145': 'British', | |
'Q30': 'American', | |
'Q881': 'Vietnamese', | |
} | |
def __init__(self, page_link): | |
if isinstance(page_link, str): | |
page_link = LiquipediaLink(page_link) | |
project = page_link.get_project() | |
article = page_link.get_article() | |
slug = page_link.get_slug() | |
self.page = page_link | |
filename = f'{project}/{article}.wiki' | |
if os.path.isfile(filename): | |
print(f'{slug}: cached page used') | |
with open(filename, encoding='utf-8') as cached_page: | |
self.content = cached_page.read() | |
retrieve_date = datetime.utcfromtimestamp(os.path.getmtime(filename)) | |
else: | |
# https://liquipedia.net/starcraft2/api.php?action=query&prop=revisions&titles=Abbadon&rvslots=*&rvprop=content&format=json&formatversion=2 | |
params = [ | |
( 'action', 'query' ), | |
( 'prop', 'revisions' ), | |
( 'titles', article ), | |
( 'rvslots', '*' ), | |
( 'rvprop', 'content' ), | |
( 'format', 'json' ), | |
( 'formatversion', '2' ), | |
] | |
headers = { | |
'User-Agent': 'Wikidata bot/0.1', | |
} | |
time.sleep(2) | |
response = requests.get(f'https://liquipedia.net/{project}/api.php', params=params, headers=headers) | |
if not response: | |
raise RuntimeError('ALARM') | |
page_info = response.json()['query']['pages'][0] | |
if 'revisions' in page_info: | |
self.content = page_info['revisions'][0]['slots']['main']['content'] | |
else: | |
raise RuntimeError('404') | |
retrieve_date = datetime.utcnow() | |
print(f'{slug}: downloaded') | |
if not os.path.isdir(project): | |
os.mkdir(project) | |
with open(filename, 'w', encoding='utf-8') as cached_page: | |
cached_page.write(self.content) | |
self.retrieve_date = f'+{retrieve_date.year}-{retrieve_date.month:02d}-{retrieve_date.day:02d}T00:00:00Z/11' | |
def get_interwiki_links(self): | |
result = [] | |
template = re.search(r'\{\{\s*[Ll]iquipedia links\s*\|([^\{\}]+)', self.content) | |
if not template: | |
return [] | |
for link in template.group(1).split('|'): | |
if '=' in link: | |
result.append(LiquipediaLink(link.replace('=', '/'))) | |
return result | |
def get_param(self, param, pattern=r'.*?'): | |
matcher = re.compile(rf'\|\s*{param}\s*=\s*({pattern})\s*[\|\}}]') | |
matches = matcher.findall(self.content) | |
if len(matches) > 1: | |
raise RuntimeError(f'several {param} parameters found at {self.page.get_slug()}') | |
if len(matches) == 0: | |
return None | |
else: | |
return matches[0] | |
def is_player(self): | |
return not self.get_param('role', r'[a-z]*') | |
def get_birth_date(self): | |
yyyymmdd = self.get_param('birth_date', r'\d{4}-\d{2}-\d{2}') | |
if yyyymmdd is None: | |
return None | |
else: | |
return f'+{yyyymmdd}T00:00:00Z/11' | |
def get_citizenship(self): | |
result = [] | |
i = 1 | |
while True: | |
param_name = f'country{i}' if i > 1 else 'country' | |
country_name = self.get_param(param_name, r'[A-Za-z ]*?') | |
if country_name is None: | |
# parameter not found - no more countries set | |
return result | |
if country_name not in self.countries_map: | |
raise RuntimeError(f'Unknown contry name `{country_name}`') | |
if self.countries_map[country_name]: | |
result.append(self.countries_map[country_name]) | |
i += 1 | |
def format_quickstatements_source(self): | |
return f'S248\tQ105835728\tS10918\t"{self.page.get_slug()}"\tS813\t{self.retrieve_date}' | |
def format_quickstatements_item(self): | |
source = self.format_quickstatements_source() | |
nickname = self.get_param('id') | |
if nickname is None: | |
nickname = self.page.get_article() | |
real_name = self.get_param('romanized_name') | |
if real_name is None: | |
real_name = self.get_param('name') | |
if real_name is None: | |
return None | |
countries = self.get_citizenship() | |
if not countries: | |
return None | |
if len(countries) == 1 and countries[0] in self.country_adjectives_map: | |
description = f'{self.country_adjectives_map[countries[0]]} progamer' | |
else: | |
description = f'progamer' | |
birth_date = self.get_birth_date() | |
lines = [] | |
# Entity | |
lines.append('CREATE') | |
lines.append(f'LAST\tLen\t"{nickname}"') | |
lines.append(f'LAST\tDen\t"{description}"') | |
if real_name: | |
lines.append(f'LAST\tAen\t"{real_name}"') | |
# Statements | |
lines.append('LAST\tP31\tQ5') # instance of = human | |
if "Category:Female " in self.content: | |
lines.append(f'LAST\tP21\tQ6581072') | |
lines.append('LAST\tP106\tQ4379701') # occupation = professional gamer | |
lines.append('LAST\tP641\tQ300920') # sport = esports | |
lines.append(f'LAST\tP742\t"{nickname}"\t{source}') | |
if birth_date is not None: | |
lines.append(f'LAST\tP569\t{birth_date}\t{source}') | |
for country in countries: | |
lines.append(f'LAST\tP27\t{country}\t{source}') | |
# Identifiers | |
lines.append(f'LAST\tP10918\t"{self.page.get_slug()}"') | |
for interwiki in self.get_interwiki_links(): | |
lines.append(f'LAST\tP10918\t"{interwiki.get_slug()}"') | |
add_optional_value = lambda prop, value: lines.append(f'LAST\t{prop}\t"{value}"\t{source}') if value else None | |
add_optional_value('P2002', self.get_param('twitter', r'\S*?')) | |
add_optional_value('P2003', self.get_param('instagram', r'\S*?')) | |
add_optional_value('P2013', self.get_param('facebook', r'\S*?')) | |
add_optional_value('P5797', self.get_param('twitch', r'\S*?')) | |
add_optional_value('P11706', self.get_param('aligulac', r'\d+')) | |
return '\n'.join(lines) | |
def main(): | |
result = '' | |
for line in open('in.txt', encoding='utf-8'): | |
page = LiquipediaPage(line) | |
if not page.is_player(): | |
continue | |
item = page.format_quickstatements_item() | |
if item is not None: | |
if "P11706" in item: | |
result += item + "\n\n" | |
with open('out.txt', 'w', encoding='utf-8') as output: | |
output.write(result) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment