Skip to content

Instantly share code, notes, and snippets.

@Facenapalm
Last active April 6, 2023 03:41
Show Gist options
  • Save Facenapalm/f4798e6ee3f1b73ec8c0fa0309d2e607 to your computer and use it in GitHub Desktop.
Save Facenapalm/f4798e6ee3f1b73ec8c0fa0309d2e607 to your computer and use it in GitHub Desktop.
Working prototype of Liquipedia grabber for Wikidata, sample created item: https://www.wikidata.org/wiki/Q117440245
import re
import time
import requests
import os.path
from datetime import datetime
class LiquipediaLink:
available_projects = {
'ageofempires', 'apexlegends', 'arenafps', 'arenaofvalor', 'artifact',
'autochess', 'battalion', 'battlerite', 'brawlhalla', 'brawlstars',
'callofduty', 'clashroyale', 'counterstrike', 'criticalops',
'crossfire', 'dota2', 'fifa', 'fighters', 'fortnite', 'freefire',
'halo', 'hearthstone', 'heroes', 'leagueoflegends', 'magic',
'mobilelegends', 'naraka', 'overwatch', 'paladins', 'pokemon', 'pubg',
'pubgmobile', 'rainbowsix', 'rocketleague', 'runeterra', 'sideswipe',
'simracing', 'smash', 'splitgate', 'squadrons', 'starcraft',
'starcraft2', 'teamfortress', 'tft', 'trackmania', 'underlords',
'valorant', 'warcraft', 'wildrift', 'worldofwarcraft'
}
def __init__(self, *args):
"""
Syntax:
LiquipediaLink('starcraft2/Bly')
or
LiquipediaLink('starcraft2', 'Bly')
"""
if len(args) == 1:
slug_parts = args[0].split('/')
if len(slug_parts) != 2:
raise RuntimeError(f'Wrong slug `{args[0]}`')
project, article = slug_parts[0], slug_parts[1]
elif len(args) == 2:
project, article = args[0], args[1]
else:
raise RuntimeError(f'Unsupported LiquipediaLink syntax ({len(args)} arguments given)')
project = project.lower().strip()
if project not in self.available_projects:
raise RuntimeError(f'Unknown project `{project}`')
self.project = project
self.article = article.replace('_', ' ').strip()
def get_project(self):
return self.project
def get_article(self):
return self.article
def get_slug(self):
return self.project + '/' + self.article.replace(' ', '_')
def get_url(self):
return 'https://liquipedia.net/' + self.get_slug()
class LiquipediaPage:
countries_map = {
'Algeria': 'Q262',
'Argentina': 'Q414',
'Australia': 'Q408',
'Austria': 'Q40',
'Belarus': 'Q184',
'Belgium': 'Q31',
'Bolivia': 'Q750',
'Bosnia and Herzegovina': 'Q225',
'Brazil': 'Q155',
'Bulgaria': 'Q219',
'Canada': 'Q16',
'Chile': 'Q298',
'China': 'Q148',
'Colombia': 'Q739',
'Costa Rica': 'Q800',
'Croatia': 'Q224',
'Cuba': 'Q241',
'Czech Republic': 'Q213',
'Denmark': 'Q35',
'Egypt': 'Q79',
'Estonia': 'Q191',
'Finland': 'Q33',
'France': 'Q142',
'Germany': 'Q183',
'Greece': 'Q41',
'Honduras': 'Q783',
'Hong Kong': 'Q8646',
'Hungary': 'Q28',
'India': 'Q668',
'Indonesia': 'Q252',
'Iran': 'Q794',
'Ireland': 'Q27',
'Israel': 'Q801',
'Italy': 'Q38',
'Japan': 'Q17',
'Jordan': 'Q810',
'Kazakhstan': 'Q232',
'Korea': 'Q884',
'Latvia': 'Q211',
'Lebanon': 'Q822',
'Lithuania': 'Q37',
'Luxembourg': 'Q32',
'Malaysia': 'Q833',
'Mexico': 'Q96',
'Mongolia': 'Q711',
'Morocco': 'Q1028',
'Netherlands': 'Q29999',
'New Zealand': 'Q664',
'Norway': 'Q20',
'Pakistan': 'Q843',
'Panama': 'Q804',
'Peru': 'Q419',
'Philippines': 'Q928',
'Poland': 'Q36',
'Portugal': 'Q45',
'Romania': 'Q218',
'Russia': 'Q159',
'Serbia': 'Q403',
'Singapore': 'Q334',
'Slovakia': 'Q214',
'Slovenia': 'Q215',
'South Africa': 'Q258',
'South Korea': 'Q884',
'Korea': 'Q884',
'Spain': 'Q29',
'Sweden': 'Q34',
'Switzerland': 'Q39',
'Taiwan': 'Q865',
'Thailand': 'Q869',
'Tunisia': 'Q948',
'Turkey': 'Q43',
'Ukraine': 'Q212',
'United Kingdom': 'Q145',
'United States': 'Q30',
'USA': 'Q30',
'Uruguay': 'Q77',
'Venezuela': 'Q717',
'Vietnam': 'Q881',
'Non-representing': None,
'xx': None,
'': None,
}
country_adjectives_map = {
'Q408': 'Australian',
'Q31': 'Belgian',
'Q155': 'Brazilian',
'Q219': 'Bulgarian',
'Q16': 'Canadian',
'Q148': 'Chinese',
'Q213': 'Czech',
'Q35': 'Danish',
'Q33': 'Finnish',
'Q142': 'French',
'Q183': 'German',
'Q38': 'Italian',
'Q17': 'Japanese',
'Q232': 'Kazakh',
'Q96': 'Mexican',
'Q29999': 'Dutch',
'Q664': 'New Zealand',
'Q20': 'Norwegian',
'Q419': 'Peruvian',
'Q36': 'Polish',
'Q159': 'Russian',
'Q334': 'Singaporean',
'Q884': 'Korean',
'Q29': 'Spanish',
'Q34': 'Swedish',
'Q39': 'Swiss',
'Q865': 'Taiwanese',
'Q869': 'Thai',
'Q212': 'Ukrainian',
'Q145': 'British',
'Q30': 'American',
'Q881': 'Vietnamese',
}
def __init__(self, page_link):
if isinstance(page_link, str):
page_link = LiquipediaLink(page_link)
project = page_link.get_project()
article = page_link.get_article()
slug = page_link.get_slug()
self.page = page_link
filename = f'{project}/{article}.wiki'
if os.path.isfile(filename):
print(f'{slug}: cached page used')
with open(filename, encoding='utf-8') as cached_page:
self.content = cached_page.read()
retrieve_date = datetime.utcfromtimestamp(os.path.getmtime(filename))
else:
# https://liquipedia.net/starcraft2/api.php?action=query&prop=revisions&titles=Abbadon&rvslots=*&rvprop=content&format=json&formatversion=2
params = [
( 'action', 'query' ),
( 'prop', 'revisions' ),
( 'titles', article ),
( 'rvslots', '*' ),
( 'rvprop', 'content' ),
( 'format', 'json' ),
( 'formatversion', '2' ),
]
headers = {
'User-Agent': 'Wikidata bot/0.1',
}
time.sleep(2)
response = requests.get(f'https://liquipedia.net/{project}/api.php', params=params, headers=headers)
if not response:
raise RuntimeError('ALARM')
page_info = response.json()['query']['pages'][0]
if 'revisions' in page_info:
self.content = page_info['revisions'][0]['slots']['main']['content']
else:
raise RuntimeError('404')
retrieve_date = datetime.utcnow()
print(f'{slug}: downloaded')
if not os.path.isdir(project):
os.mkdir(project)
with open(filename, 'w', encoding='utf-8') as cached_page:
cached_page.write(self.content)
self.retrieve_date = f'+{retrieve_date.year}-{retrieve_date.month:02d}-{retrieve_date.day:02d}T00:00:00Z/11'
def get_interwiki_links(self):
result = []
template = re.search(r'\{\{\s*[Ll]iquipedia links\s*\|([^\{\}]+)', self.content)
if not template:
return []
for link in template.group(1).split('|'):
if '=' in link:
result.append(LiquipediaLink(link.replace('=', '/')))
return result
def get_param(self, param, pattern=r'.*?'):
matcher = re.compile(rf'\|\s*{param}\s*=\s*({pattern})\s*[\|\}}]')
matches = matcher.findall(self.content)
if len(matches) > 1:
raise RuntimeError(f'several {param} parameters found at {self.page.get_slug()}')
if len(matches) == 0:
return None
else:
return matches[0]
def is_player(self):
return not self.get_param('role', r'[a-z]*')
def get_birth_date(self):
yyyymmdd = self.get_param('birth_date', r'\d{4}-\d{2}-\d{2}')
if yyyymmdd is None:
return None
else:
return f'+{yyyymmdd}T00:00:00Z/11'
def get_citizenship(self):
result = []
i = 1
while True:
param_name = f'country{i}' if i > 1 else 'country'
country_name = self.get_param(param_name, r'[A-Za-z ]*?')
if country_name is None:
# parameter not found - no more countries set
return result
if country_name not in self.countries_map:
raise RuntimeError(f'Unknown contry name `{country_name}`')
if self.countries_map[country_name]:
result.append(self.countries_map[country_name])
i += 1
def format_quickstatements_source(self):
return f'S248\tQ105835728\tS10918\t"{self.page.get_slug()}"\tS813\t{self.retrieve_date}'
def format_quickstatements_item(self):
source = self.format_quickstatements_source()
nickname = self.get_param('id')
if nickname is None:
nickname = self.page.get_article()
real_name = self.get_param('romanized_name')
if real_name is None:
real_name = self.get_param('name')
if real_name is None:
return None
countries = self.get_citizenship()
if not countries:
return None
if len(countries) == 1 and countries[0] in self.country_adjectives_map:
description = f'{self.country_adjectives_map[countries[0]]} progamer'
else:
description = f'progamer'
birth_date = self.get_birth_date()
lines = []
# Entity
lines.append('CREATE')
lines.append(f'LAST\tLen\t"{nickname}"')
lines.append(f'LAST\tDen\t"{description}"')
if real_name:
lines.append(f'LAST\tAen\t"{real_name}"')
# Statements
lines.append('LAST\tP31\tQ5') # instance of = human
if "Category:Female " in self.content:
lines.append(f'LAST\tP21\tQ6581072')
lines.append('LAST\tP106\tQ4379701') # occupation = professional gamer
lines.append('LAST\tP641\tQ300920') # sport = esports
lines.append(f'LAST\tP742\t"{nickname}"\t{source}')
if birth_date is not None:
lines.append(f'LAST\tP569\t{birth_date}\t{source}')
for country in countries:
lines.append(f'LAST\tP27\t{country}\t{source}')
# Identifiers
lines.append(f'LAST\tP10918\t"{self.page.get_slug()}"')
for interwiki in self.get_interwiki_links():
lines.append(f'LAST\tP10918\t"{interwiki.get_slug()}"')
add_optional_value = lambda prop, value: lines.append(f'LAST\t{prop}\t"{value}"\t{source}') if value else None
add_optional_value('P2002', self.get_param('twitter', r'\S*?'))
add_optional_value('P2003', self.get_param('instagram', r'\S*?'))
add_optional_value('P2013', self.get_param('facebook', r'\S*?'))
add_optional_value('P5797', self.get_param('twitch', r'\S*?'))
add_optional_value('P11706', self.get_param('aligulac', r'\d+'))
return '\n'.join(lines)
def main():
result = ''
for line in open('in.txt', encoding='utf-8'):
page = LiquipediaPage(line)
if not page.is_player():
continue
item = page.format_quickstatements_item()
if item is not None:
if "P11706" in item:
result += item + "\n\n"
with open('out.txt', 'w', encoding='utf-8') as output:
output.write(result)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment