Skip to content

Instantly share code, notes, and snippets.

@aucchen
Last active December 27, 2021 16:52
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aucchen/1ec479eb10a08370ab1a6d40b3eaa8f8 to your computer and use it in GitHub Desktop.
Save aucchen/1ec479eb10a08370ab1a6d40b3eaa8f8 to your computer and use it in GitHub Desktop.
Extract data from ifcomp/ifdb
# Extracts the current ifcomp games, and tries to link them to ifdb entries.
import datetime
import time
from bs4 import BeautifulSoup
import urllib.request
import ifdb
new_url = 'https://ifcomp.org/ballot?alphabetize=1'
end_date = datetime.datetime(2021, 11, 15)
with urllib.request.urlopen(new_url) as fp:
data = fp.read()
html = data.decode("utf8")
soup = BeautifulSoup(html, 'html.parser')
rows = soup.find_all('div', attrs={'class': 'well'})
games = []
current_game = {}
for i, row in enumerate(rows):
print(i)
if len(row.find_all('h2')) == 0:
continue
title = row.find('h2').text.strip().split('\n')[0]
current_game['title'] = title
current_game['is_parser'] = int(row['ifcomp-style'] == 'parser')
playtime = row['ifcomp-playtime']
if 'two hours' in playtime:
current_game['time'] = 120
elif 'an hour and a half' in playtime:
current_game['time'] = 90
elif 'one hour' in playtime:
current_game['time'] = 60
elif 'longer than two hours' in playtime:
current_game['time'] = 150
elif 'half an hour' in playtime:
current_game['time'] = 30
elif '15 minutes or less' in playtime:
current_game['time'] = 15
else:
current_game['time'] = 0
# TODO: find ifdb refs
try:
ifdb_id = ifdb.find_ifdb_id(title)
time.sleep(0.5)
current_game['ifdb_id'] = ifdb_id
print(ifdb_id)
rating, count = ifdb.get_ratings(ifdb_id, end_date)
time.sleep(0.5)
current_game['ifdb_rating'] = rating
current_game['ifdb_rating_count'] = count
except:
current_game['ifdb_id'] = '???'
current_game['ifdb_rating'] = 0
current_game['ifdb_rating_count'] = 0
print(current_game)
games.append(current_game)
current_game = {}
import pandas as pd
df = pd.DataFrame(games)
df.to_csv('data_2021.tsv', sep='\t', index=None)
# Extracts old ifcomp results (replace 2020 with the relevant year, and change the end_date)
import datetime
import time
from bs4 import BeautifulSoup
import urllib.request
import ifdb
old_url = 'https://ifcomp.org/comp/2020'
end_date = datetime.datetime(2020, 11, 30)
with urllib.request.urlopen(old_url) as fp:
data = fp.read()
html = data.decode("utf8")
soup = BeautifulSoup(html, 'html.parser')
rows = soup.find_all('div', attrs={'class': 'row'})
mode = 'title'
games = []
current_game = {}
for i, row in enumerate(rows):
if mode == 'title':
mode = 'info'
title = row.find_all('h2')[1].text.strip().split('\n')[0]
current_game['title'] = title
elif mode == 'info':
mode = 'rank'
entries = row.find_all('td')
current_game['score'] = float(entries[0].text.strip())
current_game['rating_count'] = int(entries[1].text.strip())
current_game['stdev'] = float(entries[2].text.strip())
current_game['is_parser'] = int('Parser' in row.text)
if 'Two hours' in row.text:
current_game['time'] = 120
elif 'An hour and a half' in row.text:
current_game['time'] = 90
elif 'One hour' in row.text:
current_game['time'] = 60
elif 'Longer than two hours' in row.text:
current_game['time'] = 150
elif 'Half an hour' in row.text:
current_game['time'] = 30
elif '15 minutes or less' in row.text:
current_game['time'] = 15
else:
current_game['time'] = 0
# TODO: find ifdb refs
refs = row.find_all('a')
for ref in refs:
if 'ifdb' in ref['href']:
ifdb_id = ref['href'].split('=')[1]
current_game['ifdb_id'] = ifdb_id
print(ifdb_id)
rating, count = ifdb.get_ratings(ifdb_id, end_date)
time.sleep(0.5)
current_game['ifdb_rating'] = rating
current_game['ifdb_rating_count'] = count
system = ifdb.get_system(ifdb_id)
current_game['system'] = system
break
elif mode == 'rank':
print(current_game)
mode = 'title'
games.append(current_game)
current_game = {}
import pandas as pd
df = pd.DataFrame(games)
df.to_csv('data_2020.tsv', sep='\t', index=None)
import pandas
import ifdb
filled_data = pandas.read_csv('data_2021.tsv', sep='\t', index_col=None)
new_rows = []
games = ifdb.get_rankings('IFComp 2021')
game_ranks = {x: i+1 for i, x in enumerate(games)}
game_ranks['Codex Sadistica'] = game_ranks['Codex Sadistica: A Heavy-Metal Minigame']
for i, row in filled_data.iterrows():
new_row = row.copy()
ifdb_rank = game_ranks[new_row['title']]
new_row['ifdb_rank'] = ifdb_rank
new_rows.append(new_row)
df = pandas.DataFrame(new_rows)
df.to_csv('data_2021.tsv', sep='\t', index=None)
import datetime
from bs4 import BeautifulSoup
import urllib.request
def find_ifdb_id(game_name):
"""
Returns the ifdb id corresponding to a game name.
"""
game_name = game_name.replace(' ', '+').replace('/', '%2F')
url = 'https://ifdb.org/search?searchbar={0}'.format(game_name)
print(url)
with urllib.request.urlopen(url) as fp:
data = fp.read()
html = data.decode("ISO-8859-1")
soup = BeautifulSoup(html, 'lxml')
if 'TUID' in soup.text:
spans = soup.find_all('span', attrs={'class': 'notes'})
for span in spans:
if 'TUID' in span.text:
tuid = span.text.split(':')[-1].strip()
return tuid
else:
url = soup.find('td').find('a')['href']
return url.split('=')[-1]
def get_ratings(ifdb_id, end_date=None):
"""
Given an IFDB game id, returns the game's rating and number of ratings (as of end_date)
"""
url = 'https://ifdb.org/viewgame?id={0}&reviews&sortby=&ratings&pg=all'.format(ifdb_id)
with urllib.request.urlopen(url) as fp:
data = fp.read()
html = data.decode("ISO-8859-1")
soup = BeautifulSoup(html, 'lxml')
indented_div = soup.find_all('div', attrs={'class': 'indented'})[0]
all_stars = []
current_stars = 0
current_date = datetime.datetime(2010, 1, 1)
for child in indented_div.children:
if child.name == 'p':
image = child.find('img')
if image:
current_stars = int(image['title'][0])
if end_date is not None:
try:
text = ','.join(child.text.split(',')[-2:]).strip()
current_date = datetime.datetime.strptime(text, '%B %d, %Y')
if current_date > end_date:
continue
except:
continue
all_stars.append(current_stars)
elif child.name == 'img':
current_stars = int(child['title'][0])
if end_date is None:
all_stars.append(current_stars)
elif child.name == 'span' and end_date != None:
text = child.text.strip(', ')
try:
current_date = datetime.datetime.strptime(text, '%B %d, %Y')
if current_date > end_date:
continue
all_stars.append(current_stars)
except:
pass
count = len(all_stars)
mean = 0
if count > 0:
mean = float(sum(all_stars))/count
return mean, count
def get_rankings(tag="IFComp 2021"):
"""
Returns a list of games sorted by their rank.
"""
tag = tag.replace(' ', '+')
url = 'https://ifdb.org/search?searchfor=tag%3A{0}&sortby=&pg=all'.format(tag)
with urllib.request.urlopen(url) as fp:
data = fp.read()
html = data.decode("ISO-8859-1")
soup = BeautifulSoup(html, 'lxml')
main = soup.find('div', attrs={'class':'main'})
all_links = main.find_all('a')
games = []
for link in all_links:
bold = link.find('b')
if bold:
print(link.text)
games.append(link.text)
return games
def get_system(ifdb_id):
"""
Returns the development system.
"""
url = 'https://ifdb.org/viewgame?id={0}'.format(ifdb_id)
with urllib.request.urlopen(url) as fp:
data = fp.read()
html = data.decode("ISO-8859-1")
soup = BeautifulSoup(html, 'lxml')
notes = soup.find('span', attrs={'class':'notes'})
dev_system = None
in_dev = False
for el in notes:
if in_dev:
dev_system = el.text
in_dev = False
break
if 'Development System' in el:
in_dev = True
return dev_system
# Updates the IFDB ratings for current ifcomp games.
import datetime
import time
import pandas
import ifdb
filled_data = pandas.read_csv('data_2021.tsv', sep='\t', index_col=None)
new_rows = []
end_date = datetime.datetime(2021, 11, 15)
for i, row in filled_data.iterrows():
new_row = row.copy()
title = row['title']
print(i, title)
ifdb_id = row['ifdb_id']
print(ifdb_id)
print('old ratings: {0} {1}'.format(row['ifdb_rating'], row['ifdb_rating_count']))
try:
rating, count = ifdb.get_ratings(ifdb_id, end_date)
if count >= row['ifdb_rating_count']:
new_row['ifdb_rating'] = rating
new_row['ifdb_rating_count'] = count
print('new ratings: {0} {1}'.format(rating, count))
time.sleep(0.5)
except:
pass
new_rows.append(new_row)
df = pandas.DataFrame(new_rows)
df.to_csv('data_2021.tsv', sep='\t', index=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment