Last active
December 27, 2021 16:52
-
-
Save aucchen/1ec479eb10a08370ab1a6d40b3eaa8f8 to your computer and use it in GitHub Desktop.
Extract data from ifcomp/ifdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extracts the current ifcomp games, and tries to link them to ifdb entries. | |
import datetime | |
import time | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import ifdb | |
new_url = 'https://ifcomp.org/ballot?alphabetize=1' | |
end_date = datetime.datetime(2021, 11, 15) | |
with urllib.request.urlopen(new_url) as fp: | |
data = fp.read() | |
html = data.decode("utf8") | |
soup = BeautifulSoup(html, 'html.parser') | |
rows = soup.find_all('div', attrs={'class': 'well'}) | |
games = [] | |
current_game = {} | |
for i, row in enumerate(rows): | |
print(i) | |
if len(row.find_all('h2')) == 0: | |
continue | |
title = row.find('h2').text.strip().split('\n')[0] | |
current_game['title'] = title | |
current_game['is_parser'] = int(row['ifcomp-style'] == 'parser') | |
playtime = row['ifcomp-playtime'] | |
if 'two hours' in playtime: | |
current_game['time'] = 120 | |
elif 'an hour and a half' in playtime: | |
current_game['time'] = 90 | |
elif 'one hour' in playtime: | |
current_game['time'] = 60 | |
elif 'longer than two hours' in playtime: | |
current_game['time'] = 150 | |
elif 'half an hour' in playtime: | |
current_game['time'] = 30 | |
elif '15 minutes or less' in playtime: | |
current_game['time'] = 15 | |
else: | |
current_game['time'] = 0 | |
# TODO: find ifdb refs | |
try: | |
ifdb_id = ifdb.find_ifdb_id(title) | |
time.sleep(0.5) | |
current_game['ifdb_id'] = ifdb_id | |
print(ifdb_id) | |
rating, count = ifdb.get_ratings(ifdb_id, end_date) | |
time.sleep(0.5) | |
current_game['ifdb_rating'] = rating | |
current_game['ifdb_rating_count'] = count | |
except: | |
current_game['ifdb_id'] = '???' | |
current_game['ifdb_rating'] = 0 | |
current_game['ifdb_rating_count'] = 0 | |
print(current_game) | |
games.append(current_game) | |
current_game = {} | |
import pandas as pd | |
df = pd.DataFrame(games) | |
df.to_csv('data_2021.tsv', sep='\t', index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extracts old ifcomp results (replace 2020 with the relevant year, and change the end_date) | |
import datetime | |
import time | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import ifdb | |
old_url = 'https://ifcomp.org/comp/2020' | |
end_date = datetime.datetime(2020, 11, 30) | |
with urllib.request.urlopen(old_url) as fp: | |
data = fp.read() | |
html = data.decode("utf8") | |
soup = BeautifulSoup(html, 'html.parser') | |
rows = soup.find_all('div', attrs={'class': 'row'}) | |
mode = 'title' | |
games = [] | |
current_game = {} | |
for i, row in enumerate(rows): | |
if mode == 'title': | |
mode = 'info' | |
title = row.find_all('h2')[1].text.strip().split('\n')[0] | |
current_game['title'] = title | |
elif mode == 'info': | |
mode = 'rank' | |
entries = row.find_all('td') | |
current_game['score'] = float(entries[0].text.strip()) | |
current_game['rating_count'] = int(entries[1].text.strip()) | |
current_game['stdev'] = float(entries[2].text.strip()) | |
current_game['is_parser'] = int('Parser' in row.text) | |
if 'Two hours' in row.text: | |
current_game['time'] = 120 | |
elif 'An hour and a half' in row.text: | |
current_game['time'] = 90 | |
elif 'One hour' in row.text: | |
current_game['time'] = 60 | |
elif 'Longer than two hours' in row.text: | |
current_game['time'] = 150 | |
elif 'Half an hour' in row.text: | |
current_game['time'] = 30 | |
elif '15 minutes or less' in row.text: | |
current_game['time'] = 15 | |
else: | |
current_game['time'] = 0 | |
# TODO: find ifdb refs | |
refs = row.find_all('a') | |
for ref in refs: | |
if 'ifdb' in ref['href']: | |
ifdb_id = ref['href'].split('=')[1] | |
current_game['ifdb_id'] = ifdb_id | |
print(ifdb_id) | |
rating, count = ifdb.get_ratings(ifdb_id, end_date) | |
time.sleep(0.5) | |
current_game['ifdb_rating'] = rating | |
current_game['ifdb_rating_count'] = count | |
system = ifdb.get_system(ifdb_id) | |
current_game['system'] = system | |
break | |
elif mode == 'rank': | |
print(current_game) | |
mode = 'title' | |
games.append(current_game) | |
current_game = {} | |
import pandas as pd | |
df = pd.DataFrame(games) | |
df.to_csv('data_2020.tsv', sep='\t', index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import ifdb | |
filled_data = pandas.read_csv('data_2021.tsv', sep='\t', index_col=None) | |
new_rows = [] | |
games = ifdb.get_rankings('IFComp 2021') | |
game_ranks = {x: i+1 for i, x in enumerate(games)} | |
game_ranks['Codex Sadistica'] = game_ranks['Codex Sadistica: A Heavy-Metal Minigame'] | |
for i, row in filled_data.iterrows(): | |
new_row = row.copy() | |
ifdb_rank = game_ranks[new_row['title']] | |
new_row['ifdb_rank'] = ifdb_rank | |
new_rows.append(new_row) | |
df = pandas.DataFrame(new_rows) | |
df.to_csv('data_2021.tsv', sep='\t', index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
from bs4 import BeautifulSoup | |
import urllib.request | |
def find_ifdb_id(game_name): | |
""" | |
Returns the ifdb id corresponding to a game name. | |
""" | |
game_name = game_name.replace(' ', '+').replace('/', '%2F') | |
url = 'https://ifdb.org/search?searchbar={0}'.format(game_name) | |
print(url) | |
with urllib.request.urlopen(url) as fp: | |
data = fp.read() | |
html = data.decode("ISO-8859-1") | |
soup = BeautifulSoup(html, 'lxml') | |
if 'TUID' in soup.text: | |
spans = soup.find_all('span', attrs={'class': 'notes'}) | |
for span in spans: | |
if 'TUID' in span.text: | |
tuid = span.text.split(':')[-1].strip() | |
return tuid | |
else: | |
url = soup.find('td').find('a')['href'] | |
return url.split('=')[-1] | |
def get_ratings(ifdb_id, end_date=None): | |
""" | |
Given an IFDB game id, returns the game's rating and number of ratings (as of end_date) | |
""" | |
url = 'https://ifdb.org/viewgame?id={0}&reviews&sortby=&ratings&pg=all'.format(ifdb_id) | |
with urllib.request.urlopen(url) as fp: | |
data = fp.read() | |
html = data.decode("ISO-8859-1") | |
soup = BeautifulSoup(html, 'lxml') | |
indented_div = soup.find_all('div', attrs={'class': 'indented'})[0] | |
all_stars = [] | |
current_stars = 0 | |
current_date = datetime.datetime(2010, 1, 1) | |
for child in indented_div.children: | |
if child.name == 'p': | |
image = child.find('img') | |
if image: | |
current_stars = int(image['title'][0]) | |
if end_date is not None: | |
try: | |
text = ','.join(child.text.split(',')[-2:]).strip() | |
current_date = datetime.datetime.strptime(text, '%B %d, %Y') | |
if current_date > end_date: | |
continue | |
except: | |
continue | |
all_stars.append(current_stars) | |
elif child.name == 'img': | |
current_stars = int(child['title'][0]) | |
if end_date is None: | |
all_stars.append(current_stars) | |
elif child.name == 'span' and end_date != None: | |
text = child.text.strip(', ') | |
try: | |
current_date = datetime.datetime.strptime(text, '%B %d, %Y') | |
if current_date > end_date: | |
continue | |
all_stars.append(current_stars) | |
except: | |
pass | |
count = len(all_stars) | |
mean = 0 | |
if count > 0: | |
mean = float(sum(all_stars))/count | |
return mean, count | |
def get_rankings(tag="IFComp 2021"): | |
""" | |
Returns a list of games sorted by their rank. | |
""" | |
tag = tag.replace(' ', '+') | |
url = 'https://ifdb.org/search?searchfor=tag%3A{0}&sortby=&pg=all'.format(tag) | |
with urllib.request.urlopen(url) as fp: | |
data = fp.read() | |
html = data.decode("ISO-8859-1") | |
soup = BeautifulSoup(html, 'lxml') | |
main = soup.find('div', attrs={'class':'main'}) | |
all_links = main.find_all('a') | |
games = [] | |
for link in all_links: | |
bold = link.find('b') | |
if bold: | |
print(link.text) | |
games.append(link.text) | |
return games | |
def get_system(ifdb_id): | |
""" | |
Returns the development system. | |
""" | |
url = 'https://ifdb.org/viewgame?id={0}'.format(ifdb_id) | |
with urllib.request.urlopen(url) as fp: | |
data = fp.read() | |
html = data.decode("ISO-8859-1") | |
soup = BeautifulSoup(html, 'lxml') | |
notes = soup.find('span', attrs={'class':'notes'}) | |
dev_system = None | |
in_dev = False | |
for el in notes: | |
if in_dev: | |
dev_system = el.text | |
in_dev = False | |
break | |
if 'Development System' in el: | |
in_dev = True | |
return dev_system |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Updates the IFDB ratings for current ifcomp games. | |
import datetime | |
import time | |
import pandas | |
import ifdb | |
filled_data = pandas.read_csv('data_2021.tsv', sep='\t', index_col=None) | |
new_rows = [] | |
end_date = datetime.datetime(2021, 11, 15) | |
for i, row in filled_data.iterrows(): | |
new_row = row.copy() | |
title = row['title'] | |
print(i, title) | |
ifdb_id = row['ifdb_id'] | |
print(ifdb_id) | |
print('old ratings: {0} {1}'.format(row['ifdb_rating'], row['ifdb_rating_count'])) | |
try: | |
rating, count = ifdb.get_ratings(ifdb_id, end_date) | |
if count >= row['ifdb_rating_count']: | |
new_row['ifdb_rating'] = rating | |
new_row['ifdb_rating_count'] = count | |
print('new ratings: {0} {1}'.format(rating, count)) | |
time.sleep(0.5) | |
except: | |
pass | |
new_rows.append(new_row) | |
df = pandas.DataFrame(new_rows) | |
df.to_csv('data_2021.tsv', sep='\t', index=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment