Created
July 27, 2020 23:07
-
-
Save thelink2012/bf226e33bd0802e6a77b031c108870d5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Transforms a Grouvee CSV file (exported from a Grouvee profile) into a | |
CSV file listing the game and the required time to beat it in ascending | |
order. | |
Usage Examples: | |
cat grouvee.csv | ./howlongtobeat-grouvee.py | |
cat grouvee.csv | ./howlongtobeat-grouvee.py "Backlog" "Wish List" | |
""" | |
import datetime | |
import asyncio | |
import sys | |
import csv | |
import json | |
from dataclasses import dataclass | |
from typing import List, Tuple, Optional, Iterable | |
from howlongtobeatpy import HowLongToBeat, HowLongToBeatEntry | |
# How many simultaneous requests to perform on HowLongToBeat. | |
# Big values make things slow and prone to rate limiting. | |
HLTB_CRAWLING_CONCURRENCY = 10 | |
# A few games needs remapping of names in order to work. | |
GROOVE_TO_HLTB_NAME_MAP = { | |
# Casing Change | |
"GRIS": "Gris", | |
"Superhot": "SUPERHOT", | |
"ICO": "Ico", | |
"inFamous": "inFAMOUS", | |
"Ōkami": "ŌKAMI", | |
# Symbol Change / Removal | |
"Test Drive: Le Mans": "Test Drive Le Mans", | |
"Split/Second": "Split / Second", | |
# Number Change | |
"Red Dead Redemption II": "Red Dead Redemption 2", | |
"The Walking Dead: Season Two": "The Walking Dead: Season 2", | |
# Other Changes | |
"Nelson Tethers: Puzzle Agent": "Puzzle Agent", | |
"F.E.A.R.: First Encounter Assault Recon": "F.E.A.R.", | |
"Half-Life: Counter-Strike": "Counter-Strike", | |
"Driver 2: The Wheelman is Back": "Driver 2", | |
"GRID": "Race Driver: Grid", | |
"Black": "Black (2006)", | |
} | |
@dataclass | |
class HowLongToBeatGrouveeEntry: | |
"Links a HowLongToBeat entry to a Grouvee entry." "" | |
grouvee_id: int | |
hltb_id: int | |
hltb_name: str | |
hltb_time: Optional[int] | |
def extract_game_time(hltb_entry: HowLongToBeatEntry) -> Optional[int]: | |
"""Extracts the HowLongToBeat time (in hours) from a crawled HLTB entry.""" | |
# Prioritize 'Main + Extra' time over 'Completionist' over 'Main Story'. | |
if hltb_entry.gameplay_main_extra != -1: | |
time_string = hltb_entry.gameplay_main_extra | |
time_unit = hltb_entry.gameplay_main_extra_unit | |
elif hltb_entry.gameplay_completionist != -1: | |
time_string = hltb_entry.gameplay_completionist | |
time_unit = hltb_entry.gameplay_completionist_unit | |
elif hltb_entry.gameplay_main != -1: | |
time_string = hltb_entry.gameplay_main | |
time_unit = hltb_entry.gameplay_main_unit | |
else: | |
return None | |
time_string = time_string.rstrip("½") | |
if time_unit == "Mins": | |
return 1 | |
else: | |
assert time_unit == "Hours" | |
return int(time_string) | |
async def crawl_hltb_entry(name: str) -> Optional[HowLongToBeatEntry]: | |
"""Crawls HowLongToBeat in search for the specified game.""" | |
name = GROOVE_TO_HLTB_NAME_MAP.get(name, name) | |
results = await HowLongToBeat().async_search(name) | |
if results is not None and len(results) > 0: | |
return max(results, key=lambda entry: entry.similarity) | |
else: | |
return None | |
async def link_grouvee_and_hltb(grouvee_rows) -> List[HowLongToBeatGrouveeEntry]: | |
"""Computes the HowLongToBeat time for each of the specified games.""" | |
result = [] | |
if len(grouvee_rows) == 0: | |
return result | |
# Only run a certain number of crawlers at once to avoid being rate limited | |
# by HowLongToBeat website. | |
concurrency_semaphore = asyncio.Semaphore(HLTB_CRAWLING_CONCURRENCY) | |
async def guard_with_semaphore(coro): | |
async with concurrency_semaphore: | |
return await coro | |
# Hacky progress bar. Assumes a VT100-compatible terminal. | |
processed_count = 0 | |
async def guard_with_progressbar(coro, name): | |
coro_result = await coro | |
nonlocal processed_count | |
processed_count += 1 | |
progress = (processed_count * 100) // len(grouvee_rows) | |
print(f"\033[2K\r{progress}% [{name}]", file=sys.stderr, end="") | |
return coro_result | |
task_list = [ | |
asyncio.create_task( | |
guard_with_progressbar( | |
guard_with_semaphore(crawl_hltb_entry(row["name"])), row["name"] | |
) | |
) | |
for row in grouvee_rows | |
] | |
done, pending = await asyncio.wait(task_list, return_when=asyncio.FIRST_EXCEPTION) | |
print("\033[2K\r", file=sys.stderr, end="") # end the progress bar | |
for row, task in zip(grouvee_rows, done): | |
hltb_entry = task.result() | |
if hltb_entry is not None: | |
# print(vars(hltb_entry), file=sys.stderr) | |
result.append( | |
HowLongToBeatGrouveeEntry( | |
grouvee_id=row["id"], | |
hltb_id=hltb_entry.game_id, | |
hltb_name=hltb_entry.game_name, | |
hltb_time=extract_game_time(hltb_entry), | |
) | |
) | |
assert len(list(pending)) == 0 | |
return result | |
def release_date_filter(max_release_date: datetime.date): | |
"""Returns a predicate to filter grouvee rows that were released after the specified date.""" | |
def predicate(grouvee_row): | |
release_date_str = grouvee_row["release_date"] | |
if len(release_date_str) == 0: # no data available | |
return True | |
try: | |
release_date = datetime.datetime.strptime( | |
release_date_str, "%Y-%m-%d" | |
).date() | |
return release_date <= max_release_date | |
except ValueError: | |
try: | |
release_date = datetime.datetime.strptime( | |
release_date_str, "%Y-%m" | |
).date() | |
# Too lazy to code this check like it should be | |
return release_date <= max_release_date | |
except ValueError: | |
release_date = datetime.datetime.strptime(release_date_str, "%Y").date() | |
return release_date.year < max_release_date.year | |
return predicate | |
def shelve_filter(shelve_list: List[str]): | |
"""Returns a predicate to filter grouvee rows that aren't in the specified shelves.""" | |
shelve_list = [s.lower() for s in shelve_list] | |
def predicate(grouvee_row): | |
grouvee_shelves_json = json.loads(grouvee_row["shelves"]) | |
return any((k.lower() in shelve_list) for k in grouvee_shelves_json.keys()) | |
return predicate | |
def warn_about_missing_games(grouvee_rows, hltb_grouvee_list): | |
"""Prints warnings about games in Grouvee that couldn't be linked to HLTB.""" | |
linked_grouvee_ids = {e.grouvee_id: e for e in hltb_grouvee_list} | |
for row in grouvee_rows: | |
hltb_grouvee_entry = linked_grouvee_ids.get(row["id"]) | |
if hltb_grouvee_entry is None: | |
print(f"warning: {row['name']} not found in HowLongToBeat", file=sys.stderr) | |
elif hltb_grouvee_entry.hltb_time is None: | |
print( | |
f"warning: {hltb_grouvee_entry.hltb_name} has no play time data in HowLongToBeat", | |
file=sys.stderr, | |
) | |
async def main(shelves: List[str]): | |
grouvee_rows = list(csv.DictReader(sys.stdin)) | |
# Filtering unreleased games is necessary because otherwise we'd get a couple | |
# of warnings about these games not having registered times in HLTB. | |
grouvee_rows = list( | |
filter(release_date_filter(datetime.date.today()), grouvee_rows) | |
) | |
if shelves is not None: | |
grouvee_rows = list(filter(shelve_filter(shelves), grouvee_rows)) | |
hltb_grouvee_list = await link_grouvee_and_hltb(grouvee_rows) | |
warn_about_missing_games(grouvee_rows, hltb_grouvee_list) | |
hltb_grouvee_list = sorted( | |
filter(lambda e: e.hltb_time is not None, hltb_grouvee_list), | |
key=lambda e: e.hltb_time, | |
) | |
csv_writer = csv.writer(sys.stdout) | |
for e in hltb_grouvee_list: | |
csv_writer.writerow([e.hltb_name, e.hltb_time]) | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
asyncio.run(main(sys.argv[1:])) | |
else: | |
asyncio.run(main(None)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment