Skip to content

Instantly share code, notes, and snippets.

@thelink2012
Created July 27, 2020 23:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thelink2012/bf226e33bd0802e6a77b031c108870d5 to your computer and use it in GitHub Desktop.
Save thelink2012/bf226e33bd0802e6a77b031c108870d5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Transforms a Grouvee CSV file (exported from a Grouvee profile) into a
CSV file listing the game and the required time to beat it in ascending
order.
Usage Examples:
cat grouvee.csv | ./howlongtobeat-grouvee.py
cat grouvee.csv | ./howlongtobeat-grouvee.py "Backlog" "Wish List"
"""
import datetime
import asyncio
import sys
import csv
import json
from dataclasses import dataclass
from typing import List, Tuple, Optional, Iterable
from howlongtobeatpy import HowLongToBeat, HowLongToBeatEntry
# How many simultaneous requests to perform on HowLongToBeat.
# Big values make things slow and prone to rate limiting.
HLTB_CRAWLING_CONCURRENCY = 10
# A few games needs remapping of names in order to work.
GROOVE_TO_HLTB_NAME_MAP = {
# Casing Change
"GRIS": "Gris",
"Superhot": "SUPERHOT",
"ICO": "Ico",
"inFamous": "inFAMOUS",
"Ōkami": "ŌKAMI",
# Symbol Change / Removal
"Test Drive: Le Mans": "Test Drive Le Mans",
"Split/Second": "Split / Second",
# Number Change
"Red Dead Redemption II": "Red Dead Redemption 2",
"The Walking Dead: Season Two": "The Walking Dead: Season 2",
# Other Changes
"Nelson Tethers: Puzzle Agent": "Puzzle Agent",
"F.E.A.R.: First Encounter Assault Recon": "F.E.A.R.",
"Half-Life: Counter-Strike": "Counter-Strike",
"Driver 2: The Wheelman is Back": "Driver 2",
"GRID": "Race Driver: Grid",
"Black": "Black (2006)",
}
@dataclass
class HowLongToBeatGrouveeEntry:
"Links a HowLongToBeat entry to a Grouvee entry." ""
grouvee_id: int
hltb_id: int
hltb_name: str
hltb_time: Optional[int]
def extract_game_time(hltb_entry: HowLongToBeatEntry) -> Optional[int]:
"""Extracts the HowLongToBeat time (in hours) from a crawled HLTB entry."""
# Prioritize 'Main + Extra' time over 'Completionist' over 'Main Story'.
if hltb_entry.gameplay_main_extra != -1:
time_string = hltb_entry.gameplay_main_extra
time_unit = hltb_entry.gameplay_main_extra_unit
elif hltb_entry.gameplay_completionist != -1:
time_string = hltb_entry.gameplay_completionist
time_unit = hltb_entry.gameplay_completionist_unit
elif hltb_entry.gameplay_main != -1:
time_string = hltb_entry.gameplay_main
time_unit = hltb_entry.gameplay_main_unit
else:
return None
time_string = time_string.rstrip("½")
if time_unit == "Mins":
return 1
else:
assert time_unit == "Hours"
return int(time_string)
async def crawl_hltb_entry(name: str) -> Optional[HowLongToBeatEntry]:
"""Crawls HowLongToBeat in search for the specified game."""
name = GROOVE_TO_HLTB_NAME_MAP.get(name, name)
results = await HowLongToBeat().async_search(name)
if results is not None and len(results) > 0:
return max(results, key=lambda entry: entry.similarity)
else:
return None
async def link_grouvee_and_hltb(grouvee_rows) -> List[HowLongToBeatGrouveeEntry]:
"""Computes the HowLongToBeat time for each of the specified games."""
result = []
if len(grouvee_rows) == 0:
return result
# Only run a certain number of crawlers at once to avoid being rate limited
# by HowLongToBeat website.
concurrency_semaphore = asyncio.Semaphore(HLTB_CRAWLING_CONCURRENCY)
async def guard_with_semaphore(coro):
async with concurrency_semaphore:
return await coro
# Hacky progress bar. Assumes a VT100-compatible terminal.
processed_count = 0
async def guard_with_progressbar(coro, name):
coro_result = await coro
nonlocal processed_count
processed_count += 1
progress = (processed_count * 100) // len(grouvee_rows)
print(f"\033[2K\r{progress}% [{name}]", file=sys.stderr, end="")
return coro_result
task_list = [
asyncio.create_task(
guard_with_progressbar(
guard_with_semaphore(crawl_hltb_entry(row["name"])), row["name"]
)
)
for row in grouvee_rows
]
done, pending = await asyncio.wait(task_list, return_when=asyncio.FIRST_EXCEPTION)
print("\033[2K\r", file=sys.stderr, end="") # end the progress bar
for row, task in zip(grouvee_rows, done):
hltb_entry = task.result()
if hltb_entry is not None:
# print(vars(hltb_entry), file=sys.stderr)
result.append(
HowLongToBeatGrouveeEntry(
grouvee_id=row["id"],
hltb_id=hltb_entry.game_id,
hltb_name=hltb_entry.game_name,
hltb_time=extract_game_time(hltb_entry),
)
)
assert len(list(pending)) == 0
return result
def release_date_filter(max_release_date: datetime.date):
"""Returns a predicate to filter grouvee rows that were released after the specified date."""
def predicate(grouvee_row):
release_date_str = grouvee_row["release_date"]
if len(release_date_str) == 0: # no data available
return True
try:
release_date = datetime.datetime.strptime(
release_date_str, "%Y-%m-%d"
).date()
return release_date <= max_release_date
except ValueError:
try:
release_date = datetime.datetime.strptime(
release_date_str, "%Y-%m"
).date()
# Too lazy to code this check like it should be
return release_date <= max_release_date
except ValueError:
release_date = datetime.datetime.strptime(release_date_str, "%Y").date()
return release_date.year < max_release_date.year
return predicate
def shelve_filter(shelve_list: List[str]):
"""Returns a predicate to filter grouvee rows that aren't in the specified shelves."""
shelve_list = [s.lower() for s in shelve_list]
def predicate(grouvee_row):
grouvee_shelves_json = json.loads(grouvee_row["shelves"])
return any((k.lower() in shelve_list) for k in grouvee_shelves_json.keys())
return predicate
def warn_about_missing_games(grouvee_rows, hltb_grouvee_list):
"""Prints warnings about games in Grouvee that couldn't be linked to HLTB."""
linked_grouvee_ids = {e.grouvee_id: e for e in hltb_grouvee_list}
for row in grouvee_rows:
hltb_grouvee_entry = linked_grouvee_ids.get(row["id"])
if hltb_grouvee_entry is None:
print(f"warning: {row['name']} not found in HowLongToBeat", file=sys.stderr)
elif hltb_grouvee_entry.hltb_time is None:
print(
f"warning: {hltb_grouvee_entry.hltb_name} has no play time data in HowLongToBeat",
file=sys.stderr,
)
async def main(shelves: List[str]):
grouvee_rows = list(csv.DictReader(sys.stdin))
# Filtering unreleased games is necessary because otherwise we'd get a couple
# of warnings about these games not having registered times in HLTB.
grouvee_rows = list(
filter(release_date_filter(datetime.date.today()), grouvee_rows)
)
if shelves is not None:
grouvee_rows = list(filter(shelve_filter(shelves), grouvee_rows))
hltb_grouvee_list = await link_grouvee_and_hltb(grouvee_rows)
warn_about_missing_games(grouvee_rows, hltb_grouvee_list)
hltb_grouvee_list = sorted(
filter(lambda e: e.hltb_time is not None, hltb_grouvee_list),
key=lambda e: e.hltb_time,
)
csv_writer = csv.writer(sys.stdout)
for e in hltb_grouvee_list:
csv_writer.writerow([e.hltb_name, e.hltb_time])
if __name__ == "__main__":
if len(sys.argv) > 1:
asyncio.run(main(sys.argv[1:]))
else:
asyncio.run(main(None))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment