Last active
August 13, 2023 04:30
-
-
Save abluescarab/724bac619c5261e70a6427ee364c98e3 to your computer and use it in GitHub Desktop.
Scrapes tags from Steam for the given app IDs, then compares them to find the most common tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests_html import AsyncHTMLSession | |
import json | |
import re | |
import asyncio | |
import argparse | |
from collections import namedtuple | |
Page = namedtuple("Page", "appid result") | |
App = namedtuple("App", "name tags") | |
async def scrape(session, appid): | |
r = await session.get( | |
f"https://store.steampowered.com/app/{appid}/", | |
cookies={"birthtime": "283993201", "mature_content": "1"}, | |
) | |
return Page(appid, r) | |
def scrape_all(session, appids): | |
coros = [lambda appid=appid: scrape(session, appid) for appid in appids] | |
results = session.run(*coros) | |
return results | |
async def parse(page): | |
appHtml = page.result.html.find("#appHubAppName", first=True) | |
if not appHtml: | |
return | |
app = appHtml.text | |
scripts = [ | |
s for s in page.result.html.find("script") if "InitAppTagModal" in s.html | |
] | |
tags = [] | |
if len(scripts) > 0: | |
script = scripts[0] | |
tagsJson = re.findall(f"\[(.*)\]", script.html)[0] | |
if tagsJson: | |
tagsList = json.loads(f"[{tagsJson}]") | |
tags = [tag["name"] for tag in tagsList] | |
result = App(app, tags) | |
return result | |
async def parse_all(scraped): | |
coros = [ | |
parse(page) | |
for page in scraped | |
if page.result.status_code == 200 and "app" in page.result.url | |
] | |
parsed = await asyncio.gather(*coros) | |
return parsed | |
def count_tags(apps): | |
tags = {} | |
for app in apps: | |
for tag in app.tags: | |
if tag not in tags: | |
tags[tag] = {"count": 0, "games": [app.name]} | |
else: | |
tags[tag]["games"].append(app.name) | |
tags[tag]["count"] += 1 | |
return dict(sorted(tags.items(), key=lambda tag: tag[1]["count"], reverse=True)) | |
def main(appids): | |
session = AsyncHTMLSession() | |
scraped = scrape_all(session, appids) | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
try: | |
parsed = loop.run_until_complete(parse_all(scraped)) | |
except KeyboardInterrupt: | |
pass | |
if len(parsed) < 1: | |
print("No pages scraped.") | |
return | |
parsed = sorted(parsed, key=lambda app: app.name) | |
tags = count_tags(parsed) | |
if len(parsed) > 1: | |
print( | |
"Shared tags:", | |
", ".join( | |
[key for [key, value] in tags.items() if value["count"] == len(appids)] | |
), | |
) | |
print() | |
print("All tags:") | |
if len(parsed) == 1: | |
for key in tags: | |
print(key) | |
else: | |
for key, value in tags.items(): | |
print(f"{key} ({value['count']}) ({', '.join(value['games'])})") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"appids", | |
type=int, | |
nargs="+", | |
metavar="APP_IDS", | |
help="Steam application IDs (number in a store URL after app/)", | |
) | |
args = parser.parse_args() | |
main(args.appids) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment