Skip to content

Instantly share code, notes, and snippets.

@abluescarab
Last active August 13, 2023 04:30
Show Gist options
  • Save abluescarab/724bac619c5261e70a6427ee364c98e3 to your computer and use it in GitHub Desktop.
Save abluescarab/724bac619c5261e70a6427ee364c98e3 to your computer and use it in GitHub Desktop.
Scrapes tags from Steam for the given app IDs, then compares them to find the most common tags
from requests_html import AsyncHTMLSession
import json
import re
import asyncio
import argparse
from collections import namedtuple
Page = namedtuple("Page", "appid result")
App = namedtuple("App", "name tags")
async def scrape(session, appid):
r = await session.get(
f"https://store.steampowered.com/app/{appid}/",
cookies={"birthtime": "283993201", "mature_content": "1"},
)
return Page(appid, r)
def scrape_all(session, appids):
coros = [lambda appid=appid: scrape(session, appid) for appid in appids]
results = session.run(*coros)
return results
async def parse(page):
appHtml = page.result.html.find("#appHubAppName", first=True)
if not appHtml:
return
app = appHtml.text
scripts = [
s for s in page.result.html.find("script") if "InitAppTagModal" in s.html
]
tags = []
if len(scripts) > 0:
script = scripts[0]
tagsJson = re.findall(f"\[(.*)\]", script.html)[0]
if tagsJson:
tagsList = json.loads(f"[{tagsJson}]")
tags = [tag["name"] for tag in tagsList]
result = App(app, tags)
return result
async def parse_all(scraped):
coros = [
parse(page)
for page in scraped
if page.result.status_code == 200 and "app" in page.result.url
]
parsed = await asyncio.gather(*coros)
return parsed
def count_tags(apps):
tags = {}
for app in apps:
for tag in app.tags:
if tag not in tags:
tags[tag] = {"count": 0, "games": [app.name]}
else:
tags[tag]["games"].append(app.name)
tags[tag]["count"] += 1
return dict(sorted(tags.items(), key=lambda tag: tag[1]["count"], reverse=True))
def main(appids):
session = AsyncHTMLSession()
scraped = scrape_all(session, appids)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
parsed = loop.run_until_complete(parse_all(scraped))
except KeyboardInterrupt:
pass
if len(parsed) < 1:
print("No pages scraped.")
return
parsed = sorted(parsed, key=lambda app: app.name)
tags = count_tags(parsed)
if len(parsed) > 1:
print(
"Shared tags:",
", ".join(
[key for [key, value] in tags.items() if value["count"] == len(appids)]
),
)
print()
print("All tags:")
if len(parsed) == 1:
for key in tags:
print(key)
else:
for key, value in tags.items():
print(f"{key} ({value['count']}) ({', '.join(value['games'])})")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"appids",
type=int,
nargs="+",
metavar="APP_IDS",
help="Steam application IDs (number in a store URL after app/)",
)
args = parser.parse_args()
main(args.appids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment