Created
December 28, 2022 04:18
-
-
Save nijave/36e1971c78c56463cb2c576506c3aee6 to your computer and use it in GitHub Desktop.
Leekduck Pokemon Go Box Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import pandas as pd | |
import requests | |
import re | |
import json | |
search_url = "https://leekduck.com/boxsales/" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0", | |
"Referer": "https://web.archive.org/web/*" | |
} | |
sparklines = requests.get( | |
"https://web.archive.org/__wb/sparkline", | |
headers=headers, | |
params={ | |
"output": "json", | |
"collection": "web", | |
"url": search_url, | |
}, | |
).json() | |
capture_dates = [] | |
for year in sparklines["years"].keys(): | |
items = requests.get( | |
"https://web.archive.org/__wb/calendarcaptures/2", | |
headers=headers, | |
params={ | |
"date": year, | |
"groupby": "day", | |
"url": search_url, | |
}, | |
).json() | |
for date, status_code, _ in items["items"]: | |
if status_code//100 != 2: | |
continue | |
date = year + str(date).zfill(4) | |
timestamps = requests.get( | |
"https://web.archive.org/__wb/calendarcaptures/2", | |
headers=headers, | |
params={ | |
"date": date, | |
"url": search_url, | |
}, | |
).json() | |
for ts in timestamps["items"]: | |
time, status_code, _ = ts | |
if status_code//100 != 2: | |
continue | |
capture_dates.append(date + str(time).zfill(6)) | |
def analyze_box(html): | |
soup = BeautifulSoup(html) | |
found_analysis = False | |
box_name = None | |
summaries = [] | |
for elem in soup.find("article").find("div", {"class": "page-content"}).find_all(recursive=False): | |
if not found_analysis: | |
if "Box Sale Breakdown" in elem.text: | |
found_analysis = True | |
continue | |
if elem.name == "div" and elem.has_attr("id") and elem["id"] == "label-wrapper" and (box_title := elem.find("h3")): | |
box_name = box_title.text | |
continue | |
if elem.name == "table": | |
table_data = str(elem) | |
df = pd.read_html(table_data)[0] | |
items = df[ ~df["Item"].isna() ].copy() | |
items["#"] = items["#"].apply(lambda x: re.sub("^x", "", x)) | |
items["#"] = items["#"].astype("int32") | |
items["Total Value"] = items["Total Value"].astype("int32") | |
summary = df[ df["Item"].isna() ].drop(["Item"], axis=1).set_index("#") | |
summaries.append({ | |
"title": box_name, | |
"items": items.to_dict(orient="split")["data"], | |
"value": summary.loc["Total Value:"][0], | |
"cost": summary.loc["Box Cost:"][0], | |
}) | |
box_name = None | |
return summaries | |
data = {} | |
for cd in capture_dates: | |
wa_url = f"https://web.archive.org/web/{cd}/{search_url}" | |
content = requests.get(wa_url).text | |
try: | |
print(cd) | |
analysis = analyze_box(content) | |
data[cd] = analysis | |
except Exception as e: | |
print(cd, e) | |
continue | |
print(json.dumps(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment