Skip to content

Instantly share code, notes, and snippets.

@nijave
Created December 28, 2022 04:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nijave/36e1971c78c56463cb2c576506c3aee6 to your computer and use it in GitHub Desktop.
Save nijave/36e1971c78c56463cb2c576506c3aee6 to your computer and use it in GitHub Desktop.
Leekduck Pokemon Go Box Scraper
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
search_url = "https://leekduck.com/boxsales/"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Referer": "https://web.archive.org/web/*"
}
sparklines = requests.get(
"https://web.archive.org/__wb/sparkline",
headers=headers,
params={
"output": "json",
"collection": "web",
"url": search_url,
},
).json()
capture_dates = []
for year in sparklines["years"].keys():
items = requests.get(
"https://web.archive.org/__wb/calendarcaptures/2",
headers=headers,
params={
"date": year,
"groupby": "day",
"url": search_url,
},
).json()
for date, status_code, _ in items["items"]:
if status_code//100 != 2:
continue
date = year + str(date).zfill(4)
timestamps = requests.get(
"https://web.archive.org/__wb/calendarcaptures/2",
headers=headers,
params={
"date": date,
"url": search_url,
},
).json()
for ts in timestamps["items"]:
time, status_code, _ = ts
if status_code//100 != 2:
continue
capture_dates.append(date + str(time).zfill(6))
def analyze_box(html):
soup = BeautifulSoup(html)
found_analysis = False
box_name = None
summaries = []
for elem in soup.find("article").find("div", {"class": "page-content"}).find_all(recursive=False):
if not found_analysis:
if "Box Sale Breakdown" in elem.text:
found_analysis = True
continue
if elem.name == "div" and elem.has_attr("id") and elem["id"] == "label-wrapper" and (box_title := elem.find("h3")):
box_name = box_title.text
continue
if elem.name == "table":
table_data = str(elem)
df = pd.read_html(table_data)[0]
items = df[ ~df["Item"].isna() ].copy()
items["#"] = items["#"].apply(lambda x: re.sub("^x", "", x))
items["#"] = items["#"].astype("int32")
items["Total Value"] = items["Total Value"].astype("int32")
summary = df[ df["Item"].isna() ].drop(["Item"], axis=1).set_index("#")
summaries.append({
"title": box_name,
"items": items.to_dict(orient="split")["data"],
"value": summary.loc["Total Value:"][0],
"cost": summary.loc["Box Cost:"][0],
})
box_name = None
return summaries
data = {}
for cd in capture_dates:
wa_url = f"https://web.archive.org/web/{cd}/{search_url}"
content = requests.get(wa_url).text
try:
print(cd)
analysis = analyze_box(content)
data[cd] = analysis
except Exception as e:
print(cd, e)
continue
print(json.dumps(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment