Skip to content

Instantly share code, notes, and snippets.

@unexceptable
Last active November 12, 2020 02:29
Show Gist options
  • Save unexceptable/223638796961cb7ebd418e1a663d9b31 to your computer and use it in GitHub Desktop.
Save unexceptable/223638796961cb7ebd418e1a663d9b31 to your computer and use it in GitHub Desktop.
web scraper for wildstar addons from curseforge, requirements: https://gist.github.com/Adrian-Turjak/631241e503b8cf4c814a3d5ca2ce8d5e
import argparse
import hashlib
import yaml
import os
from bs4 import BeautifulSoup
import cloudscraper
from markdownify import markdownify as md
from requests import exceptions
CURSE_ROOT_URL = "https://www.curseforge.com"
ADDONS_ROOT_URL = f"{CURSE_ROOT_URL}/wildstar/ws-addons"
ROOT_DIR = "wildstar_addons"
METADATA_DIR = f"{ROOT_DIR}/addon_metadata"
FILES_DIR = f"{ROOT_DIR}/addons"
class YamlDumper(yaml.Dumper):
"""Custom dumper to deal with a weird list indentation issue.
Remove when https://github.com/yaml/pyyaml/issues/234 is solved.
"""
def increase_indent(self, flow=False, indentless=False):
return super(YamlDumper, self).increase_indent(flow, False)
def get_page_range(base_page, start_page=None):
page_numbers = set()
pagination_urls = base_page.find_all(class_="pagination-item")
for page_url in pagination_urls:
try:
if page_url.name == "span":
page_numbers.add(int(page_url.text))
else:
page_numbers.add(int(page_url.find("span").text))
except ValueError:
# Ignore the '...'
pass
page_numbers = sorted(list(page_numbers))
# NOTE: End is +1 as we do want to include the
# last page.
if start_page:
return range(start_page, page_numbers[-1] + 1)
return range(page_numbers[0], page_numbers[-1] + 1)
def get_addons_on_index_page(scraper, page_number):
page_url = f"{ADDONS_ROOT_URL}?page={page_number}"
page_html = scraper.get(page_url, timeout=10)
addon_index_page = BeautifulSoup(page_html.content, "html.parser")
addon_listings = addon_index_page.find_all(class_="project-listing-row")
addon_dicts = []
for listing in addon_listings:
mod_page_url = listing.find_all("a")[0].attrs["href"]
short_description = listing.find_all(class_="leading-snug")[0].text
addon_dicts.append(
{
"detail_url": f"{CURSE_ROOT_URL}{mod_page_url}",
"short_description": short_description.strip(),
}
)
return addon_dicts
def process_addon(scraper, addon_dictionary):
addon_detail_url = addon_dictionary["detail_url"]
addon_metadata = {
"id": addon_detail_url.split("/")[-1],
"short_description": addon_dictionary["short_description"],
"popularity_rank": addon_dictionary["popularity_rank"],
}
print(f"Processing Addon: {addon_metadata['id']}")
addon_metadata_dir = f"{METADATA_DIR}/{addon_metadata['id']}"
if not os.path.exists(addon_metadata_dir):
os.makedirs(addon_metadata_dir)
page_html = scraper.get(addon_detail_url, timeout=10)
addon_page = BeautifulSoup(page_html.content, "html.parser")
addon_metadata.update(
get_addon_header_details(scraper, addon_page, addon_metadata_dir)
)
addon_metadata.update(
get_addon_side_panel_details(scraper, addon_page, addon_metadata_dir)
)
addon_metadata.update(
get_addon_description(scraper, addon_page, addon_metadata_dir)
)
addon_metadata.update(get_addon_source_links(addon_page))
addon_metadata.update(
get_addon_file_and_details(scraper, f"{addon_detail_url}/files")
)
addon_metadata.update(
get_addon_images(scraper, f"{addon_detail_url}/screenshots", addon_metadata_dir)
)
with open(f"{addon_metadata_dir}/info.yaml", "w") as file:
yaml.dump(addon_metadata, file, Dumper=YamlDumper, default_flow_style=False)
def get_addon_header_details(scraper, addon_page, addon_metadata_dir):
addon_metadata = {}
header = addon_page.find_all(class_="game-header")[0]
try:
icon_src = header.find_all("img")[0].attrs["src"]
# NOTE: Let's not assume they are all png, and preserve the extension:
icon_ext = icon_src.split(".")[-1]
icon_file_name = f"{addon_metadata_dir}/icon.{icon_ext}"
response = scraper.get(icon_src, stream=True, timeout=10)
if response.status_code == 200:
with open(icon_file_name, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
except IndexError:
pass
addon_metadata["name"] = header.find_all("h2")[0].text.strip()
total_downloads = header.find_all("span")[0].text
addon_metadata["total_downloads"] = int(total_downloads.split()[0].replace(",", ""))
last_updated = header.find_all("span")[1].find_all("abbr")[0]
addon_metadata["last_updated"] = last_updated.attrs["title"]
addon_metadata["last_updated_epoch"] = last_updated.attrs["data-epoch"]
api_version = header.find_all("span")[2].text.split(": ")[-1]
# NOTE: let's standardise the api version format:
addon_metadata["api_version"] = api_version.replace(" ", "").replace("API", "API_")
return addon_metadata
def get_addon_side_panel_details(scraper, addon_page, addon_metadata_dir):
addon_metadata = {}
side_panel = addon_page.find_all("aside")[0]
created = side_panel.find_all("span")[4].find_all("abbr")[0]
addon_metadata["created"] = created.attrs["title"]
addon_metadata["created_epoch"] = created.attrs["data-epoch"]
addon_metadata["license_short"] = side_panel.find_all("a")[0].text.strip()
license_ref = side_panel.find_all("a")[0].attrs["href"]
response = scraper.get(f"{CURSE_ROOT_URL}{license_ref}", timeout=10)
if response.status_code == 200:
license_file_name = f"{addon_metadata_dir}/LICENSE.md"
with open(license_file_name, "w") as f:
f.write(md(response.content))
categories = []
category_links = side_panel.find_all(class_="pb-4")[1].find_all("a")
for category in category_links:
categories.append(
{
"id": category.attrs["href"].split("/")[-1],
"name": category.find_all("figure")[0].attrs["title"],
}
)
addon_metadata["categories"] = categories
contributors = []
contributor_links = side_panel.find_all(class_="pb-4")[2].find_all(class_="mb-2")
for contributor in contributor_links:
contributors.append(
{
"name": contributor.find_all("span")[0].text.strip(),
"role": contributor.find_all(class_="text-xs")[0].text.strip(),
}
)
addon_metadata["contributors"] = contributors
return addon_metadata
def get_addon_description(scraper, addon_page, addon_metadata_dir):
addon_metadata = {}
images_urls = {}
possible_source_links = []
description = addon_page.find_all(class_="project-detail__content")[0]
images = description.find_all("img")
description_images_dir = f"{addon_metadata_dir}/desc_images"
if images:
if not os.path.exists(description_images_dir):
os.makedirs(description_images_dir)
for i, image in enumerate(images):
img_src = image.attrs["src"]
# NOTE: Let's not assume they are all png, and preserve the extension:
ext = img_src.split(".")[-1]
# NOTE: but if there is an extension that is stupid... skip it.
if len(ext) > 5:
continue
relative_file_name = f"desc_images/image_{i}.{ext}"
file_name = f"{addon_metadata_dir}/{relative_file_name}"
try:
response = scraper.get(img_src, stream=True, timeout=10)
if (
response.status_code == 200
and "text" not in response.headers["Content-Type"]
):
with open(file_name, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
images_urls[img_src] = relative_file_name
except (exceptions.Timeout, exceptions.InvalidSchema, exceptions.SSLError):
# NOTE: we don't care too much about these images
pass
if images and not images_urls:
os.rmdir(description_images_dir)
md_description = md(description.encode())
for old, new in images_urls.items():
md_description = md_description.replace(old, new)
description_file_name = f"{addon_metadata_dir}/description.md"
with open(description_file_name, "w") as f:
f.write(md_description)
links = description.find_all("a")
for link in links:
if "git" in link.attrs["href"]:
possible_source_links.append(link.attrs["href"])
if possible_source_links:
addon_metadata["possible_source_links"] = possible_source_links
return addon_metadata
def get_addon_source_links(addon_page):
addon_metadata = {}
nav = addon_page.find_all("nav")[1]
issues = nav.find(id="nav-issues-svg-class-icon-icon-offsite-nav-viewbox-0-0")
if issues:
addon_metadata["issues"] = issues.find_all("a")[0].attrs["href"]
source = nav.find(id="nav-source-svg-class-icon-icon-offsite-nav-viewbox-0-0")
if source:
addon_metadata["source"] = source.find_all("a")[0].attrs["href"]
return addon_metadata
def get_addon_file_and_details(scraper, file_page_url):
addon_metadata = {}
page_html = scraper.get(file_page_url, timeout=10)
file_page = BeautifulSoup(page_html.content, "html.parser")
release = file_page.find_all(class_="box")[1]
release_name = release.find_all("h3")[0].text.strip()
download_link = release.find_all("a")[1].attrs["href"] + "/file"
filename = release.find_all(class_="text-sm")[1].text.strip()
uploaded_by = release.find_all(class_="text-sm")[3].text.strip()
uploaded_at = release.find_all(class_="text-sm")[5].find_all("abbr")[0]
api_version = release.find_all(class_="text-sm")[7].text.strip()
md5_checksum = release.find_all(class_="text-sm")[13].text
addon_metadata["files"] = [
{
"name": release_name,
"filename": filename,
"uploaded_by": uploaded_by,
"uploaded_at": uploaded_at.attrs["title"],
"uploaded_at_epoch": uploaded_at.attrs["data-epoch"],
# NOTE: let's standardise the api version format:
"api_version": api_version.replace(" ", "").replace("API", "API_"),
"md5_checksum": md5_checksum,
}
]
file_location = f"{FILES_DIR}/{filename}"
response = scraper.get(f"{CURSE_ROOT_URL}{download_link}", stream=True, timeout=30)
if response.status_code == 200:
with open(file_location, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
with open(file_location, "rb") as f:
file_hash = hashlib.md5()
chunk = f.read(8192)
while chunk:
file_hash.update(chunk)
chunk = f.read(8192)
downloaded_md5 = file_hash.hexdigest()
if downloaded_md5 != md5_checksum:
print(f"expected: {md5_checksum} found: {downloaded_md5}")
raise Exception(f"File {file_location} does not match checksum.")
return addon_metadata
def get_addon_images(scraper, addon_images_url, addon_metadata_dir):
addon_metadata = {}
page_html = scraper.get(addon_images_url, timeout=10)
file_page = BeautifulSoup(page_html.content, "html.parser")
screenshots = []
screenshot_elems = file_page.find_all(class_="project-screenshot-page")[0].find_all(
class_="px-2"
)
if screenshot_elems:
screenshots_dir = f"{addon_metadata_dir}/screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)
for screenshot in screenshot_elems:
download_link = screenshot.find_all(class_="mb-2")[0].attrs["data-featherlight"]
filename = download_link.split("/")[-1]
title = screenshot.find_all("p")[0].text
description = screenshot.find_all("p")[1].text
response = scraper.get(download_link, stream=True, timeout=10)
if response.status_code == 200:
with open(f"{screenshots_dir}/{filename}", "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
screenshots.append(
{
"filename": filename,
"title": title,
"description": description,
}
)
if screenshots:
addon_metadata["screenshots"] = screenshots
return addon_metadata
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Scrape Curseforge for Wildstar Addons. "
"Will output into a relative directory 'wildstar_addons"
)
parser.add_argument(
"--start-page", metavar="<start_page>", type=int, help="Which page to start from.",
)
args = parser.parse_args()
if not os.path.exists(METADATA_DIR):
os.makedirs(METADATA_DIR)
if not os.path.exists(FILES_DIR):
os.makedirs(FILES_DIR)
scraper = cloudscraper.create_scraper()
page_html = scraper.get(ADDONS_ROOT_URL, timeout=10)
root_page = BeautifulSoup(page_html.content, "html.parser")
popularity_rank = 1
if args.start_page:
popularity_rank += (args.start_page - 1) * 20
for page in get_page_range(root_page, args.start_page):
print(f"Processing page: {page}")
for addon in get_addons_on_index_page(scraper, page):
addon["popularity_rank"] = popularity_rank
process_addon(scraper, addon)
popularity_rank += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment