Last active
March 18, 2024 01:03
-
-
Save nok-ko/00feff3dc79c118ca0c27a9506d7d67f to your computer and use it in GitHub Desktop.
"The Gods are Bastards" Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request, os | |
from urllib.request import HTTPError | |
from bs4 import BeautifulSoup | |
from itertools import count | |
from textdistance import levenshtein, jaccard | |
""" | |
bastard_scraper.py | |
brought to you by: nokko | |
NOTE: TGaB is a large work, totalling 2,812,577 words as of Chapter 15-76 | |
This script downloads the entire HTML page for each chapter. This means that | |
around 136MB of HTML is downloaded, and ~20MB of sanitized HTML is emitted. | |
""" | |
# set to True if want to keep the original HTML version with comments | |
keep_ingested = False | |
# start from the beginning | |
current = urllib.request.Request('https://tiraas.wordpress.com/2014/08/22/1-1/') | |
_break = False # for ugly unrefactored code reasons, we can't just break when we want to. hence, _break | |
def get_final_size(): | |
""" | |
Iterates through all the files in the working directory, gets their sizes. | |
Returns the size in MB. | |
""" | |
files = [f"{os.getcwd()}/{f}" for f in os.listdir('.') if os.path.isfile(f)] | |
sizes = [os.path.getsize(f) for f in files] | |
return round(sum(sizes) / 1_000_000, 2) | |
# Page Gathering: | |
for i in count(): | |
# Try to open the URL provided, break on errors like a 404, etc. | |
try: | |
with urllib.request.urlopen(current) as response: | |
if response.code != 200: | |
break | |
first_chapter = response.read() | |
soup = BeautifulSoup(first_chapter, features="html.parser") | |
title = soup.title.get_text() | |
# Write ingest result to file | |
if keep_ingested: | |
with open(f'{os.getcwd()}/input/[{i + 1:03}] {title}.html', 'w') as out: | |
out.write(soup.prettify(formatter='html5')) | |
except HTTPError as err: | |
print(err) | |
break | |
# story is everything inside the first div.entry-content we find | |
story = soup.find('div', {'class': 'entry-content'}) | |
# Remove Share link garbage | |
story.find(attrs={'id': 'jp-post-flair'}).decompose() | |
print(soup.title.get_text()) | |
# the next link we follow is the second one on the page, | |
# unless it is too dissimilar from "Next Chapter >" | |
next_url, next_link = story.find_all('a')[1]['href'], story.find_all('a')[1].get_text() | |
# make all the links point to nowhere | |
# TODO: perhaps delete them? or delete every link except the vote link using jaccard | |
for anchor in story.find_all('a'): | |
anchor['href'] = "" | |
# figure out if the link we've selected with story.a[1] is in fact a "Next Chapter" link | |
# by comparing its jaccard index to that of a template link | |
jaccard_index = jaccard("Next Chapter >".split(), next_link.split()) | |
if jaccard_index > 0.1: | |
current = next_url | |
# TODO: log similarity here? | |
else: # end scraping once next_link isn't a Next link | |
print(f"""{'=' * 64} | |
Scraping ended at {current}. | |
Reason: "Next Chapter" link too dissimilar from template. | |
Jaccard index of "{"Next Chapter >"}" and "{next_link}" = {jaccard_index} | |
Scraped {i + 1} pages, totalling {get_final_size()} MB.""") | |
_break = True | |
# write result to file | |
with open(f'{os.getcwd()}/[{i + 1:03}] {title}.html', 'w') as out: | |
out.write(story.prettify(formatter='html5')) | |
if _break: break | |
For anyone visiting this page in the future. I was looking around for something similar but I found the code hard to parse so I created an updated version: https://github.com/curohu/The-Gods-are-Bastards-Scrapper
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very late response, but you’re welcome! Glad people can still use my years-old script. :)