Skip to content

Instantly share code, notes, and snippets.

@nok-ko
Last active March 18, 2024 01:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nok-ko/00feff3dc79c118ca0c27a9506d7d67f to your computer and use it in GitHub Desktop.
Save nok-ko/00feff3dc79c118ca0c27a9506d7d67f to your computer and use it in GitHub Desktop.
"The Gods are Bastards" Scraper
import urllib.request, os
from urllib.request import HTTPError
from bs4 import BeautifulSoup
from itertools import count
from textdistance import levenshtein, jaccard
"""
bastard_scraper.py
brought to you by: nokko
NOTE: TGaB is a large work, totalling 2,812,577 words as of Chapter 15-76
This script downloads the entire HTML page for each chapter. This means that
around 136MB of HTML is downloaded, and ~20MB of sanitized HTML is emitted.
"""
# set to True if want to keep the original HTML version with comments
keep_ingested = False
# start from the beginning
current = urllib.request.Request('https://tiraas.wordpress.com/2014/08/22/1-1/')
_break = False # for ugly unrefactored code reasons, we can't just break when we want to. hence, _break
def get_final_size():
"""
Iterates through all the files in the working directory, gets their sizes.
Returns the size in MB.
"""
files = [f"{os.getcwd()}/{f}" for f in os.listdir('.') if os.path.isfile(f)]
sizes = [os.path.getsize(f) for f in files]
return round(sum(sizes) / 1_000_000, 2)
# Page Gathering:
for i in count():
# Try to open the URL provided, break on errors like a 404, etc.
try:
with urllib.request.urlopen(current) as response:
if response.code != 200:
break
first_chapter = response.read()
soup = BeautifulSoup(first_chapter, features="html.parser")
title = soup.title.get_text()
# Write ingest result to file
if keep_ingested:
with open(f'{os.getcwd()}/input/[{i + 1:03}] {title}.html', 'w') as out:
out.write(soup.prettify(formatter='html5'))
except HTTPError as err:
print(err)
break
# story is everything inside the first div.entry-content we find
story = soup.find('div', {'class': 'entry-content'})
# Remove Share link garbage
story.find(attrs={'id': 'jp-post-flair'}).decompose()
print(soup.title.get_text())
# the next link we follow is the second one on the page,
# unless it is too dissimilar from "Next Chapter >"
next_url, next_link = story.find_all('a')[1]['href'], story.find_all('a')[1].get_text()
# make all the links point to nowhere
# TODO: perhaps delete them? or delete every link except the vote link using jaccard
for anchor in story.find_all('a'):
anchor['href'] = ""
# figure out if the link we've selected with story.a[1] is in fact a "Next Chapter" link
# by comparing its jaccard index to that of a template link
jaccard_index = jaccard("Next Chapter >".split(), next_link.split())
if jaccard_index > 0.1:
current = next_url
# TODO: log similarity here?
else: # end scraping once next_link isn't a Next link
print(f"""{'=' * 64}
Scraping ended at {current}.
Reason: "Next Chapter" link too dissimilar from template.
Jaccard index of "{"Next Chapter >"}" and "{next_link}" = {jaccard_index}
Scraped {i + 1} pages, totalling {get_final_size()} MB.""")
_break = True
# write result to file
with open(f'{os.getcwd()}/[{i + 1:03}] {title}.html', 'w') as out:
out.write(story.prettify(formatter='html5'))
if _break: break
@nok-ko
Copy link
Author

nok-ko commented Oct 14, 2019

Also, a bookbinding script. This one is bad, and requires both Python 2.7 (yuck) and the pypub library (unmaintained).

import os, pypub

"""
bookbinder.py
if you would like to smite the person who wrote this, their name is: nokko
NOTE: The resulting ePub file will likely need processing afterwards. 
Just import into Calibre and convert it from ePub to ePub.
I know. I'm sorry.
"""

result = pypub.Epub('The Gods are Bastards')
files = [os.getcwd()+"/"+f for f in os.listdir('.') if os.path.isfile(f) and os.path.splitext(f)[1] == '.html']
for chapter in sorted(files):
    print(chapter)
    result.add_chapter(pypub.create_chapter_from_file(chapter, title=(chapter.split('|')[0].split('] ')[-1]).decode('utf-8'))) # unicode, yay!
result.create_epub(os.getcwd())

@riley-martine
Copy link

tysm for this ๐Ÿ™‡๐Ÿ™‡๐Ÿ™‡

@nok-ko
Copy link
Author

nok-ko commented Sep 28, 2023

tysm for this ๐Ÿ™‡๐Ÿ™‡๐Ÿ™‡

Very late response, but youโ€™re welcome! Glad people can still use my years-old script. :)

@curohu
Copy link

curohu commented Mar 18, 2024

For anyone visiting this page in the future. I was looking around for something similar but I found the code hard to parse so I created an updated version: https://github.com/curohu/The-Gods-are-Bastards-Scrapper

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment