Last active
August 29, 2015 14:18
-
-
Save cathalgarvey/1924d16ec5b12cf945e6 to your computer and use it in GitHub Desktop.
A CSS/JS-fetching and HTML-patching script for correctly archiving sites (well, 4chan at least) using wget, as requested on Reddit.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# by Cathal Garvey, copyright 2015, released under the AGPL: https://gnu.org/licenses/agpl.txt | |
# Commissioned by a 4chan user on reddit /r/linux who wanted backups but wget couldn't fetch most JS/CSS | |
# correctly. Only tested on 4chan in keeping with request. | |
# Usage e.g. (papercraft sub on 4chan): | |
# wget --recursive --no-clobber --page-requisites --html-extension --convert-links --no-parent http://boards.4chan.org/po/ | |
# cd boards.4chan.org | |
# # (Directory contains subdirectory "po/" which contains all HTML) | |
# # (Provide root domain of crawled site to help resolve relative links, and target folder) | |
# python3 <this script> boards.4chan.org po | |
# # (Open ./boards.4chan.org/po/index.html in browser: renders correctly!) | |
import requests | |
import bs4 | |
import os | |
import urllib | |
import argparse | |
def fnsafe(fname): | |
for char in set(fname): | |
if char.lower() not in "abcdefghijklmnopqrstuvwxyz0123456789 _-.[]()": | |
fname = fname.replace(char, "") | |
return fname | |
def extract_script_links(docsoup: bs4.BeautifulSoup) -> set: | |
scripts = docsoup.findAll('script') | |
scriptitems = [] | |
for s in scripts: | |
if s.attrs.get('src', ''): | |
scriptitems.append((s, 'src', s.attrs['src'])) | |
return scriptitems | |
def extract_style_links(docsoup: bs4.BeautifulSoup) -> set: | |
styles = docsoup.findAll('link', rel='stylesheet') | |
styleitems = [] | |
for s in styles: | |
if s.attrs.get('href', ''): | |
styleitems.append((s, 'href', s.attrs['href'])) | |
return styleitems | |
def localise_srcs(root: str, src: str) -> str: | |
""" | |
Make a local path from a relative or absolute one. | |
Local paths, if relative, are placed rooted in current directory. | |
Absolute paths (i.e. <schema>://<domain>/<path-to-resource>) are | |
converted to <current-directory>/absolute_resources/domain/<path-to-resource>. | |
""" | |
parsedsrc = urllib.parse.urlparse(src) | |
if parsedsrc.netloc: | |
# Absolute | |
path = os.path.join('absolute_resources', fnsafe(parsedsrc.netloc), *map(fnsafe, parsedsrc.path.split(os.path.sep))) | |
else: | |
# Relative | |
path = os.path.join(*map(fnsafe, parsedsrc.path.split(os.path.sep))) | |
return path | |
def get_resource(rooturl: str, url: str, localpath: str) -> None: | |
parsed = urllib.parse.urlparse(url) | |
if not parsed.netloc: | |
url = rooturl.rstrip("/") + "/" + url.lstrip("/") | |
print("Prefixed url with domain to:", url) | |
if not parsed.scheme: | |
url = "http://" + url.lstrip("/") | |
print("Prefixed url with schema to:", url) | |
r = requests.get(url) | |
r.raise_for_status() | |
localfolder, _ = os.path.split(localpath) | |
os.makedirs(localfolder, exist_ok=True) | |
with open(localpath, "wb") as O: | |
O.write(r.content) | |
def save_scripts_and_styles(rooturl: str, root: str, docsoup: bs4.BeautifulSoup) -> str: | |
""" | |
Fetches src'd scripts and href'd styles and saves to converted local | |
paths. Returns prettified HTML for the given docsoup, modified to | |
refer in each case to the new local path. | |
""" | |
print("Parsing scripts and styles from parent document.") | |
scripts = extract_script_links(docsoup) | |
styles = extract_style_links(docsoup) | |
for element, srcattr, elsrc in scripts + styles: | |
localised_src = localise_srcs(root, elsrc) | |
if not os.path.isfile(localised_src): | |
print("Getting resource", elsrc, "and saving to", localised_src) | |
get_resource(rooturl, elsrc, localised_src) | |
# Has (desired) side effect of modifying parent docsoup. | |
rel_src = os.path.relpath(localised_src, root) | |
print("Modifying parent document to refer to new file:", rel_src) | |
element.attrs[srcattr] = rel_src | |
return docsoup.prettify() | |
if __name__ == '__main__': | |
P = argparse.ArgumentParser() | |
P.add_argument('rooturl') | |
P.add_argument('targetdir') | |
A = P.parse_args() | |
for root, folders, files in os.walk(A.targetdir): | |
for f in files: | |
if not f.endswith("html"): continue | |
f = os.path.join(root, f) | |
print("Loading file:", f) | |
with open(f) as I: | |
fcontents = I.read() | |
print("Parsing file to soup") | |
soup = bs4.BeautifulSoup(fcontents) | |
print("Downloading assets for", f, "and generating new file") | |
newdoc = save_scripts_and_styles(A.rooturl, A.targetdir, soup) | |
print("Saving new file:", f) | |
with open(f, "w") as O: | |
O.write(newdoc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment