Skip to content

Instantly share code, notes, and snippets.

@cathalgarvey
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cathalgarvey/1924d16ec5b12cf945e6 to your computer and use it in GitHub Desktop.
Save cathalgarvey/1924d16ec5b12cf945e6 to your computer and use it in GitHub Desktop.
A CSS/JS-fetching and HTML-patching script for correctly archiving sites (well, 4chan at least) using wget, as requested on Reddit.
#!/usr/bin/env python3
# by Cathal Garvey, copyright 2015, released under the AGPL: https://gnu.org/licenses/agpl.txt
# Commissioned by a 4chan user on reddit /r/linux who wanted backups but wget couldn't fetch most JS/CSS
# correctly. Only tested on 4chan in keeping with request.
# Usage e.g. (papercraft sub on 4chan):
# wget --recursive --no-clobber --page-requisites --html-extension --convert-links --no-parent http://boards.4chan.org/po/
# cd boards.4chan.org
# # (Directory contains subdirectory "po/" which contains all HTML)
# # (Provide root domain of crawled site to help resolve relative links, and target folder)
# python3 <this script> boards.4chan.org po
# # (Open ./boards.4chan.org/po/index.html in browser: renders correctly!)
import requests
import bs4
import os
import urllib
import argparse
def fnsafe(fname):
for char in set(fname):
if char.lower() not in "abcdefghijklmnopqrstuvwxyz0123456789 _-.[]()":
fname = fname.replace(char, "")
return fname
def extract_script_links(docsoup: bs4.BeautifulSoup) -> set:
scripts = docsoup.findAll('script')
scriptitems = []
for s in scripts:
if s.attrs.get('src', ''):
scriptitems.append((s, 'src', s.attrs['src']))
return scriptitems
def extract_style_links(docsoup: bs4.BeautifulSoup) -> set:
styles = docsoup.findAll('link', rel='stylesheet')
styleitems = []
for s in styles:
if s.attrs.get('href', ''):
styleitems.append((s, 'href', s.attrs['href']))
return styleitems
def localise_srcs(root: str, src: str) -> str:
"""
Make a local path from a relative or absolute one.
Local paths, if relative, are placed rooted in current directory.
Absolute paths (i.e. <schema>://<domain>/<path-to-resource>) are
converted to <current-directory>/absolute_resources/domain/<path-to-resource>.
"""
parsedsrc = urllib.parse.urlparse(src)
if parsedsrc.netloc:
# Absolute
path = os.path.join('absolute_resources', fnsafe(parsedsrc.netloc), *map(fnsafe, parsedsrc.path.split(os.path.sep)))
else:
# Relative
path = os.path.join(*map(fnsafe, parsedsrc.path.split(os.path.sep)))
return path
def get_resource(rooturl: str, url: str, localpath: str) -> None:
parsed = urllib.parse.urlparse(url)
if not parsed.netloc:
url = rooturl.rstrip("/") + "/" + url.lstrip("/")
print("Prefixed url with domain to:", url)
if not parsed.scheme:
url = "http://" + url.lstrip("/")
print("Prefixed url with schema to:", url)
r = requests.get(url)
r.raise_for_status()
localfolder, _ = os.path.split(localpath)
os.makedirs(localfolder, exist_ok=True)
with open(localpath, "wb") as O:
O.write(r.content)
def save_scripts_and_styles(rooturl: str, root: str, docsoup: bs4.BeautifulSoup) -> str:
"""
Fetches src'd scripts and href'd styles and saves to converted local
paths. Returns prettified HTML for the given docsoup, modified to
refer in each case to the new local path.
"""
print("Parsing scripts and styles from parent document.")
scripts = extract_script_links(docsoup)
styles = extract_style_links(docsoup)
for element, srcattr, elsrc in scripts + styles:
localised_src = localise_srcs(root, elsrc)
if not os.path.isfile(localised_src):
print("Getting resource", elsrc, "and saving to", localised_src)
get_resource(rooturl, elsrc, localised_src)
# Has (desired) side effect of modifying parent docsoup.
rel_src = os.path.relpath(localised_src, root)
print("Modifying parent document to refer to new file:", rel_src)
element.attrs[srcattr] = rel_src
return docsoup.prettify()
if __name__ == '__main__':
P = argparse.ArgumentParser()
P.add_argument('rooturl')
P.add_argument('targetdir')
A = P.parse_args()
for root, folders, files in os.walk(A.targetdir):
for f in files:
if not f.endswith("html"): continue
f = os.path.join(root, f)
print("Loading file:", f)
with open(f) as I:
fcontents = I.read()
print("Parsing file to soup")
soup = bs4.BeautifulSoup(fcontents)
print("Downloading assets for", f, "and generating new file")
newdoc = save_scripts_and_styles(A.rooturl, A.targetdir, soup)
print("Saving new file:", f)
with open(f, "w") as O:
O.write(newdoc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment