cathalgarvey/patch_wgetted_4chan.py

## patch_wgetted_4chan.py
#!/usr/bin/env python3
# by Cathal Garvey, copyright 2015, released under the AGPL: https://gnu.org/licenses/agpl.txt
# Commissioned by a 4chan user on reddit /r/linux who wanted backups but wget couldn't fetch most JS/CSS
# correctly. Only tested on 4chan in keeping with request.
# Usage e.g. (papercraft sub on 4chan):
#    wget  --recursive --no-clobber --page-requisites --html-extension --convert-links --no-parent http://boards.4chan.org/po/
#    cd boards.4chan.org
#    # (Directory contains subdirectory "po/" which contains all HTML)
#    # (Provide root domain of crawled site to help resolve relative links, and target folder)
#    python3 <this script> boards.4chan.org po
#    # (Open ./boards.4chan.org/po/index.html in browser: renders correctly!)
import requests
import bs4
import os
import urllib
import argparse

def fnsafe(fname):
    for char in set(fname):
        if char.lower() not in "abcdefghijklmnopqrstuvwxyz0123456789 _-.[]()":
            fname = fname.replace(char, "")
    return fname

def extract_script_links(docsoup: bs4.BeautifulSoup) -> set:
    scripts = docsoup.findAll('script')
    scriptitems = []
    for s in scripts:
        if s.attrs.get('src', ''):
            scriptitems.append((s, 'src', s.attrs['src']))
    return scriptitems

def extract_style_links(docsoup: bs4.BeautifulSoup) -> set:
    styles = docsoup.findAll('link', rel='stylesheet')
    styleitems = []
    for s in styles:
        if s.attrs.get('href', ''):
            styleitems.append((s, 'href', s.attrs['href']))
    return styleitems

def localise_srcs(root: str, src: str) -> str:
    """
    Make a local path from a relative or absolute one.
    Local paths, if relative, are placed rooted in current directory.
    Absolute paths (i.e. <schema>://<domain>/<path-to-resource>) are
    converted to <current-directory>/absolute_resources/domain/<path-to-resource>.
    """
    parsedsrc = urllib.parse.urlparse(src)
    if parsedsrc.netloc:
        # Absolute
        path = os.path.join('absolute_resources', fnsafe(parsedsrc.netloc), *map(fnsafe, parsedsrc.path.split(os.path.sep)))
    else:
        # Relative
        path = os.path.join(*map(fnsafe, parsedsrc.path.split(os.path.sep)))
    return path

def get_resource(rooturl: str, url: str, localpath: str) -> None:
    parsed = urllib.parse.urlparse(url)
    if not parsed.netloc:
        url = rooturl.rstrip("/") + "/" + url.lstrip("/")
        print("Prefixed url with domain to:", url)
    if not parsed.scheme:
        url = "http://" + url.lstrip("/")
        print("Prefixed url with schema to:", url)
    r = requests.get(url)
    r.raise_for_status()
    localfolder, _ = os.path.split(localpath)
    os.makedirs(localfolder, exist_ok=True)
    with open(localpath, "wb") as O:
        O.write(r.content)

def save_scripts_and_styles(rooturl: str, root: str, docsoup: bs4.BeautifulSoup) -> str:
    """
    Fetches src'd scripts and href'd styles and saves to converted local
    paths. Returns prettified HTML for the given docsoup, modified to
    refer in each case to the new local path.
    """
    print("Parsing scripts and styles from parent document.")
    scripts = extract_script_links(docsoup)
    styles = extract_style_links(docsoup)
    for element, srcattr, elsrc in scripts + styles:
        localised_src = localise_srcs(root, elsrc)
        if not os.path.isfile(localised_src):
            print("Getting resource", elsrc, "and saving to", localised_src)
            get_resource(rooturl, elsrc, localised_src)
        # Has (desired) side effect of modifying parent docsoup.
        rel_src = os.path.relpath(localised_src, root)
        print("Modifying parent document to refer to new file:", rel_src)
        element.attrs[srcattr] = rel_src
    return docsoup.prettify()

if __name__ == '__main__':
    P = argparse.ArgumentParser()
    P.add_argument('rooturl')
    P.add_argument('targetdir')
    A = P.parse_args()

    for root, folders, files in os.walk(A.targetdir):
        for f in files:
            if not f.endswith("html"): continue
            f = os.path.join(root, f)
            print("Loading file:", f)
            with open(f) as I:
                fcontents = I.read()
            print("Parsing file to soup")
            soup = bs4.BeautifulSoup(fcontents)
            print("Downloading assets for", f, "and generating new file")
            newdoc = save_scripts_and_styles(A.rooturl, A.targetdir, soup)
            print("Saving new file:", f)
            with open(f, "w") as O:
                O.write(newdoc)
	#!/usr/bin/env python3
	# by Cathal Garvey, copyright 2015, released under the AGPL: https://gnu.org/licenses/agpl.txt
	# Commissioned by a 4chan user on reddit /r/linux who wanted backups but wget couldn't fetch most JS/CSS
	# correctly. Only tested on 4chan in keeping with request.
	# Usage e.g. (papercraft sub on 4chan):
	# wget --recursive --no-clobber --page-requisites --html-extension --convert-links --no-parent http://boards.4chan.org/po/
	# cd boards.4chan.org
	# # (Directory contains subdirectory "po/" which contains all HTML)
	# # (Provide root domain of crawled site to help resolve relative links, and target folder)
	# python3 <this script> boards.4chan.org po
	# # (Open ./boards.4chan.org/po/index.html in browser: renders correctly!)
	import requests
	import bs4
	import os
	import urllib
	import argparse

	def fnsafe(fname):
	for char in set(fname):
	if char.lower() not in "abcdefghijklmnopqrstuvwxyz0123456789 _-.[]()":
	fname = fname.replace(char, "")
	return fname

	def extract_script_links(docsoup: bs4.BeautifulSoup) -> set:
	scripts = docsoup.findAll('script')
	scriptitems = []
	for s in scripts:
	if s.attrs.get('src', ''):
	scriptitems.append((s, 'src', s.attrs['src']))
	return scriptitems

	def extract_style_links(docsoup: bs4.BeautifulSoup) -> set:
	styles = docsoup.findAll('link', rel='stylesheet')
	styleitems = []
	for s in styles:
	if s.attrs.get('href', ''):
	styleitems.append((s, 'href', s.attrs['href']))
	return styleitems

	def localise_srcs(root: str, src: str) -> str:
	"""
	Make a local path from a relative or absolute one.
	Local paths, if relative, are placed rooted in current directory.
	Absolute paths (i.e. <schema>://<domain>/<path-to-resource>) are
	converted to <current-directory>/absolute_resources/domain/<path-to-resource>.
	"""
	parsedsrc = urllib.parse.urlparse(src)
	if parsedsrc.netloc:
	# Absolute
	path = os.path.join('absolute_resources', fnsafe(parsedsrc.netloc), *map(fnsafe, parsedsrc.path.split(os.path.sep)))
	else:
	# Relative
	path = os.path.join(*map(fnsafe, parsedsrc.path.split(os.path.sep)))
	return path

	def get_resource(rooturl: str, url: str, localpath: str) -> None:
	parsed = urllib.parse.urlparse(url)
	if not parsed.netloc:
	url = rooturl.rstrip("/") + "/" + url.lstrip("/")
	print("Prefixed url with domain to:", url)
	if not parsed.scheme:
	url = "http://" + url.lstrip("/")
	print("Prefixed url with schema to:", url)
	r = requests.get(url)
	r.raise_for_status()
	localfolder, _ = os.path.split(localpath)
	os.makedirs(localfolder, exist_ok=True)
	with open(localpath, "wb") as O:
	O.write(r.content)

	def save_scripts_and_styles(rooturl: str, root: str, docsoup: bs4.BeautifulSoup) -> str:
	"""
	Fetches src'd scripts and href'd styles and saves to converted local
	paths. Returns prettified HTML for the given docsoup, modified to
	refer in each case to the new local path.
	"""
	print("Parsing scripts and styles from parent document.")
	scripts = extract_script_links(docsoup)
	styles = extract_style_links(docsoup)
	for element, srcattr, elsrc in scripts + styles:
	localised_src = localise_srcs(root, elsrc)
	if not os.path.isfile(localised_src):
	print("Getting resource", elsrc, "and saving to", localised_src)
	get_resource(rooturl, elsrc, localised_src)
	# Has (desired) side effect of modifying parent docsoup.
	rel_src = os.path.relpath(localised_src, root)
	print("Modifying parent document to refer to new file:", rel_src)
	element.attrs[srcattr] = rel_src
	return docsoup.prettify()

	if __name__ == '__main__':
	P = argparse.ArgumentParser()
	P.add_argument('rooturl')
	P.add_argument('targetdir')
	A = P.parse_args()

	for root, folders, files in os.walk(A.targetdir):
	for f in files:
	if not f.endswith("html"): continue
	f = os.path.join(root, f)
	print("Loading file:", f)
	with open(f) as I:
	fcontents = I.read()
	print("Parsing file to soup")
	soup = bs4.BeautifulSoup(fcontents)
	print("Downloading assets for", f, "and generating new file")
	newdoc = save_scripts_and_styles(A.rooturl, A.targetdir, soup)
	print("Saving new file:", f)
	with open(f, "w") as O:
	O.write(newdoc)