Skip to content

Instantly share code, notes, and snippets.

@mutsune
Last active January 11, 2018 15:41
Show Gist options
  • Save mutsune/a49a8daa6634d63ccabc92c4f3ac4953 to your computer and use it in GitHub Desktop.
Save mutsune/a49a8daa6634d63ccabc92c4f3ac4953 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import urllib.request
import os
import html
def save(path, content):
with open(path, "w") as file:
file.write(content)
def get_html(url):
with urllib.request.urlopen(url) as response:
html = response.read()
# convert iso-8859-1 to utf-8
return html.decode("iso-8859-1").encode("utf-8").decode("utf-8")
def get_body(url):
html = get_html(url)
return html[html.index("<body") + 1:]
def extract_urls(body):
# whether this page is a directory
if '<form method="post" action="http://minnie.tuhs.org/cgi-bin/utree.pl"' in body:
return []
href_lines = [l for l in body.split("\n") if "href=" in l]
return [l[l.index('href="') + 6:-2] for l in href_lines]
def make_path(p_path, url):
name = url.split("/")[-1]
return p_path + "/" + name
def extract_src(body):
begin = body.index("<pre>") + 6
end = body.index("</pre>")
unescaped = html.unescape(body[begin:end])
return unescaped
def mkdir(path):
os.makedirs(os.path.join(*path.split("/")), exist_ok=True)
def get_url(p_path, url):
print(url)
body = get_body(url)
urls = extract_urls(body)
path = make_path(p_path, url)
if urls:
mkdir(path)
for u in urls:
get_url(path, u)
else:
content = extract_src(body)
save(path, content)
if __name__ == '__main__':
base_url = "http://minnie.tuhs.org/cgi-bin/utree.pl?file=V6/usr"
base_dir = "."
get_url(base_dir, base_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment