Created
May 26, 2016 12:02
-
-
Save erantapaa/6190563685f4799e9f6853ed4bdd213e to your computer and use it in GitHub Desktop.
1had-fetch.py - recursively fetch 1had modules from paste
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# A script to recursively fetch 1haskelladay modules from lpaste | |
import re | |
import os | |
import requests | |
import codecs | |
import sys | |
def extract_imports(text): | |
"""Return a list of module names / url pairs""" | |
matches = re.findall("^import\s+([\w\.]+)\s+--\s+(http://lpaste.net/)(\S+)", text, re.MULTILINE) | |
return matches | |
def slurp(path): | |
with codecs.open(path, 'r', encoding="utf-8") as fh: text = fh.read() | |
return text | |
def write_file_mkdirs(path, content): | |
parent = os.path.dirname(path) | |
if not os.path.exists(parent): | |
os.makedirs(parent) | |
write_file(path, content) | |
def write_file(path, content): | |
with codecs.open(path, 'w', encoding="utf-8") as fh: fh.write(content) | |
def path_for_module(modname): | |
return modname.replace(".", "/") + ".hs" | |
def raw_lpaste_url(url): | |
return "http://lpaste.net/raw/" + url | |
def convert_to_raw_url(url): | |
if re.match("http://lpaste.net/raw/", url): | |
return url | |
m = re.match("http://lpaste.net/(.*)", url) | |
if m: | |
return raw_lpaste_url(m.group(1)) | |
else: | |
return None | |
def get_url(url): | |
r = requests.get(url) | |
print "url:", url, "status:", r.status_code, "content-type:", r.headers['content-type'] | |
if r.status_code == 200: | |
return r.text | |
else: | |
return None | |
def check_imports(text, tried): | |
for modname, http, url in extract_imports(text): | |
path = path_for_module(modname) | |
# print modname, "->", path_for_module(modname) | |
if os.path.isfile(path): | |
continue | |
if modname in tried: | |
print "already tried module:", modname | |
continue | |
raw_url = raw_lpaste_url(url) | |
if raw_url in tried: | |
print "already tried url:", raw_url | |
continue | |
tried.add(raw_url) | |
tried.add(modname) | |
print "getting module:", modname, "from:", raw_url | |
content = get_url(raw_url) | |
if content: | |
# save to path | |
write_file_mkdirs(path, content) | |
print "checking", modname | |
check_imports(content, tried) | |
def fetch_url(url): | |
raw_url = convert_to_raw_url(url) | |
if not raw_url: | |
print "bad url:", url | |
return | |
content = get_url(raw_url) | |
if content: | |
m = re.search("^module\s+([\w.]+)", content, re.MULTILINE) | |
if m: | |
modname = m.group(1) | |
path = path_for_module(modname) | |
write_file_mkdirs(path, content) | |
print "saved module:", modname | |
check_imports(content, set()) | |
def test(path): | |
seen = set() | |
check_imports(slurp(path), seen) | |
print "seen:", seen | |
def main(): | |
for url in sys.argv[1:]: | |
fetch_url(url) | |
# fetch_url("http://lpaste.net/8982640331094753280") | |
# fetch_url("http://lpaste.net/3576182129349885952") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment