Skip to content

Instantly share code, notes, and snippets.

@erantapaa
Created May 26, 2016 12:02
Show Gist options
  • Save erantapaa/6190563685f4799e9f6853ed4bdd213e to your computer and use it in GitHub Desktop.
Save erantapaa/6190563685f4799e9f6853ed4bdd213e to your computer and use it in GitHub Desktop.
1had-fetch.py - recursively fetch 1had modules from paste
#!/usr/bin/env python
#
# A script to recursively fetch 1haskelladay modules from lpaste
import re
import os
import requests
import codecs
import sys
def extract_imports(text):
"""Return a list of module names / url pairs"""
matches = re.findall("^import\s+([\w\.]+)\s+--\s+(http://lpaste.net/)(\S+)", text, re.MULTILINE)
return matches
def slurp(path):
with codecs.open(path, 'r', encoding="utf-8") as fh: text = fh.read()
return text
def write_file_mkdirs(path, content):
parent = os.path.dirname(path)
if not os.path.exists(parent):
os.makedirs(parent)
write_file(path, content)
def write_file(path, content):
with codecs.open(path, 'w', encoding="utf-8") as fh: fh.write(content)
def path_for_module(modname):
return modname.replace(".", "/") + ".hs"
def raw_lpaste_url(url):
return "http://lpaste.net/raw/" + url
def convert_to_raw_url(url):
if re.match("http://lpaste.net/raw/", url):
return url
m = re.match("http://lpaste.net/(.*)", url)
if m:
return raw_lpaste_url(m.group(1))
else:
return None
def get_url(url):
r = requests.get(url)
print "url:", url, "status:", r.status_code, "content-type:", r.headers['content-type']
if r.status_code == 200:
return r.text
else:
return None
def check_imports(text, tried):
for modname, http, url in extract_imports(text):
path = path_for_module(modname)
# print modname, "->", path_for_module(modname)
if os.path.isfile(path):
continue
if modname in tried:
print "already tried module:", modname
continue
raw_url = raw_lpaste_url(url)
if raw_url in tried:
print "already tried url:", raw_url
continue
tried.add(raw_url)
tried.add(modname)
print "getting module:", modname, "from:", raw_url
content = get_url(raw_url)
if content:
# save to path
write_file_mkdirs(path, content)
print "checking", modname
check_imports(content, tried)
def fetch_url(url):
raw_url = convert_to_raw_url(url)
if not raw_url:
print "bad url:", url
return
content = get_url(raw_url)
if content:
m = re.search("^module\s+([\w.]+)", content, re.MULTILINE)
if m:
modname = m.group(1)
path = path_for_module(modname)
write_file_mkdirs(path, content)
print "saved module:", modname
check_imports(content, set())
def test(path):
seen = set()
check_imports(slurp(path), seen)
print "seen:", seen
def main():
for url in sys.argv[1:]:
fetch_url(url)
# fetch_url("http://lpaste.net/8982640331094753280")
# fetch_url("http://lpaste.net/3576182129349885952")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment