Skip to content

Instantly share code, notes, and snippets.

@anandology anandology/ forked from rajbot/
Created Mar 21, 2012

What would you like to do?
wayback-python prototyping
"""Prototype of liveweb proxy.
def get_recent_crawl_location(url):
"""Looks at memcache to find the location of the recent crawl of the given URL.
return memcache_client.get(md5sum(url))
def fetch(url):
"""Fetches a url from liveweb.
If the url is cralwed very recently, returns that instead of fetching again.
location = get_recent_crawl_locaiton(url)
if location:
filename, offset, size = location.split()
content = read_file(filename, offset, size)
# warcproxy is a script written by Kenji
# We need to modify it to update memcache with offset info.
content = warcproxy.fetch(url)
return content
def fetch_url(url, timestamp):
if url.endswith('*'):
return fetch_prefix_matches(url, timestamp)
return fetch_wayback_content(url, timestamp)
def fetch_prefix_matches(prefix_url, timestamp=None):
assert prefix_url.endswith('*')
match_list = query_urldb(prefix_url)
if match_list and timestamp:
match_list = filter_by_time(match_list, timestamp)
return match_list
def fetch_wayback_content(url, timestamp):
match_list = query_urldb(url)
closest_match = get_closest_capture(match_list, timestamp)
if closest_match is None:
return None
return fetch_from_cluster(closest_match)
"""Robots.txt cache.
Recently used robots.txt files are cached in memcache.
from robotparser import RobotFileParser
import query_urldb as urldb
import liveweb
def get_robotstxt(host):
"""Returns the contents of robots.txt file for given host from the wayback
txt = memcache.get(host)
if not txt:
url = "http://%s/robots.txt" % host
timestamp = "latest-timestamp-here"
txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
memcache.put(host, txt)
return txt
def is_allowed(url):
"""Returns True if robots.txt allows waybackmachine to fetch/display the given URL.
txt = get_robotstxt(get_host(url))
parser = RobotFileParser()
return parser.can_fetch("ia_archiver", url)
"""Prototype of wayback webapp.
import robots
import query_urldb as urldb
import liveweb
def index():
return render_template("index.html")
def calendar(timestamp, url):
assert "*" in timestamp
if not robots.allowed(url):
return render_template("norobots.html", url)
data = urldb.query(url, timestamp)
if data:
return render_template("calendar.html", url, data)
return render_template("not_archived.html", url)
def page(timestamp, url):
assert "*" not in timestamp
if not robots.allowed(url):
return render_template("norobots.html", url)
data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
status, mimetype, body = parse_response(data)
# Handle non 200 status
if status != 200:
return render_template("non200.html", status, url)
# convert links and insert wayback header for html documents
if mimetype == "text/html":
body = rewrite_page(body, timestamp)
return body
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.