rajbot/liveweb.py

## liveweb.py
"""Prototype of liveweb proxy.
"""
def get_recent_crawl_location(url):
    """Looks at redis to find the location of the recent crawl of the given URL.
    """
    return redis.get(md5sum(url))

def fetch(url):
    """Fetches a url from liveweb.

    If the url is cralwed very recently, returns that instead of fetching again.
    """
    location = get_recent_crawl_locaiton(url)
    if location:
        filename, offset, size = location.split()
        content = read_file(filename, offset, size)
    else:
        # warcproxy is a script written by Kenji
        # We need to modify it to update redis with offset info.
        content = warcproxy.fetch(url)
    return content

## query_urldb.py
"""query module prototype
"""

def fetch_url(url, timestamp):
    if url.endswith('*'):
        return fetch_prefix_matches(url, timestamp)
    elif timestamp is None:
        return fetch_wayback_captures(url)
    else:
        return fetch_wayback_content(url, timestamp)


def fetch_prefix_matches(prefix_url, timestamp=None):
    assert prefix_url.endswith('*')

    match_list = query_urldb(prefix_url)

    if match_list and timestamp:
        match_list = filter_by_time(match_list, timestamp)

    return match_list


def fetch_wayback_content(url, timestamp)
    match_list = query_urldb(url)
    closest_match = get_closest_capture(match_list, timestamp)

    if closest_match is None:
        return None

    data = fetch_from_cluster(closest_match)

    if data.content_type == 'text/html':
        data = rewrite_page(data, timestamp)

    return data

def fetch_wayback_captures(url):
    """returns a list of timestamps"""

    match_list = query_urldb(url)
    return get_timestamps(match_list)

## robots.py
"""Robots.txt cache.

Recently used robots.txt files are cached in redis.
"""
from robotparser import RobotFileParser
import query_urldb as urldb
import liveweb

def get_robotstxt(host):
    """Returns the contents of robots.txt file for given host from the wayback
    machine.
    """
    txt = redis.get(host)
    if not txt:
        url = "http://%s/robots.txt" % host
        timestamp = "latest-timestamp-here"
        txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
        redis.set(host, txt)
    return txt

def is_allowed(url):
    """Returns True if robots.txt allows waybackmachine to fetch/display the given URL.
    """
    txt = get_robotstxt(get_host(url))

    parser = RobotFileParser()
    parser.parse(txt.splitlines())

    return parser.can_fetch("ia_archiver", url)


## webapp.py
"""Prototype of wayback webapp.
"""

import robots
import query_urldb as urldb
import liveweb

def index():
     return render_template("index.html")

def calendar(timestamp, url):
    assert "*" in timestamp
    if not robots.allowed(url):
        return render_template("norobots.html", url)

    data = urldb.query(url, timestamp)
    if data:
        return render_template("calendar.html", url, data)
    else:
        return render_template("not_archived.html", url)

def page(timestamp, url):
    assert "*" not in timestamp

    if not robots.allowed(url):
        return render_template("norobots.html", url)

    data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
    status, mimetype, body = parse_response(data)

    # Handle non 200 status
    if status != 200:
        return render_template("non200.html", status, url)

    # convert links and insert wayback header for html documents
    if mimetype == "text/html":
        body = rewrite_page(body, timestamp)
    return body
	"""Prototype of liveweb proxy.
	"""
	def get_recent_crawl_location(url):
	"""Looks at redis to find the location of the recent crawl of the given URL.
	"""
	return redis.get(md5sum(url))

	def fetch(url):
	"""Fetches a url from liveweb.

	If the url is cralwed very recently, returns that instead of fetching again.
	"""
	location = get_recent_crawl_locaiton(url)
	if location:
	filename, offset, size = location.split()
	content = read_file(filename, offset, size)
	else:
	# warcproxy is a script written by Kenji
	# We need to modify it to update redis with offset info.
	content = warcproxy.fetch(url)
	return content
	"""query module prototype
	"""

	def fetch_url(url, timestamp):
	if url.endswith('*'):
	return fetch_prefix_matches(url, timestamp)
	elif timestamp is None:
	return fetch_wayback_captures(url)
	else:
	return fetch_wayback_content(url, timestamp)


	def fetch_prefix_matches(prefix_url, timestamp=None):
	assert prefix_url.endswith('*')

	match_list = query_urldb(prefix_url)

	if match_list and timestamp:
	match_list = filter_by_time(match_list, timestamp)

	return match_list


	def fetch_wayback_content(url, timestamp)
	match_list = query_urldb(url)
	closest_match = get_closest_capture(match_list, timestamp)

	if closest_match is None:
	return None

	data = fetch_from_cluster(closest_match)

	if data.content_type == 'text/html':
	data = rewrite_page(data, timestamp)

	return data

	def fetch_wayback_captures(url):
	"""returns a list of timestamps"""

	match_list = query_urldb(url)
	return get_timestamps(match_list)
	"""Robots.txt cache.

	Recently used robots.txt files are cached in redis.
	"""
	from robotparser import RobotFileParser
	import query_urldb as urldb
	import liveweb

	def get_robotstxt(host):
	"""Returns the contents of robots.txt file for given host from the wayback
	machine.
	"""
	txt = redis.get(host)
	if not txt:
	url = "http://%s/robots.txt" % host
	timestamp = "latest-timestamp-here"
	txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
	redis.set(host, txt)
	return txt

	def is_allowed(url):
	"""Returns True if robots.txt allows waybackmachine to fetch/display the given URL.
	"""
	txt = get_robotstxt(get_host(url))

	parser = RobotFileParser()
	parser.parse(txt.splitlines())

	return parser.can_fetch("ia_archiver", url)
	"""Prototype of wayback webapp.
	"""

	import robots
	import query_urldb as urldb
	import liveweb

	def index():
	return render_template("index.html")

	def calendar(timestamp, url):
	assert "*" in timestamp
	if not robots.allowed(url):
	return render_template("norobots.html", url)

	data = urldb.query(url, timestamp)
	if data:
	return render_template("calendar.html", url, data)
	else:
	return render_template("not_archived.html", url)

	def page(timestamp, url):
	assert "*" not in timestamp

	if not robots.allowed(url):
	return render_template("norobots.html", url)

	data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
	status, mimetype, body = parse_response(data)

	# Handle non 200 status
	if status != 200:
	return render_template("non200.html", status, url)

	# convert links and insert wayback header for html documents
	if mimetype == "text/html":
	body = rewrite_page(body, timestamp)
	return body