Skip to content

Instantly share code, notes, and snippets.

@tribela
Created February 3, 2016 06:36
Show Gist options
  • Save tribela/138d615d39bc2b376ea4 to your computer and use it in GitHub Desktop.
Save tribela/138d615d39bc2b376ea4 to your computer and use it in GitHub Desktop.
Earth reader crawl old entries using archive.org
from __future__ import print_function
import datetime
import re
import sys
import requests
from libearth.repository import from_url
from libearth.session import Session
from libearth.stage import Stage
from libearth.crawler import crawl, CrawlError
from libearth.parser.autodiscovery import autodiscovery, FeedUrlNotFoundError
def get_rss_url(url):
document = requests.get(url).content
try:
feed_links = autodiscovery(document, url)
except FeedUrlNotFoundError as e:
print(e, file=sys.stderr)
exit(1)
feed_url = feed_links[0].url
return feed_url
def importer(stage):
with stage:
subs = stage.subscriptions
feed_map = dict((sub.feed_uri, sub.feed_id)
for sub in subs.recursive_subscriptions)
for sub in subs.recursive_subscriptions:
print(sub.label)
for feed_uri, feed_id in feed_map.items():
import_feed(stage, feed_uri, feed_id)
def import_feed(stage, url, feed_id):
feed_url = get_rss_url(url)
print(feed_url)
links = re.compile(r'/web/\d{14}/' + feed_url)
urls = []
current_year = datetime.datetime.now().year
for year in range(1996, current_year + 1):
document = requests.get(
'http://web.archive.org/web/{}*/{}'.format(
year, feed_url
)).content
urls += links.findall(document)
urls = map(lambda x: 'http://web.archive.org' + x, urls)
print(len(urls))
generator = crawl(urls, 20)
try:
for feed_url, feed_data, crawler_hints in generator:
with stage:
stage.feeds[feed_id] = feed_data
except CrawlError as e:
print(e, file=sys.stderr)
def main():
repo_url = sys.argv[1]
repo = from_url(repo_url)
session = Session()
stage = Stage(session, repo)
importer(stage)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment