Skip to content

Instantly share code, notes, and snippets.

@abelsonlive abelsonlive/cachelib.py
Last active Aug 29, 2015

Embed
What would you like to do?
Abstract Cache With Class
import hashlib
import s3plz
import os
import requests
class Cache(object):
"""
An Abstract class for caching
urls.
"""
def __init__(self, directory='cache', **kw):
if not directory.endswith('/'):
directory += '/'
self.directory = directory
self.extension = kw.get('extension', 'cache')
self.ttl = kw.get('ttl')
self.configure(**kw)
def url_to_filepath(self, url):
md5 = hashlib.md5(url).hexdigest()
return "{}{}.{}".format(self.directory, md5, self.extension)
def get(self, url, **kw):
filepath = self.url_to_filepath(url)
if not self.exists(filepath):
contents = self.get_url(url, **kw)
self.cache(contents, filepath)
else:
contents = self.load(filepath)
return contents
def get_url(self, url, **kw):
raise NotImplementedError
def configure(self, **kw):
pass
def cache(self, contents, filepath):
raise NotImplementedError
def exists(self, filepath):
raise NotImplementedError
def load(self, filepath):
raise NotImplementedError
class LocalCache(Cache):
"""
Local Cache
"""
def configure(self, **kw):
if not os.path.exists(self.directory):
os.mkdir(self.directory)
def cache(self, contents, filepath):
with open(filepath, 'wb') as f:
f.write(contents)
def exists(self, filepath):
return os.path.exists(filepath)
def load(self, filepath):
return open(filepath).read()
class S3Cache(Cache):
"""
S3 Cache
"""
def configure(self, **kw):
"""
Optionally pass in s3 conn
"""
assert(s3plz.utils.is_s3_uri(self.directory))
self.s3 = s3plz.connect(self.directory)
def cache(self, contents, filepath):
self.s3.put(contents, filepath)
def exists(self, filepath):
return self.s3.exists(filepath)
def load(self, filepath):
return self.s3.get(filepath)
@abelsonlive

This comment has been minimized.

Copy link
Owner Author

abelsonlive commented Jan 16, 2015

Usage

Inherit from S3Cache or LocalCache and overwrite the get_url method.
This function takes a url and returns the html of its page.

from tor_tools import TorBrowser

class BroswerCache(LocalCache)

      browser = TorBrowser()

      def get_url(self, url, **kw):
         self.browser.get(url)
         return self.browser.page_source

Now, use it:

cache = BrowserCache('cache')
cache.get('http://www.google.com/')

Examples

class SessionCache(LocalCache):

    session = requests.Session()

    def get_url(self, url, **kw):
        """

        """
        r = self.session.get(url)
        return r.content

class RequestS3Cache(S3Cache):

    def get_url(self, url, **kw):
        """

        """
        r = requests.get(url)
        return r.content

url = 'http://www.example.com'

sc = SessionCache('cache')
contents = sc.get(url)

rc = RequestS3Cache('s3://enigma-labs-data/columbia-journalism/equasis/')
contents = rc.get(url)
assert(contents == contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.