Skip to content

Instantly share code, notes, and snippets.

@nnewey
Created October 8, 2014 21:49
Show Gist options
  • Save nnewey/0c46d1c9fa0fa65132ac to your computer and use it in GitHub Desktop.
Save nnewey/0c46d1c9fa0fa65132ac to your computer and use it in GitHub Desktop.
Simple Disk Based Web-Cache
'''Very Simple Disk Based Web-Cache'''
import os
import sys
import urllib
import base64
CACHE_DIR = '/data/crawls/CACHE/'
def url2filename(url):
return base64.urlsafe_b64encode(url)
def filename2url(filename):
return base64.urlsafe_b64decode(filename)
def urlopen(url, cache_dir=CACHE_DIR):
'''Simple disk-cacheing get
What we will do is convert the url to
a filename and then just check
for the filename locally.'''
filename = cache_dir + url2filename(url)
if not os.path.exists(filename):
data = urllib.urlopen(url).read()
with open(filename, 'w') as fh:
fh.write(data)
else:
print >> sys.stderr, 'Returning cached page at {}.'.format(filename)
return open(filename)
def main(args):
print args.url
data = urlopen(args.url).read()
print len(data)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('url')
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment