Skip to content

Instantly share code, notes, and snippets.

@noman798
Created August 24, 2018 02:38
Show Gist options
  • Save noman798/bb2d8219be7c709e81344ca839d074e9 to your computer and use it in GitHub Desktop.
Save noman798/bb2d8219be7c709e81344ca839d074e9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import rocksdb
import requests
from urllib.parse import urlparse
from os.path import join
from json import loads
DB_PATH = "/mnt/data/tmp/spider/"
_DB = {}
def _db(name):
db = _DB.get(name)
if not db:
db = _DB[name] = rocksdb.DB(
join(DB_PATH, name), rocksdb.Options(create_if_missing=True)
)
return db
def get(url, *args, **kwds):
db = _db(urlparse(url).netloc)
_url = url.encode('utf-8')
r = db.get(_url)
if not 'timeout' in kwds:
kwds['timeout'] = 60
if not r:
r = requests.get(url, *args, **kwds).content
db.put(_url, r)
return r
def get_json(url, *args, **kwds):
r = get(url, *args, **kwds)
return loads(r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment