Skip to content

Instantly share code, notes, and snippets.

@dfdeshom
Created November 17, 2011 19:42
Show Gist options
  • Save dfdeshom/1374225 to your computer and use it in GitHub Desktop.
Save dfdeshom/1374225 to your computer and use it in GitHub Desktop.
import redis, tldextract,json
def get_spider_info_for_url(url):
h = redis.Redis()
extracted = tldextract.extract(url)
wholedomain = ".".join([extracted.domain, extracted.tld])
wholesubdomain = ".".join([extracted.subdomain, extracted.domain, extracted.tld])
# look for subdomain first
info = h.hget('spider_info',wholesubdomain)
if info:
return json.loads(info)
# else, look in the domain
info = h.hget('spider_info',wholedomain)
if info:
return json.loads(info)
return None
print 'spiders'
print get_spider_info_for_url('http://decoded.nationaljournal.com')
print get_spider_info_for_url('http://www.nbcsandiego.com')
print get_spider_info_for_url('http://salon.com')
{u'query_params': None, u'domain': u'decoded.nationaljournal.com', u'last_updated': u'2011-11-17 13:33:07.549885'}
{u'query_params': None, u'domain': u'www.nbcsandiego.com', u'last_updated': u'2011-11-17 13:33:07.549885'}
{u'query_params': None, u'domain': u'salon.com', u'last_updated': u'2011-11-17 13:33:07.549885'}
{u'query_params': [u'articleid'], u'domain': u'govexec.com', u'last_updated': u'2011-11-17 13:20:34.119415'}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment