Skip to content

Instantly share code, notes, and snippets.

@nramirezuy
Created September 2, 2014 14:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nramirezuy/e306c7e849a54e1909a7 to your computer and use it in GitHub Desktop.
Save nramirezuy/e306c7e849a54e1909a7 to your computer and use it in GitHub Desktop.
import collections, json
from urllib import urlretrieve
from urlparse import urljoin
from csv import DictReader, reader as csv_reader
import scrapinghub
from project.settings import SH_APIKEY
BASE_URL = 'http://files.scrapinghub.com.s3.amazonaws.com/feeds/'
def _download(url, _fname=None):
_fname, _ = urlretrieve(urljoin(BASE_URL, url), _fname)
return _fname
def _parse(_file, ofmt):
fext = _file.name.split('.')[-1]
if fext == 'csv':
if ofmt == 'dict':
return DictReader(_file)
elif ofmt == 'list':
reader = csv_reader(_file)
reader.next() # Avoid headers
return reader
else:
raise Exception('Not supported ofmt')
elif fext == 'jl':
if ofmt == 'dict':
return (json.loads(l) for l in _file)
else:
raise Exception('Not supported ofmt')
else:
raise Exception('Not supported extension')
def _to_unicode(data, encoding='utf-8'):
if isinstance(data, basestring):
return data.decode(encoding)
elif isinstance(data, collections.Mapping):
return dict(_to_unicode(d, encoding) for d in data.items())
elif isinstance(data, collections.Iterable):
return type(data)(_to_unicode(d, encoding) for d in data)
else:
return data
def ondemand(ofmt='dict', _url='feed_1.csv', _fname=None):
with open(_download(_url, _fname)) as f:
for row in _parse(f, ofmt):
yield _to_unicode(row)
def companies(ofmt='dict', _url='feed_2.csv', _fname=None):
with open(_download(_url, _fname)) as f:
for row in _parse(f, ofmt):
yield _to_unicode(row)
def items(key):
job = scrapinghub.Connection(SH_APIKEY)[key.split('/')[0]].job(key)
for item in job.items():
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment