-
-
Save bootstraponline/74655e5238c9ab427919 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections, json | |
from urllib import urlretrieve | |
from urlparse import urljoin | |
from csv import DictReader, reader as csv_reader | |
import scrapinghub | |
from project.settings import SH_APIKEY | |
BASE_URL = 'http://files.scrapinghub.com.s3.amazonaws.com/feeds/' | |
def _download(url, _fname=None): | |
_fname, _ = urlretrieve(urljoin(BASE_URL, url), _fname) | |
return _fname | |
def _parse(_file, ofmt): | |
fext = _file.name.split('.')[-1] | |
if fext == 'csv': | |
if ofmt == 'dict': | |
return DictReader(_file) | |
elif ofmt == 'list': | |
reader = csv_reader(_file) | |
reader.next() # Avoid headers | |
return reader | |
else: | |
raise Exception('Not supported ofmt') | |
elif fext == 'jl': | |
if ofmt == 'dict': | |
return (json.loads(l) for l in _file) | |
else: | |
raise Exception('Not supported ofmt') | |
else: | |
raise Exception('Not supported extension') | |
def _to_unicode(data, encoding='utf-8'): | |
if isinstance(data, basestring): | |
return data.decode(encoding) | |
elif isinstance(data, collections.Mapping): | |
return dict(_to_unicode(d, encoding) for d in data.items()) | |
elif isinstance(data, collections.Iterable): | |
return type(data)(_to_unicode(d, encoding) for d in data) | |
else: | |
return data | |
def ondemand(ofmt='dict', _url='feed_1.csv', _fname=None): | |
with open(_download(_url, _fname)) as f: | |
for row in _parse(f, ofmt): | |
yield _to_unicode(row) | |
def companies(ofmt='dict', _url='feed_2.csv', _fname=None): | |
with open(_download(_url, _fname)) as f: | |
for row in _parse(f, ofmt): | |
yield _to_unicode(row) | |
def items(key): | |
job = scrapinghub.Connection(SH_APIKEY)[key.split('/')[0]].job(key) | |
for item in job.items(): | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment