Created
September 2, 2014 14:59
-
-
Save nramirezuy/e306c7e849a54e1909a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections, json | |
from urllib import urlretrieve | |
from urlparse import urljoin | |
from csv import DictReader, reader as csv_reader | |
import scrapinghub | |
from project.settings import SH_APIKEY | |
BASE_URL = 'http://files.scrapinghub.com.s3.amazonaws.com/feeds/' | |
def _download(url, _fname=None): | |
_fname, _ = urlretrieve(urljoin(BASE_URL, url), _fname) | |
return _fname | |
def _parse(_file, ofmt): | |
fext = _file.name.split('.')[-1] | |
if fext == 'csv': | |
if ofmt == 'dict': | |
return DictReader(_file) | |
elif ofmt == 'list': | |
reader = csv_reader(_file) | |
reader.next() # Avoid headers | |
return reader | |
else: | |
raise Exception('Not supported ofmt') | |
elif fext == 'jl': | |
if ofmt == 'dict': | |
return (json.loads(l) for l in _file) | |
else: | |
raise Exception('Not supported ofmt') | |
else: | |
raise Exception('Not supported extension') | |
def _to_unicode(data, encoding='utf-8'): | |
if isinstance(data, basestring): | |
return data.decode(encoding) | |
elif isinstance(data, collections.Mapping): | |
return dict(_to_unicode(d, encoding) for d in data.items()) | |
elif isinstance(data, collections.Iterable): | |
return type(data)(_to_unicode(d, encoding) for d in data) | |
else: | |
return data | |
def ondemand(ofmt='dict', _url='feed_1.csv', _fname=None): | |
with open(_download(_url, _fname)) as f: | |
for row in _parse(f, ofmt): | |
yield _to_unicode(row) | |
def companies(ofmt='dict', _url='feed_2.csv', _fname=None): | |
with open(_download(_url, _fname)) as f: | |
for row in _parse(f, ofmt): | |
yield _to_unicode(row) | |
def items(key): | |
job = scrapinghub.Connection(SH_APIKEY)[key.split('/')[0]].job(key) | |
for item in job.items(): | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment