Skip to content

Instantly share code, notes, and snippets.

@PythonicNinja
Last active December 28, 2015 00:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PythonicNinja/9b4952b6cbc17572c7db to your computer and use it in GitHub Desktop.
Save PythonicNinja/9b4952b6cbc17572c7db to your computer and use it in GitHub Desktop.
pydrill
from pydrill.client import PyDrill
drill = PyDrill(host='localhost', port=8047)
print drill.is_active()
yelp_reviews = drill.query('''
SELECT * FROM
`dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json`
LIMIT 5
''')
for result in yelp_reviews:
print result['type'], result['date']
from pydrill.client import PyDrill
from pydrill_dsl import PyDrillDsl, fn
drill = PyDrill(host='localhost', port=8047)
dsl = PyDrillDsl(drill)
Review = dsl.search(storage_plugin='dfs',
workspace='root',
path='/Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')
print Review.fields
# [u'votes', u'user_id', u'review_id', u'text', u'business_id', u'stars', u'date', u'type']
reviews = Review.select() # SELECT * FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json`
# By default lazy, but when iterated on it will run query.
for review in reviews:
print review.type, review.type
#review 2007-05-17
#review 2010-03-22
#review 2012-02-14
#review 2012-03-02
#review 2012-05-15
#...
reviews = Review.select(Review.votes, Review.type).limit(5) # SELECT votes, type FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json` LIMIT 5
# explicitly run query without need to iterate.
results = reviews.run()
query = Review.select(Review.stars, Review.date, fn.COUNT(Review.review_id).alias('num_reviews'))\
.group_by(Review.stars, Review.date)\
.order_by(fn.COUNT(Review.review_id).alias('num_reviews').desc())
# Based on that we could also have join methods for joining several data sources via .join method.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment