Skip to content

Instantly share code, notes, and snippets.

@PythonicNinja PythonicNinja/pydrill.py
Last active Dec 28, 2015

Embed
What would you like to do?
pydrill
from pydrill.client import PyDrill
drill = PyDrill(host='localhost', port=8047)
print drill.is_active()
yelp_reviews = drill.query('''
SELECT * FROM
`dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json`
LIMIT 5
''')
for result in yelp_reviews:
print result['type'], result['date']
from pydrill.client import PyDrill
from pydrill_dsl import PyDrillDsl, fn
drill = PyDrill(host='localhost', port=8047)
dsl = PyDrillDsl(drill)
Review = dsl.search(storage_plugin='dfs',
workspace='root',
path='/Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')
print Review.fields
# [u'votes', u'user_id', u'review_id', u'text', u'business_id', u'stars', u'date', u'type']
reviews = Review.select() # SELECT * FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json`
# By default lazy, but when iterated on it will run query.
for review in reviews:
print review.type, review.type
#review 2007-05-17
#review 2010-03-22
#review 2012-02-14
#review 2012-03-02
#review 2012-05-15
#...
reviews = Review.select(Review.votes, Review.type).limit(5) # SELECT votes, type FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json` LIMIT 5
# explicitly run query without need to iterate.
results = reviews.run()
query = Review.select(Review.stars, Review.date, fn.COUNT(Review.review_id).alias('num_reviews'))\
.group_by(Review.stars, Review.date)\
.order_by(fn.COUNT(Review.review_id).alias('num_reviews').desc())
# Based on that we could also have join methods for joining several data sources via .join method.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.