Skip to content

Instantly share code, notes, and snippets.

@elliottcordo
Last active August 29, 2015 14:08
Show Gist options
  • Save elliottcordo/f5267a75defff468b757 to your computer and use it in GitHub Desktop.
Save elliottcordo/f5267a75defff468b757 to your computer and use it in GitHub Desktop.
yelp pyspark example
#MASTER=yarn-client /home/hadoop/spark/bin/pyspark
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
#------------------------------------------------
#load some users
lines=sc.textFile("s3://caserta-bucket1/yelp/in/users/users.txt")
parts = lines.map(lambda l: l.split(","))
users = parts.map(lambda p: Row(id=p[0],name=p[1],gender=p[2],age=p[3]))
schemaUsers = sqlContext.inferSchema(users)
schemaUsers.registerTempTable("users")
sqlContext.sql("select count(1) from users").collect()
#------------------------------------------------
#load some reviews from json baby
reviews = sqlContext.jsonFile("s3://caserta-bucket1/yelp-academic-dataset/yelp_academic_dataset_review.json")
reviews.printSchema()
reviews.registerTempTable("reviews")
sqlContext.sql("select count(1) from reviews").collect()
sqlContext.sql("select user_id, votes.cool as cool, votes.useful as useful from reviews").take(10)
#------------------------------------------------
#let's join!
genders_summary = sqlContext.sql("""
select gender,
count(1) as cnt
from reviews r
join users u on u.id = r.user_id
group by gender""" )
genders_summary.collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment