Skip to content

Instantly share code, notes, and snippets.

@walteryu
Last active September 29, 2018 03:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walteryu/9ab3a6e003699c9e331ec4a044b408bb to your computer and use it in GitHub Desktop.
Save walteryu/9ab3a6e003699c9e331ec4a044b408bb to your computer and use it in GitHub Desktop.
hw3_p2.py
'''
Author: Walter Yu
Course: CSCI E-63, Fall 2018
Assignment: HW3, Problem 2
References:
Slide 46 & 47, Lecture 3 Notes
SparkContext Tutorial:https://www.tutorialspoint.com/pyspark/pyspark_sparkcontext.htm
'''
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row, SparkSession
def people_parquet():
# Read in json file, save to parquet file then reinitiate:
sc = SparkContext(appName = "FirstSession")
sqlContext = SQLContext(sc)
# Read in json file as dataframe:
df = sqlContext.read.load("people.json",format="json")
# Show contents of dataframe:
print("First Session - Dataframe count:")
print(df.count())
print("")
print("First Session - Contents of dataframe:")
df.show()
print("")
# Query for name/age and save to parquet file:
df.select("name","age").write.save("nameage.parquet",format="parquet")
parquetFile = sqlContext.read.parquet("nameage.parquet")
# Save as temporary view:
parquetFile.registerTempTable("parqf")
sp = sqlContext.sql("select * from parqf")
# Show contents of temporary table:
print("First Session - Contents of names table:")
sp.show()
# Stop session and reinitiate to show contents of file
sc.stop()
def view_parquet():
# Read in json file, save to parquet file then reinitiate:
sc = SparkContext(appName = "SecondSession")
sqlContext = SQLContext(sc)
# Query for name/age and save to parquet file:
parquetFile = sqlContext.read.parquet("nameage.parquet")
parquetFile.registerTempTable("parqf")
sp = sqlContext.sql("select * from parqf")
# Show contents of table:
print("First Session - Dataframe count:")
print(sp.count())
print("")
# Show contents of temporary table:
print("Second Session - Contents of names table:")
sp.show()
# Stop session
sc.stop()
if __name__ == '__main__':
people_parquet()
view_parquet()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment