Skip to content

Instantly share code, notes, and snippets.

@krisalexander200
Created January 13, 2016 18:36
Show Gist options
  • Save krisalexander200/a9e693022b51f638e00a to your computer and use it in GitHub Desktop.
Save krisalexander200/a9e693022b51f638e00a to your computer and use it in GitHub Desktop.
Spark SQL with Python
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
import collections
conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
def mapper(line):
li = line.split(',')
#return (li[0],li[1],li[2],li[3])
return Row( ID=int(li[0]), name=li[1].encode("utf-8"), age=int(li[2]), numFriends=int(li[3]) )
lines = sc.textFile("/path/to/fakefriends.csv")
people = lines.map(mapper)
# Infer the schema, and register the DataFrame as a table.
schemaPeople = sqlContext.createDataFrame(people)
schemaPeople.registerTempTable("people")
# SQL can be run over DataFrames that have been registered as a table.
teenagers = sqlContext.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")
# The results of SQL queries are RDDs and support all the normal RDD operations.
#teenNames = teenagers.map(lambda p: "Name: " + p.name)
for teen in teenagers.collect():
print(teen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment