krisalexander200/spark-with-sql.py

## spark-with-sql.py
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
import collections

conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

def mapper(line):
    li = line.split(',')
    #return (li[0],li[1],li[2],li[3])
    return Row( ID=int(li[0]), name=li[1].encode("utf-8"), age=int(li[2]), numFriends=int(li[3]) )

lines = sc.textFile("/path/to/fakefriends.csv")
people = lines.map(mapper)

# Infer the schema, and register the DataFrame as a table.
schemaPeople = sqlContext.createDataFrame(people)
schemaPeople.registerTempTable("people")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = sqlContext.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are RDDs and support all the normal RDD operations.
#teenNames = teenagers.map(lambda p: "Name: " + p.name)
for teen in teenagers.collect():
  print(teen)
	from pyspark import SparkConf, SparkContext
	from pyspark.sql import SQLContext, Row
	import collections

	conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
	sc = SparkContext(conf = conf)
	sqlContext = SQLContext(sc)

	def mapper(line):
	li = line.split(',')
	#return (li[0],li[1],li[2],li[3])
	return Row( ID=int(li[0]), name=li[1].encode("utf-8"), age=int(li[2]), numFriends=int(li[3]) )

	lines = sc.textFile("/path/to/fakefriends.csv")
	people = lines.map(mapper)

	# Infer the schema, and register the DataFrame as a table.
	schemaPeople = sqlContext.createDataFrame(people)
	schemaPeople.registerTempTable("people")

	# SQL can be run over DataFrames that have been registered as a table.
	teenagers = sqlContext.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

	# The results of SQL queries are RDDs and support all the normal RDD operations.
	#teenNames = teenagers.map(lambda p: "Name: " + p.name)
	for teen in teenagers.collect():
	print(teen)