AtlasPilotPuppy/SparkSqlIntro.scala

## SparkSqlIntro.scala
// data files can be downloaded at https://s3.amazonaws.com/hw-sandbox/tutorial1/infochimps_dataset_4778_download_16677-csv.zip

import java.io.Serializable
import java.util

import org.apache.spark.sql._

val sc = new SparkContext("spark://master:7077", "Spark SQL Intro")
val sqlContext = new SQLContext(sc)
import sqlContext.createSchemaRDD

/* Spark SQL requires case classes or classes implementing the Product interface to be able to use them as table schema */
case class DividendRecord(exchange: String, symbol: String, date: String, dividends: Double)
def parseDividend(row: Array[String]) = new DividendRecord(row(0), row(1), row(2), row(3).toDouble)

// Create RDD with file contents
val dividends = sc.textFile("hdfs://master:9000/user/hdfs/NYSE_dividends_A.csv")
// filter header from the dataset, then split the rows on ',' and create an rdd on class DividendRecord
val div_schema = dividends.filter(!_.startsWith("exchange")).map(_.split(",")).map(parseDividend(_))
//Register the rdd as a table
div_schema.registerAsTable("div")
// Try a query
val result = sqlContext.sql("SELECT * FROM div").collect()

val result = sqlContext.sql("SELECT * FROM div where exchange='NYSE'").collect()

// Read second file
val daily_prices = sc.textFile("hdfs://master:9000/user/hdfs/NYSE_daily_prices_A.csv")

case class DailyPricesRecord(exchange: String, symbol: String, date: String, price_open: Double, price_high: Double, price_low: Double, price_close: Double, stock_volume: Double, price_adj_close: Double)
def parseDailyPrices(row: Array[String]) = new DailyPricesRecord(row(0), row(1), row(2), row(3).toDouble, row(4).toDouble, row(5).toDouble, row(6).toDouble, row(7).toDouble, row(8).toDouble)
val daily_prices_schema = daily_prices.filter(!_.startsWith("exchange")).map(_.split(",")).map(parseDailyPrices(_))

daily_prices_schema.registerAsTable("daily_prices")
val daily_prices_nyse = sqlContext.sql("select * from daily_prices where exchange = 'NYSE'").collect()

val join = sqlContext.sql("select * from div join daily_prices on div.symbol=daily_prices.symbol LIMIT 10").collect()

val group_by = sqlContext.sql("select dividends, count(*) from div where symbol='AZZ' group by dividends").collect()
val group_by = sqlContext.sql("select exchange, count(*) from div where group by exchange").collect()
val group_by = sqlContext.sql("select symbol, count(*) from div group by symbol").collect()

val join = sqlContext.sql("select * from div join daily_prices on div.symbol=daily_prices.symbol and div.date=daily_prices.date LIMIT 50").collect()
	// data files can be downloaded at https://s3.amazonaws.com/hw-sandbox/tutorial1/infochimps_dataset_4778_download_16677-csv.zip

	import java.io.Serializable
	import java.util

	import org.apache.spark.sql._

	val sc = new SparkContext("spark://master:7077", "Spark SQL Intro")
	val sqlContext = new SQLContext(sc)
	import sqlContext.createSchemaRDD

	/* Spark SQL requires case classes or classes implementing the Product interface to be able to use them as table schema */
	case class DividendRecord(exchange: String, symbol: String, date: String, dividends: Double)
	def parseDividend(row: Array[String]) = new DividendRecord(row(0), row(1), row(2), row(3).toDouble)

	// Create RDD with file contents
	val dividends = sc.textFile("hdfs://master:9000/user/hdfs/NYSE_dividends_A.csv")
	// filter header from the dataset, then split the rows on ',' and create an rdd on class DividendRecord
	val div_schema = dividends.filter(!_.startsWith("exchange")).map(_.split(",")).map(parseDividend(_))
	//Register the rdd as a table
	div_schema.registerAsTable("div")
	// Try a query
	val result = sqlContext.sql("SELECT * FROM div").collect()

	val result = sqlContext.sql("SELECT * FROM div where exchange='NYSE'").collect()

	// Read second file
	val daily_prices = sc.textFile("hdfs://master:9000/user/hdfs/NYSE_daily_prices_A.csv")

	case class DailyPricesRecord(exchange: String, symbol: String, date: String, price_open: Double, price_high: Double, price_low: Double, price_close: Double, stock_volume: Double, price_adj_close: Double)
	def parseDailyPrices(row: Array[String]) = new DailyPricesRecord(row(0), row(1), row(2), row(3).toDouble, row(4).toDouble, row(5).toDouble, row(6).toDouble, row(7).toDouble, row(8).toDouble)
	val daily_prices_schema = daily_prices.filter(!_.startsWith("exchange")).map(_.split(",")).map(parseDailyPrices(_))

	daily_prices_schema.registerAsTable("daily_prices")
	val daily_prices_nyse = sqlContext.sql("select * from daily_prices where exchange = 'NYSE'").collect()

	val join = sqlContext.sql("select * from div join daily_prices on div.symbol=daily_prices.symbol LIMIT 10").collect()

	val group_by = sqlContext.sql("select dividends, count(*) from div where symbol='AZZ' group by dividends").collect()
	val group_by = sqlContext.sql("select exchange, count(*) from div where group by exchange").collect()
	val group_by = sqlContext.sql("select symbol, count(*) from div group by symbol").collect()

	val join = sqlContext.sql("select * from div join daily_prices on div.symbol=daily_prices.symbol and div.date=daily_prices.date LIMIT 50").collect()