ytaras/Links

## demo01.scala
val shakespeare = sc.textFile("hdfs:///user/root/shakespeare")
val notExecuted = shakespeare.map { x => x.toLowerCase }
notExecuted.toDebugString

val firstLine = shakespeare.first
val words = shakespeare.flatMap( x => x.split("\\W+") )
words.takeSample(true, 10)
val cleanedWords = words.filter { x => !x.isEmpty }.map {x => x.toLowerCase }
val paired = cleanedWords.map { x => (x, 1) }
val wordCounts = paired.reduceByKey { (x, y) => x + y }
val cachedCounts = wordCounts.cache

cachedCounts.take(10)

val sortedByNumber = cachedCounts.sortBy(x => x._2, false) // There are other ways
sortedByNumber.saveAsTextFile("hdfs:///user/root/wc_sorted")

## demo02.scala
import org.apache.spark.sql._
val ssc = new SQLContext(sc)
import ssc._
ssc.setConf("spark.sql.parquet.binaryAsString", "true")

val people = ssc.jsonFile("hdfs:///tmp/people.json")
people.registerTempTable("people")
ssc.registerFunction("l", (x: String) => x.length)
ssc.sql("select l(name), name from people").collect
ssc.sql("select age, name from people order by l(name) desc").collect

val parquetCustomers = ssc.parquetFile("/user/hive/warehouse/tpcds_parquet.db/customer")
parquetCustomers.registerTempTable("customer")

ssc.sql("select count(*) from customer").collect()
ssc.sql("select max(c_birth_year), min(c_birth_year) from customer").collect()
ssc.sql("select count(*), c_birth_year from customer group by c_birth_year order by c_birth_year").collect().foreach(println)
ssc.sql("select count(*), c_birth_year from customer where c_birth_year is not null group by c_birth_year order by c_birth_year").collect().foreach(println)
ssc.sql("select count(*) as users_num, c_birth_year from customer where c_birth_year is not null group by c_birth_year order by users_num desc limit 1").collect()

## demo03.scala
val points = sc.textFile("hdfs:///user/root/kmeans_data.txt").map { line => line.split(' ').map{ x => x.toDouble}}
import org.apache.spark.mllib.linalg.Vectors
val vectors = points map { x => Vectors.dense(x) }
val examples = vectors.cache
import org.apache.spark.mllib.clustering.KMeans
val clusters = KMeans.train(examples, 2, 20)
clusters.clusterCenters
examples.map { x => (x, clusters.predict(x)) }.collect.foreach(println)
(1 to 6) map { n =>
  val cl = KMeans.train(examples, n, 20)
  cl.computeCost(examples)
}

## demo04.scala
val ssc = new StreamingContext(conf, Seconds(1))
val tweets = TwitterUtils.createStream(ssc, None)
val hashTags = tweets.flatMap(status => getTags(status))
hashTags.saveAsHadoopFiles("hdfs://...")

## Links
Hue
---

Spark overview
--------------
http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
Spark libs
----------
http://spark-summit.org/2014/training

## plan
1. What's wrong with map-reduce
2. What is RDD
 - Public Api
 - Spark details (partitions)
 - Implementation details
3. Transformations http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
4. Actions http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
5. Demo demo01
6. Sql and demo
7. MLib and demo
8. graphX demo
9. streaming. demo?

## Running
ssh root@<CDH>
export HADOOP_CONF_DIR=/etc/hadoop/conf
current/bin/spark-shell  --master yarn-client     --num-executors 2   --driver-memory 512m     --executor-memory 512m   --executor-cores 1
	val shakespeare = sc.textFile("hdfs:///user/root/shakespeare")
	val notExecuted = shakespeare.map { x => x.toLowerCase }
	notExecuted.toDebugString

	val firstLine = shakespeare.first
	val words = shakespeare.flatMap( x => x.split("\\W+") )
	words.takeSample(true, 10)
	val cleanedWords = words.filter { x => !x.isEmpty }.map {x => x.toLowerCase }
	val paired = cleanedWords.map { x => (x, 1) }
	val wordCounts = paired.reduceByKey { (x, y) => x + y }
	val cachedCounts = wordCounts.cache

	cachedCounts.take(10)

	val sortedByNumber = cachedCounts.sortBy(x => x._2, false) // There are other ways
	sortedByNumber.saveAsTextFile("hdfs:///user/root/wc_sorted")
	import org.apache.spark.sql._
	val ssc = new SQLContext(sc)
	import ssc._
	ssc.setConf("spark.sql.parquet.binaryAsString", "true")

	val people = ssc.jsonFile("hdfs:///tmp/people.json")
	people.registerTempTable("people")
	ssc.registerFunction("l", (x: String) => x.length)
	ssc.sql("select l(name), name from people").collect
	ssc.sql("select age, name from people order by l(name) desc").collect

	val parquetCustomers = ssc.parquetFile("/user/hive/warehouse/tpcds_parquet.db/customer")
	parquetCustomers.registerTempTable("customer")

	ssc.sql("select count(*) from customer").collect()
	ssc.sql("select max(c_birth_year), min(c_birth_year) from customer").collect()
	ssc.sql("select count(*), c_birth_year from customer group by c_birth_year order by c_birth_year").collect().foreach(println)
	ssc.sql("select count(*), c_birth_year from customer where c_birth_year is not null group by c_birth_year order by c_birth_year").collect().foreach(println)
	ssc.sql("select count(*) as users_num, c_birth_year from customer where c_birth_year is not null group by c_birth_year order by users_num desc limit 1").collect()
	val points = sc.textFile("hdfs:///user/root/kmeans_data.txt").map { line => line.split(' ').map{ x => x.toDouble}}
	import org.apache.spark.mllib.linalg.Vectors
	val vectors = points map { x => Vectors.dense(x) }
	val examples = vectors.cache
	import org.apache.spark.mllib.clustering.KMeans
	val clusters = KMeans.train(examples, 2, 20)
	clusters.clusterCenters
	examples.map { x => (x, clusters.predict(x)) }.collect.foreach(println)
	(1 to 6) map { n =>
	val cl = KMeans.train(examples, n, 20)
	cl.computeCost(examples)
	}
	val ssc = new StreamingContext(conf, Seconds(1))
	val tweets = TwitterUtils.createStream(ssc, None)
	val hashTags = tweets.flatMap(status => getTags(status))
	hashTags.saveAsHadoopFiles("hdfs://...")
	Hue
	---

	Spark overview
	--------------
	http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
	Spark libs
	----------
	http://spark-summit.org/2014/training
	1. What's wrong with map-reduce
	2. What is RDD
	- Public Api
	- Spark details (partitions)
	- Implementation details
	3. Transformations http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
	4. Actions http://spark-summit.org/wp-content/uploads/2013/10/McDonough-spark-tutorial_spark-summit-2013.pdf
	5. Demo demo01
	6. Sql and demo
	7. MLib and demo
	8. graphX demo
	9. streaming. demo?
	ssh root@<CDH>
	export HADOOP_CONF_DIR=/etc/hadoop/conf
	current/bin/spark-shell --master yarn-client --num-executors 2 --driver-memory 512m --executor-memory 512m --executor-cores 1