nag9s

## ICICI06Feb
//TODO

## docker-compose.yml
version: '2.2'
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:6.8.12
    container_name: elasticsearch
    environment:
      - cluster.name=docker-cluster
      - bootstrap.memory_lock=true
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ulimits:

## spark-string-manipulation.scala
//String Manipulation
val str = orders.first
val a = str.split(",")
val orderId = a(0).toInt
a(1).contains("2013")

val orderDate = a(1)
orderDate.substring(0, 10)
orderDate.substring(5, 7)
orderDate.substring(11)

## spark-scala-word-count.scala
val inputPath = "/Users/itversity/Research/data/wordcount.txt" or val inputPath = "/public/randomtextwriter/part-m-00000"
val outputPath = "/Users/itversity/Research/data/wordcount" or val outputPath = "/user/dgadiraju/wordcount"
//Make sure outputPath does not exist for this example

sc.textFile(inputPath).
  flatMap(_.split(" ")).
  map((_, 1)).
  reduceByKey(_ + _).
  take(100).
  foreach(println)

## SparkStreamingWordCount.scala
/**
  * Created by itversity on 17/03/17.
  * This is primarily to get the word count on the data received from
  * nc -lk 19999
  * Make sure build.sbt is updated with the dependency -
  * libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.6.2"
  * Create jar, ship the jar, start nc, and then use spark-submit
  * spark-submit --class SparkStreamingWordCount --master yarn --conf spark.ui.port=14562 retail_2.10-1.0.jar
  */
import org.apache.spark.SparkConf

## spark-scala-actions-transformations.scala
val path = "/public/retail_db" or val path = "/Users/itversity/Research/data/retail_db"
val rdd = sc.textFile(path + "/orders")
rdd.reduce((agg, ele) => {
  if(agg.split(",")(2).toInt < ele.split(",")(2).toInt) agg else ele
  })
rdd.top(2)
rdd.takeOrdered(5)(Ordering[Int].reverse.on(x => x.split(",")(2).toInt)).foreach(println)

## scala-spark-file-formats.scala
//This is just a script not a program
//Execute these things as part of Spark Shell

//Writing as sequence file
import org.apache.hadoop.io._
val products = sc.textFile("/public/retail_db/products")
products.map(rec => (NullWritable.get(), rec)).
  saveAsSequenceFile("/user/dgadiraju/products_seq")

//Reading sequnce files
	version: '2.2'
	services:
	elasticsearch:
	image: docker.elastic.co/elasticsearch/elasticsearch:6.8.12
	container_name: elasticsearch
	environment:
	- cluster.name=docker-cluster
	- bootstrap.memory_lock=true
	- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
	ulimits:
	//String Manipulation
	val str = orders.first
	val a = str.split(",")
	val orderId = a(0).toInt
	a(1).contains("2013")

	val orderDate = a(1)
	orderDate.substring(0, 10)
	orderDate.substring(5, 7)
	orderDate.substring(11)
	val inputPath = "/Users/itversity/Research/data/wordcount.txt" or val inputPath = "/public/randomtextwriter/part-m-00000"
	val outputPath = "/Users/itversity/Research/data/wordcount" or val outputPath = "/user/dgadiraju/wordcount"
	//Make sure outputPath does not exist for this example

	sc.textFile(inputPath).
	flatMap(_.split(" ")).
	map((_, 1)).
	reduceByKey(_ + _).
	take(100).
	foreach(println)
	/**
	* Created by itversity on 17/03/17.
	* This is primarily to get the word count on the data received from
	* nc -lk 19999
	* Make sure build.sbt is updated with the dependency -
	* libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.6.2"
	* Create jar, ship the jar, start nc, and then use spark-submit
	* spark-submit --class SparkStreamingWordCount --master yarn --conf spark.ui.port=14562 retail_2.10-1.0.jar
	*/
	import org.apache.spark.SparkConf
	val path = "/public/retail_db" or val path = "/Users/itversity/Research/data/retail_db"
	val rdd = sc.textFile(path + "/orders")
	rdd.reduce((agg, ele) => {
	if(agg.split(",")(2).toInt < ele.split(",")(2).toInt) agg else ele
	})
	rdd.top(2)
	rdd.takeOrdered(5)(Ordering[Int].reverse.on(x => x.split(",")(2).toInt)).foreach(println)
	//This is just a script not a program
	//Execute these things as part of Spark Shell

	//Writing as sequence file
	import org.apache.hadoop.io._
	val products = sc.textFile("/public/retail_db/products")
	products.map(rec => (NullWritable.get(), rec)).
	saveAsSequenceFile("/user/dgadiraju/products_seq")

	//Reading sequnce files