Skip to content

Instantly share code, notes, and snippets.

View nag9s's full-sized avatar

nag9s

View GitHub Profile
//TODO
version: '2.2'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:6.8.12
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
//String Manipulation
val str = orders.first
val a = str.split(",")
val orderId = a(0).toInt
a(1).contains("2013")
val orderDate = a(1)
orderDate.substring(0, 10)
orderDate.substring(5, 7)
orderDate.substring(11)
val inputPath = "/Users/itversity/Research/data/wordcount.txt" or val inputPath = "/public/randomtextwriter/part-m-00000"
val outputPath = "/Users/itversity/Research/data/wordcount" or val outputPath = "/user/dgadiraju/wordcount"
//Make sure outputPath does not exist for this example
sc.textFile(inputPath).
flatMap(_.split(" ")).
map((_, 1)).
reduceByKey(_ + _).
take(100).
foreach(println)
/**
* Created by itversity on 17/03/17.
* This is primarily to get the word count on the data received from
* nc -lk 19999
* Make sure build.sbt is updated with the dependency -
* libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.6.2"
* Create jar, ship the jar, start nc, and then use spark-submit
* spark-submit --class SparkStreamingWordCount --master yarn --conf spark.ui.port=14562 retail_2.10-1.0.jar
*/
import org.apache.spark.SparkConf
val path = "/public/retail_db" or val path = "/Users/itversity/Research/data/retail_db"
val rdd = sc.textFile(path + "/orders")
rdd.reduce((agg, ele) => {
if(agg.split(",")(2).toInt < ele.split(",")(2).toInt) agg else ele
})
rdd.top(2)
rdd.takeOrdered(5)(Ordering[Int].reverse.on(x => x.split(",")(2).toInt)).foreach(println)
//This is just a script not a program
//Execute these things as part of Spark Shell
//Writing as sequence file
import org.apache.hadoop.io._
val products = sc.textFile("/public/retail_db/products")
products.map(rec => (NullWritable.get(), rec)).
saveAsSequenceFile("/user/dgadiraju/products_seq")
//Reading sequnce files