Koushik M.L.N koushikmln

## KafkaMaprBuild.sbt
name := "KafkaWorkshopMapr"

version := "0.1"

scalaVersion := "2.11.11"


libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "2.2.1"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.2.1"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.2.1"

## logstash.config
input {
  file {
    path => ["/opt/gen_logs/logs/access.log"]
    type => "apache_access"
  }
}
filter {
    grok {
       match => [
       "message" , "%{COMBINEDAPACHELOG}+%{GREEDYDATA:extra_fields}",

## runMaprJar.sh
/opt/mapr/spark/spark-2.2.1/bin/spark-submit \
--class CountryVisitCount \
--master yarn  \
--conf spark.ui.port=4926  \
--jars $(echo /external_jars/*.jar | tr ' ' ',') \
kafkaworkshopmapr_2.11-0.1.jar prod

## KafkaDemoSparkCode.scala
    val conf = ConfigFactory.load
    val envProps: Config = conf.getConfig(args(0))
    val sparkConf = new SparkConf().setMaster("yarn").setAppName("SiteTraffic")
    val streamingContext = new StreamingContext(sparkConf, Seconds(envProps.getInt("window")))
    val broadcastConfig = streamingContext.sparkContext.broadcast(envProps)
    val topicsSet = Set("retail_logs")
    val now = Calendar.getInstance().getTime()
    val timestamp = streamingContext.sparkContext.broadcast(now)
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> envProps.getString("bootstrap.server"),

## HBaseFunctions.scala
def getHbaseConnection(config: Config): Connection ={
    //Create Hbase Configuration Object
    val hBaseConf: Configuration = HBaseConfiguration.create()
    hBaseConf.set("hbase.zookeeper.quorum", config.getString("zookeeper.quorum"))
    hBaseConf.set("hbase.zookeeper.property.clientPort", config.getString("zookeeper.port"))
    hBaseConf.set("zookeeper.znode.parent","/hbase-unsecure")
    hBaseConf.set("hbase.cluster.distributed","true")
    //Establish Connection
    val connection = ConnectionFactory.createConnection(hBaseConf)
    connection

## application.properties
dev.zookeeper = localhost:2181
dev.bootstrap.server = localhost:9092
dev.zookeeper.quorum = localhost
dev.zookeeper.port = 2181
dev.window = 20

prod.zookeeper = mapr02.itversity.com:5181,mapr03.itversity.com:5181,mapr04.itversity.com:5181
prod.zookeeper.quorum = mapr02.itversity.com,mapr03.itversity.com,mapr04.itversity.com
prod.zookeeper.port = 5181
prod.bootstrap.server = mapr02.itversity.com:9092,mapr03.itversity.com:9092,mapr04.itversity.com:9092

## lambda.py
fun = lambda x : x*x
fun(2) # Will return 2*2 i.e. 4

def sumOfInt(n,fun):
    sum = 0
    for i in range(0,n+1):
        sum = sum + fun(i)
    return sum

sumOfInt(5,lambda x: x*x) # Will return sum of squares till 5 i.e. 55

## examples.py
#MAP Function, takes input as function and list. applies function on list and returns the value as a map object
list1 = [1,2,3,4]
list(map(lambda x:x*x,list1)) # [1, 4, 9, 16]

#reduce takes 2 items at once as input from sequence. it makes an inverted tree
from functools import reduce
list2 = [1,2,3,4,5,6,7,8,9,10]
reduce(lambda x,y: x+y, list2) # 55

list2 = [1,2,3,4,5,6,7,8,9,10]

## OrderItems.py
#Problem Statement 1
#Get (order_id, sub_total) tuple from order items csv using map function.
def getOrderItemTuples(order_items):
    return list(map(lambda x: (int(x.split(",")[1]),float(x.split(",")[4])),order_items))

order_items = open("/data/retail_db/order_items/part-00000","r").read().splitlines()
getOrderItemTuples(order_items[:20])

#Problem Statement 2
#Get the total amount for a particular order using map, reduce, filter.

## SparkExample.py
rdd = sc.textFile("/public/retail_db/orders/part-00000")
status_count = rdd.map(lambda x: (x.split(",")[3], 1))
  .reduceByKey(lambda a,b: a + b).collect()
	name := "KafkaWorkshopMapr"

	version := "0.1"

	scalaVersion := "2.11.11"


	libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "2.2.1"
	libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.2.1"
	libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.2.1"
	input {
	file {
	path => ["/opt/gen_logs/logs/access.log"]
	type => "apache_access"
	}
	}
	filter {
	grok {
	match => [
	"message" , "%{COMBINEDAPACHELOG}+%{GREEDYDATA:extra_fields}",
	/opt/mapr/spark/spark-2.2.1/bin/spark-submit \
	--class CountryVisitCount \
	--master yarn \
	--conf spark.ui.port=4926 \
	--jars $(echo /external_jars/*.jar \| tr ' ' ',') \
	kafkaworkshopmapr_2.11-0.1.jar prod
	val conf = ConfigFactory.load
	val envProps: Config = conf.getConfig(args(0))
	val sparkConf = new SparkConf().setMaster("yarn").setAppName("SiteTraffic")
	val streamingContext = new StreamingContext(sparkConf, Seconds(envProps.getInt("window")))
	val broadcastConfig = streamingContext.sparkContext.broadcast(envProps)
	val topicsSet = Set("retail_logs")
	val now = Calendar.getInstance().getTime()
	val timestamp = streamingContext.sparkContext.broadcast(now)
	val kafkaParams = Map[String, Object](
	"bootstrap.servers" -> envProps.getString("bootstrap.server"),
	def getHbaseConnection(config: Config): Connection ={
	//Create Hbase Configuration Object
	val hBaseConf: Configuration = HBaseConfiguration.create()
	hBaseConf.set("hbase.zookeeper.quorum", config.getString("zookeeper.quorum"))
	hBaseConf.set("hbase.zookeeper.property.clientPort", config.getString("zookeeper.port"))
	hBaseConf.set("zookeeper.znode.parent","/hbase-unsecure")
	hBaseConf.set("hbase.cluster.distributed","true")
	//Establish Connection
	val connection = ConnectionFactory.createConnection(hBaseConf)
	connection
	dev.zookeeper = localhost:2181
	dev.bootstrap.server = localhost:9092
	dev.zookeeper.quorum = localhost
	dev.zookeeper.port = 2181
	dev.window = 20

	prod.zookeeper = mapr02.itversity.com:5181,mapr03.itversity.com:5181,mapr04.itversity.com:5181
	prod.zookeeper.quorum = mapr02.itversity.com,mapr03.itversity.com,mapr04.itversity.com
	prod.zookeeper.port = 5181
	prod.bootstrap.server = mapr02.itversity.com:9092,mapr03.itversity.com:9092,mapr04.itversity.com:9092
	fun = lambda x : x*x
	fun(2) # Will return 2*2 i.e. 4

	def sumOfInt(n,fun):
	sum = 0
	for i in range(0,n+1):
	sum = sum + fun(i)
	return sum

	sumOfInt(5,lambda x: x*x) # Will return sum of squares till 5 i.e. 55
	#MAP Function, takes input as function and list. applies function on list and returns the value as a map object
	list1 = [1,2,3,4]
	list(map(lambda x:x*x,list1)) # [1, 4, 9, 16]

	#reduce takes 2 items at once as input from sequence. it makes an inverted tree
	from functools import reduce
	list2 = [1,2,3,4,5,6,7,8,9,10]
	reduce(lambda x,y: x+y, list2) # 55

	list2 = [1,2,3,4,5,6,7,8,9,10]
	#Problem Statement 1
	#Get (order_id, sub_total) tuple from order items csv using map function.
	def getOrderItemTuples(order_items):
	return list(map(lambda x: (int(x.split(",")[1]),float(x.split(",")[4])),order_items))

	order_items = open("/data/retail_db/order_items/part-00000","r").read().splitlines()
	getOrderItemTuples(order_items[:20])

	#Problem Statement 2
	#Get the total amount for a particular order using map, reduce, filter.
	rdd = sc.textFile("/public/retail_db/orders/part-00000")
	status_count = rdd.map(lambda x: (x.split(",")[3], 1))
	.reduceByKey(lambda a,b: a + b).collect()