Sandeep Giri girisandeep

## flume_twitter_2_records
{
   "in_reply_to_status_id_str":null,
   "in_reply_to_status_id":null,
   "created_at":"Tue Jan 09 05:13:54 +0000 2018",
   "in_reply_to_user_id_str":null,
   "source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone<\/a>",
   "retweet_count":0,
   "retweeted":false,
   "geo":null,
   "filter_level":"low",

## saving-loading-sequencefiles.scala
//Save it
var rdd = sc.parallelize(Array(("key1", 1.0), ("key2", 2.0), ("key3", 3.0)), 2)
rdd.saveAsSequenceFile("pysequencefile1")

//Load it
import org.apache.hadoop.io.DoubleWritable
import org.apache.hadoop.io.Text

val myrdd = sc.sequenceFile(
"pysequencefile1",

## csv-parsing-ex3scala
//CSV parsing program using opencsv library
//spark-shell --packages net.sf.opencsv:opencsv:2.3
//Or
//Add this to sbt: libraryDependencies += "net.sf.opencsv" % "opencsv" % "2.3"

import au.com.bytecode.opencsv.CSVParser
var linesRdd = sc.textFile("/data/spark/temps.csv");
def parseCSV(itr:Iterator[String]):Iterator[Array[String]] = {
    val parser = new CSVParser(',')
    for(line <- itr)

## csv-parsing-ex2.scala
//CSV parsing program using opencsv library
//spark-shell --packages net.sf.opencsv:opencsv:2.3
//Or
//Add this to sbt: libraryDependencies += "net.sf.opencsv" % "opencsv" % "2.3"

import au.com.bytecode.opencsv.CSVParser

var a = sc.textFile("/data/spark/temps.csv");
var p = a.map(
line => {

## csv-parsing-ex1.scala
var lines = sc.textFile("/data/spark/temps.csv");
var recordsRDD = lines.map(line => line.split(","));
recordsRDD.take(10);

## custom-accumulator-v2.scala

class MyComplex(var x: Int, var y: Int) extends Serializable{
  def reset(): Unit = {
    x = 0
    y = 0
  }
  def add(p:MyComplex): MyComplex = {
    x = x + p.x
    y = y + p.y
    return this

## broadcast-example.scala
var commonWords = Array("a", "an", "the", "of", "at", "is", "am","are","this","that","at", "in", "or", "and", "or", "not", "be", "for", "to", "it")
val commonWordsMap = collection.mutable.Map[String, Int]()
for(word <- commonWords){
    commonWordsMap(word) = 1
}
var commonWordsBC = sc.broadcast(commonWordsMap)

var file = sc.textFile("/data/mr/wordcount/input/big.txt")
def toWords(line:String):Array[String] = {
    var words = line.split(" ")

## custom-accum-v1.scala
class MyComplex(var x: Int, var y: Int) extends Serializable{
  def reset(): Unit = {
    x = 0
    y = 0
  }
  def add(p:MyComplex): MyComplex = {
    x = x + p.x
    y = y + p.y
    return this
  }

## accumulator-example.scala
sc.setLogLevel("ERROR")
var file = sc.textFile("/data/mr/wordcount/input/")
var numBlankLines = sc.accumulator(0)

def toWords(line:String): Array[String] = {
  if(line.length == 0) {numBlankLines += 1}
  return line.split(" ");
}

var words = file.flatMap(toWords)

## spark-custom-partitioner.scala

import org.apache.spark.Partitioner
class TwoPartsPartitioner(override val numPartitions: Int) extends Partitioner {
    def getPartition(key: Any): Int = key match {
        case s: String => {
            if (s(0).toUpper > 'J') 1 else 0
        }
    }
}
	{
	"in_reply_to_status_id_str":null,
	"in_reply_to_status_id":null,
	"created_at":"Tue Jan 09 05:13:54 +0000 2018",
	"in_reply_to_user_id_str":null,
	"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone<\/a>",
	"retweet_count":0,
	"retweeted":false,
	"geo":null,
	"filter_level":"low",
	//Save it
	var rdd = sc.parallelize(Array(("key1", 1.0), ("key2", 2.0), ("key3", 3.0)), 2)
	rdd.saveAsSequenceFile("pysequencefile1")

	//Load it
	import org.apache.hadoop.io.DoubleWritable
	import org.apache.hadoop.io.Text

	val myrdd = sc.sequenceFile(
	"pysequencefile1",
	//CSV parsing program using opencsv library
	//spark-shell --packages net.sf.opencsv:opencsv:2.3
	//Or
	//Add this to sbt: libraryDependencies += "net.sf.opencsv" % "opencsv" % "2.3"

	import au.com.bytecode.opencsv.CSVParser
	var linesRdd = sc.textFile("/data/spark/temps.csv");
	def parseCSV(itr:Iterator[String]):Iterator[Array[String]] = {
	val parser = new CSVParser(',')
	for(line <- itr)
	var lines = sc.textFile("/data/spark/temps.csv");
	var recordsRDD = lines.map(line => line.split(","));
	recordsRDD.take(10);

	class MyComplex(var x: Int, var y: Int) extends Serializable{
	def reset(): Unit = {
	x = 0
	y = 0
	}
	def add(p:MyComplex): MyComplex = {
	x = x + p.x
	y = y + p.y
	return this
	var commonWords = Array("a", "an", "the", "of", "at", "is", "am","are","this","that","at", "in", "or", "and", "or", "not", "be", "for", "to", "it")
	val commonWordsMap = collection.mutable.Map[String, Int]()
	for(word <- commonWords){
	commonWordsMap(word) = 1
	}
	var commonWordsBC = sc.broadcast(commonWordsMap)

	var file = sc.textFile("/data/mr/wordcount/input/big.txt")
	def toWords(line:String):Array[String] = {
	var words = line.split(" ")
	sc.setLogLevel("ERROR")
	var file = sc.textFile("/data/mr/wordcount/input/")
	var numBlankLines = sc.accumulator(0)

	def toWords(line:String): Array[String] = {
	if(line.length == 0) {numBlankLines += 1}
	return line.split(" ");
	}

	var words = file.flatMap(toWords)

	import org.apache.spark.Partitioner
	class TwoPartsPartitioner(override val numPartitions: Int) extends Partitioner {
	def getPartition(key: Any): Int = key match {
	case s: String => {
	if (s(0).toUpper > 'J') 1 else 0
	}
	}
	}