bistaumanga/loan.scala

## loan.scala
val rawData = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/bank/bank.csv")
rawData.take(5)

//  30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"

implicit def str2int(x: String) = x.toInt // because i'm lazy
// case class for representing data row
case class DataRow(age: Int,
    job: String,
    maritial: String,
    education: String,
    default_ : String,
    balance: Int,
    housing: String,
    loan: String,
    contact: String,
    day: Int,
    month: String,
    duration: Int,
    camoaign: Int,
    pdays: Int,
    previous: Int,
    poutcome: String,
    y: String
    )

// parsing
val filtered = rawData.filter {line => ! line.contains("default")}.map(_.split(";")).map {
    arr => DataRow(
        arr(0), arr(1), arr(2), arr(3), arr(4), arr(5), arr(6), arr(7), arr(8), arr(9), arr(10), arr(11), arr(12), arr(13), arr(14), arr(15), arr(16)
    )
}

// persisting in memory
filtered.cache()


// counts for chi-squared test of independence
// between job and loan outcome

val countsByJobAndY = filtered.map{row => ((row.job, row.y), 1) }.reduceByKey(_+_)
val countsByJob = filtered.map{row => (row.job, 1) }.reduceByKey(_+_).collectAsMap()
val countsByY = filtered.map{row => (row.y, 1) }.reduceByKey(_+_).collectAsMap()
val countTotal = filtered.count()

//
countsByJobAndY.sortBy(-_._2).take(10) // top 10 highly correlated suspects from count


// observed and expected frequencies
import scala.math
val chiSqVals = countsByJobAndY
  .map{case ((job, y), observed) => ((job, y), (observed, countsByJob(job) * countsByY(y) / countTotal ) ) }
  .mapValues{case (observed, expected) => math.pow(observed - expected, 2)/expected }

chiSqVals.sortBy(-_._2).take(10) // top 10 highly correlated suspects from test statistic
// our test statistic
chiSqVals.collect()

// **ChiSquared (0.05,13) ** = 22.362
chiSqVals.map(_._2).reduce()
// chi squared test statistic, which is higher than 22.362, so job and loan outcome are not independent

## wordcount.scala
val rawTexts = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/novels/")

rawTexts.take(5)

val words = rawTexts
  .flatMap(_.split("\\s+")) // splitting by space
  .map(_.replaceAll("[^a-zA-Z ]", "")) // remove non alphabets
  .map(_.toLowerCase).filter(word => word.length >= 5 && word.length <= 10) // filtering words with certain length

val counts = words map{(_, 1)} reduceByKey (_+_)

counts.sortBy(- _._2)take(10)
	val rawData = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/bank/bank.csv")
	rawData.take(5)

	// 30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"

	implicit def str2int(x: String) = x.toInt // because i'm lazy
	// case class for representing data row
	case class DataRow(age: Int,
	job: String,
	maritial: String,
	education: String,
	default_ : String,
	balance: Int,
	housing: String,
	loan: String,
	contact: String,
	day: Int,
	month: String,
	duration: Int,
	camoaign: Int,
	pdays: Int,
	previous: Int,
	poutcome: String,
	y: String
	)

	// parsing
	val filtered = rawData.filter {line => ! line.contains("default")}.map(_.split(";")).map {
	arr => DataRow(
	arr(0), arr(1), arr(2), arr(3), arr(4), arr(5), arr(6), arr(7), arr(8), arr(9), arr(10), arr(11), arr(12), arr(13), arr(14), arr(15), arr(16)
	)
	}

	// persisting in memory
	filtered.cache()


	// counts for chi-squared test of independence
	// between job and loan outcome

	val countsByJobAndY = filtered.map{row => ((row.job, row.y), 1) }.reduceByKey(_+_)
	val countsByJob = filtered.map{row => (row.job, 1) }.reduceByKey(_+_).collectAsMap()
	val countsByY = filtered.map{row => (row.y, 1) }.reduceByKey(_+_).collectAsMap()
	val countTotal = filtered.count()

	//
	countsByJobAndY.sortBy(-_._2).take(10) // top 10 highly correlated suspects from count


	// observed and expected frequencies
	import scala.math
	val chiSqVals = countsByJobAndY
	.map{case ((job, y), observed) => ((job, y), (observed, countsByJob(job) * countsByY(y) / countTotal ) ) }
	.mapValues{case (observed, expected) => math.pow(observed - expected, 2)/expected }

	chiSqVals.sortBy(-_._2).take(10) // top 10 highly correlated suspects from test statistic
	// our test statistic
	chiSqVals.collect()

	// ChiSquared (0.05,13) = 22.362
	chiSqVals.map(_._2).reduce()
	// chi squared test statistic, which is higher than 22.362, so job and loan outcome are not independent
	val rawTexts = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/novels/")

	rawTexts.take(5)

	val words = rawTexts
	.flatMap(_.split("\\s+")) // splitting by space
	.map(_.replaceAll("[^a-zA-Z ]", "")) // remove non alphabets
	.map(_.toLowerCase).filter(word => word.length >= 5 && word.length <= 10) // filtering words with certain length

	val counts = words map{(_, 1)} reduceByKey (_+_)

	counts.sortBy(- _._2)take(10)