Skip to content

Instantly share code, notes, and snippets.

@bistaumanga
Last active February 6, 2016 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bistaumanga/139dcd2281c61ab5bf06 to your computer and use it in GitHub Desktop.
Save bistaumanga/139dcd2281c61ab5bf06 to your computer and use it in GitHub Desktop.
pwlktm02
val rawData = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/bank/bank.csv")
rawData.take(5)
// 30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"
implicit def str2int(x: String) = x.toInt // because i'm lazy
// case class for representing data row
case class DataRow(age: Int,
job: String,
maritial: String,
education: String,
default_ : String,
balance: Int,
housing: String,
loan: String,
contact: String,
day: Int,
month: String,
duration: Int,
camoaign: Int,
pdays: Int,
previous: Int,
poutcome: String,
y: String
)
// parsing
val filtered = rawData.filter {line => ! line.contains("default")}.map(_.split(";")).map {
arr => DataRow(
arr(0), arr(1), arr(2), arr(3), arr(4), arr(5), arr(6), arr(7), arr(8), arr(9), arr(10), arr(11), arr(12), arr(13), arr(14), arr(15), arr(16)
)
}
// persisting in memory
filtered.cache()
// counts for chi-squared test of independence
// between job and loan outcome
val countsByJobAndY = filtered.map{row => ((row.job, row.y), 1) }.reduceByKey(_+_)
val countsByJob = filtered.map{row => (row.job, 1) }.reduceByKey(_+_).collectAsMap()
val countsByY = filtered.map{row => (row.y, 1) }.reduceByKey(_+_).collectAsMap()
val countTotal = filtered.count()
//
countsByJobAndY.sortBy(-_._2).take(10) // top 10 highly correlated suspects from count
// observed and expected frequencies
import scala.math
val chiSqVals = countsByJobAndY
.map{case ((job, y), observed) => ((job, y), (observed, countsByJob(job) * countsByY(y) / countTotal ) ) }
.mapValues{case (observed, expected) => math.pow(observed - expected, 2)/expected }
chiSqVals.sortBy(-_._2).take(10) // top 10 highly correlated suspects from test statistic
// our test statistic
chiSqVals.collect()
// **ChiSquared (0.05,13) ** = 22.362
chiSqVals.map(_._2).reduce()
// chi squared test statistic, which is higher than 22.362, so job and loan outcome are not independent
val rawTexts = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/novels/")
rawTexts.take(5)
val words = rawTexts
.flatMap(_.split("\\s+")) // splitting by space
.map(_.replaceAll("[^a-zA-Z ]", "")) // remove non alphabets
.map(_.toLowerCase).filter(word => word.length >= 5 && word.length <= 10) // filtering words with certain length
val counts = words map{(_, 1)} reduceByKey (_+_)
counts.sortBy(- _._2)take(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment