Skip to content

Instantly share code, notes, and snippets.

@jiamingd
Last active September 18, 2017 00:31
Show Gist options
  • Save jiamingd/97cb19cb98ae3698ce66d6cb5d754a27 to your computer and use it in GitHub Desktop.
Save jiamingd/97cb19cb98ae3698ce66d6cb5d754a27 to your computer and use it in GitHub Desktop.
import scala.collection.mutable
import scala.io.Source
import scala.util.Try
/*
Sample csv:
user_id,age,favorite_color
0,5,cyan
1,5,gray
2,22,green
3,22,purple
...
Total number of users processed:
Mean age of all users:
Median age of all users:
Top 5 favorite colors
Total number of users processed with age greater than 21
Mean age of all users with age greater than 21
Median age of all users with age greater than 21
Top 5 favorite colors of user with age greater than 21
*/
val csvFile = s"${REF_DIR}/users.csv"
val itr = Source.fromFile(csvFile).getLines()
itr.next() // Not drop/slice to avoid agressive itr move, per method impl , skipping headline: id, age, color
val age2Color2Count : mutable.Map[Int, mutable.Map[String, Int]] = mutable.Map[Int, mutable.Map[String, Int]]()
.withDefault(x=> mutable.Map[String, Int]().withDefault(s=>0))
itr.foreach{ l =>
val buf = l.split(",")
if (buf.size == 3) {
val clr2Cnt: mutable.Map[String, Int] = age2Color2Count(parseToInt(buf(1)))
clr2Cnt(buf(2)) +=1
age2Color2Count(parseToInt(buf(1))) = clr2Cnt
}
}
//Now FUN starting ... ...
//Total number of users processed
val totalUsers = age2Color2Count.values.map{ clr2cnt =>
clr2cnt.values.reduce(_+_)
}.reduce(_+_)
println(s"//Total number of users processed : ${totalUsers}")
//Mean age , meidan age of all users
val age2Headcount = for {
(ag, cl2ct) <- age2Color2Count
} yield {
(ag, cl2ct.values.sum)
}
val avgAge = age2Headcount.toSeq.map{ case(ag, ct) =>
ag * ct
}.reduce(_+_) / totalUsers
println(s"Mean age : ${avgAge}")
//Median age of all users
val (ageSeq, headCountSeq) = age2Headcount.toSeq.sortBy(_._1).unzip
val medianAge = headCountSeq.scanLeft(0)(_+_).tail.zip(ageSeq).filter(_._1 > totalUsers/2).head._2
println(s"Median age of all users: ${medianAge}")
//Total number of users processed with age greater than 21
val headCountByAge21 = for( (ag, clr2Cnt) <- age2Color2Count if ag > 21 ) yield {
clr2Cnt.values.sum
}
println(s"Total number of users processed with age greater than 21: ${headCountByAge21.sum}")
//Top 5 favorite colors
val clr2HeadCnt = for {
(clr, clrCntSeq) <- age2Color2Count.values.flatten.groupBy(_._1)
} yield {
val colorHeadSum = clrCntSeq.map(_._2).reduce(_+_)
(clr, colorHeadSum)
}
println("Top 5 favorite colors (color, count)")
clr2HeadCnt.toSeq.sortWith(_._2 > _._2).slice(0, 5).foreach(println)
//Top 5 favorite colors of user with age greater than 21
//Discussion: For now for the 5th position, if [4],[5].. count is the same, just pick random one
println("Top 5 favorite colors of user with age greater than 21:")
age2Color2Count.retain((ag,v)=>ag > 21).values.map(_.toSeq).flatten.groupBy(_._1).map{ case (c, c2cnttples) =>
(c, c2cnttples.toSeq.map(_._2).reduce(_+_))
}.toSeq.sortWith(_._2 > _._2).slice(0,5).foreach(println)
// Median age of all users with age greater than 21
val age21HeadCount = for( (age, c2c) <- age2Color2Count if age > 21 ) yield {
(age, c2c.values.reduce(_+_))
}
var headCountCum = 0
var ageCum = 0
val medianAgeAbove21 = age21HeadCount.toList.map{ case (ag, c) =>
headCountCum = headCountCum + c
(ag, headCountCum)
}.filter(_._2 > age21HeadCount.values.sum / 2).head._1
println(s"Median age of all users with age greater than 21: $medianAgeAbove21")
//Mean age of all users with age greater than 21
/* Equall code
val ageTotal2Count = age2Color2Count.toSeq.filter(_._1 > 21).toList.map{ case (ag, cc) =>
(ag, cc.values.reduce(_ + _))
}.map{ case (age, count) =>
(age * count, count)
}
*/
val age2Count21 = for((age, clr2cnt) <- age2Color2Count if age > 21 ) yield {
(age, clr2cnt.values.reduce(_ + _))
}
val ageTotal2Count = age2Count21.map{ case (age, count) =>
(age*count, count)
}
println(s"Mean age of all users with age greater than 21: ${ageTotal2Count.keys.sum / ageTotal2Count.values.sum}")
def parseToInt(s: String): Int = Try(s.toInt).getOrElse(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment