Last active
September 9, 2021 12:42
-
-
Save rzykov/2c66b2bfffad3136837ab9dc0c22000c to your computer and use it in GitHub Desktop.
Data analysis using Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//CODE: | |
import org.apache.spark.rdd.RDD | |
import org.apache.spark.sql._ | |
import org.tribbloid.ispark.display.dsl._ | |
import scala.util.Try | |
val sqlContext = new org.apache.spark.sql.SQLContext(sc) | |
import sqlContext.implicits._ | |
// Declare a CASE class; we need it for the dataframe | |
case class Row(categoryId: Long, orderId: String ,cityId: String, osName: String, | |
osFamily: String, uaType: String, uaName: String,aov: Double) | |
// read the file into the val variable using sc (Spark Context), it is declared beforehand | |
val aov = sc.textFile("file:///Users/rzykov/Downloads/AOVC.csv") | |
// let's parse the fields | |
val dataAov = aov.flatMap { line => Try { line.split(",") match { | |
case Array(categoryId, orderId, cityId, osName, osFamily, uaType, uaName, aov) => | |
Row(categoryId.toLong + 100, orderId, cityId, osName, osFamily, osFamily, uaType, aov.toDouble) | |
} }.toOption } | |
//OUT: | |
MapPartitionsRDD[4] at map at <console>:28 | |
//CODE: | |
//The most popular category. | |
dataAov.map { x => x.categoryId } // select the categoryId field | |
.countByValue() // calculate how often each categoryId appears | |
.toSeq | |
.sortBy( - _._2) // sort by frequency in descending order | |
.take(10) //take the top 10 records | |
//OUT: | |
ArrayBuffer((314,3068), (132,2229), (128,1770), (270,1483), (139,1379), (107,1366), (177,1311), (226,1268), (103,1259), (127,1204)) | |
//CODE: | |
val interestedBrowsers = List("Android", "OS X", "iOS", "Linux", "Windows") | |
val osAov = dataAov.filter(x => interestedBrowsers.contains(x.osFamily)) //we leave only the desired OS | |
.filter(_.categoryId == 128) // filter categories | |
.map(x => (x.osFamily, (x.aov, 1.0))) // need to calculate average purchase amount | |
.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)) | |
.map{ case(osFamily, (revenue, orders)) => (osFamily, revenue/orders) } | |
.collect() | |
//OUT | |
//The output is an array of tuples (tuple) in OS format, the average purchase amount: | |
Array((OS X,4859.827586206897), (Linux,3730.4347826086955), (iOS,3964.6153846153848), (Android,3670.8474576271187), (Windows,3261.030993042378)) | |
//CODE of Highcharts | |
import com.quantifind.charts.Highcharts._ | |
import ru.retailrocket.ispark.wisp._ | |
draw(column(osAov.toList)) | |
//CODE | |
import org.ddahl.rscala._ | |
import ru.retailrocket.ispark._ | |
def connect() = RClient("R", false) | |
@transient | |
val r = connect() | |
R.plot(r, "barplot(cs, names.arg=names, col = 'red' )", Map("cs"->os.Aov.map(_._2), "names" -> osAov.map(_._1)) ) | |
//CODE of MIC | |
//CODE | |
import data.VarPairData | |
import mine.core.MineParameters | |
import analysis.Analysis | |
import analysis.results.BriefResult | |
import scala.util.Random | |
//Code a discrete value by randomly changing the order of the "codes | |
def encode(col: Array[String]): Array[Double] = { | |
val ns = scala.util.Random.shuffle(1 to col.toSet.size) | |
val encMap = col.toSet.zip(ns).toMap | |
col.map{encMap(_).toDouble} | |
} | |
// function to calculate MIC | |
def mic(x: Array[Double], y: Array[Double]) = { | |
val data = new VarPairData(x.map(_.toFloat), y.map(_.toFloat)) | |
val params = new MineParameters(0.6.toFloat, 15, 0, null) | |
val res = Analysis.getResult(classOf[BriefResult], data, params) | |
res.getMIC | |
} | |
//in case of a discrete value do a lot of iterations and take the maximum | |
def micMax(x: Array[Double], y: Array[Double], n: Int = 100) = | |
(for{ i <- 1 to 100} yield mic(x, y)).max | |
//Now we're close to the final result, let's perform the calculation: | |
val aov = dataAov.filter(x => interestedBrowsers.contains(x.osFamily)) //we leave only the OSes we want | |
.filter(_.categoryId == 128) // filter categories | |
//osFamily | |
var aovMic = aov.map(x => (x.osFamily, x.aov)).collect() | |
println("osFamily MIC =" + micMax(encode(aovMic.map(_._1)), aovMic.map(_._2)) | |
//orderId | |
aovMic = aov.map(x => (x.orderId, x.aov)).collect() | |
println("orderId MIC =" + micMax(encode(aovMic.map(_._1)), aovMic.map(_._2)) | |
//cityId | |
aovMic = aov.map(x => (x.cityId, x.aov)).collect() | |
println("cityId MIC =" + micMax(encode(aovMic.map(_._1)), aovMic.map(_._2)) | |
//uaName | |
aovMic = aov.map(x => (x.uaName, x.aov)).collect() | |
println("uaName MIC =" + mic(encode(aovMic.map(_._1)), aovMic.map(_._2)) | |
//aov | |
println("aov MIC =" + micMax(aovMic.map(_._2), aovMic.map(_._2)) | |
//random | |
println("random MIC =" + mic(aovMic.map(_ => math.random*100.0), aovMic.map(_._2)) | |
//OUTPUT | |
// osFamily MIC =0.06658 | |
// orderId MIC =0.10074 | |
// cityId MIC =0.07281 | |
// aov MIC =0.99999 | |
// uaName MIC =0.05297 | |
// random MIC =0.10599 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment