Last active
October 6, 2021 11:35
-
-
Save rzykov/957e78287fb0e59f18cb41a393a0cb92 to your computer and use it in GitHub Desktop.
Data Analysis with Scala - medium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//CODE: | |
import org.apache.spark.rdd.RDD | |
import org.apache.spark.sql._ | |
import org.tribbloid.ispark.display.dsl._ | |
import scala.util.Try | |
val sqlContext = new org.apache.spark.sql.SQLContext(sc) | |
import sqlContext.implicits._ | |
// Declare a CASE class; we need it for the dataframe | |
case class Row(categoryId: Long, orderId: String ,cityId: String, osName: String, | |
osFamily: String, uaType: String, uaName: String,aov: Double) | |
// read the file into the val variable using sc (Spark Context), it is declared beforehand | |
val aov = sc.textFile("file:///Users/rzykov/Downloads/AOVC.csv") | |
// let's parse the fields | |
val dataAov = aov.flatMap { line => Try { line.split(",") match { | |
case Array(categoryId, orderId, cityId, osName, osFamily, uaType, uaName, aov) => | |
Row(categoryId.toLong + 100, orderId, cityId, osName, osFamily, osFamily, uaType, aov.toDouble) | |
} }.toOption } | |
//OUT: | |
MapPartitionsRDD[4] at map at <console>:28 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment