Created
February 24, 2021 17:33
-
-
Save anthony-cros/4b5d40014dd52d57bc7dd05c35029066 to your computer and use it in GitHub Desktop.
Reproduced example from https://darrenjw.wordpress.com/2015/08/21/data-frames-and-tables-in-scala/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gallia._ // see https://github.com/galliaproject/gallia-core/blob/init/README.md#dependencies | |
// =========================================================================== | |
object DataFramesAndTablesInScalaArticle { | |
// https://darrenjw.wordpress.com/2015/08/21/data-frames-and-tables-in-scala/ | |
// --------------------------------------------------------------------------- | |
def main(args: Array[String]): Unit = { | |
"/data/misc/cars93.csv".stream() | |
.filterBy("EngineSize").matches(_ <= 4.0) | |
.generate("WeightKG").from("Weight").using(_ * 0.453592) | |
.write("/tmp/out.csv") | |
} | |
// --------------------------------------------------------------------------- | |
/* | |
from article (minus the print statements): | |
// R: | |
df=read.csv("cars93.csv") | |
df = df[df$EngineSize<=4.0,] | |
df$WeightKG = df$Weight*0.453592 | |
write.csv(df,"cars93m.csv",row.names=FALSE) | |
// --------------------------------------------------------------------------- | |
// Saddle: | |
val file = CsvFile("cars93.csv") | |
val df = CsvParser.parse(file).withColIndex(0) | |
val df2 = df.rfilter(_("EngineSize").mapValues(CsvParser.parseDouble).at(0)<=4.0) | |
val wkg=df2.col("Weight").mapValues(CsvParser.parseDouble).mapValues(_*0.453592).setColIndex(Index("WeightKG")) | |
val df3=df2.joinPreserveColIx(wkg.mapValues(_.toString)) | |
df3.writeCsvFile("saddle-out.csv") | |
// --------------------------------------------------------------------------- | |
// Scala-datatable: | |
val colTypes=Map("DriveTrain" -> StringCol, "Min.Price" -> Double, [...]) // full schema | |
val df=readCsv("Cars93",new FileReader("cars93.csv"),colTypes) | |
val df2=df.filter(row=>row.as[Double]("EngineSize")<=4.0).toDataTable | |
val oldCol=df2.columns("Weight").as[Int] | |
val newCol=new DataColumn[Double]("WeightKG",oldCol.data.map{_.toDouble*0.453592}) | |
val df3=df2.columns.add(newCol).get | |
writeCsv(df3,new File("out.csv")) | |
// --------------------------------------------------------------------------- | |
// Framian: | |
val df=Csv.parseFile(new File("cars93.csv")).labeled.toFrame | |
val df2=df.filter(Cols("EngineSize").as[Double])( _ <= 4.0 ) | |
val df3=df2.map(Cols("Weight").as[Int],"WeightKG")(r=>r.toDouble*0.453592) | |
val csv = Csv.fromFrame(new CsvFormat(",", header = true))(df3) | |
new PrintWriter("out.csv") { write(csv.toString); close } | |
// --------------------------------------------------------------------------- | |
// Spark: | |
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true"). option("inferSchema","true").load("cars93.csv") | |
val df2=df.filter("EngineSize <= 4.0") | |
val col=df2.col("Weight")*0.453592 | |
val df3=df2.withColumn("WeightKG",col) | |
df3.write.format("com.databricks.spark.csv").option("header","true").save("out-csv") | |
*/ | |
} | |
// =========================================================================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment