Skip to content

Instantly share code, notes, and snippets.

@anthony-cros
Created February 24, 2021 17:33
Show Gist options
  • Save anthony-cros/4b5d40014dd52d57bc7dd05c35029066 to your computer and use it in GitHub Desktop.
Save anthony-cros/4b5d40014dd52d57bc7dd05c35029066 to your computer and use it in GitHub Desktop.
import gallia._ // see https://github.com/galliaproject/gallia-core/blob/init/README.md#dependencies
// ===========================================================================
object DataFramesAndTablesInScalaArticle {
// https://darrenjw.wordpress.com/2015/08/21/data-frames-and-tables-in-scala/
// ---------------------------------------------------------------------------
def main(args: Array[String]): Unit = {
"/data/misc/cars93.csv".stream()
.filterBy("EngineSize").matches(_ <= 4.0)
.generate("WeightKG").from("Weight").using(_ * 0.453592)
.write("/tmp/out.csv")
}
// ---------------------------------------------------------------------------
/*
from article (minus the print statements):
// R:
df=read.csv("cars93.csv")
df = df[df$EngineSize<=4.0,]
df$WeightKG = df$Weight*0.453592
write.csv(df,"cars93m.csv",row.names=FALSE)
// ---------------------------------------------------------------------------
// Saddle:
val file = CsvFile("cars93.csv")
val df = CsvParser.parse(file).withColIndex(0)
val df2 = df.rfilter(_("EngineSize").mapValues(CsvParser.parseDouble).at(0)<=4.0)
val wkg=df2.col("Weight").mapValues(CsvParser.parseDouble).mapValues(_*0.453592).setColIndex(Index("WeightKG"))
val df3=df2.joinPreserveColIx(wkg.mapValues(_.toString))
df3.writeCsvFile("saddle-out.csv")
// ---------------------------------------------------------------------------
// Scala-datatable:
val colTypes=Map("DriveTrain" -> StringCol, "Min.Price" -> Double, [...]) // full schema
val df=readCsv("Cars93",new FileReader("cars93.csv"),colTypes)
val df2=df.filter(row=>row.as[Double]("EngineSize")<=4.0).toDataTable
val oldCol=df2.columns("Weight").as[Int]
val newCol=new DataColumn[Double]("WeightKG",oldCol.data.map{_.toDouble*0.453592})
val df3=df2.columns.add(newCol).get
writeCsv(df3,new File("out.csv"))
// ---------------------------------------------------------------------------
// Framian:
val df=Csv.parseFile(new File("cars93.csv")).labeled.toFrame
val df2=df.filter(Cols("EngineSize").as[Double])( _ <= 4.0 )
val df3=df2.map(Cols("Weight").as[Int],"WeightKG")(r=>r.toDouble*0.453592)
val csv = Csv.fromFrame(new CsvFormat(",", header = true))(df3)
new PrintWriter("out.csv") { write(csv.toString); close }
// ---------------------------------------------------------------------------
// Spark:
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true"). option("inferSchema","true").load("cars93.csv")
val df2=df.filter("EngineSize <= 4.0")
val col=df2.col("Weight")*0.453592
val df3=df2.withColumn("WeightKG",col)
df3.write.format("com.databricks.spark.csv").option("header","true").save("out-csv")
*/
}
// ===========================================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment