Skip to content

Instantly share code, notes, and snippets.

@bigsnarfdude
Last active March 20, 2016 15:20
Show Gist options
  • Save bigsnarfdude/581b780ce85d7aaecbcb to your computer and use it in GitHub Desktop.
Save bigsnarfdude/581b780ce85d7aaecbcb to your computer and use it in GitHub Desktop.
spark-assembly-2.0.0-SNAPSHOT-hadoop2.4.0.jar com.databricks#spark-csv_2.11_1.4 ERROR
// ./bin/spark-shell --packages com.databricks:spark-csv_2.11:1.4.0
//
// version 2.0.0-SNAPSHOT
// Using Scala version 2.11.7 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_80)
// http://stat-computing.org/dataexpo/2009/the-data.html
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0) + "ns")
result
}
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("/Users/employee/Downloads/2008.csv")
val df_1 = df.withColumnRenamed("Year","oldYear")
val df_2 = df_1.withColumn("Year",df_1.col("oldYear").cast("int")).drop("oldYear")
def convertColumn(df: org.apache.spark.sql.DataFrame, name:String, newType:String) = {
val df_1 = df.withColumnRenamed(name, "swap")
df_1.withColumn(name, df_1.col("swap").cast(newType)).drop("swap")
}
val df_3 = convertColumn(df_2, "ArrDelay", "int")
val df_4 = convertColumn(df_2, "DepDelay", "int")
// test write to parquet is fast
df_4.select("Year", "Cancelled").write.format("parquet").save("yearAndCancelled.parquet")
val selectedData = df_4.select("Year", "Cancelled")
val howLong = time(selectedData.write.format("com.databricks.spark.csv").option("header", "true").save("output.csv"))
//scala> val howLong = time(selectedData.write.format("com.databricks.spark.csv").option("header", "true").save("output.csv"))
//Elapsed time: 3488272270000ns
//howLong: Unit = ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment