Skip to content

Instantly share code, notes, and snippets.

@vgiri2015
Created August 19, 2016 03:30
Show Gist options
  • Save vgiri2015/edd5952a6b22dc49c4ac91f2b25aa6a3 to your computer and use it in GitHub Desktop.
Save vgiri2015/edd5952a6b22dc49c4ac91f2b25aa6a3 to your computer and use it in GitHub Desktop.
File Compression in Spark 2.0
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by vgiridatabricks on 8/13/16.
*/
object FileCompression {
case class DataFrameSample(name: String, actor: String, episodeDebut: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("Spark File Compression Handling")
.master("local")
.getOrCreate()
val df = spark.createDataFrame(
DataFrameSample("Homer", "Dan Castellaneta", "Good Night") ::
DataFrameSample("Marge", "Julie Kavner", "Good Night") ::
DataFrameSample("Bart", "Nancy Cartwright", "Good Night") ::
DataFrameSample("Lisa", "Yeardley Smith", "Good Night") ::
DataFrameSample("Maggie", "Liz Georges and more", "Good Night") ::
DataFrameSample("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") ::
Nil).toDF().cache()
df.write.mode("overwrite").format("parquet").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_parq")
df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save("/tmp/file_with_gzip_parq")
df.write.mode("overwrite").format("parquet").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_parq")
//lzo - requires a different method in terms of implementation.
df.write.mode("overwrite").format("orc").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_orc")
df.write.mode("overwrite").format("orc").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_orc")
df.write.mode("overwrite").format("orc").option("compression", "zlib").mode("overwrite").save("/tmp/file_with_zlib_orc")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment