bzamecnik/spark_csv_to_parquet.scala

## spark_csv_to_parquet.scala
// run eg. in spark-shell...

// uncompressed CSV without a header
val df = spark.read.csv("input.csv")
df.write.format("parquet").save("output.parquet")

// it produces a directory output.parquet/ with the following content:
//
// ls -l output.parquet/
// -rw-r--r-- 1 bza bza 146346 Sep 29 16:06 part-r-00000-8a89541d-9071-4246-b525-22e894ef3e0b.snappy.parquet
// -rw-r--r-- 1 bza bza      0 Sep 29 16:06 _SUCCESS
//
// the parquet file is snappy-compressed


// gzip-compressed CSV - no problem...
val df_gz = spark.read.csv("input.csv.gz")

// the parquet file can be read again:
val df_parquet = spark.read.parquet("output.parquet")

// add column names when the CSV header is missing:
val columns = Seq("foo", "bar", "baz")
val df_with_columns = df_without_columns.toDF(columns: _*)
	// run eg. in spark-shell...

	// uncompressed CSV without a header
	val df = spark.read.csv("input.csv")
	df.write.format("parquet").save("output.parquet")

	// it produces a directory output.parquet/ with the following content:
	//
	// ls -l output.parquet/
	// -rw-r--r-- 1 bza bza 146346 Sep 29 16:06 part-r-00000-8a89541d-9071-4246-b525-22e894ef3e0b.snappy.parquet
	// -rw-r--r-- 1 bza bza 0 Sep 29 16:06 _SUCCESS
	//
	// the parquet file is snappy-compressed


	// gzip-compressed CSV - no problem...
	val df_gz = spark.read.csv("input.csv.gz")

	// the parquet file can be read again:
	val df_parquet = spark.read.parquet("output.parquet")

	// add column names when the CSV header is missing:
	val columns = Seq("foo", "bar", "baz")
	val df_with_columns = df_without_columns.toDF(columns: _*)