Skip to content

Instantly share code, notes, and snippets.

@oluies
Last active October 25, 2016 08:06
Show Gist options
  • Save oluies/fd198ed80a4280f53574bca79b21d834 to your computer and use it in GitHub Desktop.
Save oluies/fd198ed80a4280f53574bca79b21d834 to your computer and use it in GitHub Desktop.
import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
import org.apache.hadoop.fs.{FileSystem,Path}
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
def csvToDF(file: Path, delimiter : String,charset: String = "UTF8", useHeader: Boolean = true, schema: Option[StructType] = None) = {
val df = schema match {
case Some(schema) => sqlContext.read
.format("com.databricks.spark.csv")
.option("header",useHeader.toString()) // Use first line of all files as header
.option("inferSchema", "true") // Automatical¤ly infer data types
.option("delimiter",delimiter)
.option("charset",charset)
.option("mode","DROPMALFORMED")
.option("treatEmptyValuesAsNulls","true")
.option("nullValue","NULL")
.schema(schema)
.load(file.toString() )
case None => sqlContext.read
.format("com.databricks.spark.csv")
.option("header",useHeader.toString()) // Use first line of all files as header
.option("inferSchema", "true") // Automatical¤ly infer data types
.option("delimiter",delimiter)
.option("charset",charset)
.option("mode","DROPMALFORMED")
.option("treatEmptyValuesAsNulls","true")
.option("nullValue","NULL")
.load(file.toString() )
}
df
}
val dfmetoder = csvToDF(new Path("/data/thepath/metoder.csv"),",",useHeader = false,schema=None)
import org.apache.spark.sql.SaveMode
dfmetoder.write.mode(SaveMode.Append).parquet("/data/thepath/p/metoder" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment