saisgit/dfFromSparkStructTypeSchema.scala

## dfFromSparkStructTypeSchema.scala
/* Assume below is our File's Schema Definition
  * id integer
  * first_name string
  * last_name string
  * city string
  * country string
  * phone string
*/

import org.apache.spark.sql.types._


// Defining List of StructFields for struct
val fields: List[StructField] = List(
  StructField("id", IntegerType, true),
  StructField("first_name", StringType, true),
  StructField("last_name", StringType, true),
  StructField("city", StringType, true),
  StructField("country", StringType, true),
  StructField("phone", StringType, true)
)

// Spark StructType for above Schema Definition
val structSchema: StructType = StructType(fields)

// View Tree String - This is same as the printSchema method which we have for DataFrame
structSchema.printTreeString

// We can also get the schema as string instead of printing in console
val treeString = structSchema.treeString

/*
  * Many things can be achived programmatically using Spark Struct Schema
  *
  * Suppose we may have thousands of columns and we may intereseted only in column with String Type in such case instead of manually getting all the columns with string type we can get it programmatically like below
*/

// Get all the FieldNames with Type String
val fieldNames: Array[String] = (
  structSchema.fields // get all fields
  .filter(field => field.dataType == StringType) // filter field datatype matches stringtype
  .map(field => field.name) // get name of the field
)

// Note: df.schema() will return Spark StructType for a DataFrame

// CSV File Definition
val filePath: String = "/FileStore/tables/Customer.csv"
val header: String = "true"
val sep: String = ";"

val csvDF2 = (
  spark.read
  .option("header", "true")
  .option("sep", sep)
  .schema(structSchema) // Passing StructType Schema to read the files
  .csv(filePath)
)

/*
* Advantage of passing schema explicitly
  * Spark don't need to run tasks to inferSchema from the File Sources.
  * For CSV, we may require to have a different name for the some of the columns that is defined in the file header. In such cases we can take advantage of Spark StructType schema and we will be able avoid a additional stage to rename the columns.
*/
csvDF2.printSchema()
	/* Assume below is our File's Schema Definition
	* id integer
	* first_name string
	* last_name string
	* city string
	* country string
	* phone string
	*/

	import org.apache.spark.sql.types._


	// Defining List of StructFields for struct
	val fields: List[StructField] = List(
	StructField("id", IntegerType, true),
	StructField("first_name", StringType, true),
	StructField("last_name", StringType, true),
	StructField("city", StringType, true),
	StructField("country", StringType, true),
	StructField("phone", StringType, true)
	)

	// Spark StructType for above Schema Definition
	val structSchema: StructType = StructType(fields)

	// View Tree String - This is same as the printSchema method which we have for DataFrame
	structSchema.printTreeString

	// We can also get the schema as string instead of printing in console
	val treeString = structSchema.treeString

	/*
	* Many things can be achived programmatically using Spark Struct Schema
	*
	* Suppose we may have thousands of columns and we may intereseted only in column with String Type in such case instead of manually getting all the columns with string type we can get it programmatically like below
	*/

	// Get all the FieldNames with Type String
	val fieldNames: Array[String] = (
	structSchema.fields // get all fields
	.filter(field => field.dataType == StringType) // filter field datatype matches stringtype
	.map(field => field.name) // get name of the field
	)

	// Note: df.schema() will return Spark StructType for a DataFrame

	// CSV File Definition
	val filePath: String = "/FileStore/tables/Customer.csv"
	val header: String = "true"
	val sep: String = ";"

	val csvDF2 = (
	spark.read
	.option("header", "true")
	.option("sep", sep)
	.schema(structSchema) // Passing StructType Schema to read the files
	.csv(filePath)
	)

	/*
	* Advantage of passing schema explicitly
	* Spark don't need to run tasks to inferSchema from the File Sources.
	* For CSV, we may require to have a different name for the some of the columns that is defined in the file header. In such cases we can take advantage of Spark StructType schema and we will be able avoid a additional stage to rename the columns.
	*/
	csvDF2.printSchema()