dvannoy/spark_language_comparison.scala

## spark_language_comparison.scala
// Implicits provide many shortcuts, including conversion from Row into a specific type
import spark.implicits._

// Case class to use as type for each Row
case class VehicleStopRaw(
            stop_id: String, stop_cause: String, service_area: String, subject_race: String,
            subject_sex: String, subject_age: String, timestamp: String, stop_date: String,
            stop_time: String, sd_resident: String, arrested: String, searched: String,
            obtained_consent: String, contraband_found: String, property_seized: String)

val cvDF = spark.read
  .option("header","true")
  .json("s3a://dvannoy-public/sample_data/vehicle_stops_newline_delimited.json")
  .as[VehicleStopRaw]  // convert each row to type VehicleStopRaw

val r = cvDF.show()  // print data

## spark_language_pyspark.py
sample_data = [
  ["TestRecord1", "first entry", 1],
  ["TestRecord2", "second entry", 2],
  ["TestRecord3", "third entry", 3]
]

# Read with column names and implicit types
column_names = ['name', 'desc', 'value']
df = spark.createDataFrame(sample_data, column_names)
df.show()  # print data

# Read with Spark schema (specific types)
from pyspark.sql.types import StructType
schema = StructType().add('name', 'string').add('desc', 'string').add('value', 'integer')
df2 = spark.createDataFrame(sample_data, schema)
df2.show()  # print data
	// Implicits provide many shortcuts, including conversion from Row into a specific type
	import spark.implicits._

	// Case class to use as type for each Row
	case class VehicleStopRaw(
	stop_id: String, stop_cause: String, service_area: String, subject_race: String,
	subject_sex: String, subject_age: String, timestamp: String, stop_date: String,
	stop_time: String, sd_resident: String, arrested: String, searched: String,
	obtained_consent: String, contraband_found: String, property_seized: String)

	val cvDF = spark.read
	.option("header","true")
	.json("s3a://dvannoy-public/sample_data/vehicle_stops_newline_delimited.json")
	.as[VehicleStopRaw] // convert each row to type VehicleStopRaw

	val r = cvDF.show() // print data
	sample_data = [
	["TestRecord1", "first entry", 1],
	["TestRecord2", "second entry", 2],
	["TestRecord3", "third entry", 3]
	]

	# Read with column names and implicit types
	column_names = ['name', 'desc', 'value']
	df = spark.createDataFrame(sample_data, column_names)
	df.show() # print data

	# Read with Spark schema (specific types)
	from pyspark.sql.types import StructType
	schema = StructType().add('name', 'string').add('desc', 'string').add('value', 'integer')
	df2 = spark.createDataFrame(sample_data, schema)
	df2.show() # print data