Skip to content

Instantly share code, notes, and snippets.

@dvannoy
Last active April 8, 2021 00:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvannoy/43838000c97d4dabf72852e84db76cbe to your computer and use it in GitHub Desktop.
Save dvannoy/43838000c97d4dabf72852e84db76cbe to your computer and use it in GitHub Desktop.
// Implicits provide many shortcuts, including conversion from Row into a specific type
import spark.implicits._
// Case class to use as type for each Row
case class VehicleStopRaw(
stop_id: String, stop_cause: String, service_area: String, subject_race: String,
subject_sex: String, subject_age: String, timestamp: String, stop_date: String,
stop_time: String, sd_resident: String, arrested: String, searched: String,
obtained_consent: String, contraband_found: String, property_seized: String)
val cvDF = spark.read
.option("header","true")
.json("s3a://dvannoy-public/sample_data/vehicle_stops_newline_delimited.json")
.as[VehicleStopRaw] // convert each row to type VehicleStopRaw
val r = cvDF.show() // print data
sample_data = [
["TestRecord1", "first entry", 1],
["TestRecord2", "second entry", 2],
["TestRecord3", "third entry", 3]
]
# Read with column names and implicit types
column_names = ['name', 'desc', 'value']
df = spark.createDataFrame(sample_data, column_names)
df.show() # print data
# Read with Spark schema (specific types)
from pyspark.sql.types import StructType
schema = StructType().add('name', 'string').add('desc', 'string').add('value', 'integer')
df2 = spark.createDataFrame(sample_data, schema)
df2.show() # print data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment