oneryalcin/sparkify_3_read_data.py

## sparkify_3_read_data.py
# Read data into spark.
# Note: Ideally data should be in a schema supported format like parquet,
# which also supports partitioning, something very important while ingesting big data.
# Also data may be placed in a distributed filesystem like HDFS or in a cloud
# provider storage bucket like AWS S3 / Google Cloud Storage for faster reads.
# here we only read from local disk.
data = spark.read.json('mini_sparkify_event_data.json')

# How many user activity rows do we have?
data.count()
>> 286500

# Have a look at the inferred schema
data.printSchema()
#>>root
# |-- artist: string (nullable = true)
# |-- auth: string (nullable = true)
# |-- firstName: string (nullable = true)
# |-- gender: string (nullable = true)
# |-- itemInSession: long (nullable = true)
# |-- lastName: string (nullable = true)
# |-- length: double (nullable = true)
# |-- level: string (nullable = true)
# |-- location: string (nullable = true)
# |-- method: string (nullable = true)
# |-- page: string (nullable = true)
# |-- registration: long (nullable = true)
# |-- sessionId: long (nullable = true)
# |-- song: string (nullable = true)
# |-- status: long (nullable = true)
# |-- ts: long (nullable = true)
# |-- userAgent: string (nullable = true)
# |-- userId: string (nullable = true)
	# Read data into spark.
	# Note: Ideally data should be in a schema supported format like parquet,
	# which also supports partitioning, something very important while ingesting big data.
	# Also data may be placed in a distributed filesystem like HDFS or in a cloud
	# provider storage bucket like AWS S3 / Google Cloud Storage for faster reads.
	# here we only read from local disk.
	data = spark.read.json('mini_sparkify_event_data.json')

	# How many user activity rows do we have?
	data.count()
	>> 286500

	# Have a look at the inferred schema
	data.printSchema()
	#>>root
	# \|-- artist: string (nullable = true)
	# \|-- auth: string (nullable = true)
	# \|-- firstName: string (nullable = true)
	# \|-- gender: string (nullable = true)
	# \|-- itemInSession: long (nullable = true)
	# \|-- lastName: string (nullable = true)
	# \|-- length: double (nullable = true)
	# \|-- level: string (nullable = true)
	# \|-- location: string (nullable = true)
	# \|-- method: string (nullable = true)
	# \|-- page: string (nullable = true)
	# \|-- registration: long (nullable = true)
	# \|-- sessionId: long (nullable = true)
	# \|-- song: string (nullable = true)
	# \|-- status: long (nullable = true)
	# \|-- ts: long (nullable = true)
	# \|-- userAgent: string (nullable = true)
	# \|-- userId: string (nullable = true)