corneliouzbett/JSONReader.py

## JSONReader.py
sc = spark.sparkContext

# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
path = "examples/src/main/resources/people.json"
peopleDF = spark.read.json(path)

# The inferred schema can be visualized using the printSchema() method
peopleDF.printSchema()
# root
#  |-- age: long (nullable = true)
#  |-- name: string (nullable = true)

# Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")

# SQL statements can be run by using the sql methods provided by spark
teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
teenagerNamesDF.show()
# +------+
# |  name|
# +------+
# |Justin|
# +------+

# Alternatively, a DataFrame can be created for a JSON dataset represented by
# an RDD[String] storing one JSON object per string
jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
otherPeopleRDD = sc.parallelize(jsonStrings)
otherPeople = spark.read.json(otherPeopleRDD)
otherPeople.show()
	sc = spark.sparkContext

	# A JSON dataset is pointed to by path.
	# The path can be either a single text file or a directory storing text files
	path = "examples/src/main/resources/people.json"
	peopleDF = spark.read.json(path)

	# The inferred schema can be visualized using the printSchema() method
	peopleDF.printSchema()
	# root
	# \|-- age: long (nullable = true)
	# \|-- name: string (nullable = true)

	# Creates a temporary view using the DataFrame
	peopleDF.createOrReplaceTempView("people")

	# SQL statements can be run by using the sql methods provided by spark
	teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
	teenagerNamesDF.show()
	# +------+
	# \| name\|
	# +------+
	# \|Justin\|
	# +------+

	# Alternatively, a DataFrame can be created for a JSON dataset represented by
	# an RDD[String] storing one JSON object per string
	jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
	otherPeopleRDD = sc.parallelize(jsonStrings)
	otherPeople = spark.read.json(otherPeopleRDD)
	otherPeople.show()