karpanGit/pyspark, read json.py

## pyspark, read json.py
# read single singleline json
ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }'''
with open(r'./junk/ex1.json', 'wt') as f:
    f.write(ex1)
data = spark.read.json(r'./junk/ex1.json')
data.show()
# +-----+-------+----------------+
# |    a|      b|               c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
#  |-- a: string (nullable = true)
#  |-- b: string (nullable = true)
#  |-- c: struct (nullable = true)
#  |    |-- x: array (nullable = true)
#  |    |    |-- element: long (containsNull = true)
#  |    |-- y: string (nullable = true)


# read multiple singleline json
ex2 = '''
{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
'''
with open(r'./junk/ex2.json', 'wt') as f:
    f.write(ex2)
data = spark.read.json(r'./junk/ex2.json')
data.show()
# +-----+-------+----------------+
# |    a|      b|               c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
#  |-- a: string (nullable = true)
#  |-- b: string (nullable = true)
#  |-- c: struct (nullable = true)
#  |    |-- x: array (nullable = true)
#  |    |    |-- element: long (containsNull = true)
#  |    |-- y: string (nullable = true)


# read single multiline json
ex3 = '''
{"a": "hello",
"b": "hello 2",
 "c":{"x":[1,2,3], "y": "bye"} }
'''
with open(r'./junk/ex3.json', 'wt') as f:
    f.write(ex3)
data = spark.read.json(r'./junk/ex3.json', multiLine=True)
data.show()
# +-----+-------+----------------+
# |    a|      b|               c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()


# read multiple multiline json <- this is the most generic useful way of reading json most likely
ex4 = '''
[
{"a": "hello",
"b": "hello 2",
"c":{"x":[1,2,3], "y": "bye"}},

{"a": "hello",
"b": "hello 2",
"c":{"x":[1,2,3], "y": "bye"}}
]
'''
with open(r'./junk/ex4.json', 'wt') as f:
    f.write(ex4)
data = spark.read.json(r'./junk/ex4.json', multiLine=True)
data.show()
# +-----+-------+----------------+
# |    a|      b|               c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
#  |-- a: string (nullable = true)
#  |-- b: string (nullable = true)
#  |-- c: struct (nullable = true)
#  |    |-- x: array (nullable = true)
#  |    |    |-- element: long (containsNull = true)
#  |    |-- y: string (nullable = true)
	# read single singleline json
	ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }'''
	with open(r'./junk/ex1.json', 'wt') as f:
	f.write(ex1)
	data = spark.read.json(r'./junk/ex1.json')
	data.show()
	# +-----+-------+----------------+
	# \| a\| b\| c\|
	# +-----+-------+----------------+
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# +-----+-------+----------------+
	data.printSchema()
	# \|-- a: string (nullable = true)
	# \|-- b: string (nullable = true)
	# \|-- c: struct (nullable = true)
	# \| \|-- x: array (nullable = true)
	# \| \| \|-- element: long (containsNull = true)
	# \| \|-- y: string (nullable = true)


	# read multiple singleline json
	ex2 = '''
	{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
	{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
	'''
	with open(r'./junk/ex2.json', 'wt') as f:
	f.write(ex2)
	data = spark.read.json(r'./junk/ex2.json')
	data.show()
	# +-----+-------+----------------+
	# \| a\| b\| c\|
	# +-----+-------+----------------+
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# +-----+-------+----------------+
	data.printSchema()
	# \|-- a: string (nullable = true)
	# \|-- b: string (nullable = true)
	# \|-- c: struct (nullable = true)
	# \| \|-- x: array (nullable = true)
	# \| \| \|-- element: long (containsNull = true)
	# \| \|-- y: string (nullable = true)


	# read single multiline json
	ex3 = '''
	{"a": "hello",
	"b": "hello 2",
	"c":{"x":[1,2,3], "y": "bye"} }
	'''
	with open(r'./junk/ex3.json', 'wt') as f:
	f.write(ex3)
	data = spark.read.json(r'./junk/ex3.json', multiLine=True)
	data.show()
	# +-----+-------+----------------+
	# \| a\| b\| c\|
	# +-----+-------+----------------+
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# +-----+-------+----------------+
	data.printSchema()


	# read multiple multiline json <- this is the most generic useful way of reading json most likely
	ex4 = '''
	[
	{"a": "hello",
	"b": "hello 2",
	"c":{"x":[1,2,3], "y": "bye"}},

	{"a": "hello",
	"b": "hello 2",
	"c":{"x":[1,2,3], "y": "bye"}}
	]
	'''
	with open(r'./junk/ex4.json', 'wt') as f:
	f.write(ex4)
	data = spark.read.json(r'./junk/ex4.json', multiLine=True)
	data.show()
	# +-----+-------+----------------+
	# \| a\| b\| c\|
	# +-----+-------+----------------+
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	# +-----+-------+----------------+
	data.printSchema()
	# \|-- a: string (nullable = true)
	# \|-- b: string (nullable = true)
	# \|-- c: struct (nullable = true)
	# \| \|-- x: array (nullable = true)
	# \| \| \|-- element: long (containsNull = true)
	# \| \|-- y: string (nullable = true)