Skip to content

Instantly share code, notes, and snippets.

@karpanGit
Created April 27, 2022 05:48
Show Gist options
  • Save karpanGit/6b5cffe42460fd16b7bed9e2459289e3 to your computer and use it in GitHub Desktop.
Save karpanGit/6b5cffe42460fd16b7bed9e2459289e3 to your computer and use it in GitHub Desktop.
pyspark, read json
# read single singleline json
ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }'''
with open(r'./junk/ex1.json', 'wt') as f:
f.write(ex1)
data = spark.read.json(r'./junk/ex1.json')
data.show()
# +-----+-------+----------------+
# | a| b| c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- c: struct (nullable = true)
# | |-- x: array (nullable = true)
# | | |-- element: long (containsNull = true)
# | |-- y: string (nullable = true)
# read multiple singleline json
ex2 = '''
{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }
'''
with open(r'./junk/ex2.json', 'wt') as f:
f.write(ex2)
data = spark.read.json(r'./junk/ex2.json')
data.show()
# +-----+-------+----------------+
# | a| b| c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- c: struct (nullable = true)
# | |-- x: array (nullable = true)
# | | |-- element: long (containsNull = true)
# | |-- y: string (nullable = true)
# read single multiline json
ex3 = '''
{"a": "hello",
"b": "hello 2",
"c":{"x":[1,2,3], "y": "bye"} }
'''
with open(r'./junk/ex3.json', 'wt') as f:
f.write(ex3)
data = spark.read.json(r'./junk/ex3.json', multiLine=True)
data.show()
# +-----+-------+----------------+
# | a| b| c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
# read multiple multiline json <- this is the most generic useful way of reading json most likely
ex4 = '''
[
{"a": "hello",
"b": "hello 2",
"c":{"x":[1,2,3], "y": "bye"}},
{"a": "hello",
"b": "hello 2",
"c":{"x":[1,2,3], "y": "bye"}}
]
'''
with open(r'./junk/ex4.json', 'wt') as f:
f.write(ex4)
data = spark.read.json(r'./junk/ex4.json', multiLine=True)
data.show()
# +-----+-------+----------------+
# | a| b| c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|
# |hello|hello 2|{[1, 2, 3], bye}|
# +-----+-------+----------------+
data.printSchema()
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- c: struct (nullable = true)
# | |-- x: array (nullable = true)
# | | |-- element: long (containsNull = true)
# | |-- y: string (nullable = true)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment