Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karpanGit/21f3cc54de381255771adb10fe0870a4 to your computer and use it in GitHub Desktop.
Save karpanGit/21f3cc54de381255771adb10fe0870a4 to your computer and use it in GitHub Desktop.
pyspark, from pyspark to json and back
# from pyspark schema to json and the other way round
import pyspark.sql.types as T
from pprint import pprint
# -- create simple dataframe and schema
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
schema = T.StructType([
T.StructField('x1', T.LongType()),
T.StructField('x2', T.StringType()),
T.StructField('x3', T.ArrayType(StringType())),
T.StructField('x4', T.MapType(StringType(), StringType())),
T.StructField('x5', T.StructType([
StructField('x5_1', StringType()),
StructField('x5_2', StringType())
]))
])
df = spark.createDataFrame(df, schema=schema)
df.printSchema()
# root
# |-- x1: long (nullable = true)
# |-- x2: string (nullable = true)
# |-- x3: array (nullable = true)
# | |-- element: string (containsNull = true)
# |-- x4: map (nullable = true)
# | |-- key: string
# | |-- value: string (valueContainsNull = true)
# |-- x5: struct (nullable = true)
# | |-- x5_1: string (nullable = true)
# | |-- x5_2: string (nullable = true)
pprint(schema.jsonValue())
# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
# {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
# {'metadata': {},
# 'name': 'x3',
# 'nullable': True,
# 'type': {'containsNull': True,
# 'elementType': 'string',
# 'type': 'array'}},
# {'metadata': {},
# 'name': 'x4',
# 'nullable': True,
# 'type': {'keyType': 'string',
# 'type': 'map',
# 'valueContainsNull': True,
# 'valueType': 'string'}},
# {'metadata': {},
# 'name': 'x5',
# 'nullable': True,
# 'type': {'fields': [{'metadata': {},
# 'name': 'x5_1',
# 'nullable': True,
# 'type': 'string'},
# {'metadata': {},
# 'name': 'x5_2',
# 'nullable': True,
# 'type': 'string'}],
# 'type': 'struct'}}],
# 'type': 'struct'}
pprint(df.schema.jsonValue())
# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
# {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
# {'metadata': {},
# 'name': 'x3',
# 'nullable': True,
# 'type': {'containsNull': True,
# 'elementType': 'string',
# 'type': 'array'}},
# {'metadata': {},
# 'name': 'x4',
# 'nullable': True,
# 'type': {'keyType': 'string',
# 'type': 'map',
# 'valueContainsNull': True,
# 'valueType': 'string'}},
# {'metadata': {},
# 'name': 'x5',
# 'nullable': True,
# 'type': {'fields': [{'metadata': {},
# 'name': 'x5_1',
# 'nullable': True,
# 'type': 'string'},
# {'metadata': {},
# 'name': 'x5_2',
# 'nullable': True,
# 'type': 'string'}],
# 'type': 'struct'}}],
# 'type': 'struct'}
# convert the json back to pyspark schema
schema2 = T.StructType.fromJson(df.schema.jsonValue())
print(schema==schema2)
# True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment