karpanGit/pyspark, from pyspark to json and back.py

## pyspark, from pyspark to json and back.py
# from pyspark schema to json and the other way round
import pyspark.sql.types as T
from pprint import pprint
# -- create simple dataframe and schema
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
schema = T.StructType([
    T.StructField('x1', T.LongType()),
    T.StructField('x2', T.StringType()),
    T.StructField('x3', T.ArrayType(StringType())),
    T.StructField('x4', T.MapType(StringType(), StringType())),
    T.StructField('x5', T.StructType([
        StructField('x5_1', StringType()),
        StructField('x5_2', StringType())
        ]))
])
df = spark.createDataFrame(df, schema=schema)
df.printSchema()
# root
#  |-- x1: long (nullable = true)
#  |-- x2: string (nullable = true)
#  |-- x3: array (nullable = true)
#  |    |-- element: string (containsNull = true)
#  |-- x4: map (nullable = true)
#  |    |-- key: string
#  |    |-- value: string (valueContainsNull = true)
#  |-- x5: struct (nullable = true)
#  |    |-- x5_1: string (nullable = true)
#  |    |-- x5_2: string (nullable = true)
pprint(schema.jsonValue())
# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
#             {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
#             {'metadata': {},
#              'name': 'x3',
#              'nullable': True,
#              'type': {'containsNull': True,
#                       'elementType': 'string',
#                       'type': 'array'}},
#             {'metadata': {},
#              'name': 'x4',
#              'nullable': True,
#              'type': {'keyType': 'string',
#                       'type': 'map',
#                       'valueContainsNull': True,
#                       'valueType': 'string'}},
#             {'metadata': {},
#              'name': 'x5',
#              'nullable': True,
#              'type': {'fields': [{'metadata': {},
#                                   'name': 'x5_1',
#                                   'nullable': True,
#                                   'type': 'string'},
#                                  {'metadata': {},
#                                   'name': 'x5_2',
#                                   'nullable': True,
#                                   'type': 'string'}],
#                       'type': 'struct'}}],
#  'type': 'struct'}
pprint(df.schema.jsonValue())
# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
#             {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
#             {'metadata': {},
#              'name': 'x3',
#              'nullable': True,
#              'type': {'containsNull': True,
#                       'elementType': 'string',
#                       'type': 'array'}},
#             {'metadata': {},
#              'name': 'x4',
#              'nullable': True,
#              'type': {'keyType': 'string',
#                       'type': 'map',
#                       'valueContainsNull': True,
#                       'valueType': 'string'}},
#             {'metadata': {},
#              'name': 'x5',
#              'nullable': True,
#              'type': {'fields': [{'metadata': {},
#                                   'name': 'x5_1',
#                                   'nullable': True,
#                                   'type': 'string'},
#                                  {'metadata': {},
#                                   'name': 'x5_2',
#                                   'nullable': True,
#                                   'type': 'string'}],
#                       'type': 'struct'}}],
#  'type': 'struct'}
# convert the json back to pyspark schema
schema2 = T.StructType.fromJson(df.schema.jsonValue())
print(schema==schema2)
# True
	# from pyspark schema to json and the other way round
	import pyspark.sql.types as T
	from pprint import pprint
	# -- create simple dataframe and schema
	df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
	schema = T.StructType([
	T.StructField('x1', T.LongType()),
	T.StructField('x2', T.StringType()),
	T.StructField('x3', T.ArrayType(StringType())),
	T.StructField('x4', T.MapType(StringType(), StringType())),
	T.StructField('x5', T.StructType([
	StructField('x5_1', StringType()),
	StructField('x5_2', StringType())
	]))
	])
	df = spark.createDataFrame(df, schema=schema)
	df.printSchema()
	# root
	# \|-- x1: long (nullable = true)
	# \|-- x2: string (nullable = true)
	# \|-- x3: array (nullable = true)
	# \| \|-- element: string (containsNull = true)
	# \|-- x4: map (nullable = true)
	# \| \|-- key: string
	# \| \|-- value: string (valueContainsNull = true)
	# \|-- x5: struct (nullable = true)
	# \| \|-- x5_1: string (nullable = true)
	# \| \|-- x5_2: string (nullable = true)
	pprint(schema.jsonValue())
	# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
	# {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
	# {'metadata': {},
	# 'name': 'x3',
	# 'nullable': True,
	# 'type': {'containsNull': True,
	# 'elementType': 'string',
	# 'type': 'array'}},
	# {'metadata': {},
	# 'name': 'x4',
	# 'nullable': True,
	# 'type': {'keyType': 'string',
	# 'type': 'map',
	# 'valueContainsNull': True,
	# 'valueType': 'string'}},
	# {'metadata': {},
	# 'name': 'x5',
	# 'nullable': True,
	# 'type': {'fields': [{'metadata': {},
	# 'name': 'x5_1',
	# 'nullable': True,
	# 'type': 'string'},
	# {'metadata': {},
	# 'name': 'x5_2',
	# 'nullable': True,
	# 'type': 'string'}],
	# 'type': 'struct'}}],
	# 'type': 'struct'}
	pprint(df.schema.jsonValue())
	# {'fields': [{'metadata': {}, 'name': 'x1', 'nullable': True, 'type': 'long'},
	# {'metadata': {}, 'name': 'x2', 'nullable': True, 'type': 'string'},
	# {'metadata': {},
	# 'name': 'x3',
	# 'nullable': True,
	# 'type': {'containsNull': True,
	# 'elementType': 'string',
	# 'type': 'array'}},
	# {'metadata': {},
	# 'name': 'x4',
	# 'nullable': True,
	# 'type': {'keyType': 'string',
	# 'type': 'map',
	# 'valueContainsNull': True,
	# 'valueType': 'string'}},
	# {'metadata': {},
	# 'name': 'x5',
	# 'nullable': True,
	# 'type': {'fields': [{'metadata': {},
	# 'name': 'x5_1',
	# 'nullable': True,
	# 'type': 'string'},
	# {'metadata': {},
	# 'name': 'x5_2',
	# 'nullable': True,
	# 'type': 'string'}],
	# 'type': 'struct'}}],
	# 'type': 'struct'}
	# convert the json back to pyspark schema
	schema2 = T.StructType.fromJson(df.schema.jsonValue())
	print(schema==schema2)
	# True