mkaranasou/pyspark_parse_json_and_expand_into_columns.py

## pyspark_parse_json_and_expand_into_columns.py
json_col = 'json_col'

# either infer the features schema:
schema = self.spark.read.json(df.select(json_col).rdd.map(lambda x: x[0])).schema

# parse the features string into a map
df = df.withColumn(json_col, (F.from_json(F.col(json_col), schema)))

# access the feature columns by name
df.select(F.col(json_col)['some_key']).show()

# or if you know how the json is like - a dict in our case:
schema = T.MapType(T.StringType(), T.FloatType())
df = df.withColumn(json_col, (F.from_json(F.col('features'), schema)))
df.select(F.col(json_col)['some_key']).show()

# get all the features in a list
current_keys = df.select(F.map_keys(json_col)).take(1)[0][0]

# expand the features into columns
for k in current_keys:
     df = df.withColumn(k, F.col(json_col)[k])
	json_col = 'json_col'

	# either infer the features schema:
	schema = self.spark.read.json(df.select(json_col).rdd.map(lambda x: x[0])).schema

	# parse the features string into a map
	df = df.withColumn(json_col, (F.from_json(F.col(json_col), schema)))

	# access the feature columns by name
	df.select(F.col(json_col)['some_key']).show()

	# or if you know how the json is like - a dict in our case:
	schema = T.MapType(T.StringType(), T.FloatType())
	df = df.withColumn(json_col, (F.from_json(F.col('features'), schema)))
	df.select(F.col(json_col)['some_key']).show()

	# get all the features in a list
	current_keys = df.select(F.map_keys(json_col)).take(1)[0][0]

	# expand the features into columns
	for k in current_keys:
	df = df.withColumn(k, F.col(json_col)[k])