karpanGit/pyspark, extract data from structs with scalars and structs with arrays.py

## pyspark, extract data from structs with scalars and structs with arrays.py

# extract data from array of structs, nested
import json
from pyspark.sql.types import *
structureSchema = StructType([
        StructField('name', ArrayType(StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ]))),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])
data = [([('James', '', 'Smith'), ('James2', '', 'Smith2')], 1, 'M', 10_000),\
        ([('James3', 'mplah', 'Smith3')], 1, 'M', 20_000)]
df = spark.createDataFrame(data, schema=structureSchema)
df.printSchema()
# root
#  |-- name: array (nullable = true)
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- firstname: string (nullable = true)
#  |    |    |-- middlename: string (nullable = true)
#  |    |    |-- lastname: string (nullable = true)
#  |-- id: string (nullable = true)
#  |-- gender: string (nullable = true)
#  |-- salary: integer (nullable = true)
df.show()
res = df.select(f.col('name')['firstname'])
res.printSchema()
# root
#  |-- name.firstname: array (nullable = true)
#  |    |-- element: string (containsNull = true)
res.show()
# | name.firstname|
# +---------------+
# |[James, James2]|
# |       [James3]|
# +---------------+

	# extract data from array of structs, nested
	import json
	from pyspark.sql.types import *
	structureSchema = StructType([
	StructField('name', ArrayType(StructType([
	StructField('firstname', StringType(), True),
	StructField('middlename', StringType(), True),
	StructField('lastname', StringType(), True)
	]))),
	StructField('id', StringType(), True),
	StructField('gender', StringType(), True),
	StructField('salary', IntegerType(), True)
	])
	data = [([('James', '', 'Smith'), ('James2', '', 'Smith2')], 1, 'M', 10_000),\
	([('James3', 'mplah', 'Smith3')], 1, 'M', 20_000)]
	df = spark.createDataFrame(data, schema=structureSchema)
	df.printSchema()
	# root
	# \|-- name: array (nullable = true)
	# \| \|-- element: struct (containsNull = true)
	# \| \| \|-- firstname: string (nullable = true)
	# \| \| \|-- middlename: string (nullable = true)
	# \| \| \|-- lastname: string (nullable = true)
	# \|-- id: string (nullable = true)
	# \|-- gender: string (nullable = true)
	# \|-- salary: integer (nullable = true)
	df.show()
	res = df.select(f.col('name')['firstname'])
	res.printSchema()
	# root
	# \|-- name.firstname: array (nullable = true)
	# \| \|-- element: string (containsNull = true)
	res.show()
	# \| name.firstname\|
	# +---------------+
	# \|[James, James2]\|
	# \| [James3]\|
	# +---------------+