karpanGit/pyspark, manually create schema containing complex columns, populate dataframe and extract data.py

## pyspark, manually create schema containing complex columns, populate dataframe and extract data.py
# create a simple schema and populate an example dataframe
childSchema = StructType([
    StructField('child name', StringType(), nullable=False),
    StructField('child age', LongType(), nullable=False)
])
schema = StructType([
    StructField('name', StringType(), nullable=False),
    StructField('age', LongType(), nullable=False),
    StructField('children', ArrayType(childSchema, containsNull=False), nullable=False)
])
data = [('Panos', 30, [('George', 10), ('Bob', 12)]),
        ('Maria', 30, [('George2', 10), ('Bob2', 12)])]
df = spark.createDataFrame(data, schema=schema)
df.printSchema()
# root
#  |-- name: string (nullable = false)
#  |-- age: long (nullable = false)
#  |-- children: array (nullable = false)
#  |    |-- element: struct (containsNull = false)
#  |    |    |-- child name: string (nullable = false)
#  |    |    |-- child age: long (nullable = false)
df.show(truncate=False)
# |name |age|children                   |
# +-----+---+---------------------------+
# |Panos|30 |[{George, 10}, {Bob, 12}]  |
# |Maria|30 |[{George2, 10}, {Bob2, 12}]|
# +-----+---+---------------------------+
df.select(f.col('children')['child name']).show()
# |children.child name|
# +-------------------+
# |      [George, Bob]|
# |    [George2, Bob2]|
# +-------------------+
	# create a simple schema and populate an example dataframe
	childSchema = StructType([
	StructField('child name', StringType(), nullable=False),
	StructField('child age', LongType(), nullable=False)
	])
	schema = StructType([
	StructField('name', StringType(), nullable=False),
	StructField('age', LongType(), nullable=False),
	StructField('children', ArrayType(childSchema, containsNull=False), nullable=False)
	])
	data = [('Panos', 30, [('George', 10), ('Bob', 12)]),
	('Maria', 30, [('George2', 10), ('Bob2', 12)])]
	df = spark.createDataFrame(data, schema=schema)
	df.printSchema()
	# root
	# \|-- name: string (nullable = false)
	# \|-- age: long (nullable = false)
	# \|-- children: array (nullable = false)
	# \| \|-- element: struct (containsNull = false)
	# \| \| \|-- child name: string (nullable = false)
	# \| \| \|-- child age: long (nullable = false)
	df.show(truncate=False)
	# \|name \|age\|children \|
	# +-----+---+---------------------------+
	# \|Panos\|30 \|[{George, 10}, {Bob, 12}] \|
	# \|Maria\|30 \|[{George2, 10}, {Bob2, 12}]\|
	# +-----+---+---------------------------+
	df.select(f.col('children')['child name']).show()
	# \|children.child name\|
	# +-------------------+
	# \| [George, Bob]\|
	# \| [George2, Bob2]\|
	# +-------------------+