Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karpanGit/b5c981419be75068faf214002b80c51b to your computer and use it in GitHub Desktop.
Save karpanGit/b5c981419be75068faf214002b80c51b to your computer and use it in GitHub Desktop.
pyspark, manually create schema containing complex columns, populate dataframe and extract data
# create a simple schema and populate an example dataframe
childSchema = StructType([
StructField('child name', StringType(), nullable=False),
StructField('child age', LongType(), nullable=False)
])
schema = StructType([
StructField('name', StringType(), nullable=False),
StructField('age', LongType(), nullable=False),
StructField('children', ArrayType(childSchema, containsNull=False), nullable=False)
])
data = [('Panos', 30, [('George', 10), ('Bob', 12)]),
('Maria', 30, [('George2', 10), ('Bob2', 12)])]
df = spark.createDataFrame(data, schema=schema)
df.printSchema()
# root
# |-- name: string (nullable = false)
# |-- age: long (nullable = false)
# |-- children: array (nullable = false)
# | |-- element: struct (containsNull = false)
# | | |-- child name: string (nullable = false)
# | | |-- child age: long (nullable = false)
df.show(truncate=False)
# |name |age|children |
# +-----+---+---------------------------+
# |Panos|30 |[{George, 10}, {Bob, 12}] |
# |Maria|30 |[{George2, 10}, {Bob2, 12}]|
# +-----+---+---------------------------+
df.select(f.col('children')['child name']).show()
# |children.child name|
# +-------------------+
# | [George, Bob]|
# | [George2, Bob2]|
# +-------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment