Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karpanGit/29766fadb4188521f7fb1638f3db1caf to your computer and use it in GitHub Desktop.
Save karpanGit/29766fadb4188521f7fb1638f3db1caf to your computer and use it in GitHub Desktop.
pyspark, extract data from structs with scalars and structs with arrays
# extract data from array of structs, nested
import json
from pyspark.sql.types import *
structureSchema = StructType([
StructField('name', ArrayType(StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
]))),
StructField('id', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', IntegerType(), True)
])
data = [([('James', '', 'Smith'), ('James2', '', 'Smith2')], 1, 'M', 10_000),\
([('James3', 'mplah', 'Smith3')], 1, 'M', 20_000)]
df = spark.createDataFrame(data, schema=structureSchema)
df.printSchema()
# root
# |-- name: array (nullable = true)
# | |-- element: struct (containsNull = true)
# | | |-- firstname: string (nullable = true)
# | | |-- middlename: string (nullable = true)
# | | |-- lastname: string (nullable = true)
# |-- id: string (nullable = true)
# |-- gender: string (nullable = true)
# |-- salary: integer (nullable = true)
df.show()
res = df.select(f.col('name')['firstname'])
res.printSchema()
# root
# |-- name.firstname: array (nullable = true)
# | |-- element: string (containsNull = true)
res.show()
# | name.firstname|
# +---------------+
# |[James, James2]|
# | [James3]|
# +---------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment