Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karpanGit/1003d96865f520e5997335052d528e2a to your computer and use it in GitHub Desktop.
Save karpanGit/1003d96865f520e5997335052d528e2a to your computer and use it in GitHub Desktop.
pyspark, explode array, explode list, collect_list, collect_set
# simple example: explode array
import pyspark.sql.functions as F
df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
df.printSchema()
res = df.select(F.col('x'), F.explode(F.col('z')).alias('z'))
res.show(truncate=False)
res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z'))
res.show(truncate=False)
# simple example: explode map
import pyspark.sql.functions as F
df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
res = df.select(F.col('x'), F.create_map(F.col('y'), F.col('z')).alias('mapped'))
res.printSchema()
# root
# |-- x: long (nullable = true)
# |-- mapped: map (nullable = false)
# | |-- key: string
# | |-- value: string (valueContainsNull = true)
res.show()
# | x| mapped|
# +---+-----------------+
# | 1| {mplah -> gogo}|
# | 2|{mplah2 -> gogo2}|
# | 3|{mplah3 -> gogo3}|
# +---+-----------------+
res = res.select(F.col('x'), F.explode(F.col('mapped')))
res.show()
# | x| key|value|
# +---+------+-----+
# | 1| mplah| gogo|
# | 2|mplah2|gogo2|
# | 3|mplah3|gogo3|
# +---+------+-----+
# simple example 2: explode map
import pyspark.sql.functions as F
df = [[1, ['A', 'B', 'C'], ['a', 'b', 'c']], [2, ['A2', 'B2', 'C2'], ['a2', 'b2', 'c2']], [3, ['A3', 'B3', 'C3', 'D3'], ['a3', 'b3', 'c3', 'd3']]]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
res = df.select(F.col('x'), F.map_from_arrays(F.col('y'), F.col('z')).alias('mapped'))
res.show(truncate=False)
# |x |mapped |
# +---+----------------------------------------+
# |1 |{A -> a, B -> b, C -> c} |
# |2 |{A2 -> a2, B2 -> b2, C2 -> c2} |
# |3 |{A3 -> a3, B3 -> b3, C3 -> c3, D3 -> d3}|
# +---+----------------------------------------+
res = res.select(F.col('x'), F.explode(F.col('mapped')))
res.show()
# | x|key|value|
# +---+---+-----+
# | 1| A| a|
# | 1| B| b|
# | 1| C| c|
# | 2| A2| a2|
# | 2| B2| b2|
# | 2| C2| c2|
# | 3| A3| a3|
# | 3| B3| b3|
# | 3| C3| c3|
# | 3| D3| d3|
# +---+---+-----+
# collect the results back into an array
res = res.groupby(F.col('x')).agg(F.collect_list(F.col('key')).alias('key'), F.collect_list(F.col('value')).alias('value'))
res.show()
# | x| key| value|
# +---+----------------+----------------+
# | 1| [A, B, C]| [a, b, c]|
# | 2| [A2, B2, C2]| [a2, b2, c2]|
# | 3|[A3, B3, C3, D3]|[a3, b3, c3, d3]|
# +---+----------------+----------------+
res = res.select(F.col('x'), F.map_from_arrays('key', 'value').alias('mapped'))
res.show(truncate=False)
# |x |mapped |
# +---+----------------------------------------+
# |1 |{A -> a, B -> b, C -> c} |
# |2 |{A2 -> a2, B2 -> b2, C2 -> c2} |
# |3 |{A3 -> a3, B3 -> b3, C3 -> c3, D3 -> d3}|
# +---+----------------------------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment