korkridake/PySpark_Wide_to_Long.py

## PySpark_Wide_to_Long.py
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql import DataFrame
from typing import Iterable

def melt(
        df: DataFrame,
        id_vars: Iterable[str], value_vars: Iterable[str],
        var_name: str="variable", value_name: str="value") -> DataFrame:
    """
    Convert :class:`DataFrame` from wide to long format.
    Source: https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
    """

    # -------------------------------------------------------------------------------
    # Create array<struct<variable: str, value: ...>>
    # -------------------------------------------------------------------------------
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))

    # -------------------------------------------------------------------------------
    # Add to the DataFrame and explode
    # -------------------------------------------------------------------------------
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))

    cols = id_vars + [
            col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    return _tmp.select(*cols)

# -------------------------------------------------------------------------------
# Let's Implement Wide to Long in Pyspark!
# -------------------------------------------------------------------------------
melt(df_web_browsing_full_test,
     id_vars=['ID_variable'],
     value_vars=['VALUE_variable_1', 'VALUE_variable_2']).show()
	from pyspark.sql.functions import array, col, explode, lit, struct
	from pyspark.sql import DataFrame
	from typing import Iterable

	def melt(
	df: DataFrame,
	id_vars: Iterable[str], value_vars: Iterable[str],
	var_name: str="variable", value_name: str="value") -> DataFrame:
	"""
	Convert :class:`DataFrame` from wide to long format.
	Source: https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
	"""

	# -------------------------------------------------------------------------------
	# Create array<struct<variable: str, value: ...>>
	# -------------------------------------------------------------------------------
	_vars_and_vals = array(*(
	struct(lit(c).alias(var_name), col(c).alias(value_name))
	for c in value_vars))

	# -------------------------------------------------------------------------------
	# Add to the DataFrame and explode
	# -------------------------------------------------------------------------------
	_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))

	cols = id_vars + [
	col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
	return _tmp.select(*cols)

	# -------------------------------------------------------------------------------
	# Let's Implement Wide to Long in Pyspark!
	# -------------------------------------------------------------------------------
	melt(df_web_browsing_full_test,
	id_vars=['ID_variable'],
	value_vars=['VALUE_variable_1', 'VALUE_variable_2']).show()