thangarajan8/delete.py

## delete.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, when

# Initialize Spark session
# spark = SparkSession.builder \
#     .appName("Sales Analysis") \
#     .getOrCreate()

# Sample data
data = [
    (100, 'a', 'b', 'c', 's1','p1'),
    (200, 'a', 'b', 'c', 's2','p1'),
    (300, 'a', 'b', 'c', 's3','p2'),
    (100, 'd', 'e', 'f', 's4','p2'),
    (100, 'd', 'e', 'f', 's5','p3'),
    (100, 'd', 'e', 'f', 's6','p4')
]

# Define schema
schema = ["sales", "region", "country", "city", "stage",'pincode']

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

def create_new_column(df, filter_cols, filter_vals, agg_col, new_col):
    """
    Function to filter, aggregate, and create/update a new column based on conditions in PySpark DataFrame.

    Args:
    - df (DataFrame): Input PySpark DataFrame.
    - filter_cols (list of str): List of column names for filtering.
    - filter_vals (list or tuple): List or tuple of values corresponding to filter_cols.
    - agg_col (str): Column to aggregate.
    - new_col (str): Name of the new column to be created or updated.

    Returns:
    - DataFrame: DataFrame with the updated/new column added based on the conditions.
    """
    # Construct filter conditions
    filter_condition = None
    for col_name, col_val in zip(filter_cols, filter_vals):
        if filter_condition is None:
            filter_condition = col(col_name) == col_val
        else:
            filter_condition = filter_condition & (col(col_name) == col_val)

    # Filter the data
    filtered_data = df.filter(filter_condition)
    filtered_data.show()
    # Group by filter columns and aggregate
    grouped_data = filtered_data.groupBy(*filter_cols) \
                                .agg(spark_sum(agg_col).alias('total_sales'))

    # Set the new column based on the condition
    result = filtered_data.join(grouped_data, filter_cols, 'left_outer') \
               .withColumn(new_col, when(col('total_sales') > 500, 'OK').otherwise('Not OK')) \
               .drop('total_sales')

    return result

# Example usage: Filter on region=a, country=b, city=c, aggregate sales, and create/update a new column based on the condition
fil = ['a','b','c','p1']
# fil1 = ['d', 'e', 'f','']
filtered_result = create_new_column(df, ['region', 'country','city','pincode'], fil , 'sales', 'new_column')

# Show the final result
filtered_result.show()