Skip to content

Instantly share code, notes, and snippets.

@morganmcg1
Last active July 10, 2023 03:45
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save morganmcg1/15a9de711b9c5e8e1bd142b4be80252d to your computer and use it in GitHub Desktop.
Save morganmcg1/15a9de711b9c5e8e1bd142b4be80252d to your computer and use it in GitHub Desktop.
PySpark - Normalize (Standardize) train and test dataframes: [ (x - mean) / std_dev ]
# Based on the solution here: https://stackoverflow.com/questions/44580644/subtract-mean-from-pyspark-dataframe
# Function to normalise (standardise) PySpark dataframes
def standardize_train_test_data(train_df, test_df, columns):
'''
Add normalised columns to the input dataframe.
formula = [(X - mean) / std_dev]
Inputs : training dataframe, list of column name strings to be normalised
Returns : dataframe with new normalised columns, averages and std deviation dataframes
'''
# Find the Mean and the Standard Deviation for each column
aggExpr = []
aggStd = []
for column in columns:
aggExpr.append(mean(train_df[column]).alias(column))
aggStd.append(stddev(train_df[column]).alias(column + '_stddev'))
averages = train_df.agg(*aggExpr).collect()[0]
std_devs = train_df.agg(*aggStd).collect()[0]
# Standardise each dataframe, column by column
for column in columns:
# Standardise the TRAINING data
train_df = train_df.withColumn(column + '_norm', ((train_df[column] - averages[column]) /
std_devs[column + '_stddev']))
# Standardise the TEST data (using the training mean and std_dev)
test_df = test_df.withColumn(column + '_norm', ((test_df[column] - averages[column]) /
std_devs[column + '_stddev']))
return train_df, test_df, averages, std_devs
# Original StackExchange function:
# def normalize(df, columns):
# aggExpr = []
# for column in columns:
# aggExpr.append(mean(df[column]).alias(column))
# averages = df.agg(*aggExpr).collect()[0]
# selectExpr = []
# for column in columns:
# selectExpr.append(df[column] - averages[column])
# return df.select(selectExpr)
@j-c-cook
Copy link

+1

@ajaverett
Copy link

I love you

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment