Skip to content

Instantly share code, notes, and snippets.

@nmukerje
Last active October 26, 2020 23:52
Show Gist options
  • Save nmukerje/3db710b0279aa068d847daa6b5400a74 to your computer and use it in GitHub Desktop.
Save nmukerje/3db710b0279aa068d847daa6b5400a74 to your computer and use it in GitHub Desktop.
Pyspark Makes Nested Coluumns Lower Case and Replaces Hyphens with Underscores.
from pyspark.sql import functions as F
def get_column_wise_schema(df_string_schema, df_columns):
# Returns a dictionary containing column name and corresponding column schema as string.
column_schema_dict = {}
i = 0
while i < len(df_columns):
current_col = df_columns[i]
next_col = df_columns[i + 1] if i < len(df_columns) - 1 else None
current_col_split_key = '[' + current_col + ': ' if i == 0 else ' ' + current_col + ': '
next_col_split_key = ']' if i == len(df_columns) - 1 else ', ' + next_col + ': '
column_schema_dict[current_col] = df_string_schema.split(current_col_split_key)[1].\
split(next_col_split_key)[0]
i += 1
return column_schema_dict
def convert_colnames_to_lower(spark_df):
columns = spark_df.columns
column_wise_schema_dict = get_column_wise_schema(spark_df.__str__().replace('-','_').replace('.','_'), columns)
print (column_wise_schema_dict)
col_exprs = []
for column_name in columns:
column_schema_lowercase = column_wise_schema_dict[column_name]
col_exprs.append(F.col(column_name).cast(column_schema_lowercase).
alias(column_name.lower()))
return spark_df.select(*col_exprs)
converted_df = convert_colnames_to_lower(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment