Last active
April 17, 2024 22:07
-
-
Save knu2xs/5e1658d4c374d050db8fcea7ed761510 to your computer and use it in GitHub Desktop.
Get PySpark Schema
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pyspark.sql import DataFrame | |
def get_pyspark_dataframe_schema(df: DataFrame) -> str: | |
"""Output the DataFrame schema to easily be included when constructing a new PySpark DataFrame.""" | |
# characters to include for tab, since python, using four spaces | |
tab = ' ' | |
# start the string to bulid | |
schm_str = f'StructType([' | |
# iterate the dataframe schema and format the data type schemas for each column | |
for col in df.schema: | |
# format the string representation | |
schm_str = schm_str + '\n' + tab + col.__str__() + ',' | |
# at the end, close the list and struct type | |
schm_str = schm_str + '\n])' | |
# insert commas around column names | |
for match in re.finditer(r"StructField\((.*?),", schm_str, re.MULTILINE): | |
groups = match.groups() | |
schm_str = schm_str.replace(f'StructField({groups[0]}', f'StructField("{groups[0]}"') | |
# convert booleans' first letter to uppercase | |
schm_str = schm_str.replace('true', 'True').replace('false', 'False') | |
# initialize all the data types in the string | |
dt_set = set(match.groups()[0] for match in re.finditer(r'(\w*Type),', schm_str, re.MULTILINE)) | |
for dt in dt_set: | |
schm_str = schm_str.replace(dt, f'{dt}()') | |
return schm_str | |
def get_pyspark_dataframe_schema_tree(df: DataFrame) -> str: | |
"""Get the string output from `DataFrame.printSchema()`.""" | |
tree_str = df._jdf.schema().treeString() | |
return tree_str |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment