Skip to content

Instantly share code, notes, and snippets.

@knu2xs
Last active April 17, 2024 22:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save knu2xs/5e1658d4c374d050db8fcea7ed761510 to your computer and use it in GitHub Desktop.
Save knu2xs/5e1658d4c374d050db8fcea7ed761510 to your computer and use it in GitHub Desktop.
Get PySpark Schema
import re
from pyspark.sql import DataFrame
def get_pyspark_dataframe_schema(df: DataFrame) -> str:
"""Output the DataFrame schema to easily be included when constructing a new PySpark DataFrame."""
# characters to include for tab, since python, using four spaces
tab = ' '
# start the string to bulid
schm_str = f'StructType(['
# iterate the dataframe schema and format the data type schemas for each column
for col in df.schema:
# format the string representation
schm_str = schm_str + '\n' + tab + col.__str__() + ','
# at the end, close the list and struct type
schm_str = schm_str + '\n])'
# insert commas around column names
for match in re.finditer(r"StructField\((.*?),", schm_str, re.MULTILINE):
groups = match.groups()
schm_str = schm_str.replace(f'StructField({groups[0]}', f'StructField("{groups[0]}"')
# convert booleans' first letter to uppercase
schm_str = schm_str.replace('true', 'True').replace('false', 'False')
# initialize all the data types in the string
dt_set = set(match.groups()[0] for match in re.finditer(r'(\w*Type),', schm_str, re.MULTILINE))
for dt in dt_set:
schm_str = schm_str.replace(dt, f'{dt}()')
return schm_str
def get_pyspark_dataframe_schema_tree(df: DataFrame) -> str:
"""Get the string output from `DataFrame.printSchema()`."""
tree_str = df._jdf.schema().treeString()
return tree_str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment