knu2xs/get_pyspark_schema.py

## get_pyspark_schema.py
import re

from pyspark.sql import DataFrame

def get_pyspark_dataframe_schema(df: DataFrame) -> str:
    """Output the DataFrame schema to easily be included when constructing a new PySpark DataFrame."""

    # characters to include for tab, since python, using four spaces
    tab = '    '

    # start the string to bulid
    schm_str = f'StructType(['

    # iterate the dataframe schema and format the data type schemas for each column
    for col in df.schema:

        # format the string representation
        schm_str = schm_str + '\n' + tab + col.__str__() + ','

    # at the end, close the list and struct type
    schm_str = schm_str + '\n])'

    # insert commas around column names
    for match in re.finditer(r"StructField\((.*?),", schm_str, re.MULTILINE):
        groups = match.groups()
        schm_str = schm_str.replace(f'StructField({groups[0]}', f'StructField("{groups[0]}"')

    # convert booleans' first letter to uppercase
    schm_str = schm_str.replace('true', 'True').replace('false', 'False')

    # initialize all the data types in the string
    dt_set = set(match.groups()[0] for match in re.finditer(r'(\w*Type),', schm_str, re.MULTILINE))
    for dt in dt_set:
        schm_str = schm_str.replace(dt, f'{dt}()')

    return schm_str


def get_pyspark_dataframe_schema_tree(df: DataFrame) -> str:
    """Get the string output from `DataFrame.printSchema()`."""

    tree_str = df._jdf.schema().treeString()

    return tree_str
	import re

	from pyspark.sql import DataFrame

	def get_pyspark_dataframe_schema(df: DataFrame) -> str:
	"""Output the DataFrame schema to easily be included when constructing a new PySpark DataFrame."""

	# characters to include for tab, since python, using four spaces
	tab = ' '

	# start the string to bulid
	schm_str = f'StructType(['

	# iterate the dataframe schema and format the data type schemas for each column
	for col in df.schema:

	# format the string representation
	schm_str = schm_str + '\n' + tab + col.__str__() + ','

	# at the end, close the list and struct type
	schm_str = schm_str + '\n])'

	# insert commas around column names
	for match in re.finditer(r"StructField\((.*?),", schm_str, re.MULTILINE):
	groups = match.groups()
	schm_str = schm_str.replace(f'StructField({groups[0]}', f'StructField("{groups[0]}"')

	# convert booleans' first letter to uppercase
	schm_str = schm_str.replace('true', 'True').replace('false', 'False')

	# initialize all the data types in the string
	dt_set = set(match.groups()[0] for match in re.finditer(r'(\w*Type),', schm_str, re.MULTILINE))
	for dt in dt_set:
	schm_str = schm_str.replace(dt, f'{dt}()')

	return schm_str


	def get_pyspark_dataframe_schema_tree(df: DataFrame) -> str:
	"""Get the string output from `DataFrame.printSchema()`."""

	tree_str = df._jdf.schema().treeString()

	return tree_str