Skip to content

Instantly share code, notes, and snippets.

@bmc
Last active October 2, 2018 12:18
Show Gist options
  • Save bmc/5aeab84166596f8b6afb0e2fefc8eb88 to your computer and use it in GitHub Desktop.
Save bmc/5aeab84166596f8b6afb0e2fefc8eb88 to your computer and use it in GitHub Desktop.
Patch a printSchemaAsCode() method into Spark DataFrame, in Python and Scala
from pyspark.sql.types import *
from pyspark.sql import DataFrame
def schema_as_code(schema, indentation=4):
def str_field(field, indentationLevel):
'''
Convert a field to string representation.
'''
field_indent_spaces = " " * ((indentationLevel + 1) * indentation)
field_prefix = '{0}StructField("{1}", '.format(field_indent_spaces, field.name)
if isinstance(field.dataType, StructType):
field_type = pretty_struct(field.dataType, indentationLevel + 1)
else:
field_type = '{0}()'.format(field.dataType)
return '{0}{1}, {2})'.format(field_prefix, field_type, field.nullable)
def pretty_struct(st, indentationLevel):
buf = ""
indent_spaces = " " * (indentationLevel * indentation)
prefix = indent_spaces + "StructType([\n"
field_indentation = " " * ((indentationLevel + 1) * indentation)
field_strings = [str_field(field, indentationLevel) for field in st.fields]
fields = ',\n'.join(field_strings)
return '{0}{1}\n{2}])'.format(prefix, fields, indent_spaces)
return pretty_struct(schema, 0)
def print_schema_as_code(self):
print(schema_as_code(self.schema))
DataFrame.printSchemaAsCode = print_schema_as_code
import org.apache.spark.sql.types._
import org.apache.spark.sql._
object Implicits {
def schemaAsCode(schema: StructType, indentation: Int = 2): String = {
def prettyStruct(st: StructType, indentationLevel: Int): String = {
val indentSpaces = " " * (indentationLevel * indentation)
val prefix = s"${indentSpaces}StructType(List(\n"
val fieldIndentSpaces = " " * ((indentationLevel + 1) * indentation)
val fieldStrings: Seq[String] = for (field <- st.fields) yield {
val fieldPrefix = s"""${fieldIndentSpaces}StructField("${field.name}", """
val fieldType = field.dataType match {
case st2: StructType => s"${prettyStruct(st2, indentationLevel + 1)}"
case _ => s"${field.dataType}"
}
s"$fieldPrefix$fieldType, ${field.nullable})"
}
val fields = fieldStrings.mkString(",\n")
s"$prefix$fields\n$indentSpaces))"
}
prettyStruct(schema, 0)
}
implicit class EnrichedDataFrame(val df: DataFrame) {
def printSchemaAsCode(): Unit = {
println(schemaAsCode(df.schema))
}
}
}
import Implicits._
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment