Skip to content

Instantly share code, notes, and snippets.

Why is this so hard to remember?

from pyspark.sql.types import StructType, StructField, StringType

rdd = sc.parallelize([("moo this has stopwords b", "bat this one does not"),
                      ("apple orange banana", "cookie jar bla la")])

schema = StructType([StructField('entity', StringType(), True),
 StructField('cleaned_entity', StringType(), True),