from pyspark.sql.types import StructType, StructField, StringType
rdd = sc.parallelize([("moo this has stopwords b", "bat this one does not"),
("apple orange banana", "cookie jar bla la")])
schema = StructType([StructField('entity', StringType(), True),
StructField('cleaned_entity', StringType(), True),