Skip to content

Instantly share code, notes, and snippets.

Created August 6, 2017 14:04
Show Gist options
  • Save anonymous/ed487233c94cdaaca60b3405c804c48c to your computer and use it in GitHub Desktop.
Save anonymous/ed487233c94cdaaca60b3405c804c48c to your computer and use it in GitHub Desktop.
def loadTestFile(spark: SparkSession, testFile: String): Dataset[Features] = {
import spark.implicits.newProductEncoder
val testDF = spark.read.options(csvOptions).csv(testFile)
val hashUDF = udf((q: String) => Option(q).getOrElse("").hashCode)
val typedTestDF = testDF
.selectExpr("cast(test_id as int) id", "question1", "question2")
.withColumn("isDuplicate", lit(false)) // not used by the model, but convenient
.withColumn("qid1", hashUDF(col("question1")))
.withColumn("qid2", hashUDF(col("question2")))
val orderedTestDF = typedTestDF.select("id", "qid1", "qid2", "question1", "question2", "isDuplicate")
orderedTestDF.as[Features]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment