Skip to content

Instantly share code, notes, and snippets.

@TomLous
Last active April 25, 2017 09:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TomLous/7f0c89c46eaed1b3a35a6c5449a7fc74 to your computer and use it in GitHub Desktop.
Save TomLous/7f0c89c46eaed1b3a35a6c5449a7fc74 to your computer and use it in GitHub Desktop.
// Tabulator: checkout http://stackoverflow.com/questions/7539831/scala-draw-table-to-console
def propertyList(kvKRecord: KvKRecord): List[Any] = KvKRecord.unapply(kvKRecord).map(_.productIterator.toList).getOrElse(Nil)
val labeledList: ArrayBuffer[LabeledVector] = ArrayBuffer()
breakable {
comparableDataset
.sample(withReplacement = false, Config.sampleFactor)
.collect()
.foreach {
case (left, right, vector) => {
val table = propertyList(left) zip propertyList(right) map (x => List(x._1, x._2))
println(Tabulator.format(table))
println("Same? [y/n]")
Try(scala.io.StdIn.readChar()) match {
case Success('y') => labeledList.append(LabeledVector(left.dossierNummer, right.dossierNummer, vector, 1.0))
case Success('n') => labeledList.append(LabeledVector(left.dossierNummer, right.dossierNummer, vector, 0.0))
case _ => break
}
}
}
}
spark
.createDataFrame(labeledList.toList)
.write.mode(SaveMode.Append)
.parquet(path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment