Skip to content

Instantly share code, notes, and snippets.

@jesusjavierdediego
Last active February 27, 2019 22:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jesusjavierdediego/b3d43d47a9b108201f55da3f1e9c9041 to your computer and use it in GitHub Desktop.
Save jesusjavierdediego/b3d43d47a9b108201f55da3f1e9c9041 to your computer and use it in GitHub Desktop.
class MakeTrainingTestDatasets extends Logging{
def evalVector(features: Array[Double]): Boolean ={
import util.control.Breaks._
var result: Boolean = true
breakable {
for((f,i) <- features.view.zipWithIndex){
i match {
// 0-Title
case 0 => {
f match {
case 0.0 => result = true
case t if t < 0.5 => result = true
case t if t >= 0.5 => result = false
case _ => result = false
}
if(!result) break
}
// 1-Forename
case 1 => {
f match {
case 0.0 => result = true
case t if t < 0.2 => result = true
case t if t >= 0.2 => result = false
case _ => result = false
}
if(!result) break
}
// 2-middlename -> IGNORED
// 3-Familyname
case 3 => {
f match {
case 0.0 => result = true
case t if t < 0.15 => result = true
case t if t >= 0.15 => result = false
case _ => result = false
}
if(!result) break
}
// 4-Secondfamilyname -> IGNORED
// 5-DOB
case 5 => {
f match {
case 0.0 => result = true
case _ => result = false
}
if(!result) break
}
// 6-Nationality
case 6 => {
f match {
case 0.0 => result = true
case _ => result = false
}
if(!result) break
}
// 7-Postcode
case 7 => {
f match {
case 0.0 => result = true
case _ => result = false
}
if(!result) break
}
case _ => logger.warn("Ignored feature will not be evaluated")
}
}
}
result
}
def generateTrainingData(comparableDataset: Dataset[(Person, Person, Vector)]): DataFrame ={
import scala.collection.mutable.ArrayBuffer
import util.control.Breaks._
val labeledList: ArrayBuffer[LabeledVector] = ArrayBuffer()
comparableDataset
.collect()
.foreach {
case (left, right, vector) => {
evalVector(vector.toArray) match{
case true => {
labeledList.append(LabeledVector(left.old_id.toString, right.old_id.toString, vector, 1.0))
}
case false => {
labeledList.append(LabeledVector(left.old_id.toString, right.old_id.toString, vector, 0.0))
}
}
}
}
spark.createDataFrame(labeledList.toArray[LabeledVector])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment