This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File | |
import java.time.Instant | |
import java.sql.Timestamp | |
import java.util._ | |
import java.security.KeyStore.PasswordProtection | |
import java.security.spec.DSAParameterSpec | |
import java.math.BigInteger | |
import org.slf4j.LoggerFactory | |
import eu.europa.esig.dss.enumerations._ | |
import eu.europa.esig.dss.validation.CommonCertificateVerifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def mergePurgeAndMakeListWithoutDuplictes(listOfFoundDuplicates: DataFrame, personList: Dataset[Person]): DataFrame ={ | |
import spark.implicits._ | |
import scala.collection.mutable.ArrayBuffer | |
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any] | |
var unifiedListOfPersons = ArrayBuffer.empty[Person] | |
val multipleDuplicates = listOfFoundDuplicates.filter($"num" > 1) | |
val noDuplicates = listOfFoundDuplicates.filter($"num" === 1) | |
multipleDuplicates.map(row => { | |
try{ | |
unifiedListOfPersons += mergeDuplicatesForAGivenPerson(row.getAs[String]("id"), row.getAs[Set[String]]("duplicates"), personList) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ModelComposer extends Logging{ | |
val operationType = "create" | |
@throws(classOf[ModelComposerException]) | |
def getAllDFs(persons: DataFrame, sourceData: DataFrame): Map[String, DataFrame] = { | |
import spark.implicits._ | |
import com.datastax.driver.core.utils.UUIDs | |
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any] | |
var result: scala.collection.mutable.Map[String, DataFrame] = scala.collection.mutable.Map.empty[String, DataFrame] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def applyModelToAllCombinations(trainedModel: LogisticRegressionModel, allComparableDataset: Dataset[(Person, Person, Vector)]): Dataset[PredictedVector] ={ | |
import spark.implicits._ | |
val getFirst = udf((v: Vector) => v(1)) | |
val predictionsRaw: DataFrame = trainedModel.transform(allComparableDataset) | |
predictionsRaw.select( | |
$"left.old_id".as("id_left"), | |
$"right.old_id".as("id_right"), | |
$"features", | |
getFirst($"probability").as("probability"), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ModelTrainer extends Logging{ | |
def train(trainingDF: DataFrame): (LogisticRegressionModel, Map[String, Double]) ={ | |
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} | |
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator | |
import spark.implicits._ | |
val SPLIT_FACTOR = configuration.envOrElseConfig("learning.split-factor").toDouble | |
val SPLIT_FEED = configuration.envOrElseConfig("learning.split-seed").toLong | |
val MAX_ITERATIONS = configuration.envOrElseConfig("learning.max-iterations").toInt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MakeTrainingTestDatasets extends Logging{ | |
def evalVector(features: Array[Double]): Boolean ={ | |
import util.control.Breaks._ | |
var result: Boolean = true | |
breakable { | |
for((f,i) <- features.view.zipWithIndex){ | |
i match { | |
// 0-Title |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def createAllCombinationsDataset(personDFFromSource: Dataset[Person]): Dataset[(Person, Person, Vector)] ={ | |
import spark.implicits._ | |
import com.fts.cp.etl.model.Person | |
import org.apache.spark.ml.linalg.{Vector, Vectors} | |
val joinPostcodeCondition = when( | |
$"l.postcode".isNotNull && $"r.postcode".isNotNull, | |
$"l.postcode" === $"r.postcode" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object TaxCalculator { | |
private val taxWindows = Seq( | |
TaxForWindow(1, 5070, 10), | |
TaxForWindow(5071, 8660, 14), | |
TaxForWindow(8661, 14070, 23), | |
TaxForWindow(14071, 21240, 30), | |
TaxForWindow(21241, 40230, 33), | |
TaxForWindow(40231, Int.MaxValue, 45)) | |
def calculate(salary: Double): Double = { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object Maps { | |
val colors = Map("red" -> 0xFF0000, | |
"turquoise" -> 0x00FFFF, | |
"black" -> 0x000000, | |
"orange" -> 0xFF8040, | |
"brown" -> 0x804000) | |
def main(args: Array[String]) { | |
for (name <- args) println( | |
colors.get(name) match { | |
case Some(code) => |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val df = sqlContext.createDataFrame(data).toDF("label", "features") | |
val lr = new LogisticRegression().setMaxIter(10) | |
val model = lr.fit(df) | |
val weights = model.weights | |
model.transform(df).show() |