Skip to content

Instantly share code, notes, and snippets.

View jesusjavierdediego's full-sized avatar

Jesus de Diego jesusjavierdediego

View GitHub Profile
import java.io.File
import java.time.Instant
import java.sql.Timestamp
import java.util._
import java.security.KeyStore.PasswordProtection
import java.security.spec.DSAParameterSpec
import java.math.BigInteger
import org.slf4j.LoggerFactory
import eu.europa.esig.dss.enumerations._
import eu.europa.esig.dss.validation.CommonCertificateVerifier
def mergePurgeAndMakeListWithoutDuplictes(listOfFoundDuplicates: DataFrame, personList: Dataset[Person]): DataFrame ={
import spark.implicits._
import scala.collection.mutable.ArrayBuffer
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any]
var unifiedListOfPersons = ArrayBuffer.empty[Person]
val multipleDuplicates = listOfFoundDuplicates.filter($"num" > 1)
val noDuplicates = listOfFoundDuplicates.filter($"num" === 1)
multipleDuplicates.map(row => {
try{
unifiedListOfPersons += mergeDuplicatesForAGivenPerson(row.getAs[String]("id"), row.getAs[Set[String]]("duplicates"), personList)
class ModelComposer extends Logging{
val operationType = "create"
@throws(classOf[ModelComposerException])
def getAllDFs(persons: DataFrame, sourceData: DataFrame): Map[String, DataFrame] = {
import spark.implicits._
import com.datastax.driver.core.utils.UUIDs
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any]
var result: scala.collection.mutable.Map[String, DataFrame] = scala.collection.mutable.Map.empty[String, DataFrame]
def applyModelToAllCombinations(trainedModel: LogisticRegressionModel, allComparableDataset: Dataset[(Person, Person, Vector)]): Dataset[PredictedVector] ={
import spark.implicits._
val getFirst = udf((v: Vector) => v(1))
val predictionsRaw: DataFrame = trainedModel.transform(allComparableDataset)
predictionsRaw.select(
$"left.old_id".as("id_left"),
$"right.old_id".as("id_right"),
$"features",
getFirst($"probability").as("probability"),
class ModelTrainer extends Logging{
def train(trainingDF: DataFrame): (LogisticRegressionModel, Map[String, Double]) ={
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import spark.implicits._
val SPLIT_FACTOR = configuration.envOrElseConfig("learning.split-factor").toDouble
val SPLIT_FEED = configuration.envOrElseConfig("learning.split-seed").toLong
val MAX_ITERATIONS = configuration.envOrElseConfig("learning.max-iterations").toInt
class MakeTrainingTestDatasets extends Logging{
def evalVector(features: Array[Double]): Boolean ={
import util.control.Breaks._
var result: Boolean = true
breakable {
for((f,i) <- features.view.zipWithIndex){
i match {
// 0-Title
def createAllCombinationsDataset(personDFFromSource: Dataset[Person]): Dataset[(Person, Person, Vector)] ={
import spark.implicits._
import com.fts.cp.etl.model.Person
import org.apache.spark.ml.linalg.{Vector, Vectors}
val joinPostcodeCondition = when(
$"l.postcode".isNotNull && $"r.postcode".isNotNull,
$"l.postcode" === $"r.postcode"
)
object TaxCalculator {
private val taxWindows = Seq(
TaxForWindow(1, 5070, 10),
TaxForWindow(5071, 8660, 14),
TaxForWindow(8661, 14070, 23),
TaxForWindow(14071, 21240, 30),
TaxForWindow(21241, 40230, 33),
TaxForWindow(40231, Int.MaxValue, 45))
def calculate(salary: Double): Double = {
object Maps {
val colors = Map("red" -> 0xFF0000,
"turquoise" -> 0x00FFFF,
"black" -> 0x000000,
"orange" -> 0xFF8040,
"brown" -> 0x804000)
def main(args: Array[String]) {
for (name <- args) println(
colors.get(name) match {
case Some(code) =>
val df = sqlContext.createDataFrame(data).toDF("label", "features")
val lr = new LogisticRegression().setMaxIter(10)
val model = lr.fit(df)
val weights = model.weights
model.transform(df).show()