Skip to content

Instantly share code, notes, and snippets.

Jesus de Diego jesusjavierdediego

Block or report user

Report or block jesusjavierdediego

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View mergePurgeAndMakeListWithoutDup.scala
def mergePurgeAndMakeListWithoutDuplictes(listOfFoundDuplicates: DataFrame, personList: Dataset[Person]): DataFrame ={
import spark.implicits._
import scala.collection.mutable.ArrayBuffer
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any]
var unifiedListOfPersons = ArrayBuffer.empty[Person]
val multipleDuplicates = listOfFoundDuplicates.filter($"num" > 1)
val noDuplicates = listOfFoundDuplicates.filter($"num" === 1)
multipleDuplicates.map(row => {
try{
unifiedListOfPersons += mergeDuplicatesForAGivenPerson(row.getAs[String]("id"), row.getAs[Set[String]]("duplicates"), personList)
View ModelComposer.scala
class ModelComposer extends Logging{
val operationType = "create"
@throws(classOf[ModelComposerException])
def getAllDFs(persons: DataFrame, sourceData: DataFrame): Map[String, DataFrame] = {
import spark.implicits._
import com.datastax.driver.core.utils.UUIDs
implicit val anyEncoder = org.apache.spark.sql.Encoders.kryo[Any]
var result: scala.collection.mutable.Map[String, DataFrame] = scala.collection.mutable.Map.empty[String, DataFrame]
View applyModelToAllCombinations.scala
def applyModelToAllCombinations(trainedModel: LogisticRegressionModel, allComparableDataset: Dataset[(Person, Person, Vector)]): Dataset[PredictedVector] ={
import spark.implicits._
val getFirst = udf((v: Vector) => v(1))
val predictionsRaw: DataFrame = trainedModel.transform(allComparableDataset)
predictionsRaw.select(
$"left.old_id".as("id_left"),
$"right.old_id".as("id_right"),
$"features",
getFirst($"probability").as("probability"),
View ModelTrainer.scala
class ModelTrainer extends Logging{
def train(trainingDF: DataFrame): (LogisticRegressionModel, Map[String, Double]) ={
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import spark.implicits._
val SPLIT_FACTOR = configuration.envOrElseConfig("learning.split-factor").toDouble
val SPLIT_FEED = configuration.envOrElseConfig("learning.split-seed").toLong
val MAX_ITERATIONS = configuration.envOrElseConfig("learning.max-iterations").toInt
View MakeTrainingTestDatasets.scala
class MakeTrainingTestDatasets extends Logging{
def evalVector(features: Array[Double]): Boolean ={
import util.control.Breaks._
var result: Boolean = true
breakable {
for((f,i) <- features.view.zipWithIndex){
i match {
// 0-Title
View createAllCombinationsDataset.scala
def createAllCombinationsDataset(personDFFromSource: Dataset[Person]): Dataset[(Person, Person, Vector)] ={
import spark.implicits._
import com.fts.cp.etl.model.Person
import org.apache.spark.ml.linalg.{Vector, Vectors}
val joinPostcodeCondition = when(
$"l.postcode".isNotNull && $"r.postcode".isNotNull,
$"l.postcode" === $"r.postcode"
)
View TaxCalculator.scala
object TaxCalculator {
private val taxWindows = Seq(
TaxForWindow(1, 5070, 10),
TaxForWindow(5071, 8660, 14),
TaxForWindow(8661, 14070, 23),
TaxForWindow(14071, 21240, 30),
TaxForWindow(21241, 40230, 33),
TaxForWindow(40231, Int.MaxValue, 45))
def calculate(salary: Double): Double = {
View map.scala
object Maps {
val colors = Map("red" -> 0xFF0000,
"turquoise" -> 0x00FFFF,
"black" -> 0x000000,
"orange" -> 0xFF8040,
"brown" -> 0x804000)
def main(args: Array[String]) {
for (name <- args) println(
colors.get(name) match {
case Some(code) =>
View spark3.scala
val df = sqlContext.createDataFrame(data).toDF("label", "features")
val lr = new LogisticRegression().setMaxIter(10)
val model = lr.fit(df)
val weights = model.weights
model.transform(df).show()
View spark2.scala
val url = "jdbc:mysql://125.25.25.63:3306/test?user=USERNAME;password=PASSWORD"
val df = sqlContext
.read
.format("jdbc")
.option("url", url)
.option("dbtable", "politicians")
.load()
df.printSchema()
You can’t perform that action at this time.