Skip to content

Instantly share code, notes, and snippets.

View mostafam's full-sized avatar

Mostafa Majidpour mostafam

View GitHub Profile
@mostafam
mostafam / scoreREPL.scala
Created August 13, 2020 08:58
Read MLeap and score the LeapFrame
scala> import ml.combust.bundle.BundleFile
scala> import ml.combust.mleap.runtime.MleapSupport._
scala> import resource._
scala> import ml.combust.mleap.core.types._
scala> import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
// creating LeapFrame equivalent to the DataFrame above
scala> val schema = StructType(StructField("email", ScalarType.String),
| StructField("first_name", ScalarType.String)).get
@mostafam
mostafam / createSparkPL.scala
Created August 13, 2020 08:56
Create Spark PipelineModel and Export it
scala> import org.apache.spark.ml.mleap.feature.StringChecker
scala> import ml.combust.mleap.core.feature.StringCheckerModel
scala> import org.apache.spark.ml.bundle.SparkBundleContext
scala> import ml.combust.bundle.BundleFile
scala> import ml.combust.mleap.spark.SparkSupport._
scala> import org.apache.spark.ml.{Pipeline, PipelineModel}
scala> import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler}
scala> import org.apache.spark.sql._
scala> import org.apache.spark.sql.functions._
scala> import resource._
@mostafam
mostafam / StringCheckerOp.scala
Created August 13, 2020 08:46
StringCheckerOp.scala (Spark side)
package org.apache.spark.ml.bundle.extension.ops.feature
import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.core.feature.StringCheckerModel
import org.apache.spark.ml.bundle.SparkBundleContext
import org.apache.spark.ml.mleap.feature.StringChecker
/**
@mostafam
mostafam / StringCheckerOp.scala
Last active August 13, 2020 08:41
StringCheckerOp.scala (MLeap side)
package ml.combust.mleap.bundle.ops.feature
import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.StringCheckerModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.StringChecker
@mostafam
mostafam / StringChecker.scala
Created August 13, 2020 08:34
StringChecker.scala (Spark side)
package org.apache.spark.ml.mleap.feature
import ml.combust.mleap.core.feature.StringCheckerModel
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
@mostafam
mostafam / StringChecker.scala
Created August 13, 2020 08:31
StringChecker.scala (MLeap side)
package ml.combust.mleap.runtime.transformer.feature
import ml.combust.mleap.core.feature.StringCheckerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, Transformer}
import ml.combust.mleap.runtime.function.{StructSelector, UserDefinedFunction}
import scala.util.Try
package ml.combust.mleap.core.feature
import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{ScalarType, StructField, StructType}
/**
* Created by mostafam on 6/30/20.
*/
case class StringCheckerModel(caseSensitive: Boolean) extends Model {
@mostafam
mostafam / stringCheckerInstance.scala
Last active August 13, 2020 08:08
stringChecker Instance
scala> val stringChecker = new StringChecker(uid = "string_checker", model = new StringCheckerModel(caseSensitive = false)).
setInputCols("text", "query").
setOutputCol("is_it_there?")
scala> val df = spark.createDataFrame(
Seq((0, "john.doe@gmail.com", "John"), (1, "JackieChan234@xyz.com","jack"), (2, "ping_pong@missed.org","Al"))
).toDF("id", "email", "first_name")
scala> df.show(false)
+---+---------------------+----------+
|id |email |first_name|
+---+---------------------+----------+
|0 |john.doe@gmail.com |John |
|1 |JackieChan234@xyz.com|jack |
@mostafam
mostafam / create_sample_df.py
Last active June 3, 2020 01:28
Create sample Spark Dataframe
sample_df = spark.createDataFrame([(1, 33.704045, -157.754334),
(2, 45.019704, -118.014528),
(3, 21.306754,-119.159649)],
("id", "latitude","longitude"))
# +---+---------+-----------+
# | id| latitude| longitude|
# +---+---------+-----------+
# | 1|33.704045| -90.754334|
# | 2|45.019704|-123.014528|