Skip to content

Instantly share code, notes, and snippets.

@Fkawala
Last active December 5, 2018 15:41
Show Gist options
  • Save Fkawala/65568b0923fa41a642a979d129ca00fe to your computer and use it in GitHub Desktop.
Save Fkawala/65568b0923fa41a642a979d129ca00fe to your computer and use it in GitHub Desktop.
import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.VectorAssembler
import scala.util.Random
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostRegressor}
val n = 10000
val x0 = 2
val data = (1 to n).map(x => (Random.nextFloat()))
val df = data.toDF("x").withColumn("label", ($"x" > 0.5).cast("int"))
val assembler = new VectorAssembler().setInputCols(Array("x")).setOutputCol("features")
val clf = assembler.transform(df).drop("x")
val numIterations = 100
val nbExecutors = sc.statusTracker.getExecutorInfos.length - 1
val paramMap = List("eta" -> 0.05f, "max_depth" -> 10, "objective" -> "reg:linear", "eval_metric" ->"mae", "nthread"->3).toMap
val trainSet = trainDF.cache()
val valSet = valDF.cache()
val model = new XGBoostClassifier(paramMap)
.setFeaturesCol("features")
.setLabelCol("label")
.setNumRound(numIterations)
.setNumWorkers(nbExecutors)
.fit(trainSet)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment