terrisgit/kmeans.scala Secret

## kmeans.scala
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.{Pipeline}
import org.apache.spark.sql.functions.col
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
import org.apache.spark.sql.types._
// Data file:
// age(int)
// gender('M' or 'F')
// days since prior purchase(int)
// month(string, 3 char month abbreviation)
// amount(float)
val schema = StructType(Array(
StructField("age", DoubleType, true),
StructField("gender", StringType, true),
StructField("days", DoubleType, true),
StructField("month", StringType, true),
StructField("amount", DoubleType, true)))
// Read the input file
val df = spark.read.format("csv")
.option("header", "false")
.option("inferSchema", "true")
.schema(schema)
.load("/home/jovyan/work/data.csv")
val gindexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex")
val gencoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec")
val mindexer = new StringIndexer().setInputCol("month").setOutputCol("monthIndex")
val mencoder = new OneHotEncoder().setInputCol("monthIndex").setOutputCol("monthVec")
// Specify the fields used for clustering
val assembler = new VectorAssembler()
.setInputCols(Array("age","genderVec","days","monthVec","amount"))
.setOutputCol("features")
// k-means model with two clusters
val kmeans = new KMeans().setK(2).setSeed(1L)
// Create a pipeline
val pipeline = new Pipeline().setStages(Array(gindexer, gencoder, mindexer, mencoder, assembler, kmeans))
// Run the pipeline
val kMeansPredictionModel = pipeline.fit(df)
// Create a dataframe with the transformed input plus a
// field named 'prediction' containing the cluster number
val predictionResult = kMeansPredictionModel.transform(df)
	import org.apache.spark.ml.clustering.KMeans
	import org.apache.spark.ml.evaluation.ClusteringEvaluator
	import org.apache.spark.ml.feature.VectorAssembler
	import org.apache.spark.ml.linalg.Vectors
	import org.apache.spark.ml.{Pipeline}
	import org.apache.spark.sql.functions.col
	import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
	import org.apache.spark.sql.types._
	// Data file:
	// age(int)
	// gender('M' or 'F')
	// days since prior purchase(int)
	// month(string, 3 char month abbreviation)
	// amount(float)
	val schema = StructType(Array(
	StructField("age", DoubleType, true),
	StructField("gender", StringType, true),
	StructField("days", DoubleType, true),
	StructField("month", StringType, true),
	StructField("amount", DoubleType, true)))
	// Read the input file
	val df = spark.read.format("csv")
	.option("header", "false")
	.option("inferSchema", "true")
	.schema(schema)
	.load("/home/jovyan/work/data.csv")
	val gindexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex")
	val gencoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec")
	val mindexer = new StringIndexer().setInputCol("month").setOutputCol("monthIndex")
	val mencoder = new OneHotEncoder().setInputCol("monthIndex").setOutputCol("monthVec")
	// Specify the fields used for clustering
	val assembler = new VectorAssembler()
	.setInputCols(Array("age","genderVec","days","monthVec","amount"))
	.setOutputCol("features")
	// k-means model with two clusters
	val kmeans = new KMeans().setK(2).setSeed(1L)
	// Create a pipeline
	val pipeline = new Pipeline().setStages(Array(gindexer, gencoder, mindexer, mencoder, assembler, kmeans))
	// Run the pipeline
	val kMeansPredictionModel = pipeline.fit(df)
	// Create a dataframe with the transformed input plus a
	// field named 'prediction' containing the cluster number
	val predictionResult = kMeansPredictionModel.transform(df)