Skip to content

Instantly share code, notes, and snippets.

@piyo7
Last active August 29, 2015 14:07
Show Gist options
  • Save piyo7/8014c909888e2e912365 to your computer and use it in GitHub Desktop.
Save piyo7/8014c909888e2e912365 to your computer and use it in GitHub Desktop.
Spark / MLlib の K-means を Scala から利用してみる ref: http://qiita.com/piyo7/items/77cc4350bfeab75a29e7
sbt.version=0.13.6
name := "KMeansIris"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "1.1.0",
"org.apache.spark" %% "spark-mllib" % "1.1.0"
)
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
object KMeansIris extends App {
val context = new SparkContext("local", "demo")
val data = context.
textFile("src/main/resources/iris.data").
filter(_.nonEmpty).
map { s =>
val elems = s.split(",")
(elems.last, Vectors.dense(elems.init.map(_.toDouble)))
}
val k = 3 // クラスタの個数を指定します
val maxItreations = 100 // K-means のイテレーション最大回数を指定します
val clusters = KMeans.train(data.map(_._2), k, maxItreations)
// 各クラスタの中心を確認する
println("## クラスタの中心")
clusters.clusterCenters.foreach {
center => println(f"${center.toArray.mkString("[", ", ", "]")}%s")
}
// 各データがどのクラスタに分類されたのかを確認する
println("## 各データのクラスタリング結果")
data.foreach { tuple =>
println(f"${tuple._2.toArray.mkString("[", ", ", "]")}%s " +
f"(${tuple._1}%s) : cluster = ${clusters.predict(tuple._2)}%d")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment