CosineSimilarity DIMSUM Example
/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package org.apache.spark.examples.mllib | |
import scopt.OptionParser | |
import org.apache.spark.SparkContext._ | |
import org.apache.spark.mllib.linalg.Vectors | |
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} | |
import org.apache.spark.{SparkConf, SparkContext} | |
/** | |
* Compute the similar columns of a matrix, using cosine similarity. | |
* | |
* The input matrix must be stored in row-oriented dense format, one line per row with its entries | |
* separated by space. For example, | |
* {{{ | |
* 0.5 1.0 | |
* 2.0 3.0 | |
* 4.0 5.0 | |
* }}} | |
* represents a 3-by-2 matrix, whose first row is (0.5, 1.0). | |
* | |
* Example invocation: | |
* | |
* bin/run-example mllib.CosineSimilarity \ | |
* --threshold 0.1 data/mllib/sample_svm_data.txt | |
*/ | |
object CosineSimilarity { | |
case class Params(inputFile: String = null, threshold: Double = 0.1) | |
def main(args: Array[String]) { | |
val defaultParams = Params() | |
val parser = new OptionParser[Params]("CosineSimilarity") { | |
head("CosineSimilarity: an example app.") | |
opt[Double]("threshold") | |
.required() | |
.text(s"threshold similarity: to tradeoff computation vs quality estimate") | |
.action((x, c) => c.copy(threshold = x)) | |
arg[String]("<inputFile>") | |
.required() | |
.text(s"input file, one row per line, space-separated") | |
.action((x, c) => c.copy(inputFile = x)) | |
note( | |
""" | |
|For example, the following command runs this app on a dataset: | |
| | |
| ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | |
| examplesjar.jar \ | |
| --threshold 0.1 data/mllib/sample_svm_data.txt | |
""".stripMargin) | |
} | |
parser.parse(args, defaultParams).map { params => | |
run(params) | |
} getOrElse { | |
System.exit(1) | |
} | |
} | |
def run(params: Params) { | |
val conf = new SparkConf().setAppName("CosineSimilarity") | |
val sc = new SparkContext(conf) | |
// Load and parse the data file. | |
val rows = sc.textFile(params.inputFile).map { line => | |
val values = line.split(' ').map(_.toDouble) | |
Vectors.dense(values) | |
}.cache() | |
val mat = new RowMatrix(rows) | |
// Compute similar columns perfectly, with brute force. | |
val exact = mat.columnSimilarities() | |
// Compute similar columns with estimation using DIMSUM | |
val approx = mat.columnSimilarities(params.threshold) | |
val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } | |
val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } | |
val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { | |
case (u, Some(v)) => | |
math.abs(u - v) | |
case (u, None) => | |
math.abs(u) | |
}.mean() | |
println(s"Average absolute error in estimate is: $MAE") | |
sc.stop() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment