Skip to content

Instantly share code, notes, and snippets.

@rezazadeh
Last active November 1, 2020 10:11
Show Gist options
  • Save rezazadeh/5a3bb88d9fdf423dd861 to your computer and use it in GitHub Desktop.
Save rezazadeh/5a3bb88d9fdf423dd861 to your computer and use it in GitHub Desktop.
CosineSimilarity DIMSUM Example
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.mllib
import scopt.OptionParser
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Compute the similar columns of a matrix, using cosine similarity.
*
* The input matrix must be stored in row-oriented dense format, one line per row with its entries
* separated by space. For example,
* {{{
* 0.5 1.0
* 2.0 3.0
* 4.0 5.0
* }}}
* represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
*
* Example invocation:
*
* bin/run-example mllib.CosineSimilarity \
* --threshold 0.1 data/mllib/sample_svm_data.txt
*/
object CosineSimilarity {
case class Params(inputFile: String = null, threshold: Double = 0.1)
def main(args: Array[String]) {
val defaultParams = Params()
val parser = new OptionParser[Params]("CosineSimilarity") {
head("CosineSimilarity: an example app.")
opt[Double]("threshold")
.required()
.text(s"threshold similarity: to tradeoff computation vs quality estimate")
.action((x, c) => c.copy(threshold = x))
arg[String]("<inputFile>")
.required()
.text(s"input file, one row per line, space-separated")
.action((x, c) => c.copy(inputFile = x))
note(
"""
|For example, the following command runs this app on a dataset:
|
| ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \
| examplesjar.jar \
| --threshold 0.1 data/mllib/sample_svm_data.txt
""".stripMargin)
}
parser.parse(args, defaultParams).map { params =>
run(params)
} getOrElse {
System.exit(1)
}
}
def run(params: Params) {
val conf = new SparkConf().setAppName("CosineSimilarity")
val sc = new SparkContext(conf)
// Load and parse the data file.
val rows = sc.textFile(params.inputFile).map { line =>
val values = line.split(' ').map(_.toDouble)
Vectors.dense(values)
}.cache()
val mat = new RowMatrix(rows)
// Compute similar columns perfectly, with brute force.
val exact = mat.columnSimilarities()
// Compute similar columns with estimation using DIMSUM
val approx = mat.columnSimilarities(params.threshold)
val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
case (u, Some(v)) =>
math.abs(u - v)
case (u, None) =>
math.abs(u)
}.mean()
println(s"Average absolute error in estimate is: $MAE")
sc.stop()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment