Skip to content

Instantly share code, notes, and snippets.

View invkrh's full-sized avatar

Hao Ren invkrh

View GitHub Profile
@invkrh
invkrh / gist:05a83be081c1f713e15b
Last active November 8, 2016 14:50
MLLIB ALS evaluation using Mean Average Precision (MAP) and Expected Percentile Rank (EPR)
def meanAveragedPrecision(k: Int, model: MatrixFactorizationModel, ratings: RDD[Rating]): Double = {
def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = {
val predK = predicted.take(k)
var score = 0.0
var numHits = 0.0
for ((p, i) <- predK.zipWithIndex) {
if (actual.contains(p) && !predK.take(i).contains(p)) {
numHits += 1.0
score += numHits / (i.toDouble + 1.0)
}
@invkrh
invkrh / GroupByTest.scala
Last active August 29, 2015 14:06
groupBy problem
import context.RecoSysContext._
import org.apache.spark.HashPartitioner
import org.apache.spark.SparkContext._
object GroupByTest extends App {
case class purchaseLog(client_id: String, ticket_id: String, store: String)
val input = sc.textFile("/home/spark/workspace/data/pruchaseLog") // The input directory is generated by RDD.saveAsTextFile with 27 partitions
.map(_.split(";", -1))
@invkrh
invkrh / ndcg
Created January 26, 2015 17:01
correction of ndcg
def ndcgAt(k: Int): Double = {
require(k > 0, "ranking position k should be positive")
predictionAndLabels.map { case (pred, lab) =>
val labSet = lab.toSet
if (labSet.nonEmpty) {
val labSetSize = labSet.size
val predSize = pred.size
val n = math.min(math.max(pred.length, labSetSize), k)
var maxDcg = 0.0
@invkrh
invkrh / mockInput.scala
Last active November 24, 2017 10:15
input mocking for test need
object test extends App {
import scala.collection.JavaConversions.asJavaEnumeration
import java.io.{ByteArrayInputStream, SequenceInputStream}
val inputs = Iterator("asdf", "qewr", "wert")
.map(x => new ByteArrayInputStream((x + "\n").getBytes))
val in = new SequenceInputStream(asJavaEnumeration(inputs))
Console.setIn(in)
val a = readLine()
val b = readLine()
@invkrh
invkrh / cc.scala
Created July 22, 2015 22:51
Connected Component Problem
package me.invkrh.train
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD
/**
* Created with IntelliJ IDEA.
* User: invkrh
* Date: 22/07/15
@invkrh
invkrh / Lambda.java
Last active August 29, 2015 14:26
java lambda expression and method reference
rdd.map(x -> b).foreach(x -> System.out.println(x)); // works
rdd.map(x -> b).foreach(System.out::println(x)); // serialization problem
/*
* lambda expression -> interface
* method reference -> function call
* Note: idea propose using method ref which makes code broken
*/
@invkrh
invkrh / tailrec.scala
Created September 2, 2015 20:10
Show that recursive call need to be final or private (avoid overriding)
class C {
def fact(n: Int, result: Int): Int ={
println("super")
if(n == 0) result else fact(n - 1, n * result)
}
}
class C2 extends C {
scala> List(1,2,3,4,5) map {x => println("x"); x} map {y => println("y"); y}
x
x
x
x
x
y
y
y
y
def foldRightViaFoldLeft_1[A, B](l: List[A], z: B)(f: (A, B) => B): B =
foldLeft(l, (x: B) => x)((g, a) => (b: B) => g(f(a, b)))(z)
def foldLeftViaFoldRight[A, B](l: List[A], z: B)(f: (B, A) => B): B =
foldRight(l, (b: B) => b)((a, g) => b => g(f(b, a)))(z)
object LazySerializationTest extends App {
def serialize(obj: Any): Array[Byte] = {
val bytes = new ByteArrayOutputStream()
val out = new ObjectOutputStream(bytes)
out.writeObject(obj)
out.close()
bytes.toByteArray
}