Skip to content

Instantly share code, notes, and snippets.

@mrvege
Created December 14, 2015 12:20
Show Gist options
  • Save mrvege/1d94c98de27aa42f2fa2 to your computer and use it in GitHub Desktop.
Save mrvege/1d94c98de27aa42f2fa2 to your computer and use it in GitHub Desktop.
using a NAStatCounter to prefilter the NAN in my spark dataset, in order to load the class in spark-shell, you should use command like this (:load [path to your scala file])
/**
* Created by cmy on 12/14/15.
*/
import org.apache.spark.util.StatCounter
class NAStatCounter extends Serializable{
val stats: StatCounter = new StatCounter()
var missing: Long = 0
def add(x: Double): NAStatCounter = {
if (java.lang.Double.isNaN(x)){
missing += 1
} else {
stats.merge(x)
}
this
}
def merge(other: NAStatCounter): NAStatCounter = {
stats.merge(other.stats)
missing += other.missing
this
}
override def toString = {
"stats:" + stats.toString() + "NaN:" + missing
}
object NAStatCounter extends Serializable {
def apply(x: Double) = new NAStatCounter().add(x)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment