Created
December 14, 2015 12:20
-
-
Save mrvege/1d94c98de27aa42f2fa2 to your computer and use it in GitHub Desktop.
using a NAStatCounter to prefilter the NAN in my spark dataset, in order to load the class in spark-shell, you should use command like this (:load [path to your scala file])
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Created by cmy on 12/14/15. | |
*/ | |
import org.apache.spark.util.StatCounter | |
class NAStatCounter extends Serializable{ | |
val stats: StatCounter = new StatCounter() | |
var missing: Long = 0 | |
def add(x: Double): NAStatCounter = { | |
if (java.lang.Double.isNaN(x)){ | |
missing += 1 | |
} else { | |
stats.merge(x) | |
} | |
this | |
} | |
def merge(other: NAStatCounter): NAStatCounter = { | |
stats.merge(other.stats) | |
missing += other.missing | |
this | |
} | |
override def toString = { | |
"stats:" + stats.toString() + "NaN:" + missing | |
} | |
object NAStatCounter extends Serializable { | |
def apply(x: Double) = new NAStatCounter().add(x) | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment