Skip to content

Instantly share code, notes, and snippets.

@erikerlandson
Last active July 27, 2017 08:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikerlandson/05db1f15c8d623448ff6 to your computer and use it in GitHub Desktop.
Save erikerlandson/05db1f15c8d623448ff6 to your computer and use it in GitHub Desktop.
Prototype iterators that use efficient gap sampling algorithms
import scala.reflect.ClassTag
import scala.sys.process._
class PoissonDist(p: Double) {
val q = Math.exp(-p)
def next: Int = {
var r = 0
var pp = Math.random()
while (pp > q) {
r += 1
pp *= Math.random()
}
r
}
def nextGE1: Int = {
// simulate that the standard poisson sampling
// gave us at least one iteration, for a sample of >= 1
var pp = q + ((1.0 - q) * Math.random())
var r = 1
// now continue with standard poisson sampling algorithm
pp *= Math.random()
while (pp > q) {
r += 1
pp *= Math.random()
}
r
}
}
// I used this to verify that my poisson works same as cern, so I can
// now consistently use my poisson across testing
/*
import cern.jet.random.Poisson
import cern.jet.random.engine.DRand
def test_poisson(n: Int = 1000, p: Double = 0.5) {
val pd = new PoissonDist(p)
val pdref = new Poisson(p, new DRand())
var t1 = for (j <- 1 to n) yield pd.next
var t2 = for (j <- 1 to n) yield pdref.nextInt()
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
var stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
println(s"p= $p n= $n KS pval= $stat")
val nf = (n.toDouble / (1.0 - Math.exp(-p))).toInt
t1 = for (j <- 1 to n) yield pd.nextGE1
t2 = (for (j <- 1 to nf) yield pdref.nextInt()).filter(_ >= 1)
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
println(s"p= $p n= $n KS pval= $stat")
}
*/
class GapSamplingIterator[T: ClassTag](var data: Iterator[T], p: Double) extends Iterator[T] {
// implement efficient drop until scala includes fix for jira SI-8835
val dd: Int => Unit = {
val arrayClass = Array.empty[T].iterator.getClass
data.getClass match {
case `arrayClass` => {
((n: Int) => { data = data.drop(n) })
}
case _ => {
((n: Int)=> {
var j = 0
while (j < n && data.hasNext) {
data.next
j += 1
}
})
}
}
}
override def hasNext: Boolean = data.hasNext
override def next: T = {
val r = data.next
advance
r
}
val lnq = Math.log(1.0-p)
def advance {
// skip elements with replication factor zero (i.e. elements that won't be sampled)
val u = Math.max(Math.random(), 1e-10)
val rlen = (Math.log(u)/lnq).toInt
dd(rlen)
}
// advance to first sample as part of object construction
advance
}
class GapSamplingReplacementIterator[T: ClassTag](var data: Iterator[T],
p: Double) extends Iterator[T] {
// implement efficient drop until scala includes fix for jira SI-8835
val dd: Int => Unit = {
val arrayClass = Array.empty[T].iterator.getClass
data.getClass match {
case `arrayClass` => {
((n: Int) => { data = data.drop(n) })
}
case _ => {
((n: Int)=> {
var j = 0
while (j < n && data.hasNext) {
data.next
j += 1
}
})
}
}
}
val poisson = new PoissonDist(p)
// current sampling value, and its replication factor, as we are sampling with replacement:
var v: T = _
var rep: Int = 0
override def hasNext: Boolean = data.hasNext || rep > 0
override def next: T = {
val r = v
rep -= 1
if (rep <= 0) advance
r
}
def advance {
// skip elements with replication factor zero (i.e. elements that won't be sampled)
val u = Math.max(Math.random(), 1e-10)
val rlen = (Math.log(u)/(-p)).toInt
dd(rlen)
// set the value and replication factor for the next value
if (data.hasNext) {
v = data.next
rep = poisson.nextGE1
}
}
// advance to first sample as part of object construction
advance
}
def benchmark(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
var data:Seq[Int] = (1 to m).toArray
println(s"\np= $p n= $n m= $m\nArray")
var start = System.currentTimeMillis()
(1 to n).foreach { _ => {
data.iterator.filter(_ => Math.random() < p).length
}}
println(s" Time using filter: ${System.currentTimeMillis() - start}")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
(new GapSamplingIterator(data.iterator, p)).length
}}
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
data = (1 to m).toList
println(s"List")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
data.iterator.filter(_ => Math.random() < p).length
}}
println(s" Time using filter: ${System.currentTimeMillis() - start}")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
(new GapSamplingIterator(data.iterator, p)).length
}}
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
}
def benchmark_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
val poisson = new PoissonDist(p)
var data:Seq[Int] = (1 to m).toArray
println(s"\np= $p n= $n m= $m\nArray (replacement)")
var start = System.currentTimeMillis()
(1 to n).foreach { _ => {
data.iterator.flatMap(element => {
val count = poisson.next
if (count == 0) Iterator.empty else Iterator.fill(count)(element)
}).length
}}
println(s" Time using filter: ${System.currentTimeMillis() - start}")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
(new GapSamplingReplacementIterator(data.iterator, p)).length
}}
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
data = (1 to m).toList
println(s"List (replacement)")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
data.iterator.flatMap(element => {
val count = poisson.next
if (count == 0) Iterator.empty else Iterator.fill(count)(element)
}).length
}}
println(s" Time using filter: ${System.currentTimeMillis() - start}")
start = System.currentTimeMillis()
(1 to n).foreach { _ => {
(new GapSamplingReplacementIterator(data.iterator, p)).length
}}
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
}
def test_sample(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss: Int = 5, RA:Boolean = true,
testName: String="sample length",
test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList
val stat = (for (jj <- 1 to ss) yield {
val t1 = (for (j <- 1 to n) yield {
(new GapSamplingIterator(data.iterator, p))
}).flatMap(test)
val t2 = (for (j <- 1 to n) yield {
data.iterator.filter(_ => Math.random() < p)
}).flatMap(test)
// detour through scipy to get KS test:
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
}).sortWith(_ < _)(ss / 2)
println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat")
}
def test_sample_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss:Int = 5, RA:Boolean = true,
testName: String="sample length (replacement)",
test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
val poisson = new PoissonDist(p)
val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList
val stat = (for (jj <- 1 to ss) yield {
val t1 = (for (j <- 1 to n) yield {
(new GapSamplingReplacementIterator(data.iterator, p))
}).flatMap(test)
val t2 = (for (j <- 1 to n) yield {
data.iterator.flatMap(element => {
val count = poisson.next
if (count == 0) Iterator.empty else Iterator.fill(count)(element)
})
}).flatMap(test)
// detour through scipy to get KS test:
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
}).sortWith(_ < _)(ss / 2)
println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat")
}
// measure gap lengths between sampled values
// (assumes values are in sorted order)
def gaps(data: Iterator[Int]): Iterator[Int] = {
data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, r-l._1)).drop(2).map(_._2)
}
def run_benchmarks {
benchmark(p= 0.001, n= 1000, m= 100000)
benchmark(p= 0.01, n= 1000, m= 100000)
benchmark(p= 0.1, n= 1000, m= 100000)
benchmark(p= 0.5, n= 1000, m= 100000)
benchmark(p= 0.9, n= 1000, m= 100000)
benchmark_replace(p= 0.001, n= 1000, m= 100000)
benchmark_replace(p= 0.01, n= 1000, m= 100000)
benchmark_replace(p= 0.1, n= 1000, m= 100000)
benchmark_replace(p= 0.5, n= 1000, m= 100000)
benchmark_replace(p= 0.9, n= 1000, m= 100000)
}
def run_ks_tests {
test_sample(p= 0.01, n= 10000, m= 10000)
test_sample(p= 0.1, n= 10000, m= 10000)
test_sample(p= 0.5, n= 10000, m= 10000)
test_sample(p= 0.01, n= 10000, m= 10000, RA= false)
test_sample(p= 0.1, n= 10000, m= 10000, RA= false)
test_sample(p= 0.5, n= 10000, m= 10000, RA= false)
test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps")
test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps")
test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps")
test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
test_sample_replace(p= 0.01, n= 10000, m= 10000)
test_sample_replace(p= 0.1, n= 10000, m= 10000)
test_sample_replace(p= 0.5, n= 10000, m= 10000)
test_sample_replace(p= 0.01, n= 10000, m= 10000, RA= false)
test_sample_replace(p= 0.1, n= 10000, m= 10000, RA= false)
test_sample_replace(p= 0.5, n= 10000, m= 10000, RA= false)
test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
}
#!/bin/env python
import sys
import argparse
from scipy import stats
argparser = argparse.ArgumentParser()
argparser.add_argument('data1', type=argparse.FileType('r'), metavar='<data-file-1>')
argparser.add_argument('data2', type=argparse.FileType('r'), metavar='<data-file-2>')
argparser.add_argument('-mr', default=False, action='store_true', help='machine readable output')
args = argparser.parse_args()
dv1 = args.data1.read().split()
dv2 = args.data2.read().split()
ksres = stats.ks_2samp(dv1, dv2)
if args.mr:
sys.stdout.write("%s %s\n" % ksres)
else:
sys.stdout.write("KS test results: stat=%s pval=%s\n" % ksres)
scala> run_benchmarks
p= 0.001 n= 1000 m= 100000
Array
Time using filter: 2833
Time using gap sampling: 29
List
Time using filter: 2213
Time using gap sampling: 230
p= 0.01 n= 1000 m= 100000
Array
Time using filter: 2825
Time using gap sampling: 76
List
Time using filter: 2220
Time using gap sampling: 265
p= 0.1 n= 1000 m= 100000
Array
Time using filter: 2985
Time using gap sampling: 787
List
Time using filter: 2337
Time using gap sampling: 796
p= 0.5 n= 1000 m= 100000
Array
Time using filter: 3526
Time using gap sampling: 3478
List
Time using filter: 2794
Time using gap sampling: 3151
p= 0.9 n= 1000 m= 100000
Array
Time using filter: 3023
Time using gap sampling: 6081
List
Time using filter: 2513
Time using gap sampling: 4849
p= 0.001 n= 1000 m= 100000
Array (replacement)
Time using filter: 2604
Time using gap sampling: 45
List (replacement)
Time using filter: 2431
Time using gap sampling: 233
p= 0.01 n= 1000 m= 100000
Array (replacement)
Time using filter: 3442
Time using gap sampling: 117
List (replacement)
Time using filter: 2450
Time using gap sampling: 299
p= 0.1 n= 1000 m= 100000
Array (replacement)
Time using filter: 3653
Time using gap sampling: 1044
List (replacement)
Time using filter: 2984
Time using gap sampling: 1330
p= 0.5 n= 1000 m= 100000
Array (replacement)
Time using filter: 5643
Time using gap sampling: 5073
List (replacement)
Time using filter: 5331
Time using gap sampling: 4752
p= 0.9 n= 1000 m= 100000
Array (replacement)
Time using filter: 7668
Time using gap sampling: 8388
List (replacement)
Time using filter: 6744
Time using gap sampling: 7811
scala> run_ks_tests
test= sample length
p= 0.01 n= 10000 m= 10000 RA= true
KS pval= 0.98104407763
test= sample length
p= 0.1 n= 10000 m= 10000 RA= true
KS pval= 0.912879254739
test= sample length
p= 0.5 n= 10000 m= 10000 RA= true
KS pval= 0.732634164255
test= sample length
p= 0.01 n= 10000 m= 10000 RA= false
KS pval= 0.986338712288
test= sample length
p= 0.1 n= 10000 m= 10000 RA= false
KS pval= 0.778352793151
test= sample length
p= 0.5 n= 10000 m= 10000 RA= false
KS pval= 0.242107687641
test= sample gaps
p= 0.01 n= 10000 m= 1000 RA= true
KS pval= 0.398632159825
test= sample gaps
p= 0.1 n= 10000 m= 1000 RA= true
KS pval= 0.918625732614
test= sample gaps
p= 0.5 n= 10000 m= 1000 RA= true
KS pval= 0.937181377828
test= sample gaps
p= 0.01 n= 10000 m= 1000 RA= false
KS pval= 0.512899405585
test= sample gaps
p= 0.1 n= 10000 m= 1000 RA= false
KS pval= 0.667342469032
test= sample gaps
p= 0.5 n= 10000 m= 1000 RA= false
KS pval= 0.945788971072
test= sample length (replacement)
p= 0.01 n= 10000 m= 10000 RA= true
KS pval= 0.673545329328
test= sample length (replacement)
p= 0.1 n= 10000 m= 10000 RA= true
KS pval= 0.364634485794
test= sample length (replacement)
p= 0.5 n= 10000 m= 10000 RA= true
KS pval= 0.637715845192
test= sample length (replacement)
p= 0.01 n= 10000 m= 10000 RA= false
KS pval= 0.34627068807
test= sample length (replacement)
p= 0.1 n= 10000 m= 10000 RA= false
KS pval= 0.190771573151
test= sample length (replacement)
p= 0.5 n= 10000 m= 10000 RA= false
KS pval= 0.487108719852
test= sample gaps (replacement)
p= 0.01 n= 10000 m= 1000 RA= true
KS pval= 0.648436825568
test= sample gaps (replacement)
p= 0.1 n= 10000 m= 1000 RA= true
KS pval= 0.637677942914
test= sample gaps (replacement)
p= 0.5 n= 10000 m= 1000 RA= true
KS pval= 0.953266697608
test= sample gaps (replacement)
p= 0.01 n= 10000 m= 1000 RA= false
KS pval= 0.765060442147
test= sample gaps (replacement)
p= 0.1 n= 10000 m= 1000 RA= false
KS pval= 0.443334023868
test= sample gaps (replacement)
p= 0.5 n= 10000 m= 1000 RA= false
KS pval= 0.848623231604
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment