Last active
July 27, 2017 08:28
-
-
Save erikerlandson/05db1f15c8d623448ff6 to your computer and use it in GitHub Desktop.
Prototype iterators that use efficient gap sampling algorithms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.reflect.ClassTag | |
import scala.sys.process._ | |
class PoissonDist(p: Double) { | |
val q = Math.exp(-p) | |
def next: Int = { | |
var r = 0 | |
var pp = Math.random() | |
while (pp > q) { | |
r += 1 | |
pp *= Math.random() | |
} | |
r | |
} | |
def nextGE1: Int = { | |
// simulate that the standard poisson sampling | |
// gave us at least one iteration, for a sample of >= 1 | |
var pp = q + ((1.0 - q) * Math.random()) | |
var r = 1 | |
// now continue with standard poisson sampling algorithm | |
pp *= Math.random() | |
while (pp > q) { | |
r += 1 | |
pp *= Math.random() | |
} | |
r | |
} | |
} | |
// I used this to verify that my poisson works same as cern, so I can | |
// now consistently use my poisson across testing | |
/* | |
import cern.jet.random.Poisson | |
import cern.jet.random.engine.DRand | |
def test_poisson(n: Int = 1000, p: Double = 0.5) { | |
val pd = new PoissonDist(p) | |
val pdref = new Poisson(p, new DRand()) | |
var t1 = for (j <- 1 to n) yield pd.next | |
var t2 = for (j <- 1 to n) yield pdref.nextInt() | |
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n")) | |
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n")) | |
var stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble | |
println(s"p= $p n= $n KS pval= $stat") | |
val nf = (n.toDouble / (1.0 - Math.exp(-p))).toInt | |
t1 = for (j <- 1 to n) yield pd.nextGE1 | |
t2 = (for (j <- 1 to nf) yield pdref.nextInt()).filter(_ >= 1) | |
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n")) | |
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n")) | |
stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble | |
println(s"p= $p n= $n KS pval= $stat") | |
} | |
*/ | |
class GapSamplingIterator[T: ClassTag](var data: Iterator[T], p: Double) extends Iterator[T] { | |
// implement efficient drop until scala includes fix for jira SI-8835 | |
val dd: Int => Unit = { | |
val arrayClass = Array.empty[T].iterator.getClass | |
data.getClass match { | |
case `arrayClass` => { | |
((n: Int) => { data = data.drop(n) }) | |
} | |
case _ => { | |
((n: Int)=> { | |
var j = 0 | |
while (j < n && data.hasNext) { | |
data.next | |
j += 1 | |
} | |
}) | |
} | |
} | |
} | |
override def hasNext: Boolean = data.hasNext | |
override def next: T = { | |
val r = data.next | |
advance | |
r | |
} | |
val lnq = Math.log(1.0-p) | |
def advance { | |
// skip elements with replication factor zero (i.e. elements that won't be sampled) | |
val u = Math.max(Math.random(), 1e-10) | |
val rlen = (Math.log(u)/lnq).toInt | |
dd(rlen) | |
} | |
// advance to first sample as part of object construction | |
advance | |
} | |
class GapSamplingReplacementIterator[T: ClassTag](var data: Iterator[T], | |
p: Double) extends Iterator[T] { | |
// implement efficient drop until scala includes fix for jira SI-8835 | |
val dd: Int => Unit = { | |
val arrayClass = Array.empty[T].iterator.getClass | |
data.getClass match { | |
case `arrayClass` => { | |
((n: Int) => { data = data.drop(n) }) | |
} | |
case _ => { | |
((n: Int)=> { | |
var j = 0 | |
while (j < n && data.hasNext) { | |
data.next | |
j += 1 | |
} | |
}) | |
} | |
} | |
} | |
val poisson = new PoissonDist(p) | |
// current sampling value, and its replication factor, as we are sampling with replacement: | |
var v: T = _ | |
var rep: Int = 0 | |
override def hasNext: Boolean = data.hasNext || rep > 0 | |
override def next: T = { | |
val r = v | |
rep -= 1 | |
if (rep <= 0) advance | |
r | |
} | |
def advance { | |
// skip elements with replication factor zero (i.e. elements that won't be sampled) | |
val u = Math.max(Math.random(), 1e-10) | |
val rlen = (Math.log(u)/(-p)).toInt | |
dd(rlen) | |
// set the value and replication factor for the next value | |
if (data.hasNext) { | |
v = data.next | |
rep = poisson.nextGE1 | |
} | |
} | |
// advance to first sample as part of object construction | |
advance | |
} | |
def benchmark(n: Int = 1000, m: Int = 1000, p: Double = 0.5) { | |
var data:Seq[Int] = (1 to m).toArray | |
println(s"\np= $p n= $n m= $m\nArray") | |
var start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
data.iterator.filter(_ => Math.random() < p).length | |
}} | |
println(s" Time using filter: ${System.currentTimeMillis() - start}") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
(new GapSamplingIterator(data.iterator, p)).length | |
}} | |
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}") | |
data = (1 to m).toList | |
println(s"List") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
data.iterator.filter(_ => Math.random() < p).length | |
}} | |
println(s" Time using filter: ${System.currentTimeMillis() - start}") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
(new GapSamplingIterator(data.iterator, p)).length | |
}} | |
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}") | |
} | |
def benchmark_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5) { | |
val poisson = new PoissonDist(p) | |
var data:Seq[Int] = (1 to m).toArray | |
println(s"\np= $p n= $n m= $m\nArray (replacement)") | |
var start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
data.iterator.flatMap(element => { | |
val count = poisson.next | |
if (count == 0) Iterator.empty else Iterator.fill(count)(element) | |
}).length | |
}} | |
println(s" Time using filter: ${System.currentTimeMillis() - start}") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
(new GapSamplingReplacementIterator(data.iterator, p)).length | |
}} | |
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}") | |
data = (1 to m).toList | |
println(s"List (replacement)") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
data.iterator.flatMap(element => { | |
val count = poisson.next | |
if (count == 0) Iterator.empty else Iterator.fill(count)(element) | |
}).length | |
}} | |
println(s" Time using filter: ${System.currentTimeMillis() - start}") | |
start = System.currentTimeMillis() | |
(1 to n).foreach { _ => { | |
(new GapSamplingReplacementIterator(data.iterator, p)).length | |
}} | |
println(s" Time using gap sampling: ${System.currentTimeMillis() - start}") | |
} | |
def test_sample(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss: Int = 5, RA:Boolean = true, | |
testName: String="sample length", | |
test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) { | |
val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList | |
val stat = (for (jj <- 1 to ss) yield { | |
val t1 = (for (j <- 1 to n) yield { | |
(new GapSamplingIterator(data.iterator, p)) | |
}).flatMap(test) | |
val t2 = (for (j <- 1 to n) yield { | |
data.iterator.filter(_ => Math.random() < p) | |
}).flatMap(test) | |
// detour through scipy to get KS test: | |
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n")) | |
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n")) | |
Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble | |
}).sortWith(_ < _)(ss / 2) | |
println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat") | |
} | |
def test_sample_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss:Int = 5, RA:Boolean = true, | |
testName: String="sample length (replacement)", | |
test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) { | |
val poisson = new PoissonDist(p) | |
val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList | |
val stat = (for (jj <- 1 to ss) yield { | |
val t1 = (for (j <- 1 to n) yield { | |
(new GapSamplingReplacementIterator(data.iterator, p)) | |
}).flatMap(test) | |
val t2 = (for (j <- 1 to n) yield { | |
data.iterator.flatMap(element => { | |
val count = poisson.next | |
if (count == 0) Iterator.empty else Iterator.fill(count)(element) | |
}) | |
}).flatMap(test) | |
// detour through scipy to get KS test: | |
scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n")) | |
scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n")) | |
Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble | |
}).sortWith(_ < _)(ss / 2) | |
println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat") | |
} | |
// measure gap lengths between sampled values | |
// (assumes values are in sorted order) | |
def gaps(data: Iterator[Int]): Iterator[Int] = { | |
data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, r-l._1)).drop(2).map(_._2) | |
} | |
def run_benchmarks { | |
benchmark(p= 0.001, n= 1000, m= 100000) | |
benchmark(p= 0.01, n= 1000, m= 100000) | |
benchmark(p= 0.1, n= 1000, m= 100000) | |
benchmark(p= 0.5, n= 1000, m= 100000) | |
benchmark(p= 0.9, n= 1000, m= 100000) | |
benchmark_replace(p= 0.001, n= 1000, m= 100000) | |
benchmark_replace(p= 0.01, n= 1000, m= 100000) | |
benchmark_replace(p= 0.1, n= 1000, m= 100000) | |
benchmark_replace(p= 0.5, n= 1000, m= 100000) | |
benchmark_replace(p= 0.9, n= 1000, m= 100000) | |
} | |
def run_ks_tests { | |
test_sample(p= 0.01, n= 10000, m= 10000) | |
test_sample(p= 0.1, n= 10000, m= 10000) | |
test_sample(p= 0.5, n= 10000, m= 10000) | |
test_sample(p= 0.01, n= 10000, m= 10000, RA= false) | |
test_sample(p= 0.1, n= 10000, m= 10000, RA= false) | |
test_sample(p= 0.5, n= 10000, m= 10000, RA= false) | |
test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps") | |
test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps") | |
test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps") | |
test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false) | |
test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false) | |
test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false) | |
test_sample_replace(p= 0.01, n= 10000, m= 10000) | |
test_sample_replace(p= 0.1, n= 10000, m= 10000) | |
test_sample_replace(p= 0.5, n= 10000, m= 10000) | |
test_sample_replace(p= 0.01, n= 10000, m= 10000, RA= false) | |
test_sample_replace(p= 0.1, n= 10000, m= 10000, RA= false) | |
test_sample_replace(p= 0.5, n= 10000, m= 10000, RA= false) | |
test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)") | |
test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)") | |
test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)") | |
test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false) | |
test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false) | |
test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
import sys | |
import argparse | |
from scipy import stats | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('data1', type=argparse.FileType('r'), metavar='<data-file-1>') | |
argparser.add_argument('data2', type=argparse.FileType('r'), metavar='<data-file-2>') | |
argparser.add_argument('-mr', default=False, action='store_true', help='machine readable output') | |
args = argparser.parse_args() | |
dv1 = args.data1.read().split() | |
dv2 = args.data2.read().split() | |
ksres = stats.ks_2samp(dv1, dv2) | |
if args.mr: | |
sys.stdout.write("%s %s\n" % ksres) | |
else: | |
sys.stdout.write("KS test results: stat=%s pval=%s\n" % ksres) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scala> run_benchmarks | |
p= 0.001 n= 1000 m= 100000 | |
Array | |
Time using filter: 2833 | |
Time using gap sampling: 29 | |
List | |
Time using filter: 2213 | |
Time using gap sampling: 230 | |
p= 0.01 n= 1000 m= 100000 | |
Array | |
Time using filter: 2825 | |
Time using gap sampling: 76 | |
List | |
Time using filter: 2220 | |
Time using gap sampling: 265 | |
p= 0.1 n= 1000 m= 100000 | |
Array | |
Time using filter: 2985 | |
Time using gap sampling: 787 | |
List | |
Time using filter: 2337 | |
Time using gap sampling: 796 | |
p= 0.5 n= 1000 m= 100000 | |
Array | |
Time using filter: 3526 | |
Time using gap sampling: 3478 | |
List | |
Time using filter: 2794 | |
Time using gap sampling: 3151 | |
p= 0.9 n= 1000 m= 100000 | |
Array | |
Time using filter: 3023 | |
Time using gap sampling: 6081 | |
List | |
Time using filter: 2513 | |
Time using gap sampling: 4849 | |
p= 0.001 n= 1000 m= 100000 | |
Array (replacement) | |
Time using filter: 2604 | |
Time using gap sampling: 45 | |
List (replacement) | |
Time using filter: 2431 | |
Time using gap sampling: 233 | |
p= 0.01 n= 1000 m= 100000 | |
Array (replacement) | |
Time using filter: 3442 | |
Time using gap sampling: 117 | |
List (replacement) | |
Time using filter: 2450 | |
Time using gap sampling: 299 | |
p= 0.1 n= 1000 m= 100000 | |
Array (replacement) | |
Time using filter: 3653 | |
Time using gap sampling: 1044 | |
List (replacement) | |
Time using filter: 2984 | |
Time using gap sampling: 1330 | |
p= 0.5 n= 1000 m= 100000 | |
Array (replacement) | |
Time using filter: 5643 | |
Time using gap sampling: 5073 | |
List (replacement) | |
Time using filter: 5331 | |
Time using gap sampling: 4752 | |
p= 0.9 n= 1000 m= 100000 | |
Array (replacement) | |
Time using filter: 7668 | |
Time using gap sampling: 8388 | |
List (replacement) | |
Time using filter: 6744 | |
Time using gap sampling: 7811 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scala> run_ks_tests | |
test= sample length | |
p= 0.01 n= 10000 m= 10000 RA= true | |
KS pval= 0.98104407763 | |
test= sample length | |
p= 0.1 n= 10000 m= 10000 RA= true | |
KS pval= 0.912879254739 | |
test= sample length | |
p= 0.5 n= 10000 m= 10000 RA= true | |
KS pval= 0.732634164255 | |
test= sample length | |
p= 0.01 n= 10000 m= 10000 RA= false | |
KS pval= 0.986338712288 | |
test= sample length | |
p= 0.1 n= 10000 m= 10000 RA= false | |
KS pval= 0.778352793151 | |
test= sample length | |
p= 0.5 n= 10000 m= 10000 RA= false | |
KS pval= 0.242107687641 | |
test= sample gaps | |
p= 0.01 n= 10000 m= 1000 RA= true | |
KS pval= 0.398632159825 | |
test= sample gaps | |
p= 0.1 n= 10000 m= 1000 RA= true | |
KS pval= 0.918625732614 | |
test= sample gaps | |
p= 0.5 n= 10000 m= 1000 RA= true | |
KS pval= 0.937181377828 | |
test= sample gaps | |
p= 0.01 n= 10000 m= 1000 RA= false | |
KS pval= 0.512899405585 | |
test= sample gaps | |
p= 0.1 n= 10000 m= 1000 RA= false | |
KS pval= 0.667342469032 | |
test= sample gaps | |
p= 0.5 n= 10000 m= 1000 RA= false | |
KS pval= 0.945788971072 | |
test= sample length (replacement) | |
p= 0.01 n= 10000 m= 10000 RA= true | |
KS pval= 0.673545329328 | |
test= sample length (replacement) | |
p= 0.1 n= 10000 m= 10000 RA= true | |
KS pval= 0.364634485794 | |
test= sample length (replacement) | |
p= 0.5 n= 10000 m= 10000 RA= true | |
KS pval= 0.637715845192 | |
test= sample length (replacement) | |
p= 0.01 n= 10000 m= 10000 RA= false | |
KS pval= 0.34627068807 | |
test= sample length (replacement) | |
p= 0.1 n= 10000 m= 10000 RA= false | |
KS pval= 0.190771573151 | |
test= sample length (replacement) | |
p= 0.5 n= 10000 m= 10000 RA= false | |
KS pval= 0.487108719852 | |
test= sample gaps (replacement) | |
p= 0.01 n= 10000 m= 1000 RA= true | |
KS pval= 0.648436825568 | |
test= sample gaps (replacement) | |
p= 0.1 n= 10000 m= 1000 RA= true | |
KS pval= 0.637677942914 | |
test= sample gaps (replacement) | |
p= 0.5 n= 10000 m= 1000 RA= true | |
KS pval= 0.953266697608 | |
test= sample gaps (replacement) | |
p= 0.01 n= 10000 m= 1000 RA= false | |
KS pval= 0.765060442147 | |
test= sample gaps (replacement) | |
p= 0.1 n= 10000 m= 1000 RA= false | |
KS pval= 0.443334023868 | |
test= sample gaps (replacement) | |
p= 0.5 n= 10000 m= 1000 RA= false | |
KS pval= 0.848623231604 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment