erikerlandson/gap_sampling.scala

## gap_sampling.scala
import scala.reflect.ClassTag
import scala.sys.process._


class PoissonDist(p: Double) {
  val q = Math.exp(-p)

  def next: Int = {
    var r = 0
    var pp = Math.random()
    while (pp > q) {
      r += 1
      pp *= Math.random()
    }
    r
  }

  def nextGE1: Int = {
    // simulate that the standard poisson sampling
    // gave us at least one iteration, for a sample of >= 1
    var pp = q + ((1.0 - q) * Math.random())
    var r = 1

    // now continue with standard poisson sampling algorithm
    pp *= Math.random()
    while (pp > q) {
      r += 1
      pp *= Math.random()
    }
    r
  }
}

// I used this to verify that my poisson works same as cern, so I can
// now consistently use my poisson across testing
/*
import cern.jet.random.Poisson
import cern.jet.random.engine.DRand
def test_poisson(n: Int = 1000, p: Double = 0.5) {
  val pd = new PoissonDist(p)
  val pdref = new Poisson(p, new DRand())
  var t1 = for (j <- 1 to n) yield pd.next
  var t2 = for (j <- 1 to n) yield pdref.nextInt()

  scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
  scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
  var stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble

  println(s"p= $p  n= $n    KS pval= $stat")

  val nf = (n.toDouble / (1.0 - Math.exp(-p))).toInt

  t1 = for (j <- 1 to n) yield pd.nextGE1
  t2 = (for (j <- 1 to nf) yield pdref.nextInt()).filter(_ >= 1)

  scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
  scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
  stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble

  println(s"p= $p  n= $n    KS pval= $stat")
}
*/

class GapSamplingIterator[T: ClassTag](var data: Iterator[T], p: Double) extends Iterator[T] {
  // implement efficient drop until scala includes fix for jira SI-8835
  val dd: Int => Unit = {
    val arrayClass = Array.empty[T].iterator.getClass
    data.getClass match {
      case `arrayClass` => {
        ((n: Int) => { data = data.drop(n) })
      }
      case _ => {
        ((n: Int)=> {
          var j = 0
          while (j < n  &&  data.hasNext) {
            data.next
            j += 1
          }
        })
      }
    }
  }

  override def hasNext: Boolean = data.hasNext

  override def next: T = {
    val r = data.next
    advance
    r
  }

  val lnq = Math.log(1.0-p)
  def advance {
    // skip elements with replication factor zero (i.e. elements that won't be sampled)
    val u = Math.max(Math.random(), 1e-10)
    val rlen = (Math.log(u)/lnq).toInt
    dd(rlen)
  }

  // advance to first sample as part of object construction
  advance
}


class GapSamplingReplacementIterator[T: ClassTag](var data: Iterator[T],
                                                  p: Double) extends Iterator[T] {
  // implement efficient drop until scala includes fix for jira SI-8835
  val dd: Int => Unit = {
    val arrayClass = Array.empty[T].iterator.getClass
    data.getClass match {
      case `arrayClass` => {
        ((n: Int) => { data = data.drop(n) })
      }
      case _ => {
        ((n: Int)=> {
          var j = 0
          while (j < n  &&  data.hasNext) {
            data.next
            j += 1
          }
        })
      }
    }
  }

  val poisson = new PoissonDist(p)

  // current sampling value, and its replication factor, as we are sampling with replacement:
  var v: T = _
  var rep: Int = 0

  override def hasNext: Boolean = data.hasNext || rep > 0

  override def next: T = {
    val r = v
    rep -= 1
    if (rep <= 0) advance
    r
  }

  def advance {
    // skip elements with replication factor zero (i.e. elements that won't be sampled)
    val u = Math.max(Math.random(), 1e-10)
    val rlen = (Math.log(u)/(-p)).toInt
    dd(rlen)
    // set the value and replication factor for the next value
    if (data.hasNext) {
      v = data.next
      rep = poisson.nextGE1
    }
  }

  // advance to first sample as part of object construction
  advance
}


def benchmark(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
  var data:Seq[Int] = (1 to m).toArray
  println(s"\np= $p  n= $n  m= $m\nArray")

  var start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    data.iterator.filter(_ => Math.random() < p).length
  }}
  println(s"  Time using filter: ${System.currentTimeMillis() - start}")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    (new GapSamplingIterator(data.iterator, p)).length
  }}
  println(s"  Time using gap sampling: ${System.currentTimeMillis() - start}")


  data = (1 to m).toList
  println(s"List")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    data.iterator.filter(_ => Math.random() < p).length
  }}
  println(s"  Time using filter: ${System.currentTimeMillis() - start}")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    (new GapSamplingIterator(data.iterator, p)).length
  }}
  println(s"  Time using gap sampling: ${System.currentTimeMillis() - start}")
}

def benchmark_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
  val poisson = new PoissonDist(p)

  var data:Seq[Int] = (1 to m).toArray
  println(s"\np= $p  n= $n  m= $m\nArray (replacement)")

  var start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    data.iterator.flatMap(element => {
      val count = poisson.next
      if (count == 0) Iterator.empty else Iterator.fill(count)(element)
    }).length
  }}
  println(s"  Time using filter: ${System.currentTimeMillis() - start}")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    (new GapSamplingReplacementIterator(data.iterator, p)).length
  }}
  println(s"  Time using gap sampling: ${System.currentTimeMillis() - start}")


  data = (1 to m).toList
  println(s"List (replacement)")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    data.iterator.flatMap(element => {
      val count = poisson.next
      if (count == 0) Iterator.empty else Iterator.fill(count)(element)
    }).length
  }}
  println(s"  Time using filter: ${System.currentTimeMillis() - start}")

  start = System.currentTimeMillis()
  (1 to n).foreach { _ => {
    (new GapSamplingReplacementIterator(data.iterator, p)).length
  }}
  println(s"  Time using gap sampling: ${System.currentTimeMillis() - start}")
}


def test_sample(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss: Int = 5, RA:Boolean = true,
                testName: String="sample length",
                test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
  val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList

  val stat = (for (jj <- 1 to ss) yield {
    val t1 = (for (j <- 1 to n) yield {
      (new GapSamplingIterator(data.iterator, p))
    }).flatMap(test)
    val t2 = (for (j <- 1 to n) yield {
      data.iterator.filter(_ => Math.random() < p)
    }).flatMap(test)

    // detour through scipy to get KS test:
    scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
    scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
    Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
  }).sortWith(_ < _)(ss / 2)
  println(s"\ntest= $testName\n    p= $p  n= $n  m= $m  RA= $RA\n    KS pval= $stat")
}

def test_sample_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss:Int = 5, RA:Boolean = true,
                testName: String="sample length (replacement)",
                test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
  val poisson = new PoissonDist(p)
  val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList

  val stat = (for (jj <- 1 to ss) yield {
    val t1 = (for (j <- 1 to n) yield {
      (new GapSamplingReplacementIterator(data.iterator, p))
    }).flatMap(test)
    val t2 = (for (j <- 1 to n) yield {
      data.iterator.flatMap(element => {
        val count = poisson.next
        if (count == 0) Iterator.empty else Iterator.fill(count)(element)
      })
    }).flatMap(test)

    // detour through scipy to get KS test:
    scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
    scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
    Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
  }).sortWith(_ < _)(ss / 2)
  println(s"\ntest= $testName\n    p= $p  n= $n  m= $m  RA= $RA\n    KS pval= $stat")
}

// measure gap lengths between sampled values
// (assumes values are in sorted order)
def gaps(data: Iterator[Int]): Iterator[Int] = {
  data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, r-l._1)).drop(2).map(_._2)
}

def run_benchmarks {
  benchmark(p= 0.001, n= 1000, m= 100000)
  benchmark(p= 0.01, n= 1000, m= 100000)
  benchmark(p= 0.1, n= 1000, m= 100000)
  benchmark(p= 0.5, n= 1000, m= 100000)
  benchmark(p= 0.9, n= 1000, m= 100000)

  benchmark_replace(p= 0.001, n= 1000, m= 100000)
  benchmark_replace(p= 0.01, n= 1000, m= 100000)
  benchmark_replace(p= 0.1, n= 1000, m= 100000)
  benchmark_replace(p= 0.5, n= 1000, m= 100000)
  benchmark_replace(p= 0.9, n= 1000, m= 100000)
}

def run_ks_tests {
  test_sample(p= 0.01, n= 10000, m= 10000)
  test_sample(p= 0.1, n= 10000, m= 10000)
  test_sample(p= 0.5, n= 10000, m= 10000)

  test_sample(p= 0.01, n= 10000, m= 10000, RA= false)
  test_sample(p= 0.1, n= 10000, m= 10000, RA= false)
  test_sample(p= 0.5, n= 10000, m= 10000, RA= false)

  test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps")
  test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps")
  test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps")

  test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
  test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
  test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)

  test_sample_replace(p= 0.01, n= 10000, m= 10000)
  test_sample_replace(p= 0.1, n= 10000, m= 10000)
  test_sample_replace(p= 0.5, n= 10000, m= 10000)

  test_sample_replace(p= 0.01, n= 10000, m= 10000, RA= false)
  test_sample_replace(p= 0.1, n= 10000, m= 10000, RA= false)
  test_sample_replace(p= 0.5, n= 10000, m= 10000, RA= false)

  test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
  test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
  test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")

  test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
  test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
  test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
}

## kstest.py
#!/bin/env python

import sys
import argparse
from scipy import stats

argparser = argparse.ArgumentParser()
argparser.add_argument('data1', type=argparse.FileType('r'), metavar='<data-file-1>')
argparser.add_argument('data2', type=argparse.FileType('r'), metavar='<data-file-2>')
argparser.add_argument('-mr', default=False, action='store_true', help='machine readable output')

args = argparser.parse_args()

dv1 = args.data1.read().split()
dv2 = args.data2.read().split()

ksres = stats.ks_2samp(dv1, dv2)

if args.mr:
    sys.stdout.write("%s %s\n" % ksres)
else:
    sys.stdout.write("KS test results: stat=%s   pval=%s\n" % ksres)

## test_results_benchmark.txt
scala> run_benchmarks

p= 0.001  n= 1000  m= 100000
Array
  Time using filter: 2833
  Time using gap sampling: 29
List
  Time using filter: 2213
  Time using gap sampling: 230

p= 0.01  n= 1000  m= 100000
Array
  Time using filter: 2825
  Time using gap sampling: 76
List
  Time using filter: 2220
  Time using gap sampling: 265

p= 0.1  n= 1000  m= 100000
Array
  Time using filter: 2985
  Time using gap sampling: 787
List
  Time using filter: 2337
  Time using gap sampling: 796

p= 0.5  n= 1000  m= 100000
Array
  Time using filter: 3526
  Time using gap sampling: 3478
List
  Time using filter: 2794
  Time using gap sampling: 3151

p= 0.9  n= 1000  m= 100000
Array
  Time using filter: 3023
  Time using gap sampling: 6081
List
  Time using filter: 2513
  Time using gap sampling: 4849

p= 0.001  n= 1000  m= 100000
Array (replacement)
  Time using filter: 2604
  Time using gap sampling: 45
List (replacement)
  Time using filter: 2431
  Time using gap sampling: 233

p= 0.01  n= 1000  m= 100000
Array (replacement)
  Time using filter: 3442
  Time using gap sampling: 117
List (replacement)
  Time using filter: 2450
  Time using gap sampling: 299

p= 0.1  n= 1000  m= 100000
Array (replacement)
  Time using filter: 3653
  Time using gap sampling: 1044
List (replacement)
  Time using filter: 2984
  Time using gap sampling: 1330

p= 0.5  n= 1000  m= 100000
Array (replacement)
  Time using filter: 5643
  Time using gap sampling: 5073
List (replacement)
  Time using filter: 5331
  Time using gap sampling: 4752

p= 0.9  n= 1000  m= 100000
Array (replacement)
  Time using filter: 7668
  Time using gap sampling: 8388
List (replacement)
  Time using filter: 6744
  Time using gap sampling: 7811

## test_results_distributions.txt
scala> run_ks_tests

test= sample length
    p= 0.01  n= 10000  m= 10000  RA= true
    KS pval= 0.98104407763

test= sample length
    p= 0.1  n= 10000  m= 10000  RA= true
    KS pval= 0.912879254739

test= sample length
    p= 0.5  n= 10000  m= 10000  RA= true
    KS pval= 0.732634164255

test= sample length
    p= 0.01  n= 10000  m= 10000  RA= false
    KS pval= 0.986338712288

test= sample length
    p= 0.1  n= 10000  m= 10000  RA= false
    KS pval= 0.778352793151

test= sample length
    p= 0.5  n= 10000  m= 10000  RA= false
    KS pval= 0.242107687641

test= sample gaps
    p= 0.01  n= 10000  m= 1000  RA= true
    KS pval= 0.398632159825

test= sample gaps
    p= 0.1  n= 10000  m= 1000  RA= true
    KS pval= 0.918625732614

test= sample gaps
    p= 0.5  n= 10000  m= 1000  RA= true
    KS pval= 0.937181377828

test= sample gaps
    p= 0.01  n= 10000  m= 1000  RA= false
    KS pval= 0.512899405585

test= sample gaps
    p= 0.1  n= 10000  m= 1000  RA= false
    KS pval= 0.667342469032

test= sample gaps
    p= 0.5  n= 10000  m= 1000  RA= false
    KS pval= 0.945788971072

test= sample length (replacement)
    p= 0.01  n= 10000  m= 10000  RA= true
    KS pval= 0.673545329328

test= sample length (replacement)
    p= 0.1  n= 10000  m= 10000  RA= true
    KS pval= 0.364634485794

test= sample length (replacement)
    p= 0.5  n= 10000  m= 10000  RA= true
    KS pval= 0.637715845192

test= sample length (replacement)
    p= 0.01  n= 10000  m= 10000  RA= false
    KS pval= 0.34627068807

test= sample length (replacement)
    p= 0.1  n= 10000  m= 10000  RA= false
    KS pval= 0.190771573151

test= sample length (replacement)
    p= 0.5  n= 10000  m= 10000  RA= false
    KS pval= 0.487108719852

test= sample gaps (replacement)
    p= 0.01  n= 10000  m= 1000  RA= true
    KS pval= 0.648436825568

test= sample gaps (replacement)
    p= 0.1  n= 10000  m= 1000  RA= true
    KS pval= 0.637677942914

test= sample gaps (replacement)
    p= 0.5  n= 10000  m= 1000  RA= true
    KS pval= 0.953266697608

test= sample gaps (replacement)
    p= 0.01  n= 10000  m= 1000  RA= false
    KS pval= 0.765060442147

test= sample gaps (replacement)
    p= 0.1  n= 10000  m= 1000  RA= false
    KS pval= 0.443334023868

test= sample gaps (replacement)
    p= 0.5  n= 10000  m= 1000  RA= false
    KS pval= 0.848623231604
	import scala.reflect.ClassTag
	import scala.sys.process._


	class PoissonDist(p: Double) {
	val q = Math.exp(-p)

	def next: Int = {
	var r = 0
	var pp = Math.random()
	while (pp > q) {
	r += 1
	pp *= Math.random()
	}
	r
	}

	def nextGE1: Int = {
	// simulate that the standard poisson sampling
	// gave us at least one iteration, for a sample of >= 1
	var pp = q + ((1.0 - q) * Math.random())
	var r = 1

	// now continue with standard poisson sampling algorithm
	pp *= Math.random()
	while (pp > q) {
	r += 1
	pp *= Math.random()
	}
	r
	}
	}

	// I used this to verify that my poisson works same as cern, so I can
	// now consistently use my poisson across testing
	/*
	import cern.jet.random.Poisson
	import cern.jet.random.engine.DRand
	def test_poisson(n: Int = 1000, p: Double = 0.5) {
	val pd = new PoissonDist(p)
	val pdref = new Poisson(p, new DRand())
	var t1 = for (j <- 1 to n) yield pd.next
	var t2 = for (j <- 1 to n) yield pdref.nextInt()

	scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
	scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
	var stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble

	println(s"p= $p n= $n KS pval= $stat")

	val nf = (n.toDouble / (1.0 - Math.exp(-p))).toInt

	t1 = for (j <- 1 to n) yield pd.nextGE1
	t2 = (for (j <- 1 to nf) yield pdref.nextInt()).filter(_ >= 1)

	scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
	scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
	stat = Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble

	println(s"p= $p n= $n KS pval= $stat")
	}
	*/

	class GapSamplingIterator[T: ClassTag](var data: Iterator[T], p: Double) extends Iterator[T] {
	// implement efficient drop until scala includes fix for jira SI-8835
	val dd: Int => Unit = {
	val arrayClass = Array.empty[T].iterator.getClass
	data.getClass match {
	case `arrayClass` => {
	((n: Int) => { data = data.drop(n) })
	}
	case _ => {
	((n: Int)=> {
	var j = 0
	while (j < n && data.hasNext) {
	data.next
	j += 1
	}
	})
	}
	}
	}

	override def hasNext: Boolean = data.hasNext

	override def next: T = {
	val r = data.next
	advance
	r
	}

	val lnq = Math.log(1.0-p)
	def advance {
	// skip elements with replication factor zero (i.e. elements that won't be sampled)
	val u = Math.max(Math.random(), 1e-10)
	val rlen = (Math.log(u)/lnq).toInt
	dd(rlen)
	}

	// advance to first sample as part of object construction
	advance
	}


	class GapSamplingReplacementIterator[T: ClassTag](var data: Iterator[T],
	p: Double) extends Iterator[T] {
	// implement efficient drop until scala includes fix for jira SI-8835
	val dd: Int => Unit = {
	val arrayClass = Array.empty[T].iterator.getClass
	data.getClass match {
	case `arrayClass` => {
	((n: Int) => { data = data.drop(n) })
	}
	case _ => {
	((n: Int)=> {
	var j = 0
	while (j < n && data.hasNext) {
	data.next
	j += 1
	}
	})
	}
	}
	}

	val poisson = new PoissonDist(p)

	// current sampling value, and its replication factor, as we are sampling with replacement:
	var v: T = _
	var rep: Int = 0

	override def hasNext: Boolean = data.hasNext \|\| rep > 0

	override def next: T = {
	val r = v
	rep -= 1
	if (rep <= 0) advance
	r
	}

	def advance {
	// skip elements with replication factor zero (i.e. elements that won't be sampled)
	val u = Math.max(Math.random(), 1e-10)
	val rlen = (Math.log(u)/(-p)).toInt
	dd(rlen)
	// set the value and replication factor for the next value
	if (data.hasNext) {
	v = data.next
	rep = poisson.nextGE1
	}
	}

	// advance to first sample as part of object construction
	advance
	}


	def benchmark(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
	var data:Seq[Int] = (1 to m).toArray
	println(s"\np= $p n= $n m= $m\nArray")

	var start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	data.iterator.filter(_ => Math.random() < p).length
	}}
	println(s" Time using filter: ${System.currentTimeMillis() - start}")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	(new GapSamplingIterator(data.iterator, p)).length
	}}
	println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")


	data = (1 to m).toList
	println(s"List")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	data.iterator.filter(_ => Math.random() < p).length
	}}
	println(s" Time using filter: ${System.currentTimeMillis() - start}")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	(new GapSamplingIterator(data.iterator, p)).length
	}}
	println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
	}

	def benchmark_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5) {
	val poisson = new PoissonDist(p)

	var data:Seq[Int] = (1 to m).toArray
	println(s"\np= $p n= $n m= $m\nArray (replacement)")

	var start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	data.iterator.flatMap(element => {
	val count = poisson.next
	if (count == 0) Iterator.empty else Iterator.fill(count)(element)
	}).length
	}}
	println(s" Time using filter: ${System.currentTimeMillis() - start}")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	(new GapSamplingReplacementIterator(data.iterator, p)).length
	}}
	println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")


	data = (1 to m).toList
	println(s"List (replacement)")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	data.iterator.flatMap(element => {
	val count = poisson.next
	if (count == 0) Iterator.empty else Iterator.fill(count)(element)
	}).length
	}}
	println(s" Time using filter: ${System.currentTimeMillis() - start}")

	start = System.currentTimeMillis()
	(1 to n).foreach { _ => {
	(new GapSamplingReplacementIterator(data.iterator, p)).length
	}}
	println(s" Time using gap sampling: ${System.currentTimeMillis() - start}")
	}


	def test_sample(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss: Int = 5, RA:Boolean = true,
	testName: String="sample length",
	test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
	val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList

	val stat = (for (jj <- 1 to ss) yield {
	val t1 = (for (j <- 1 to n) yield {
	(new GapSamplingIterator(data.iterator, p))
	}).flatMap(test)
	val t2 = (for (j <- 1 to n) yield {
	data.iterator.filter(_ => Math.random() < p)
	}).flatMap(test)

	// detour through scipy to get KS test:
	scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
	scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
	Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
	}).sortWith(_ < _)(ss / 2)
	println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat")
	}

	def test_sample_replace(n: Int = 1000, m: Int = 1000, p: Double = 0.5, ss:Int = 5, RA:Boolean = true,
	testName: String="sample length (replacement)",
	test: (Iterator[Int] => Iterator[Int]) = (itr => List(itr.length).iterator)) {
	val poisson = new PoissonDist(p)
	val data:Seq[Int] = if (RA) (0 until m).toArray else (0 until m).toList

	val stat = (for (jj <- 1 to ss) yield {
	val t1 = (for (j <- 1 to n) yield {
	(new GapSamplingReplacementIterator(data.iterator, p))
	}).flatMap(test)
	val t2 = (for (j <- 1 to n) yield {
	data.iterator.flatMap(element => {
	val count = poisson.next
	if (count == 0) Iterator.empty else Iterator.fill(count)(element)
	})
	}).flatMap(test)

	// detour through scipy to get KS test:
	scala.tools.nsc.io.File("/tmp/test_sample_work_1.dat").writeAll(t1.mkString("\n"))
	scala.tools.nsc.io.File("/tmp/test_sample_work_2.dat").writeAll(t2.mkString("\n"))
	Seq("/home/eje/kstest", "/tmp/test_sample_work_1.dat", "/tmp/test_sample_work_2.dat", "-mr").!!.split(' ')(1).toDouble
	}).sortWith(_ < _)(ss / 2)
	println(s"\ntest= $testName\n p= $p n= $n m= $m RA= $RA\n KS pval= $stat")
	}

	// measure gap lengths between sampled values
	// (assumes values are in sorted order)
	def gaps(data: Iterator[Int]): Iterator[Int] = {
	data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, r-l._1)).drop(2).map(_._2)
	}

	def run_benchmarks {
	benchmark(p= 0.001, n= 1000, m= 100000)
	benchmark(p= 0.01, n= 1000, m= 100000)
	benchmark(p= 0.1, n= 1000, m= 100000)
	benchmark(p= 0.5, n= 1000, m= 100000)
	benchmark(p= 0.9, n= 1000, m= 100000)

	benchmark_replace(p= 0.001, n= 1000, m= 100000)
	benchmark_replace(p= 0.01, n= 1000, m= 100000)
	benchmark_replace(p= 0.1, n= 1000, m= 100000)
	benchmark_replace(p= 0.5, n= 1000, m= 100000)
	benchmark_replace(p= 0.9, n= 1000, m= 100000)
	}

	def run_ks_tests {
	test_sample(p= 0.01, n= 10000, m= 10000)
	test_sample(p= 0.1, n= 10000, m= 10000)
	test_sample(p= 0.5, n= 10000, m= 10000)

	test_sample(p= 0.01, n= 10000, m= 10000, RA= false)
	test_sample(p= 0.1, n= 10000, m= 10000, RA= false)
	test_sample(p= 0.5, n= 10000, m= 10000, RA= false)

	test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps")
	test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps")
	test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps")

	test_sample(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
	test_sample(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)
	test_sample(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps", RA= false)

	test_sample_replace(p= 0.01, n= 10000, m= 10000)
	test_sample_replace(p= 0.1, n= 10000, m= 10000)
	test_sample_replace(p= 0.5, n= 10000, m= 10000)

	test_sample_replace(p= 0.01, n= 10000, m= 10000, RA= false)
	test_sample_replace(p= 0.1, n= 10000, m= 10000, RA= false)
	test_sample_replace(p= 0.5, n= 10000, m= 10000, RA= false)

	test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
	test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")
	test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)")

	test_sample_replace(p= 0.01, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
	test_sample_replace(p= 0.1, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
	test_sample_replace(p= 0.5, n= 10000, m= 1000, test=gaps, testName="sample gaps (replacement)", RA= false)
	}
	#!/bin/env python

	import sys
	import argparse
	from scipy import stats

	argparser = argparse.ArgumentParser()
	argparser.add_argument('data1', type=argparse.FileType('r'), metavar='<data-file-1>')
	argparser.add_argument('data2', type=argparse.FileType('r'), metavar='<data-file-2>')
	argparser.add_argument('-mr', default=False, action='store_true', help='machine readable output')

	args = argparser.parse_args()

	dv1 = args.data1.read().split()
	dv2 = args.data2.read().split()

	ksres = stats.ks_2samp(dv1, dv2)

	if args.mr:
	sys.stdout.write("%s %s\n" % ksres)
	else:
	sys.stdout.write("KS test results: stat=%s pval=%s\n" % ksres)
	scala> run_benchmarks

	p= 0.001 n= 1000 m= 100000
	Array
	Time using filter: 2833
	Time using gap sampling: 29
	List
	Time using filter: 2213
	Time using gap sampling: 230

	p= 0.01 n= 1000 m= 100000
	Array
	Time using filter: 2825
	Time using gap sampling: 76
	List
	Time using filter: 2220
	Time using gap sampling: 265

	p= 0.1 n= 1000 m= 100000
	Array
	Time using filter: 2985
	Time using gap sampling: 787
	List
	Time using filter: 2337
	Time using gap sampling: 796

	p= 0.5 n= 1000 m= 100000
	Array
	Time using filter: 3526
	Time using gap sampling: 3478
	List
	Time using filter: 2794
	Time using gap sampling: 3151

	p= 0.9 n= 1000 m= 100000
	Array
	Time using filter: 3023
	Time using gap sampling: 6081
	List
	Time using filter: 2513
	Time using gap sampling: 4849

	p= 0.001 n= 1000 m= 100000
	Array (replacement)
	Time using filter: 2604
	Time using gap sampling: 45
	List (replacement)
	Time using filter: 2431
	Time using gap sampling: 233

	p= 0.01 n= 1000 m= 100000
	Array (replacement)
	Time using filter: 3442
	Time using gap sampling: 117
	List (replacement)
	Time using filter: 2450
	Time using gap sampling: 299

	p= 0.1 n= 1000 m= 100000
	Array (replacement)
	Time using filter: 3653
	Time using gap sampling: 1044
	List (replacement)
	Time using filter: 2984
	Time using gap sampling: 1330

	p= 0.5 n= 1000 m= 100000
	Array (replacement)
	Time using filter: 5643
	Time using gap sampling: 5073
	List (replacement)
	Time using filter: 5331
	Time using gap sampling: 4752

	p= 0.9 n= 1000 m= 100000
	Array (replacement)
	Time using filter: 7668
	Time using gap sampling: 8388
	List (replacement)
	Time using filter: 6744
	Time using gap sampling: 7811
	scala> run_ks_tests

	test= sample length
	p= 0.01 n= 10000 m= 10000 RA= true
	KS pval= 0.98104407763

	test= sample length
	p= 0.1 n= 10000 m= 10000 RA= true
	KS pval= 0.912879254739

	test= sample length
	p= 0.5 n= 10000 m= 10000 RA= true
	KS pval= 0.732634164255

	test= sample length
	p= 0.01 n= 10000 m= 10000 RA= false
	KS pval= 0.986338712288

	test= sample length
	p= 0.1 n= 10000 m= 10000 RA= false
	KS pval= 0.778352793151

	test= sample length
	p= 0.5 n= 10000 m= 10000 RA= false
	KS pval= 0.242107687641

	test= sample gaps
	p= 0.01 n= 10000 m= 1000 RA= true
	KS pval= 0.398632159825

	test= sample gaps
	p= 0.1 n= 10000 m= 1000 RA= true
	KS pval= 0.918625732614

	test= sample gaps
	p= 0.5 n= 10000 m= 1000 RA= true
	KS pval= 0.937181377828

	test= sample gaps
	p= 0.01 n= 10000 m= 1000 RA= false
	KS pval= 0.512899405585

	test= sample gaps
	p= 0.1 n= 10000 m= 1000 RA= false
	KS pval= 0.667342469032

	test= sample gaps
	p= 0.5 n= 10000 m= 1000 RA= false
	KS pval= 0.945788971072

	test= sample length (replacement)
	p= 0.01 n= 10000 m= 10000 RA= true
	KS pval= 0.673545329328

	test= sample length (replacement)
	p= 0.1 n= 10000 m= 10000 RA= true
	KS pval= 0.364634485794

	test= sample length (replacement)
	p= 0.5 n= 10000 m= 10000 RA= true
	KS pval= 0.637715845192

	test= sample length (replacement)
	p= 0.01 n= 10000 m= 10000 RA= false
	KS pval= 0.34627068807

	test= sample length (replacement)
	p= 0.1 n= 10000 m= 10000 RA= false
	KS pval= 0.190771573151

	test= sample length (replacement)
	p= 0.5 n= 10000 m= 10000 RA= false
	KS pval= 0.487108719852

	test= sample gaps (replacement)
	p= 0.01 n= 10000 m= 1000 RA= true
	KS pval= 0.648436825568

	test= sample gaps (replacement)
	p= 0.1 n= 10000 m= 1000 RA= true
	KS pval= 0.637677942914

	test= sample gaps (replacement)
	p= 0.5 n= 10000 m= 1000 RA= true
	KS pval= 0.953266697608

	test= sample gaps (replacement)
	p= 0.01 n= 10000 m= 1000 RA= false
	KS pval= 0.765060442147

	test= sample gaps (replacement)
	p= 0.1 n= 10000 m= 1000 RA= false
	KS pval= 0.443334023868

	test= sample gaps (replacement)
	p= 0.5 n= 10000 m= 1000 RA= false
	KS pval= 0.848623231604