damonfeldman/gist:70dcef2083120e6d1761ce97411cb928

## gistfile1.txt
'use strict';


/** Random sampling of arrays or sequences.
    Limitations:  not 100% random. Tries to not start at 0 all the time, or get items at the exact end of list, but there are some biases.
    Intended to get random blocks of contiguous items. Blocks are used for two reasons
       1. it is sometimes nice to only get ranges, then get the data later to avoid a huge URI or ID pull up front.
       2. each chunk is one pass through a loop, including random generation and adjustments. fewer/larger chunks goes faster than chunksize=1.
      */

// var dbg=[]

// if the samples randomly exteded beyond the total size of the array, adjust them down till they fit
function stretchSqueeze(rangeStarts, tot, chunkSize) {
  let avgIntervalSize = tot / rangeStarts.length
  let last = rangeStarts[rangeStarts.length - 1] + chunkSize
  if (last < (tot - avgIntervalSize)) {  // sample is skewed heavily to beginning of list. Because there is no way to skew to end, dissalow
    let underage = tot - last
    let moveAheadAmt = underage - random(avgIntervalSize)
    let amtPerRange = moveAheadAmt / rangeStarts.length
    return rangeStarts.map((start, i) => start + (i+1)*amtPerRange)
  } else if (last <= tot) {
    return rangeStarts
  } else { // random intervals ended up being past end of list. move all starts back to fit into range, but not overlap
    let overage = last - tot
        //dbg.push("overage="+overage)
    let moveBackAmt = overage + random (avgIntervalSize)  // random to end somewhere in the last expected interval, not always on the last item
        //dbg.push("moveback="+moveBackAmt)
    let amtPerRange = moveBackAmt / rangeStarts.length
        //dbg.push("amt per range="+amtPerRange)
    let starts = rangeStarts.map(
      function(start, i){ return start - (i+1)*amtPerRange})
    return starts
  }
}

/* decimal random with precision to 3 points after the decimal */
function random(max) {
  let randX1000 = xdmp.random(max*100000 - 1)+1
  return randX1000 / 100000
}

/* calculate random range starting points in the interval [0..tot-1] anticipating ranges of size chunkSize
   this returns decimals because rounding too often throws off the randomness. Need to truncate to ints later
   anticipates a chunksize so we can compuete 1M random-ish values with only 10K iterations of the loop with each iteration computing a chunk of 100
   */
function computeRangeStarts(tot, samples, chunkSize) {
  let rangeStarts = []
  let avgIntervalSize = tot / samples   // the averge difference between the start of one sample and the start of the next
  let lastStart = 0
  let at = 0
  for (let i=0;i<samples;i++) {
    let skip = random(avgIntervalSize * 2) // skip from 0 to double the avg, to hit the average overall
    if (skip <= chunkSize + 0.01) {
      skip = chunkSize + 0.01 // ensure we skip ahead a full chunk to avoid overlap
    }
    at = at + skip
    rangeStarts.push(at)
    lastStart = at
  };

  return stretchSqueeze(rangeStarts, tot, chunkSize)
}

function max(a,b) { return a > b ? a : b}
function min(a,b) { return a < b ? a : b}

function randomRanges(tot, samples, chunkSize) {
  let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
  let minNextInt=0
  let rangeData = rangeStarts.reduce(
      function(collectorArr, start) {
        let idealStartInt = math.floor(start)
        let startInt = max(minNextInt, idealStartInt)
        let stopInt = startInt + chunkSize - 1
        minNextInt = stopInt+1 // sometimes the randoms can prodice the same number twice for very dense samples. Ensure we always move +1 at least
        collectorArr.push({"start":startInt, "stop":stopInt})
        return collectorArr
      },
      [])
  return rangeData
}


function randomUris(query, samples, chunkSize) {  // todo generalize to any lexicon
  let tot = cts.estimate(query)
  if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total sequence size"
  let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
  let minNextInt=0
  let lastUri = ""
  let rangeData = rangeStarts.reduce(
      function(collectorArr, start) {
        let idealStartInt = math.floor(start)
        let startInt = max(minNextInt, idealStartInt)
        let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
        minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
        let uris = cts.uris(lastUri+" ", Sequence.from(["limit="+(stopInt-startInt)]), query).toArray()  // concat a space to move past last item
        lastUri=uris[uris.length-1]
        return collectorArr.concat(uris)  // tricky: stop is non-inclusive so just subtract
      },
      [])
  return rangeData
}
function randomSampleChunks(dataArr, samples, chunkSize) {
  let tot = dataArr.length
  if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total data size"
  let rangeStarts = computeRangeStarts(dataArr.length, samples, chunkSize)
  let minNextInt=0
  let rangeData = rangeStarts.reduce(
      function(collectorArr, start) {
        let idealStartInt = math.floor(start)
        let startInt = max(minNextInt, idealStartInt)
        let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
        minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
        return collectorArr.concat(dataArr.slice(startInt, stopInt))  // tricky: slice ignores end index, but see above
      },
      [])
  return rangeData
}

///////////// ==================

let errors=[]
function test(cond, msg) {
  if (!cond) errors.push(msg)
}

let tot = 1000
let data = []
for (let i=0; i<tot; i++) data.push(i);
// data = cts.uris().toArray()

let samples = 20
let chunkSize = 4
let res = randomSampleChunks(data, samples, chunkSize)
test(res.length===20*4, "expect 80 items. got "+res.length)
test(res[3]-res[0]===3, "expect first chunk of 4 to be contiguous. diff="+res[3]-res[0])
test(res[7]-res[4]===3, "expect second chunk of 4 to be contiguous.")
test(res[11]-res[8]===3, "expect third chunk of 4 to be contiguous")
test(res[15]-res[12]===3, "expect fourth/last chunk of 4 to be contiguous")
test(res[15] <= 999, "last item must be at or below max of 999")


let ranges = randomRanges(tot, samples, chunkSize)
test(ranges.length===samples, "expected 20 ranges")
test(ranges[0].stop - ranges[0].start ===3, "expect first range to be size. size= ="+ (ranges[0].stop - ranges[0].start))
test(ranges[19].stop <= 999, "last item must be at or below max of 999. was "+ranges[19].stop)

//--- test some very dense samples 990 items of 1000 ---
 samples = 990
 chunkSize = 1

res = randomSampleChunks(data, samples, chunkSize)
test(res.length===990, "expect 990 chunks. got "+res.length)
test(res[989] <= 999, "last item must be at or below max of 999")
test(res[989] > 995, "last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // about 4 or 5 gaps should be in the sequence randomly, not two at the very end

ranges = randomRanges(tot, samples, chunkSize)
test(ranges.length===990, "ranges: expect 990 chunks. got "+ranges.length)

// known bug. samples that incude pretty much all of the list can overrun slightly.
// appears to be because we occasionally advance the next range to avoid overlap/duplicate of the last stop point. If this happens too often, pushes last item past end.
test(ranges[989].stop <= 999, "known issue w dense ranges: last item must be at or below max of 999. was:"+ranges[989].stop)

test(ranges[989].stop > 997, "ranges: last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // due to density. if all the gaps are at the very end, likely bug

//--- uneven numbers
 tot = 1234
 samples = 77
 chunkSize = 11
 data = []
 for (let i=0; i<tot; i++) data.push(i);

res = randomSampleChunks(data, samples, chunkSize)

test(res.length===77*11, "expect 781 items. got "+res.length)
test(res[10]-res[0]===10, "uneven: expect first chunk of 11 to be contiguous")
test(res[21]-res[11]===10, "uneven: expect second chunk of 11 to be contiguous")
test(res[780] <= 1233, "uneven: last item must be at or below max of 999")
test(res[780] < 1220, "uneven: last item unlikely to be too close to end was:"+res[780])
test(res[780] > 820, "uneven: last item unlikely to be very close to theoretical min of 781:"+res[780])

res = randomUris(cts.trueQuery(), samples, chunkSize)
test(res.length===77*11, "expect 781 items. got "+res.length)
let uniqueArray = res.filter(function(item, pos) { return res.indexOf(item) == pos; })
test(uniqueArray.length === res.length, "no duplicates")

let x= {"err":errors}
x
	'use strict';


	/** Random sampling of arrays or sequences.
	Limitations: not 100% random. Tries to not start at 0 all the time, or get items at the exact end of list, but there are some biases.
	Intended to get random blocks of contiguous items. Blocks are used for two reasons
	1. it is sometimes nice to only get ranges, then get the data later to avoid a huge URI or ID pull up front.
	2. each chunk is one pass through a loop, including random generation and adjustments. fewer/larger chunks goes faster than chunksize=1.
	*/

	// var dbg=[]

	// if the samples randomly exteded beyond the total size of the array, adjust them down till they fit
	function stretchSqueeze(rangeStarts, tot, chunkSize) {
	let avgIntervalSize = tot / rangeStarts.length
	let last = rangeStarts[rangeStarts.length - 1] + chunkSize
	if (last < (tot - avgIntervalSize)) { // sample is skewed heavily to beginning of list. Because there is no way to skew to end, dissalow
	let underage = tot - last
	let moveAheadAmt = underage - random(avgIntervalSize)
	let amtPerRange = moveAheadAmt / rangeStarts.length
	return rangeStarts.map((start, i) => start + (i+1)*amtPerRange)
	} else if (last <= tot) {
	return rangeStarts
	} else { // random intervals ended up being past end of list. move all starts back to fit into range, but not overlap
	let overage = last - tot
	//dbg.push("overage="+overage)
	let moveBackAmt = overage + random (avgIntervalSize) // random to end somewhere in the last expected interval, not always on the last item
	//dbg.push("moveback="+moveBackAmt)
	let amtPerRange = moveBackAmt / rangeStarts.length
	//dbg.push("amt per range="+amtPerRange)
	let starts = rangeStarts.map(
	function(start, i){ return start - (i+1)*amtPerRange})
	return starts
	}
	}

	/* decimal random with precision to 3 points after the decimal */
	function random(max) {
	let randX1000 = xdmp.random(max*100000 - 1)+1
	return randX1000 / 100000
	}

	/* calculate random range starting points in the interval [0..tot-1] anticipating ranges of size chunkSize
	this returns decimals because rounding too often throws off the randomness. Need to truncate to ints later
	anticipates a chunksize so we can compuete 1M random-ish values with only 10K iterations of the loop with each iteration computing a chunk of 100
	*/
	function computeRangeStarts(tot, samples, chunkSize) {
	let rangeStarts = []
	let avgIntervalSize = tot / samples // the averge difference between the start of one sample and the start of the next
	let lastStart = 0
	let at = 0
	for (let i=0;i<samples;i++) {
	let skip = random(avgIntervalSize * 2) // skip from 0 to double the avg, to hit the average overall
	if (skip <= chunkSize + 0.01) {
	skip = chunkSize + 0.01 // ensure we skip ahead a full chunk to avoid overlap
	}
	at = at + skip
	rangeStarts.push(at)
	lastStart = at
	};

	return stretchSqueeze(rangeStarts, tot, chunkSize)
	}

	function max(a,b) { return a > b ? a : b}
	function min(a,b) { return a < b ? a : b}

	function randomRanges(tot, samples, chunkSize) {
	let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
	let minNextInt=0
	let rangeData = rangeStarts.reduce(
	function(collectorArr, start) {
	let idealStartInt = math.floor(start)
	let startInt = max(minNextInt, idealStartInt)
	let stopInt = startInt + chunkSize - 1
	minNextInt = stopInt+1 // sometimes the randoms can prodice the same number twice for very dense samples. Ensure we always move +1 at least
	collectorArr.push({"start":startInt, "stop":stopInt})
	return collectorArr
	},
	[])
	return rangeData
	}


	function randomUris(query, samples, chunkSize) { // todo generalize to any lexicon
	let tot = cts.estimate(query)
	if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total sequence size"
	let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
	let minNextInt=0
	let lastUri = ""
	let rangeData = rangeStarts.reduce(
	function(collectorArr, start) {
	let idealStartInt = math.floor(start)
	let startInt = max(minNextInt, idealStartInt)
	let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
	minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
	let uris = cts.uris(lastUri+" ", Sequence.from(["limit="+(stopInt-startInt)]), query).toArray() // concat a space to move past last item
	lastUri=uris[uris.length-1]
	return collectorArr.concat(uris) // tricky: stop is non-inclusive so just subtract
	},
	[])
	return rangeData
	}
	function randomSampleChunks(dataArr, samples, chunkSize) {
	let tot = dataArr.length
	if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total data size"
	let rangeStarts = computeRangeStarts(dataArr.length, samples, chunkSize)
	let minNextInt=0
	let rangeData = rangeStarts.reduce(
	function(collectorArr, start) {
	let idealStartInt = math.floor(start)
	let startInt = max(minNextInt, idealStartInt)
	let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
	minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
	return collectorArr.concat(dataArr.slice(startInt, stopInt)) // tricky: slice ignores end index, but see above
	},
	[])
	return rangeData
	}

	///////////// ==================

	let errors=[]
	function test(cond, msg) {
	if (!cond) errors.push(msg)
	}

	let tot = 1000
	let data = []
	for (let i=0; i<tot; i++) data.push(i);
	// data = cts.uris().toArray()

	let samples = 20
	let chunkSize = 4
	let res = randomSampleChunks(data, samples, chunkSize)
	test(res.length===20*4, "expect 80 items. got "+res.length)
	test(res[3]-res[0]===3, "expect first chunk of 4 to be contiguous. diff="+res[3]-res[0])
	test(res[7]-res[4]===3, "expect second chunk of 4 to be contiguous.")
	test(res[11]-res[8]===3, "expect third chunk of 4 to be contiguous")
	test(res[15]-res[12]===3, "expect fourth/last chunk of 4 to be contiguous")
	test(res[15] <= 999, "last item must be at or below max of 999")


	let ranges = randomRanges(tot, samples, chunkSize)
	test(ranges.length===samples, "expected 20 ranges")
	test(ranges[0].stop - ranges[0].start ===3, "expect first range to be size. size= ="+ (ranges[0].stop - ranges[0].start))
	test(ranges[19].stop <= 999, "last item must be at or below max of 999. was "+ranges[19].stop)

	//--- test some very dense samples 990 items of 1000 ---
	samples = 990
	chunkSize = 1

	res = randomSampleChunks(data, samples, chunkSize)
	test(res.length===990, "expect 990 chunks. got "+res.length)
	test(res[989] <= 999, "last item must be at or below max of 999")
	test(res[989] > 995, "last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // about 4 or 5 gaps should be in the sequence randomly, not two at the very end

	ranges = randomRanges(tot, samples, chunkSize)
	test(ranges.length===990, "ranges: expect 990 chunks. got "+ranges.length)

	// known bug. samples that incude pretty much all of the list can overrun slightly.
	// appears to be because we occasionally advance the next range to avoid overlap/duplicate of the last stop point. If this happens too often, pushes last item past end.
	test(ranges[989].stop <= 999, "known issue w dense ranges: last item must be at or below max of 999. was:"+ranges[989].stop)

	test(ranges[989].stop > 997, "ranges: last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // due to density. if all the gaps are at the very end, likely bug

	//--- uneven numbers
	tot = 1234
	samples = 77
	chunkSize = 11
	data = []
	for (let i=0; i<tot; i++) data.push(i);

	res = randomSampleChunks(data, samples, chunkSize)

	test(res.length===77*11, "expect 781 items. got "+res.length)
	test(res[10]-res[0]===10, "uneven: expect first chunk of 11 to be contiguous")
	test(res[21]-res[11]===10, "uneven: expect second chunk of 11 to be contiguous")
	test(res[780] <= 1233, "uneven: last item must be at or below max of 999")
	test(res[780] < 1220, "uneven: last item unlikely to be too close to end was:"+res[780])
	test(res[780] > 820, "uneven: last item unlikely to be very close to theoretical min of 781:"+res[780])

	res = randomUris(cts.trueQuery(), samples, chunkSize)
	test(res.length===77*11, "expect 781 items. got "+res.length)
	let uniqueArray = res.filter(function(item, pos) { return res.indexOf(item) == pos; })
	test(uniqueArray.length === res.length, "no duplicates")

	let x= {"err":errors}
	x