Skip to content

Instantly share code, notes, and snippets.

@damonfeldman
Created August 30, 2018 14:56
Show Gist options
  • Save damonfeldman/70dcef2083120e6d1761ce97411cb928 to your computer and use it in GitHub Desktop.
Save damonfeldman/70dcef2083120e6d1761ce97411cb928 to your computer and use it in GitHub Desktop.
MarkLogic random sampling functions
'use strict';
/** Random sampling of arrays or sequences.
Limitations: not 100% random. Tries to not start at 0 all the time, or get items at the exact end of list, but there are some biases.
Intended to get random blocks of contiguous items. Blocks are used for two reasons
1. it is sometimes nice to only get ranges, then get the data later to avoid a huge URI or ID pull up front.
2. each chunk is one pass through a loop, including random generation and adjustments. fewer/larger chunks goes faster than chunksize=1.
*/
// var dbg=[]
// if the samples randomly exteded beyond the total size of the array, adjust them down till they fit
function stretchSqueeze(rangeStarts, tot, chunkSize) {
let avgIntervalSize = tot / rangeStarts.length
let last = rangeStarts[rangeStarts.length - 1] + chunkSize
if (last < (tot - avgIntervalSize)) { // sample is skewed heavily to beginning of list. Because there is no way to skew to end, dissalow
let underage = tot - last
let moveAheadAmt = underage - random(avgIntervalSize)
let amtPerRange = moveAheadAmt / rangeStarts.length
return rangeStarts.map((start, i) => start + (i+1)*amtPerRange)
} else if (last <= tot) {
return rangeStarts
} else { // random intervals ended up being past end of list. move all starts back to fit into range, but not overlap
let overage = last - tot
//dbg.push("overage="+overage)
let moveBackAmt = overage + random (avgIntervalSize) // random to end somewhere in the last expected interval, not always on the last item
//dbg.push("moveback="+moveBackAmt)
let amtPerRange = moveBackAmt / rangeStarts.length
//dbg.push("amt per range="+amtPerRange)
let starts = rangeStarts.map(
function(start, i){ return start - (i+1)*amtPerRange})
return starts
}
}
/* decimal random with precision to 3 points after the decimal */
function random(max) {
let randX1000 = xdmp.random(max*100000 - 1)+1
return randX1000 / 100000
}
/* calculate random range starting points in the interval [0..tot-1] anticipating ranges of size chunkSize
this returns decimals because rounding too often throws off the randomness. Need to truncate to ints later
anticipates a chunksize so we can compuete 1M random-ish values with only 10K iterations of the loop with each iteration computing a chunk of 100
*/
function computeRangeStarts(tot, samples, chunkSize) {
let rangeStarts = []
let avgIntervalSize = tot / samples // the averge difference between the start of one sample and the start of the next
let lastStart = 0
let at = 0
for (let i=0;i<samples;i++) {
let skip = random(avgIntervalSize * 2) // skip from 0 to double the avg, to hit the average overall
if (skip <= chunkSize + 0.01) {
skip = chunkSize + 0.01 // ensure we skip ahead a full chunk to avoid overlap
}
at = at + skip
rangeStarts.push(at)
lastStart = at
};
return stretchSqueeze(rangeStarts, tot, chunkSize)
}
function max(a,b) { return a > b ? a : b}
function min(a,b) { return a < b ? a : b}
function randomRanges(tot, samples, chunkSize) {
let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
let minNextInt=0
let rangeData = rangeStarts.reduce(
function(collectorArr, start) {
let idealStartInt = math.floor(start)
let startInt = max(minNextInt, idealStartInt)
let stopInt = startInt + chunkSize - 1
minNextInt = stopInt+1 // sometimes the randoms can prodice the same number twice for very dense samples. Ensure we always move +1 at least
collectorArr.push({"start":startInt, "stop":stopInt})
return collectorArr
},
[])
return rangeData
}
function randomUris(query, samples, chunkSize) { // todo generalize to any lexicon
let tot = cts.estimate(query)
if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total sequence size"
let rangeStarts = computeRangeStarts(tot, samples, chunkSize)
let minNextInt=0
let lastUri = ""
let rangeData = rangeStarts.reduce(
function(collectorArr, start) {
let idealStartInt = math.floor(start)
let startInt = max(minNextInt, idealStartInt)
let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
let uris = cts.uris(lastUri+" ", Sequence.from(["limit="+(stopInt-startInt)]), query).toArray() // concat a space to move past last item
lastUri=uris[uris.length-1]
return collectorArr.concat(uris) // tricky: stop is non-inclusive so just subtract
},
[])
return rangeData
}
function randomSampleChunks(dataArr, samples, chunkSize) {
let tot = dataArr.length
if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total data size"
let rangeStarts = computeRangeStarts(dataArr.length, samples, chunkSize)
let minNextInt=0
let rangeData = rangeStarts.reduce(
function(collectorArr, start) {
let idealStartInt = math.floor(start)
let startInt = max(minNextInt, idealStartInt)
let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint
minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least
return collectorArr.concat(dataArr.slice(startInt, stopInt)) // tricky: slice ignores end index, but see above
},
[])
return rangeData
}
///////////// ==================
let errors=[]
function test(cond, msg) {
if (!cond) errors.push(msg)
}
let tot = 1000
let data = []
for (let i=0; i<tot; i++) data.push(i);
// data = cts.uris().toArray()
let samples = 20
let chunkSize = 4
let res = randomSampleChunks(data, samples, chunkSize)
test(res.length===20*4, "expect 80 items. got "+res.length)
test(res[3]-res[0]===3, "expect first chunk of 4 to be contiguous. diff="+res[3]-res[0])
test(res[7]-res[4]===3, "expect second chunk of 4 to be contiguous.")
test(res[11]-res[8]===3, "expect third chunk of 4 to be contiguous")
test(res[15]-res[12]===3, "expect fourth/last chunk of 4 to be contiguous")
test(res[15] <= 999, "last item must be at or below max of 999")
let ranges = randomRanges(tot, samples, chunkSize)
test(ranges.length===samples, "expected 20 ranges")
test(ranges[0].stop - ranges[0].start ===3, "expect first range to be size. size= ="+ (ranges[0].stop - ranges[0].start))
test(ranges[19].stop <= 999, "last item must be at or below max of 999. was "+ranges[19].stop)
//--- test some very dense samples 990 items of 1000 ---
samples = 990
chunkSize = 1
res = randomSampleChunks(data, samples, chunkSize)
test(res.length===990, "expect 990 chunks. got "+res.length)
test(res[989] <= 999, "last item must be at or below max of 999")
test(res[989] > 995, "last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // about 4 or 5 gaps should be in the sequence randomly, not two at the very end
ranges = randomRanges(tot, samples, chunkSize)
test(ranges.length===990, "ranges: expect 990 chunks. got "+ranges.length)
// known bug. samples that incude pretty much all of the list can overrun slightly.
// appears to be because we occasionally advance the next range to avoid overlap/duplicate of the last stop point. If this happens too often, pushes last item past end.
test(ranges[989].stop <= 999, "known issue w dense ranges: last item must be at or below max of 999. was:"+ranges[989].stop)
test(ranges[989].stop > 997, "ranges: last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // due to density. if all the gaps are at the very end, likely bug
//--- uneven numbers
tot = 1234
samples = 77
chunkSize = 11
data = []
for (let i=0; i<tot; i++) data.push(i);
res = randomSampleChunks(data, samples, chunkSize)
test(res.length===77*11, "expect 781 items. got "+res.length)
test(res[10]-res[0]===10, "uneven: expect first chunk of 11 to be contiguous")
test(res[21]-res[11]===10, "uneven: expect second chunk of 11 to be contiguous")
test(res[780] <= 1233, "uneven: last item must be at or below max of 999")
test(res[780] < 1220, "uneven: last item unlikely to be too close to end was:"+res[780])
test(res[780] > 820, "uneven: last item unlikely to be very close to theoretical min of 781:"+res[780])
res = randomUris(cts.trueQuery(), samples, chunkSize)
test(res.length===77*11, "expect 781 items. got "+res.length)
let uniqueArray = res.filter(function(item, pos) { return res.indexOf(item) == pos; })
test(uniqueArray.length === res.length, "no duplicates")
let x= {"err":errors}
x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment