-
-
Save xquery/6831758 to your computer and use it in GitHub Desktop.
xquery version "1.0-ml"; | |
let $data := (1,1,1,1,1,1,1,1,1,20) | |
let $choose := xdmp:random( sum($data) ) | |
let $pos := filter( function($a){if($choose le sum($data[1 to $a])) then true() else false()}, 1 to count($data) )[1] | |
return "weighted choose: " || $choose || " | data position: " || $pos || " value: " || $data[$pos] | |
On large $data sequences the expression sum($data[1 to $a]) in the filter function becomes a bottleneck.
This can be improved by recursively summing up the values and stopping as soon as $choose is reached. Here is the complete code.
(:
Weighted random sampling.
Returns a random integer whose probability is based on a list of weights.
@param $weights: the weights, a sequence of numbers
@return an integer between 1 and count($weights)
:)
declare function weighted-random($weights)
{
let $sum := max((floor(sum($weights)-1), 0))
let $choose := xdmp:random( xs:unsignedLong($sum) ) + 1
return sum-pos($weights, $choose, 0, 0)
};
(: Recursive function that returns the lowest $i such that sum($weights[1 to $i]) >= $choose :)
declare private function sum-pos($weights, $choose, $sum, $pos)
{
if ($choose <= $sum) then $pos
else sum-pos(tail($weights), $choose, $sum+head($weights), $pos+1)
};
It can be called for example as weighted-random((1,2,3,4)).
xdmp:random may return 0, which incorrectly increases the probability for the first position. In my opinion it is more correct to define $choose as follows, to get a value in the range [1,sum($data)]:
let $choose := xdmp:random( sum($data)-1) + 1