Skip to content

Instantly share code, notes, and snippets.

@masyukun
Last active May 29, 2019 15:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masyukun/5d39245ca3202b5e5e2e5ee91b2a540e to your computer and use it in GitHub Desktop.
Save masyukun/5d39245ca3202b5e5e2e5ee91b2a540e to your computer and use it in GitHub Desktop.
xquery version "1.0-ml";
declare namespace p = "http://www.mycompany.com";
declare option xdmp:mapping "false";
(:~
Returns z-score for the specified confidence value
For confidence values not in the lookup table,
solve the cubic equation I generated from a polynomial regression of Confidence Percentages to z-score with R^2 = 1 fit
@author Matthew Royal
@see https://matthewroyal.com/blog/wp-content/uploads/2015/05/Screen-Shot-2015-05-24-at-10.57.54-AM.png
:)
declare function local:getZScore($confidence as xs:double) {
if ($confidence ge 0.5 and $confidence le 1) then
if ($confidence eq 0.5) then 0.674
else if ($confidence eq 0.8) then 1.282
else if ($confidence eq 0.9) then 1.645
else if ($confidence eq 0.95) then 1.96
else if ($confidence eq 0.98) then 2.326
else if ($confidence eq 0.99) then 2.576
else
math:fabs(local:solveCubicEquation(0.0481, 0.4007, 1.1354, 0.098 + $confidence))
else fn:error("ERROR: 0.5 <= $confidence <= 1.0, and input was " || fn:string-join($confidence))
};
(:~
Solve a cubic polynomial for x, given coefficients a, b, c, d
in standard form = ax^3 - bx^2 + cx - d = 0.
Uses homegrown cubic root function sqrt3() because Xquery's math library
cannot raise a negative number to a decimal power.
@author Matthew Royal
@param $a 1st coefficient
@param $b 2nd coefficient
@param $c 3rd coefficient
@param $d 4th coefficient
@see http://www.math.vanderbilt.edu/~schectex/courses/cubic/
:)
declare function local:solveCubicEquation($a as xs:double, $b as xs:double, $c as xs:double, $d as xs:double) {
let $big := ((-math:pow($b,3.0)) div (27.0 * math:pow($a,3.0))) + (($b * $c) div (6.0 * math:pow($a,2.0))) - ($d div (2.0 * $a))
let $small := ($c div (3.0 * $a)) - (math:pow($b,2.0) div (9.0 * math:pow($a,2.0)))
let $tail := ($b div (3.0 * $a))
let $x := math:pow($big + math:sqrt(math:pow($big,2.0) + math:pow($small,3.0)), (1.0 div 3.0)) + local:sqrt3( $big - math:sqrt(math:pow($big,2.0) + math:pow($small,3.0)) ) - $tail
return $x
};
(:~
Estimate the cube root of a number.
WARNINGS: 1) Does not work well with many digits due to xs:double size limitation.
2) Does not generate complex numbers.
@author Matthew Royal
@see http://www4.wittenberg.edu/academics/mathcomp/bjsdir/CubeRootTalk.pdf
:)
declare function local:sqrt3($number as xs:double) {
let $answer := ()
let $negative := $number lt 0
let $number := if ($negative) then $number * -1 else $number
let $numLength := fn:string-length(fn:replace(fn:string($number), "[^0-9]", ""))
let $hasDecimal := fn:matches(xs:string($number), "\.")
let $beforeDecimal := fn:replace(xs:string($number), "^([0-9]+?)\.[0-9]*$", "$1")
let $afterDecimal := if ($hasDecimal) then fn:replace(xs:string($number), "^[0-9]+?\.([0-9]*)$", "$1") else ()
(: Group numbers by threes, starting before then after the decimal point :)
let $beforeGroups := fn:reverse(
for $i in (1 to xs:integer(math:ceil(fn:string-length($beforeDecimal) div 3)))
return xs:double(fn:replace($beforeDecimal, "^.*?([0-9]{1,3})[0-9]{"||($i - 1) * 3||"}$", "$1"))
)
let $beforeGroups :=
if (fn:count($beforeGroups) eq 1 and $beforeDecimal eq "0") then () else $beforeGroups
let $afterGroups :=
for $i in (1 to xs:integer(math:ceil(fn:string-length($afterDecimal) div 3)))
return xs:double(fn:replace($afterDecimal, "^[0-9]{"||($i - 1) * 3||"}([0-9]{1,3}).*?$", "$1"))
let $numPieces := ($beforeGroups, $afterGroups)
let $newLast :=
let $num := xs:string($numPieces[fn:last()])
return
if (fn:string-length($num) eq 1) then xs:double($num || "00")
else if (fn:string-length($num) eq 2) then xs:double($num || "0")
else ()
let $numPieces := if ($newLast) then ($numPieces[1 to (fn:last() - 1)], $newLast) else $numPieces
let $numPieces := (
$numPieces
,
let $precisionDeficit := ($numLength - fn:count($numPieces))
return
if ($precisionDeficit gt 0) then
for $i in (1 to $precisionDeficit)
return "000"
else ()
)
(: Find the largest cube of a single digit number less than the first group :)
let $largestCube := 1
let $_ :=
for $j in (2 to 9)
return
if (math:pow($j, 3) le $numPieces[1]) then
xdmp:set($largestCube, $j)
else ()
let $_ := xdmp:set($answer, ($answer, xs:string($largestCube)))
let $diff := $numPieces[1] - math:pow($largestCube, 3)
let $diffNext := ()
let $numAnswer := ()
let $combo := ()
(: Find the largest factor of a single digit number less than all the rest of the groups :)
let $_ :=
try {
for $i in (2 to fn:count($numPieces) )
let $numAnswer := xs:double(fn:string-join($answer)) * 10
let $_ := xdmp:set($diffNext, xs:double(xs:string($diff) || $numPieces[$i]))
let $largestFactor := 1
let $_ :=
for $j in (1 to 9)
let $_ := xdmp:set($combo, $j * ((3 * math:pow($numAnswer,2)) + ((3 * $numAnswer) * $j) + math:pow($j,2)))
return
if ($combo le $diffNext) then
xdmp:set($largestFactor, $j)
else ()
let $_ := xdmp:set($answer, ($answer, xs:string($largestFactor)))
return xdmp:set($diff, ($diffNext - ($largestFactor * ((3 * math:pow($numAnswer,2)) + ((3 * $numAnswer) * $largestFactor) + math:pow($largestFactor,2)))) )
} catch ($exception) {}
return
xs:double(fn:string-join(
for $num at $i in $answer
return (
if ($i - 1 eq fn:count($beforeGroups)) then
if (fn:count($beforeGroups) eq 1 and $beforeDecimal eq "0") then "0."
else "."
else ()
,
$num
)
)) * (if ($negative) then -1 else 1)
};
(:~
Calculate the sample size for a given population and confidence.
@author Matthew Royal
@see http://www.surveysystem.com/sample-size-formula.htm
:)
declare function local:getSampleSize($population as xs:integer, $confidence as xs:double, $confidenceInterval as xs:double) {
let $confidence := local:getZScore($confidence)
let $choicePickedPercent := 0.5 (: default for computing sample size:)
let $sampleSize :=
(math:pow($confidence, 2) * $choicePickedPercent * (1 - $choicePickedPercent))
div math:pow($confidenceInterval, 2)
let $sampleSizePopulation :=
$sampleSize div (1 + (($sampleSize - 1) div $population))
return $sampleSizePopulation
};
declare function local:computeSample($query as cts:query, $confidence as xs:double, $confidenceInterval as xs:double, $function as xdmp:function?, $numDocuments as xs:integer?) {
let $queryEstimate := xdmp:estimate(cts:search(/, $query))
let $sampleSize := local:getSampleSize($queryEstimate, $confidence, $confidenceInterval)
return
if (fn:exists($function)) then
let $documents := cts:search(/, $query, ("score-random"))[1 to xs:integer(math:ceil($sampleSize))]
let $bools := xdmp:apply($function, $documents)
let $results := fn:count(
for $bool in $bools where $bool return $bool
)
return (
"Query Estimate: " || $queryEstimate, "Sample size: " || $sampleSize
|| " " || $confidence * 100
||"% +/-" || $confidenceInterval * 100 || "% confidence"
,
"XQuery function returned TRUE in " || (($results div math:ceil($sampleSize)) * 100.0)
|| "% of the cases (" || $results || " of " || xs:integer(math:ceil($sampleSize)) || "), "
|| "scaling to an estimated ~" || math:ceil(($results div math:ceil($sampleSize)) * $queryEstimate)
|| " documents for the entire population " || $queryEstimate || "."
,
let $numDocuments := (if (exists($numDocuments) and $numDocuments gt 0) then $numDocuments else xs:int(math:ceil($sampleSize)))
let $affectedDocs := $documents[let $p := fn:position() return $bools[$p] eq fn:true()]/fn:base-uri()
return (
"Matching URIs (Up to "||$numDocuments||"):",
$affectedDocs[1 to $numDocuments]
)
)
else (
"Query Estimate: " || $queryEstimate, "Sample size: " || $sampleSize
|| " " ||$confidence * 100
||"% +/-" || $confidenceInterval * 100 || "% confidence"
)
};
let $query :=
cts:element-query( xs:QName("p:companyDirectory"),
cts:and-query((
cts:element-query(xs:QName("p:employeeName"), cts:and-query(()) ),
cts:element-value-query(xs:QName("p:deleted"), "false"),
cts:element-value-query(xs:QName("p:isCurrentVersion"), "true"),
cts:element-value-query(xs:QName("p:hireYear"), "2015")
))
)
let $function := function ($documents as item()*) as xs:boolean* {
for $doc in $documents
let $employeeNames := $doc//p:employeeName/fn:string()
return fn:exists(
(
for $employeeName in $employeeNames
where fn:string-length($employeeName) ge 60
return $employeeName
)[1]
)
}
return local:computeSample($query, 0.95, 0.05, $function, 10)
@masyukun
Copy link
Author

Updated default population from "5" to the rounded up sample size.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment