Skip to content

Instantly share code, notes, and snippets.

@joewiz
Created March 18, 2012 19:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save joewiz/2080400 to your computer and use it in GitHub Desktop.
Save joewiz/2080400 to your computer and use it in GitHub Desktop.
Wrangling plain text with XQuery
<group>
<line level="0">The President left at 8:48 am</line>
<group>
<group>
<line level="1">-Administration recommendations on Capitol Hill</line>
</group>
<group>
<line level="1">-Improvements</line>
</group>
<group>
<line level="1">-Richardson’s trip to New York</line>
</group>
<group>
<line level="1">-Health programs</line>
<group>
<group>
<line level="2">-Goals</line>
</group>
<group>
<line level="2">-Problems in present system</line>
</group>
<group>
<line level="2">-Approach</line>
</group>
<group>
<line level="2">-Emphasis on quality</line>
</group>
</group>
</group>
<group>
<line level="1">-Improvements in United States’ health care</line>
<group>
<group>
<line level="2">-Maternal deaths</line>
<group>
<group>
<line level="3">-Rate</line>
</group>
<group>
<line level="3">-Decline</line>
</group>
<group>
<line level="3">-United States’ rate compared to other nations</line>
<group>
<line level="4">-Reporting system</line>
</group>
</group>
</group>
</group>
<group>
<line level="2">-Data on health</line>
<group>
<line level="3">-Differences in reporting system</line>
</group>
</group>
<group>
<line level="2">-Low-income people</line>
<group>
<group>
<line level="3">-Whites</line>
</group>
<group>
<line level="3">-Non-whites</line>
</group>
</group>
</group>
<group>
<line level="2">-Mortality rates</line>
<group>
<line level="3">-Figures</line>
</group>
</group>
</group>
</group>
<group>
<line level="1">-Resource allocation</line>
<group>
<group>
<line level="2">-Rural areas</line>
<group>
<line level="3">-Availability of care</line>
</group>
</group>
<group>
<line level="2">-Catastrophic care costs</line>
</group>
<group>
<line level="2">-Prevention</line>
</group>
<group>
<line level="2">-Problems</line>
</group>
</group>
</group>
</group>
</group>
declare function local:group-lines($lines as element(line)+) {
let $first-line := $lines[1]
let $level := $first-line/@level
let $next-line-at-same-level := subsequence($lines, 2)[@level eq $level][1]
let $group-of-lines-inside-this-level :=
if ($next-line-at-same-level) then
subsequence(
$lines,
1,
index-of($lines, $next-line-at-same-level) - 1
)
else
$lines
return
(
<group>{$group-of-lines-inside-this-level}</group>
,
if ($next-line-at-same-level) then
local:group-lines(subsequence($lines, index-of($lines, $next-line-at-same-level)))
else
()
)
};
declare function local:groups-to-list($group as element(group)) {
<list>{local:inner-groups-to-list($group)}</list>
};
declare function local:inner-groups-to-list($group as element(group)) {
if ($group/line) then
for $item in $group/line
return
<item>{
$item/text()
,
if ($item/following-sibling::group) then
<list>{local:inner-groups-to-list($item/following-sibling::group)}</list>
else
()
}</item>
else (: if ($group[not(line)]) then :)
for $g in $group/group
return
local:inner-groups-to-list($g)
};
The President left at 8:48 am
-Administration recommendations on Capitol Hill
-Improvements
-Richardson’s trip to New York
-Health programs
-Goals
-Problems in present system
-Approach
-Emphasis on quality
-Improvements in United States’ health care
-Maternal deaths
-Rate
-Decline
-United States’ rate compared to other nations
-Reporting system
-Data on health
-Differences in reporting system
-Low-income people
-Whites
-Non-whites
-Mortality rates
-Figures
-Resource allocation
-Rural areas
-Availability of care
-Catastrophic care costs
-Prevention
-Problems
declare function local:process-groups($groups as element(group)+) {
if (count($groups) gt 1) then
<group>{
for $group in $groups
return
local:apply-levels($group)
}</group>
else
local:apply-levels($groups)
};
declare function local:apply-levels($group as element(group)) {
<group>
{$group/line[1]}
{
if ($group/line[2]) then
if (count(subsequence($group/line, 2)) gt 1) then
<group>{
for $group in local:group-lines(subsequence($group/line, 2))
return
local:apply-levels($group)
}</group>
else
local:group-lines(subsequence($group/line, 2))
else ()
}
</group>
};
<list>
<item>The President left at 8:48 am
<list>
<item>Administration recommendations on Capitol Hill</item>
<item>Improvements</item>
<item>Richardson’s trip to New York</item>
<item>Health programs
<list>
<item>Goals</item>
<item>Problems in present system</item>
<item>Approach</item>
<item>Emphasis on quality</item>
</list>
</item>
<item>Improvements in United States’ health care
<list>
<item>Maternal deaths<list>
<item>Rate</item>
<item>Decline</item>
<item>United States’ rate compared to other nations
<list>
<item>Reporting system</item>
</list>
</item>
</list>
</item>
<item>Data on health
<list>
<item>Differences in reporting system</item>
</list>
</item>
<item>Low-income people
<list>
<item>Whites</item>
<item>Non-whites</item>
</list>
</item>
<item>Mortality rates
<list>
<item>Figures</item>
</list>
</item>
</list>
</item>
<item>Resource allocation
<list>
<item>Rural areas
<list>
<item>Availability of care</item>
</list>
</item>
<item>Catastrophic care costs</item>
<item>Prevention</item>
<item>Problems</item>
</list>
</item>
</list>
</item>
</list>
<line level="0">The President left at 8:48 am</line>
<line level="1">-Administration recommendations on Capitol Hill</line>
<line level="1">-Improvements</line>
<line level="1">-Richardson’s trip to New York</line>
<line level="1">-Health programs</line>
<line level="2">-Goals</line>
<line level="2">-Problems in present system</line>
<line level="2">-Approach</line>
<line level="2">-Emphasis on quality</line>
<line level="1">-Improvements in United States’ health care</line>
<line level="2">-Maternal deaths</line>
<line level="3">-Rate</line>
<line level="3">-Decline</line>
<line level="3">-United States’ rate compared to other nations</line>
<line level="4">-Reporting system</line>
<line level="2">-Data on health</line>
<line level="3">-Differences in reporting system</line>
<line level="2">-Low-income people</line>
<line level="3">-Whites</line>
<line level="3">-Non-whites</line>
<line level="2">-Mortality rates</line>
<line level="3">-Figures</line>
<line level="1">-Resource allocation</line>
<line level="2">-Rural areas</line>
<line level="3">-Availability of care</line>
<line level="2">-Catastrophic care costs</line>
<line level="2">-Prevention</line>
<line level="2">-Problems</line>
declare function local:text-to-lines($text as xs:string) {
let $lines := tokenize($text, '\n')
for $line in $lines
let $level :=
if (matches($line, '^\s')) then
string-length(replace($line, '^(\s*).+$', '$1'))
else
0
let $content := replace($line, '^\s*(.+)$', '$1')
return
<line level="{$level}">{$content}</line>
};
let $lines := local:text-to-lines($text)
let $groups := local:group-lines($lines)
let $processed-group := local:process-groups($groups)
let $list := local:groups-to-list($processed-group)
return
$list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment