Created
March 18, 2012 19:50
-
-
Save joewiz/2080400 to your computer and use it in GitHub Desktop.
Wrangling plain text with XQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<group> | |
<line level="0">The President left at 8:48 am</line> | |
<group> | |
<group> | |
<line level="1">-Administration recommendations on Capitol Hill</line> | |
</group> | |
<group> | |
<line level="1">-Improvements</line> | |
</group> | |
<group> | |
<line level="1">-Richardson’s trip to New York</line> | |
</group> | |
<group> | |
<line level="1">-Health programs</line> | |
<group> | |
<group> | |
<line level="2">-Goals</line> | |
</group> | |
<group> | |
<line level="2">-Problems in present system</line> | |
</group> | |
<group> | |
<line level="2">-Approach</line> | |
</group> | |
<group> | |
<line level="2">-Emphasis on quality</line> | |
</group> | |
</group> | |
</group> | |
<group> | |
<line level="1">-Improvements in United States’ health care</line> | |
<group> | |
<group> | |
<line level="2">-Maternal deaths</line> | |
<group> | |
<group> | |
<line level="3">-Rate</line> | |
</group> | |
<group> | |
<line level="3">-Decline</line> | |
</group> | |
<group> | |
<line level="3">-United States’ rate compared to other nations</line> | |
<group> | |
<line level="4">-Reporting system</line> | |
</group> | |
</group> | |
</group> | |
</group> | |
<group> | |
<line level="2">-Data on health</line> | |
<group> | |
<line level="3">-Differences in reporting system</line> | |
</group> | |
</group> | |
<group> | |
<line level="2">-Low-income people</line> | |
<group> | |
<group> | |
<line level="3">-Whites</line> | |
</group> | |
<group> | |
<line level="3">-Non-whites</line> | |
</group> | |
</group> | |
</group> | |
<group> | |
<line level="2">-Mortality rates</line> | |
<group> | |
<line level="3">-Figures</line> | |
</group> | |
</group> | |
</group> | |
</group> | |
<group> | |
<line level="1">-Resource allocation</line> | |
<group> | |
<group> | |
<line level="2">-Rural areas</line> | |
<group> | |
<line level="3">-Availability of care</line> | |
</group> | |
</group> | |
<group> | |
<line level="2">-Catastrophic care costs</line> | |
</group> | |
<group> | |
<line level="2">-Prevention</line> | |
</group> | |
<group> | |
<line level="2">-Problems</line> | |
</group> | |
</group> | |
</group> | |
</group> | |
</group> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
declare function local:group-lines($lines as element(line)+) { | |
let $first-line := $lines[1] | |
let $level := $first-line/@level | |
let $next-line-at-same-level := subsequence($lines, 2)[@level eq $level][1] | |
let $group-of-lines-inside-this-level := | |
if ($next-line-at-same-level) then | |
subsequence( | |
$lines, | |
1, | |
index-of($lines, $next-line-at-same-level) - 1 | |
) | |
else | |
$lines | |
return | |
( | |
<group>{$group-of-lines-inside-this-level}</group> | |
, | |
if ($next-line-at-same-level) then | |
local:group-lines(subsequence($lines, index-of($lines, $next-line-at-same-level))) | |
else | |
() | |
) | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
declare function local:groups-to-list($group as element(group)) { | |
<list>{local:inner-groups-to-list($group)}</list> | |
}; | |
declare function local:inner-groups-to-list($group as element(group)) { | |
if ($group/line) then | |
for $item in $group/line | |
return | |
<item>{ | |
$item/text() | |
, | |
if ($item/following-sibling::group) then | |
<list>{local:inner-groups-to-list($item/following-sibling::group)}</list> | |
else | |
() | |
}</item> | |
else (: if ($group[not(line)]) then :) | |
for $g in $group/group | |
return | |
local:inner-groups-to-list($g) | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The President left at 8:48 am | |
-Administration recommendations on Capitol Hill | |
-Improvements | |
-Richardson’s trip to New York | |
-Health programs | |
-Goals | |
-Problems in present system | |
-Approach | |
-Emphasis on quality | |
-Improvements in United States’ health care | |
-Maternal deaths | |
-Rate | |
-Decline | |
-United States’ rate compared to other nations | |
-Reporting system | |
-Data on health | |
-Differences in reporting system | |
-Low-income people | |
-Whites | |
-Non-whites | |
-Mortality rates | |
-Figures | |
-Resource allocation | |
-Rural areas | |
-Availability of care | |
-Catastrophic care costs | |
-Prevention | |
-Problems |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
declare function local:process-groups($groups as element(group)+) { | |
if (count($groups) gt 1) then | |
<group>{ | |
for $group in $groups | |
return | |
local:apply-levels($group) | |
}</group> | |
else | |
local:apply-levels($groups) | |
}; | |
declare function local:apply-levels($group as element(group)) { | |
<group> | |
{$group/line[1]} | |
{ | |
if ($group/line[2]) then | |
if (count(subsequence($group/line, 2)) gt 1) then | |
<group>{ | |
for $group in local:group-lines(subsequence($group/line, 2)) | |
return | |
local:apply-levels($group) | |
}</group> | |
else | |
local:group-lines(subsequence($group/line, 2)) | |
else () | |
} | |
</group> | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<list> | |
<item>The President left at 8:48 am | |
<list> | |
<item>Administration recommendations on Capitol Hill</item> | |
<item>Improvements</item> | |
<item>Richardson’s trip to New York</item> | |
<item>Health programs | |
<list> | |
<item>Goals</item> | |
<item>Problems in present system</item> | |
<item>Approach</item> | |
<item>Emphasis on quality</item> | |
</list> | |
</item> | |
<item>Improvements in United States’ health care | |
<list> | |
<item>Maternal deaths<list> | |
<item>Rate</item> | |
<item>Decline</item> | |
<item>United States’ rate compared to other nations | |
<list> | |
<item>Reporting system</item> | |
</list> | |
</item> | |
</list> | |
</item> | |
<item>Data on health | |
<list> | |
<item>Differences in reporting system</item> | |
</list> | |
</item> | |
<item>Low-income people | |
<list> | |
<item>Whites</item> | |
<item>Non-whites</item> | |
</list> | |
</item> | |
<item>Mortality rates | |
<list> | |
<item>Figures</item> | |
</list> | |
</item> | |
</list> | |
</item> | |
<item>Resource allocation | |
<list> | |
<item>Rural areas | |
<list> | |
<item>Availability of care</item> | |
</list> | |
</item> | |
<item>Catastrophic care costs</item> | |
<item>Prevention</item> | |
<item>Problems</item> | |
</list> | |
</item> | |
</list> | |
</item> | |
</list> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<line level="0">The President left at 8:48 am</line> | |
<line level="1">-Administration recommendations on Capitol Hill</line> | |
<line level="1">-Improvements</line> | |
<line level="1">-Richardson’s trip to New York</line> | |
<line level="1">-Health programs</line> | |
<line level="2">-Goals</line> | |
<line level="2">-Problems in present system</line> | |
<line level="2">-Approach</line> | |
<line level="2">-Emphasis on quality</line> | |
<line level="1">-Improvements in United States’ health care</line> | |
<line level="2">-Maternal deaths</line> | |
<line level="3">-Rate</line> | |
<line level="3">-Decline</line> | |
<line level="3">-United States’ rate compared to other nations</line> | |
<line level="4">-Reporting system</line> | |
<line level="2">-Data on health</line> | |
<line level="3">-Differences in reporting system</line> | |
<line level="2">-Low-income people</line> | |
<line level="3">-Whites</line> | |
<line level="3">-Non-whites</line> | |
<line level="2">-Mortality rates</line> | |
<line level="3">-Figures</line> | |
<line level="1">-Resource allocation</line> | |
<line level="2">-Rural areas</line> | |
<line level="3">-Availability of care</line> | |
<line level="2">-Catastrophic care costs</line> | |
<line level="2">-Prevention</line> | |
<line level="2">-Problems</line> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
declare function local:text-to-lines($text as xs:string) { | |
let $lines := tokenize($text, '\n') | |
for $line in $lines | |
let $level := | |
if (matches($line, '^\s')) then | |
string-length(replace($line, '^(\s*).+$', '$1')) | |
else | |
0 | |
let $content := replace($line, '^\s*(.+)$', '$1') | |
return | |
<line level="{$level}">{$content}</line> | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let $lines := local:text-to-lines($text) | |
let $groups := local:group-lines($lines) | |
let $processed-group := local:process-groups($groups) | |
let $list := local:groups-to-list($processed-group) | |
return | |
$list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment