Last active
March 3, 2016 04:54
-
-
Save joewiz/6312943 to your computer and use it in GitHub Desktop.
Dehyphenate text suffering from improper hyphenation, using XQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
(: Functions to dehyphenate a word or a paragraph suffering from improper hyphenation. | |
Uses a dictionary (a list of known words), such as those available at: | |
https://github.com/marklogic/dictionaries/tree/master/dictionaries | |
:) | |
declare namespace fn="http://www.w3.org/2005/xpath-functions"; | |
declare namespace spell="http://marklogic.com/xdmp/spell"; | |
declare function local:dehyphenate-word($candidate as xs:string, $dictionary as element(spell:word)+) as xs:string { | |
let $sans-hyphen := replace($candidate, '-', '') | |
return | |
if ($sans-hyphen = $dictionary) then | |
$sans-hyphen | |
else | |
$candidate | |
}; | |
declare function local:dehyphenate-paragraph($paragraph as xs:string, $dictionary as element(spell:word)+) as xs:string* { | |
let $pattern := '\b[A-Za-z]+[a-z]-[a-z-]+\b' | |
let $analysis := analyze-string($paragraph, $pattern) | |
return | |
if ($analysis/fn:match) then | |
string-join( | |
for $fragment in $analysis/* | |
return | |
if ($fragment/self::fn:non-match) then | |
$fragment/string() | |
else (: if ($fragment/self::fn:match) then :) | |
local:dehyphenate-word($fragment, $dictionary) | |
, '') | |
else | |
$paragraph | |
}; | |
(: | |
let $paragraph := | |
'Kissinger said that in connection with our message President Nixon had, from the | |
very out-set, been prepared to endorse a de facto halt to the escalation of | |
military operations. He had con-vinced the Israeli Government to do so as well. | |
The President presumed that our message was accom-panied by a concurrent | |
proposal to renew a confi-dential, bilateral exchange of views on a political | |
settlement, i.e., the cease-fire would, as it were, es-tablish a favorable | |
atmosphere for further efforts to reach a political settlement. However, he | |
contin-ued, in the view of President Nixon and the Israeli Government, the | |
information regarding shipments of Soviet missiles of a more advanced type to | |
Cairo creates a new situation. That situation could well be interpreted as | |
follows: Israel now agrees to a de facto halt to its air raids, while Nasser | |
uses this time to make all kinds of improvements in his mili-tary machine, in | |
particular his air defenses, with no interference whatsoever. Then at some point | |
chosen by the UAR President himself, military op-erations will again resume, but | |
now under condi-tions far worse for Israel, because by that time the UAR will | |
have an air defense system with SAM–3 missiles, which U.S. experts describe as | |
“fairly ef-fective.” Given this prospect, it is hard to persuade Israel to | |
totally stop the air raids at this time.' | |
:) | |
let $words := | |
( | |
'face-to-face', | |
'vis-a-vis', | |
'accom-panied', | |
'con-vinced', | |
'confi-dential', | |
'es-tablish', | |
'mili-tary', | |
'condi-tions', | |
'ef-fective', | |
'cease-fire' | |
) | |
let $dictionary := doc('/db/apps/dictionaries/data/large-dictionary.xml')//spell:word | |
return | |
element results { | |
for $word in $words | |
let $dehyphenated := local:dehyphenate-word($word, $dictionary) | |
return | |
element result { | |
element source {$source}, | |
element dehyph {$dehyphenated} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- | |
an eXist-db collection.xconf file that sets a range index on <word> elements in | |
a dictionary file from https://github.com/marklogic/dictionaries/tree/master/dictionaries | |
--> | |
<collection xmlns="http://exist-db.org/collection-config/1.0"> | |
<index xmlns:spell="http://marklogic.com/xdmp/spell"> | |
<fulltext default="none" attributes="false"/> | |
<!-- Range index configuration --> | |
<create qname="spell:word" type="xs:string"/> | |
</index> | |
</collection> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<results> | |
<result> | |
<source>face-to-face</source> | |
<dehyph>face-to-face</dehyph> | |
</result> | |
<result> | |
<source>vis-a-vis</source> | |
<dehyph>vis-a-vis</dehyph> | |
</result> | |
<result> | |
<source>accom-panied</source> | |
<dehyph>accompanied</dehyph> | |
</result> | |
<result> | |
<source>con-vinced</source> | |
<dehyph>convinced</dehyph> | |
</result> | |
<result> | |
<source>confi-dential</source> | |
<dehyph>confidential</dehyph> | |
</result> | |
<result> | |
<source>es-tablish</source> | |
<dehyph>establish</dehyph> | |
</result> | |
<result> | |
<source>mili-tary</source> | |
<dehyph>military</dehyph> | |
</result> | |
<result> | |
<source>condi-tions</source> | |
<dehyph>conditions</dehyph> | |
</result> | |
<result> | |
<source>ef-fective</source> | |
<dehyph>effective</dehyph> | |
</result> | |
<result> | |
<source>cease-fire</source> | |
<dehyph>ceasefire</dehyph> | |
</result> | |
</results> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment