Last active
April 3, 2024 18:46
-
-
Save bertsky/63c7edb69daa8e5e7eb5cc203c3d44c2 to your computer and use it in GitHub Desktop.
wrapper around dta-tools tei2txt.pl covering dehyphenation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
nontext_opts=( | |
xmlstarlet ed -N tei=http://www.tei-c.org/ns/1.0 | |
-d //tei:note | |
-d //tei:fw | |
-d //tei:table | |
-d //tei:figure | |
-d //tei:formula | |
-d //tei:titlePage | |
-d '//tei:div[@type="index"]' | |
-d '//tei:div[@type="imprint"]' | |
-d '//tei:div[@type="frontispiece"]' | |
-d '//tei:div[@type="advertisement"]' | |
-d '//tei:div[@type="copyright"]' | |
-d '//tei:div[@type="abbrevations"]' | |
) | |
tei2txt_opts=( | |
tei2txt.pl | |
-p show_page_numbers=0 | |
-p show_form_feed=0 | |
-p show_line_numbers=0 | |
-p show_bogensignatur=0 | |
-p show_kolumnentitel=0 | |
-p show_catchword=0 | |
-p gap_char=_ | |
) | |
dehyphenate() { | |
# this should be done at runtime, too! | |
cat <<"EOF" | |
: start | |
s/¬ /- /g | |
# unwrap (and move any „ to new start of line) | |
/¬$/{N | |
# rule for upper case continuation | |
s/¬\ | |
\(„\)\?\([A-Z][^ ]\+\) \?/-\2\ | |
\1/ | |
t start | |
# rule for normal continuation | |
s/¬\ | |
\(„\)\?\([^ ]\+\) \?/\2\ | |
\1/ | |
# next line could be hyphenated itself | |
b start | |
} | |
EOF | |
} | |
normalize_opts=( | |
# decompose | |
perl -CS -MUnicode::Normalize=NFKD -Mutf8 -p | |
-e '$_ = NFKD($_);' | |
# map musical symbols to gap | |
-e 's/[\x{1d100}-\x{1d1ff}]/_/g;' | |
# map mathematical alphanumeric symbols to gap | |
-e 's/[\x{1d400}-\x{1d7ff}]/_/g;' | |
# map box drawings to gap | |
-e 's/[\x{2500}-\x{257f}]/_/g;' | |
# map Canadian syllabics to gap | |
-e 's/[\x{1400}-\x{167f}]/_/g;' | |
# map Coptic letters to gap | |
-e 's/[\x{2c80}-\x{2cff}]/_/g;' | |
# replace tab with space | |
-e 's/\t/ /g;' | |
# normalise some unexpected codepoints | |
-e 's/[⋅✕☓]/×/g;' | |
-e 's/∗/*/g;' | |
-e 's/∼/~/g;' | |
-e 's/⁎/*/g;' | |
# just (wrongly transcribed) print errors | |
-e 's/[ǝə]/e/g;' | |
-e 's/[ꟺɯ]/m/g;' | |
-e 's/ɹ/r/g;' | |
# cedilla instead of low comma | |
-e 's/\x{326}/\x{327}/g;' | |
# mathematical, normal, and pointing angle brackets | |
-e 's/[⟨〈〈]/(/g;' | |
-e 's/[⟩〉〉]/)/g;' | |
# approximations to mathematical symbols | |
-e 's/[⪙≦≤]/</g;' | |
-e 's/[⪚≧≥]/>/g;' | |
-e 's/≡/=/g;' | |
-e 's/∷/~/g;' | |
-e 's/∓/±/g;' | |
# roman numerals | |
-e 's/Ↄ/C/g;' | |
-e 's/ↀ/CD/g;' | |
# list items | |
-e 's/∙/•/g;' | |
-e 's/[⁑∸]/•/g;' | |
# geometric shapes | |
-e 's/[⚬⚪●]/○/g;' | |
-e 's/[◻☐■]/□/g;' | |
-e 's/∆/△/g;' | |
# crosses without distinction | |
-e 's/[♰♱✝☩]/✠/g;' | |
# variants of pointing finger | |
-e 's/[👉☛]/☞/g;' | |
-e 's/[👈☚]/☜/g;' | |
# Fraktur hyphen (double-oblique) is supposed to be hyphen-minus | |
-e 's/⸗/-/g;' | |
# Unicode hyphen is supposed to be hyphen-minus | |
-e 's/‐/-/g;' | |
-e 's/‧/,/g;' | |
# variants of dash and minus | |
-e 's/[⸺‒—─−―]/–/g;' | |
# variants of vertical line | |
-e 's/[⏐│‖⏐]/|/g;' | |
# consonant ligatures (Fraktur) | |
-e 's/ffi/ffi/g;' | |
-e 's/fi/fi/g;' | |
-e 's/fl/fl/g;' | |
# other potential MUFI/PUA codepoints | |
-e 's//ſſ/g;' | |
-e 's//ſſi/g;' # MUFI: LATIN SMALL LIGATURE LONG S LONG S I, U+EBA7 | |
-e 's//ch/g;' # Latin small letter c ligated with latin small letter h, U+F502 | |
-e 's//ck/g;' # Latin small ligature ck, U+EEC4 | |
-e 's/ſt/ſt/g;' | |
-e 's/fi/fi/g;' | |
-e 's/ff/ff/g;' | |
-e 's/fl/fl/g;' | |
-e 's/ffi/ffi/g;' | |
-e 's//ſk/g;' | |
-e 's//tz/g;' # MUFI: LATIN SMALL LIGATURE TZ | |
-e 's//as/g;' # eMOP: Latin small ligature as, U+f532 | |
-e 's//is/g;' # eMOP: Latin small ligature is, U+f533 | |
-e 's//us/g;' # eMOP: Latin small ligature us, U+f534 | |
-e 's//Qu/g;' # eMOP: Latin ligature capital Q small u, U+f535 | |
-e 's/ij/ij/g;' # U+0133 LATIN SMALL LIGATURE IJ | |
-e 's//q&/g;' # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET, U+E8BF | |
-e 's//ſp/g;' # MUFI: LATIN SMALL LIGATURE LONG S P, U+EBA5 | |
-e 's/st/st/g;' # U+FB06 LATIN SMALL LIGATURE ST | |
-e 's/[̉ᷣ]/᷎/g;' # combining ogonek above (U+1DCE, ᷎) instead of combining r rotunda (U+1DE3, ᷣ) or combining hook above (U+0309, ̉) | |
-e 's//ſt/g;' # PUA EADA -> ſt | |
-e 's//ſi/g;' # PUA EBA2 -> ſi | |
-e 's//ſl/g;' # PUA EBA3 -> ſl | |
-e 's//ſſ/g;' # PUA EBA6 -> ſſ | |
-e 's//ſſi/g;' # PUA EBA7 -> ſſi | |
-e 's//ſſt/g;' # PUA F4FF -> ſſt | |
-e 's//ſp/g;' # PUA F52C -> ſp | |
-e 's//ct/g;' # PUA EEC5 -> ct | |
-e 's//ft/g;' # PUA EECB -> ft | |
-e 's//tʒ/g;' # PUA EEDC -> tʒ | |
-e 's//m̃/g;' # PUA E5D2 -> m̃ | |
-e 's//ñ/g;' # PUA E5DC -> ñ | |
-e 's//p̃/g;' # PUA E665 -> p + ... | |
-e 's//qʒ/g;' # PUA E8BF -> q; (or to qʒ, or to que, as you like) | |
-e 's//aͤ/g;' # PUA E42C -> a + U+0364, combining e above | |
-e 's//oͤ/g;' # PUA E644 -> o + U+0364 | |
-e 's//uͤ/g;' # PUA E72B -> u + U+0364 | |
-e 's//ů/g;' # PUA E72D -> U+016F | |
-e 's//ß/g;' # PUA EBAC -> ß (check for correct meaning) | |
-e 's//ß/g;' # PUA E8B7 -> ß (proper replacement in some German printings) | |
#-e 's//ſᷣ/g;' # PUA E8B7 -> ſ with combining r rotunda (in some Latin printings) | |
-e 's//ꝰ/g;' # PUA F1A6 -> U+A770, modifier letter us | |
-e 's//m/g;' # PUA F223 -> m | |
-e 's//⁊/g;' # PUA F158 -> U+204A (Tironian et) | |
-e 's//ð/g;' # PUA F159 -> eth, U+00F0 | |
-e 's//:/g;' # PUA F160 -> : | |
-e 's/q/qͥ/g;' # PUA F02F -> small letter i above (U+0365) | |
-e 's/t/t᷑/g;' # t + PUA F1CC -> t + combining ur above (U+1DD1) | |
-e 's//ll/g;' # PUA F4F9 -> ll | |
-e 's//q́/g;' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT | |
# variants of quotation marks | |
-e 's/〞/〟/g;' | |
-e 's/‟/„/g;' | |
# “ ? | |
# " ? | |
-e 's/‛/‚/g;' | |
# ‘ ’ ? | |
# rare currencies | |
-e 's/[₽₤¥₰ℳ\$]/£/g;' | |
# metric symbols | |
-e 's/[⍘⏝‿⏓⏖]/⏑/g;' | |
-e 's/─́/–́/g;' | |
# remaining ¬ ? | |
# mere decorum, remove | |
-e 's/〰//g;' | |
# only in formulae (mostly schroeder_logik), remove | |
-e 's/[ɟ⋹∈∉∪⊃⊂⊆⊄≶≠≗‥∘÷∶∴∵⌣⌢∽⦂Ↄ⊽∨∧⋁⋀∥═∺≺⊥◡◠⁀∑∟∢∠∝∾◦]/ /g;' | |
# not well defined (frequent but single use-case/book, not usual semantics), replace with gap character | |
-e 's/\x{fffc}/_/g;' # object replacement character | |
-e 's/\x{feff}//g;' # byte-order mark / zero-width non-breaking space | |
-e 's/\x{ad}//g;' # soft hyphen | |
-e 's/[▭⊙✽℣☘卍↙⟞⟴ʃ※⊕؛©®€]/_/g;' | |
# conflate multiple gaps/spaces generated by above substitutions | |
-e 's/ +/ /g;' | |
-e 's/__+/_/g;' | |
) | |
(($#>0)) && exec < "$1" | |
(($#>1)) && exec > "$2" | |
"${nontext_opts[@]}" | "${tei2txt_opts[@]}" | sed -f <(dehyphenate) | "${normalize_opts[@]}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment