Skip to content

Instantly share code, notes, and snippets.

@bertsky
Last active April 3, 2024 18:46
Show Gist options
  • Save bertsky/63c7edb69daa8e5e7eb5cc203c3d44c2 to your computer and use it in GitHub Desktop.
Save bertsky/63c7edb69daa8e5e7eb5cc203c3d44c2 to your computer and use it in GitHub Desktop.
wrapper around dta-tools tei2txt.pl covering dehyphenation
#!/bin/bash
nontext_opts=(
xmlstarlet ed -N tei=http://www.tei-c.org/ns/1.0
-d //tei:note
-d //tei:fw
-d //tei:table
-d //tei:figure
-d //tei:formula
-d //tei:titlePage
-d '//tei:div[@type="index"]'
-d '//tei:div[@type="imprint"]'
-d '//tei:div[@type="frontispiece"]'
-d '//tei:div[@type="advertisement"]'
-d '//tei:div[@type="copyright"]'
-d '//tei:div[@type="abbrevations"]'
)
tei2txt_opts=(
tei2txt.pl
-p show_page_numbers=0
-p show_form_feed=0
-p show_line_numbers=0
-p show_bogensignatur=0
-p show_kolumnentitel=0
-p show_catchword=0
-p gap_char=_
)
dehyphenate() {
# this should be done at runtime, too!
cat <<"EOF"
: start
s/¬ /- /g
# unwrap (and move any „ to new start of line)
/¬$/{N
# rule for upper case continuation
s/¬\
\(„\)\?\([A-Z][^ ]\+\) \?/-\2\
\1/
t start
# rule for normal continuation
s/¬\
\(„\)\?\([^ ]\+\) \?/\2\
\1/
# next line could be hyphenated itself
b start
}
EOF
}
normalize_opts=(
# decompose
perl -CS -MUnicode::Normalize=NFKD -Mutf8 -p
-e '$_ = NFKD($_);'
# map musical symbols to gap
-e 's/[\x{1d100}-\x{1d1ff}]/_/g;'
# map mathematical alphanumeric symbols to gap
-e 's/[\x{1d400}-\x{1d7ff}]/_/g;'
# map box drawings to gap
-e 's/[\x{2500}-\x{257f}]/_/g;'
# map Canadian syllabics to gap
-e 's/[\x{1400}-\x{167f}]/_/g;'
# map Coptic letters to gap
-e 's/[\x{2c80}-\x{2cff}]/_/g;'
# replace tab with space
-e 's/\t/ /g;'
# normalise some unexpected codepoints
-e 's/[⋅✕☓]/×/g;'
-e 's/∗/*/g;'
-e 's/∼/~/g;'
-e 's/⁎/*/g;'
# just (wrongly transcribed) print errors
-e 's/[ǝə]/e/g;'
-e 's/[ꟺɯ]/m/g;'
-e 's/ɹ/r/g;'
# cedilla instead of low comma
-e 's/\x{326}/\x{327}/g;'
# mathematical, normal, and pointing angle brackets
-e 's/[⟨〈〈]/(/g;'
-e 's/[⟩〉〉]/)/g;'
# approximations to mathematical symbols
-e 's/[⪙≦≤]/</g;'
-e 's/[⪚≧≥]/>/g;'
-e 's/≡/=/g;'
-e 's/∷/~/g;'
-e 's/∓/±/g;'
# roman numerals
-e 's/Ↄ/C/g;'
-e 's/ↀ/CD/g;'
# list items
-e 's/∙/•/g;'
-e 's/[⁑∸]/•/g;'
# geometric shapes
-e 's/[⚬⚪●]/○/g;'
-e 's/[◻☐■]/□/g;'
-e 's/∆/△/g;'
# crosses without distinction
-e 's/[♰♱✝☩]/✠/g;'
# variants of pointing finger
-e 's/[👉☛]/☞/g;'
-e 's/[👈☚]/☜/g;'
# Fraktur hyphen (double-oblique) is supposed to be hyphen-minus
-e 's/⸗/-/g;'
# Unicode hyphen is supposed to be hyphen-minus
-e 's/‐/-/g;'
-e 's/‧/,/g;'
# variants of dash and minus
-e 's/[⸺‒—─−―]/–/g;'
# variants of vertical line
-e 's/[⏐│‖⏐]/|/g;'
# consonant ligatures (Fraktur)
-e 's/ffi/ffi/g;'
-e 's/fi/fi/g;'
-e 's/fl/fl/g;'
# other potential MUFI/PUA codepoints
-e 's//ſſ/g;'
-e 's//ſſi/g;' # MUFI: LATIN SMALL LIGATURE LONG S LONG S I, U+EBA7
-e 's//ch/g;' # Latin small letter c ligated with latin small letter h, U+F502
-e 's//ck/g;' # Latin small ligature ck, U+EEC4
-e 's/ſt/ſt/g;'
-e 's/fi/fi/g;'
-e 's/ff/ff/g;'
-e 's/fl/fl/g;'
-e 's/ffi/ffi/g;'
-e 's//ſk/g;'
-e 's//tz/g;' # MUFI: LATIN SMALL LIGATURE TZ
-e 's//as/g;' # eMOP: Latin small ligature as, U+f532
-e 's//is/g;' # eMOP: Latin small ligature is, U+f533
-e 's//us/g;' # eMOP: Latin small ligature us, U+f534
-e 's//Qu/g;' # eMOP: Latin ligature capital Q small u, U+f535
-e 's/ij/ij/g;' # U+0133 LATIN SMALL LIGATURE IJ
-e 's//q&/g;' # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET, U+E8BF
-e 's//ſp/g;' # MUFI: LATIN SMALL LIGATURE LONG S P, U+EBA5
-e 's/st/st/g;' # U+FB06 LATIN SMALL LIGATURE ST
-e 's/[̉ᷣ]/᷎/g;' # combining ogonek above (U+1DCE, ᷎) instead of combining r rotunda (U+1DE3, ᷣ) or combining hook above (U+0309, ̉)
-e 's//ſt/g;' # PUA EADA -> ſt
-e 's//ſi/g;' # PUA EBA2 -> ſi
-e 's//ſl/g;' # PUA EBA3 -> ſl
-e 's//ſſ/g;' # PUA EBA6 -> ſſ
-e 's//ſſi/g;' # PUA EBA7 -> ſſi
-e 's//ſſt/g;' # PUA F4FF -> ſſt
-e 's//ſp/g;' # PUA F52C -> ſp
-e 's//ct/g;' # PUA EEC5 -> ct
-e 's//ft/g;' # PUA EECB -> ft
-e 's//tʒ/g;' # PUA EEDC -> tʒ
-e 's//m̃/g;' # PUA E5D2 -> m̃
-e 's//ñ/g;' # PUA E5DC -> ñ
-e 's//p̃/g;' # PUA E665 -> p + ...
-e 's//qʒ/g;' # PUA E8BF -> q; (or to qʒ, or to que, as you like)
-e 's//aͤ/g;' # PUA E42C -> a + U+0364, combining e above
-e 's//oͤ/g;' # PUA E644 -> o + U+0364
-e 's//uͤ/g;' # PUA E72B -> u + U+0364
-e 's//ů/g;' # PUA E72D -> U+016F
-e 's//ß/g;' # PUA EBAC -> ß (check for correct meaning)
-e 's//ß/g;' # PUA E8B7 -> ß (proper replacement in some German printings)
#-e 's//ſᷣ/g;' # PUA E8B7 -> ſ with combining r rotunda (in some Latin printings)
-e 's//ꝰ/g;' # PUA F1A6 -> U+A770, modifier letter us
-e 's//m/g;' # PUA F223 -> m
-e 's//⁊/g;' # PUA F158 -> U+204A (Tironian et)
-e 's//ð/g;' # PUA F159 -> eth, U+00F0
-e 's//:/g;' # PUA F160 -> :
-e 's/q/qͥ/g;' # PUA F02F -> small letter i above (U+0365)
-e 's/t/t᷑/g;' # t + PUA F1CC -> t + combining ur above (U+1DD1)
-e 's//ll/g;' # PUA F4F9 -> ll
-e 's//q́/g;' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
# variants of quotation marks
-e 's/〞/〟/g;'
-e 's/‟/„/g;'
# “ ?
# " ?
-e 's/‛/‚/g;'
# ‘ ’ ?
# rare currencies
-e 's/[₽₤¥₰ℳ\$]/£/g;'
# metric symbols
-e 's/[⍘⏝‿⏓⏖]/⏑/g;'
-e 's/─́/–́/g;'
# remaining ¬ ?
# mere decorum, remove
-e 's/〰//g;'
# only in formulae (mostly schroeder_logik), remove
-e 's/[ɟ⋹∈∉∪⊃⊂⊆⊄≶≠≗‥∘÷∶∴∵⌣⌢∽⦂Ↄ⊽∨∧⋁⋀∥═∺≺⊥◡◠⁀∑∟∢∠∝∾◦]/ /g;'
# not well defined (frequent but single use-case/book, not usual semantics), replace with gap character
-e 's/\x{fffc}/_/g;' # object replacement character
-e 's/\x{feff}//g;' # byte-order mark / zero-width non-breaking space
-e 's/\x{ad}//g;' # soft hyphen
-e 's/[▭⊙✽℣☘卍↙⟞⟴ʃ※⊕؛©®€]/_/g;'
# conflate multiple gaps/spaces generated by above substitutions
-e 's/ +/ /g;'
-e 's/__+/_/g;'
)
(($#>0)) && exec < "$1"
(($#>1)) && exec > "$2"
"${nontext_opts[@]}" | "${tei2txt_opts[@]}" | sed -f <(dehyphenate) | "${normalize_opts[@]}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment