bertsky/tei2txt.sh

## tei2txt.sh
#!/bin/bash

nontext_opts=(
    xmlstarlet ed -N tei=http://www.tei-c.org/ns/1.0
    -d //tei:note
    -d //tei:fw
    -d //tei:table
    -d //tei:figure
    -d //tei:formula
    -d //tei:titlePage
    -d '//tei:div[@type="index"]'
    -d '//tei:div[@type="imprint"]'
    -d '//tei:div[@type="frontispiece"]'
    -d '//tei:div[@type="advertisement"]'
    -d '//tei:div[@type="copyright"]'
    -d '//tei:div[@type="abbrevations"]'
)

tei2txt_opts=(
    tei2txt.pl
    -p show_page_numbers=0
    -p show_form_feed=0
    -p show_line_numbers=0
    -p show_bogensignatur=0
    -p show_kolumnentitel=0
    -p show_catchword=0
    -p gap_char=_
)

dehyphenate() {
    # this should be done at runtime, too!
    cat <<"EOF"
: start
s/¬ /- /g
# unwrap (and move any „ to new start of line)
/¬$/{N
# rule for upper case continuation
s/¬\
\(„\)\?\([A-Z][^ ]\+\) \?/-\2\
\1/
t start
# rule for normal continuation
s/¬\
\(„\)\?\([^ ]\+\) \?/\2\
\1/
# next line could be hyphenated itself
b start
}
EOF
}

normalize_opts=(
    # decompose
    perl -CS -MUnicode::Normalize=NFKD -Mutf8 -p
    -e '$_ = NFKD($_);'
    # map musical symbols to gap
    -e 's/[\x{1d100}-\x{1d1ff}]/_/g;'
    # map mathematical alphanumeric symbols to gap
    -e 's/[\x{1d400}-\x{1d7ff}]/_/g;'
    # map box drawings to gap
    -e 's/[\x{2500}-\x{257f}]/_/g;'
    # map Canadian syllabics to gap
    -e 's/[\x{1400}-\x{167f}]/_/g;'
    # map Coptic letters to gap
    -e 's/[\x{2c80}-\x{2cff}]/_/g;'
    # replace tab with space
    -e 's/\t/ /g;'
    # normalise some unexpected codepoints
    -e 's/[⋅✕☓]/×/g;'
    -e 's/∗/*/g;'
    -e 's/∼/~/g;'
    -e 's/⁎/*/g;'
    # just (wrongly transcribed) print errors
    -e 's/[ǝə]/e/g;'
    -e 's/[ꟺɯ]/m/g;'
    -e 's/ɹ/r/g;'
    # cedilla instead of low comma
    -e 's/\x{326}/\x{327}/g;'
    # mathematical, normal, and pointing angle brackets
    -e 's/[⟨〈〈]/(/g;'
    -e 's/[⟩〉〉]/)/g;'
    # approximations to mathematical symbols
    -e 's/[⪙≦≤]/</g;'
    -e 's/[⪚≧≥]/>/g;'
    -e 's/≡/=/g;'
    -e 's/∷/~/g;'
    -e 's/∓/±/g;'
    # roman numerals
    -e 's/Ↄ/C/g;'
    -e 's/ↀ/CD/g;'
    # list items
    -e 's/∙/•/g;'
    -e 's/[⁑∸]/•/g;'
    # geometric shapes
    -e 's/[⚬⚪●]/○/g;'
    -e 's/[◻☐■]/□/g;'
    -e 's/∆/△/g;'
    # crosses without distinction
    -e 's/[♰♱✝☩]/✠/g;'
    # variants of pointing finger
    -e 's/[👉☛]/☞/g;'
    -e 's/[👈☚]/☜/g;'
    # Fraktur hyphen (double-oblique) is supposed to be hyphen-minus
    -e 's/⸗/-/g;'
    # Unicode hyphen is supposed to be hyphen-minus
    -e 's/‐/-/g;'
    -e 's/‧/,/g;'
    # variants of dash and minus
    -e 's/[⸺‒—─−―]/–/g;'
    # variants of vertical line
    -e 's/[⏐│‖⏐]/|/g;'
    # consonant ligatures (Fraktur)
    -e 's/ﬃ/ffi/g;'
    -e 's/ﬁ/fi/g;'
    -e 's/ﬂ/fl/g;'
    # other potential MUFI/PUA codepoints
    -e 's//ſſ/g;'
    -e 's//ſſi/g;'  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I, U+EBA7
    -e 's//ch/g;' # Latin small letter c ligated with latin small letter h, U+F502
    -e 's//ck/g;' # Latin small ligature ck, U+EEC4
    -e 's/ﬅ/ſt/g;'
    -e 's/ﬁ/fi/g;'
    -e 's/ﬀ/ff/g;'
    -e 's/ﬂ/fl/g;'
    -e 's/ﬃ/ffi/g;'
    -e 's//ſk/g;'
    -e 's//tz/g;'       # MUFI: LATIN SMALL LIGATURE TZ
    -e 's//as/g;'  # eMOP: Latin small ligature as, U+f532
    -e 's//is/g;'  # eMOP: Latin small ligature is, U+f533
    -e 's//us/g;'  # eMOP: Latin small ligature us, U+f534
    -e 's//Qu/g;'  # eMOP: Latin ligature capital Q small u, U+f535
    -e 's/ĳ/ij/g;'       # U+0133 LATIN SMALL LIGATURE IJ
    -e 's//q&/g;'  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET, U+E8BF
    -e 's//ſp/g;'  # MUFI: LATIN SMALL LIGATURE LONG S P, U+EBA5
    -e 's/ﬆ/st/g;'      # U+FB06 LATIN SMALL LIGATURE ST
    -e 's/[̉ᷣ]/᷎/g;' # combining ogonek above (U+1DCE, ᷎) instead of combining r rotunda (U+1DE3, ᷣ) or combining hook above (U+0309, ̉)
    -e 's//ſt/g;' # PUA EADA -> ſt
    -e 's//ſi/g;' # PUA EBA2 -> ſi
    -e 's//ſl/g;' # PUA EBA3 -> ſl
    -e 's//ſſ/g;' # PUA EBA6 -> ſſ
    -e 's//ſſi/g;' # PUA EBA7 -> ſſi
    -e 's//ſſt/g;' # PUA F4FF -> ſſt
    -e 's//ſp/g;' # PUA F52C -> ſp
    -e 's//ct/g;' # PUA EEC5 -> ct
    -e 's//ft/g;' # PUA EECB -> ft
    -e 's//tʒ/g;' # PUA EEDC -> tʒ
    -e 's//m̃/g;' # PUA E5D2 -> m̃
    -e 's//ñ/g;' # PUA E5DC -> ñ
    -e 's//p̃/g;' # PUA E665 -> p + ...
    -e 's//qʒ/g;' # PUA E8BF -> q; (or to qʒ, or to que, as you like)
    -e 's//aͤ/g;' # PUA E42C -> a + U+0364, combining e above
    -e 's//oͤ/g;' # PUA E644 -> o + U+0364
    -e 's//uͤ/g;' # PUA E72B -> u + U+0364
    -e 's//ů/g;' # PUA E72D -> U+016F
    -e 's//ß/g;' # PUA EBAC -> ß (check for correct meaning)
    -e 's//ß/g;' # PUA E8B7 -> ß (proper replacement in some German printings)
    #-e 's//ſᷣ/g;' # PUA E8B7 -> ſ with combining r rotunda (in some Latin printings)
    -e 's//ꝰ/g;' # PUA F1A6 -> U+A770, modifier letter us
    -e 's//m/g;' # PUA F223 -> m
    -e 's//⁊/g;' # PUA F158 -> U+204A (Tironian et)
    -e 's//ð/g;' # PUA F159 -> eth, U+00F0
    -e 's//:/g;' # PUA F160 -> :
    -e 's/q/qͥ/g;' # PUA F02F -> small letter i above (U+0365)
    -e 's/t/t᷑/g;' # t + PUA F1CC -> t + combining ur above (U+1DD1)
    -e 's//ll/g;' # PUA F4F9 -> ll
    -e 's//q́/g;' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
    # variants of quotation marks
    -e 's/〞/〟/g;'
    -e 's/‟/„/g;'
    # “ ?
    # " ?
    -e 's/‛/‚/g;'
    # ‘ ’ ?
    # rare currencies
    -e 's/[₽₤¥₰ℳ\$]/£/g;'
    # metric symbols
    -e 's/[⍘⏝‿⏓⏖]/⏑/g;'
    -e 's/─́/–́/g;'
    # remaining ¬ ?
    # mere decorum, remove
    -e 's/〰//g;'
    # only in formulae (mostly schroeder_logik), remove
    -e 's/[ɟ⋹∈∉∪⊃⊂⊆⊄≶≠≗‥∘÷∶∴∵⌣⌢∽⦂Ↄ⊽∨∧⋁⋀∥═∺≺⊥◡◠⁀∑∟∢∠∝∾◦]/ /g;'
    # not well defined (frequent but single use-case/book, not usual semantics), replace with gap character
    -e 's/\x{fffc}/_/g;' # object replacement character
    -e 's/\x{feff}//g;' # byte-order mark / zero-width non-breaking space
    -e 's/\x{ad}//g;' # soft hyphen
    -e 's/[▭⊙✽℣☘卍↙⟞⟴ʃ※⊕؛©®€]/_/g;'
    # conflate multiple gaps/spaces generated by above substitutions
    -e 's/  +/ /g;'
    -e 's/__+/_/g;'
)

(($#>0)) && exec < "$1"
(($#>1)) && exec > "$2"
"${nontext_opts[@]}" | "${tei2txt_opts[@]}" | sed -f <(dehyphenate) | "${normalize_opts[@]}"
	#!/bin/bash

	nontext_opts=(
	xmlstarlet ed -N tei=http://www.tei-c.org/ns/1.0
	-d //tei:note
	-d //tei:fw
	-d //tei:table
	-d //tei:figure
	-d //tei:formula
	-d //tei:titlePage
	-d '//tei:div[@type="index"]'
	-d '//tei:div[@type="imprint"]'
	-d '//tei:div[@type="frontispiece"]'
	-d '//tei:div[@type="advertisement"]'
	-d '//tei:div[@type="copyright"]'
	-d '//tei:div[@type="abbrevations"]'
	)

	tei2txt_opts=(
	tei2txt.pl
	-p show_page_numbers=0
	-p show_form_feed=0
	-p show_line_numbers=0
	-p show_bogensignatur=0
	-p show_kolumnentitel=0
	-p show_catchword=0
	-p gap_char=_
	)

	dehyphenate() {
	# this should be done at runtime, too!
	cat <<"EOF"
	: start
	s/¬ /- /g
	# unwrap (and move any „ to new start of line)
	/¬$/{N
	# rule for upper case continuation
	s/¬\
	\(„\)\?\([A-Z][^ ]\+\) \?/-\2\
	\1/
	t start
	# rule for normal continuation
	s/¬\
	\(„\)\?\([^ ]\+\) \?/\2\
	\1/
	# next line could be hyphenated itself
	b start
	}
	EOF
	}

	normalize_opts=(
	# decompose
	perl -CS -MUnicode::Normalize=NFKD -Mutf8 -p
	-e '$_ = NFKD($_);'
	# map musical symbols to gap
	-e 's/[\x{1d100}-\x{1d1ff}]/_/g;'
	# map mathematical alphanumeric symbols to gap
	-e 's/[\x{1d400}-\x{1d7ff}]/_/g;'
	# map box drawings to gap
	-e 's/[\x{2500}-\x{257f}]/_/g;'
	# map Canadian syllabics to gap
	-e 's/[\x{1400}-\x{167f}]/_/g;'
	# map Coptic letters to gap
	-e 's/[\x{2c80}-\x{2cff}]/_/g;'
	# replace tab with space
	-e 's/\t/ /g;'
	# normalise some unexpected codepoints
	-e 's/[⋅✕☓]/×/g;'
	-e 's/∗/*/g;'
	-e 's/∼/~/g;'
	-e 's/⁎/*/g;'
	# just (wrongly transcribed) print errors
	-e 's/[ǝə]/e/g;'
	-e 's/[ꟺɯ]/m/g;'
	-e 's/ɹ/r/g;'
	# cedilla instead of low comma
	-e 's/\x{326}/\x{327}/g;'
	# mathematical, normal, and pointing angle brackets
	-e 's/[⟨〈〈]/(/g;'
	-e 's/[⟩〉〉]/)/g;'
	# approximations to mathematical symbols
	-e 's/[⪙≦≤]/</g;'
	-e 's/[⪚≧≥]/>/g;'
	-e 's/≡/=/g;'
	-e 's/∷/~/g;'
	-e 's/∓/±/g;'
	# roman numerals
	-e 's/Ↄ/C/g;'
	-e 's/ↀ/CD/g;'
	# list items
	-e 's/∙/•/g;'
	-e 's/[⁑∸]/•/g;'
	# geometric shapes
	-e 's/[⚬⚪●]/○/g;'
	-e 's/[◻☐■]/□/g;'
	-e 's/∆/△/g;'
	# crosses without distinction
	-e 's/[♰♱✝☩]/✠/g;'
	# variants of pointing finger
	-e 's/[👉☛]/☞/g;'
	-e 's/[👈☚]/☜/g;'
	# Fraktur hyphen (double-oblique) is supposed to be hyphen-minus
	-e 's/⸗/-/g;'
	# Unicode hyphen is supposed to be hyphen-minus
	-e 's/‐/-/g;'
	-e 's/‧/,/g;'
	# variants of dash and minus
	-e 's/[⸺‒—─−―]/–/g;'
	# variants of vertical line
	-e 's/[⏐│‖⏐]/\|/g;'
	# consonant ligatures (Fraktur)
	-e 's/ﬃ/ffi/g;'
	-e 's/ﬁ/fi/g;'
	-e 's/ﬂ/fl/g;'
	# other potential MUFI/PUA codepoints
	-e 's//ſſ/g;'
	-e 's//ſſi/g;' # MUFI: LATIN SMALL LIGATURE LONG S LONG S I, U+EBA7
	-e 's//ch/g;' # Latin small letter c ligated with latin small letter h, U+F502
	-e 's//ck/g;' # Latin small ligature ck, U+EEC4
	-e 's/ﬅ/ſt/g;'
	-e 's/ﬁ/fi/g;'
	-e 's/ﬀ/ff/g;'
	-e 's/ﬂ/fl/g;'
	-e 's/ﬃ/ffi/g;'
	-e 's//ſk/g;'
	-e 's//tz/g;' # MUFI: LATIN SMALL LIGATURE TZ
	-e 's//as/g;' # eMOP: Latin small ligature as, U+f532
	-e 's//is/g;' # eMOP: Latin small ligature is, U+f533
	-e 's//us/g;' # eMOP: Latin small ligature us, U+f534
	-e 's//Qu/g;' # eMOP: Latin ligature capital Q small u, U+f535
	-e 's/ĳ/ij/g;' # U+0133 LATIN SMALL LIGATURE IJ
	-e 's//q&/g;' # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET, U+E8BF
	-e 's//ſp/g;' # MUFI: LATIN SMALL LIGATURE LONG S P, U+EBA5
	-e 's/ﬆ/st/g;' # U+FB06 LATIN SMALL LIGATURE ST
	-e 's/[̉ᷣ]/᷎/g;' # combining ogonek above (U+1DCE, ᷎) instead of combining r rotunda (U+1DE3, ᷣ) or combining hook above (U+0309, ̉)
	-e 's//ſt/g;' # PUA EADA -> ſt
	-e 's//ſi/g;' # PUA EBA2 -> ſi
	-e 's//ſl/g;' # PUA EBA3 -> ſl
	-e 's//ſſ/g;' # PUA EBA6 -> ſſ
	-e 's//ſſi/g;' # PUA EBA7 -> ſſi
	-e 's//ſſt/g;' # PUA F4FF -> ſſt
	-e 's//ſp/g;' # PUA F52C -> ſp
	-e 's//ct/g;' # PUA EEC5 -> ct
	-e 's//ft/g;' # PUA EECB -> ft
	-e 's//tʒ/g;' # PUA EEDC -> tʒ
	-e 's//m̃/g;' # PUA E5D2 -> m̃
	-e 's//ñ/g;' # PUA E5DC -> ñ
	-e 's//p̃/g;' # PUA E665 -> p + ...
	-e 's//qʒ/g;' # PUA E8BF -> q; (or to qʒ, or to que, as you like)
	-e 's//aͤ/g;' # PUA E42C -> a + U+0364, combining e above
	-e 's//oͤ/g;' # PUA E644 -> o + U+0364
	-e 's//uͤ/g;' # PUA E72B -> u + U+0364
	-e 's//ů/g;' # PUA E72D -> U+016F
	-e 's//ß/g;' # PUA EBAC -> ß (check for correct meaning)
	-e 's//ß/g;' # PUA E8B7 -> ß (proper replacement in some German printings)
	#-e 's//ſᷣ/g;' # PUA E8B7 -> ſ with combining r rotunda (in some Latin printings)
	-e 's//ꝰ/g;' # PUA F1A6 -> U+A770, modifier letter us
	-e 's//m/g;' # PUA F223 -> m
	-e 's//⁊/g;' # PUA F158 -> U+204A (Tironian et)
	-e 's//ð/g;' # PUA F159 -> eth, U+00F0
	-e 's//:/g;' # PUA F160 -> :
	-e 's/q/qͥ/g;' # PUA F02F -> small letter i above (U+0365)
	-e 's/t/t᷑/g;' # t + PUA F1CC -> t + combining ur above (U+1DD1)
	-e 's//ll/g;' # PUA F4F9 -> ll
	-e 's//q́/g;' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
	# variants of quotation marks
	-e 's/〞/〟/g;'
	-e 's/‟/„/g;'
	# “ ?
	# " ?
	-e 's/‛/‚/g;'
	# ‘ ’ ?
	# rare currencies
	-e 's/[₽₤¥₰ℳ\$]/£/g;'
	# metric symbols
	-e 's/[⍘⏝‿⏓⏖]/⏑/g;'
	-e 's/─́/–́/g;'
	# remaining ¬ ?
	# mere decorum, remove
	-e 's/〰//g;'
	# only in formulae (mostly schroeder_logik), remove
	-e 's/[ɟ⋹∈∉∪⊃⊂⊆⊄≶≠≗‥∘÷∶∴∵⌣⌢∽⦂Ↄ⊽∨∧⋁⋀∥═∺≺⊥◡◠⁀∑∟∢∠∝∾◦]/ /g;'
	# not well defined (frequent but single use-case/book, not usual semantics), replace with gap character
	-e 's/\x{fffc}/_/g;' # object replacement character
	-e 's/\x{feff}//g;' # byte-order mark / zero-width non-breaking space
	-e 's/\x{ad}//g;' # soft hyphen
	-e 's/[▭⊙✽℣☘卍↙⟞⟴ʃ※⊕؛©®€]/_/g;'
	# conflate multiple gaps/spaces generated by above substitutions
	-e 's/ +/ /g;'
	-e 's/__+/_/g;'
	)

	(($#>0)) && exec < "$1"
	(($#>1)) && exec > "$2"
	"${nontext_opts[@]}" \| "${tei2txt_opts[@]}" \| sed -f <(dehyphenate) \| "${normalize_opts[@]}"