Skip to content

Instantly share code, notes, and snippets.

@benosteen
Created March 3, 2017 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benosteen/482b0fb66855e8185bce6f18920b6e6b to your computer and use it in GitHub Desktop.
Save benosteen/482b0fb66855e8185bce6f18920b6e6b to your computer and use it in GitHub Desktop.
Quick and dirty ALTO XML to txt
#python3
import re, os
text_p = re.compile(r"CONTENT=\"([^\"]*)\"", re.U)
line_p = re.compile(r"</TextLine>", re.U)
def get_text(alto_filepath):
current = ""
text_content = ""
words = []
with open(alto_filepath, "r", encoding="utf-8") as alto:
for line in alto:
if line_p.search(line) != None:
if current != "":
words.append(current)
text_content += " ".join(words)
text_content += "\n"
words = []
current = ""
else:
g = text_p.search(line)
if g != None:
txt = g.groups()[0]
if current != "":
# if two words are 'joined' by a hyphen, concat.
# if 1st ends with hyphen, but not reciprocated, return both as separate words
# Also, some ALTO XML marks this up:
# <String ID="P13_ST00253" HPOS="1167" VPOS="1886" WIDTH="75" HEIGHT="36" CONTENT="shap" SUBS_TYPE="HypPart1" SUBS_CONTENT="shaping" WC="0.82" CC="07100"/>
# <HYP CONTENT="-"/>
#</TextLine>
#<TextLine ID="P13_TL00028" HPOS="132" VPOS="1935" WIDTH="1110" HEIGHT="40">
# <String ID="P13_ST00254" HPOS="132" VPOS="1941" WIDTH="51" HEIGHT="34" CONTENT="ing" SUBS_TYPE="HypPart2" SUBS_CONTENT="shaping" WC="0.66" CC="360"/>
#FIXME - deal with this form of markup additionally.
if txt.startswith("-"):
words.append(current[:-1] + txt[1:])
current = ""
else:
words.append(current)
words.append(txt)
current = ""
elif txt.endswith("-"):
current = txt
else:
words.append(txt)
if current != "":
words.append(current)
text_content += " ".join(words)
return text_content
def get_book(book_id, directorypath, txtfilepath):
with open(txtfilepath, "w", encoding="utf-8") as txtout:
for item in sorted([x for x in os.listdir(directorypath) if x.startswith(book_id)]):
txtout.write(get_text(os.path.join(directorypath, item)))
txtout.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment