Skip to content

Instantly share code, notes, and snippets.

@bertsky
Created June 11, 2024 14:47
Show Gist options
  • Save bertsky/b82f0400b842761e0e3e92917fd19cfc to your computer and use it in GitHub Desktop.
Save bertsky/b82f0400b842761e0e3e92917fd19cfc to your computer and use it in GitHub Desktop.
DFG METS: overwrite PAGE-XML @imageFilename from image fileGrp
import os
import click
from ocrd_utils import MIMETYPE_PAGE
from ocrd_models.ocrd_mets import OcrdMets
from ocrd_models.ocrd_page import to_xml
from ocrd_modelfactory import page_from_file
@click.command()
@click.option('-m', '--mets-file', default="mets.xml", help="path to METS of workspace")
@click.option('-I', '--input-file-grp', required=True, help="fileGrp to pick image files from")
@click.option('-O', '--output-file-grp', required=True, help="fileGrp to modify PAGE files from")
def cli(mets_file, input_file_grp, output_file_grp):
"""
open METS file, for each physical page in the structmap,
get the local filename of the respective image file in the input fileGrp,
get the local filename of the respective PAGE-XML file in the output fileGrp,
open the PAGE-XML file and overwrite its `@imageFilename` with the local path
of the image file in the METS. Finally serialise the modified PAGE-XML file.
"""
os.chdir(os.path.dirname(mets_file))
mets = OcrdMets(filename=os.path.basename(mets_file))
page2image = {ocrd_file.pageId: ocrd_file.local_filename
for ocrd_file in mets.find_all_files(fileGrp=input_file_grp)
if ocrd_file.mimetype.startswith('image/')}
page2PAGE = {ocrd_file.pageId: ocrd_file.local_filename
for ocrd_file in mets.find_all_files(fileGrp=output_file_grp)
if ocrd_file.mimetype == MIMETYPE_PAGE}
for page in page2PAGE:
PAGE = page2PAGE[page]
img = page2image[page]
pcgts = page_from_file(PAGE)
pcgts.get_Page().set_imageFilename(img)
with open(PAGE, 'w', encoding='utf-8') as f:
f.write(to_xml(pcgts))
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment