Skip to content

Instantly share code, notes, and snippets.

@bertsky
Last active January 10, 2024 16:20
Show Gist options
  • Save bertsky/76365fc92d7476218a5d12549c83a840 to your computer and use it in GitHub Desktop.
Save bertsky/76365fc92d7476218a5d12549c83a840 to your computer and use it in GitHub Desktop.
dump METS files from an OAI harvest (metha-cat output after running metha-sync), with recursive METS downloads for multipart works
#!/usr/bin/env python3
import sys
from lxml import etree as ET
from ocrd_models.constants import NAMESPACES
NAMESPACES['oai'] = "http://www.openarchives.org/OAI/2.0/"
for curie in NAMESPACES:
ET.register_namespace(curie, NAMESPACES[curie])
# can be too large for in-memory parsing:
#tree = ET.ElementTree()
#tree.parse(sys.stdin, parser=ET.ETCompatXMLParser(encoding='utf-8', recover=True))
#
#root = tree.getroot() # Records
#for record in root.iterfind('oai:record', NAMESPACES):
# so use incremental parsing instead:
def fast_iter(context, func, *args, **kwargs):
"""
http://lxml.de/parsing.html#modifying-the-tree
Based on Liza Daly's fast_iter
http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
See also http://effbot.org/zone/element-iterparse.htm
"""
for _, elem in context:
func(elem, *args, **kwargs)
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
def write_mets(mets, name):
#mets.write(name + '.xml', encoding='utf-8', xml_declaration=True)
with open(name + '.xml', 'wb') as file:
mets = ET.tostring(mets, pretty_print=True, encoding='utf-8')
file.write(mets)
def download_mets(url):
session = Session()
retries = Retry(total=5, status_forcelist=[
# only transient failures (probably too wide):
408, 409, 412, 417, 423, 424, 425, 426, 428, 429, 440, 500, 503, 504, 509, 529, 598, 599])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
response = session.get(url, timeout=3)
response.raise_for_status()
download = response.content
mets = ET.fromstring(download)
return mets
def dive_mets(mets, oai_id, level):
if not mets:
print("empty METS result for %s" % oai_id, file=sys.stderr)
return
if level > 2:
raise Exception("unexpected depth of METS recursion for %s", oai_id)
# assert oai_id == mets.find('./mets:dmdSec/mets:mdWrap/mets:xmlData/mods:mods/mods:recordInfo/mods:recordIdentifier[@source="http://digital.slub-dresden.de/oai/"]', namespaces=NAMESPACES).text
logmap = mets.find('./mets:structMap[@TYPE="LOGICAL"]', namespaces=NAMESPACES)
# as long as the deepest mets:div have mets:mptr, go recursive on them (up to 2 times)
mptrs = logmap.xpath('.//mets:div[not(mets:div)]/mets:mptr/@xlink:href', namespaces=NAMESPACES)
if len(mptrs):
assert mets.find('./mets:fileSec/mets:fileGrp[@USE="DEFAULT"]', namespaces=NAMESPACES) is None, mptrs
for url in mptrs:
mets = download_mets(url)
if not len(mets):
print("empty METS result for %s" % url, file=sys.stderr)
continue
oai_id = mets.find('./mets:dmdSec/mets:mdWrap[@MDTYPE="MODS"]/mets:xmlData/mods:mods/mods:recordInfo/mods:recordIdentifier[@source="http://digital.slub-dresden.de/oai/"]', namespaces=NAMESPACES).text
print("recursive %s" % oai_id)
dive_mets(mets, oai_id, level + 1)
else:
write_mets(mets, oai_id)
def process_record(record):
oai_id = record.find('oai:header/oai:identifier', NAMESPACES).text
print("processing %s" % oai_id)
mets = record.find('oai:metadata/mets:mets', NAMESPACES)
mets = ET.ElementTree(mets)
dive_mets(mets, oai_id, 0)
if len(sys.argv) > 1:
files = sys.argv[1:]
else:
files = [sys.stdin.buffer]
for file_ in files:
fast_iter(ET.iterparse(file_,
encoding='utf-8',
recover=True,
tag='{%s}record' % NAMESPACES['oai']),
process_record)
@bertsky
Copy link
Author

bertsky commented Dec 20, 2023

  1. install metha
  2. harvest using metha-sync -format mets ...
  3. retrieve using metha-cat -format mets ... | python metha-dump.py

@bertsky
Copy link
Author

bertsky commented Jan 10, 2024

now correctly handles recursive case:

  • if in the logical structMap, there are terminal mets:div with mets:mptr, then instead of writing the current METS, download each of the @href METS and continue with that
  • in this case, use the OAI identifier from the MODS instead of the retrieval record
  • stop recursion after 2 levels, because (due to METS application profile, sections 2.1.2.2 and 2.1.3) we only expect
    • multivolume_workmultivolume_work/volume,
    • periodicalperiodical/volume,
    • newspapernewspaper/yearnewspaper/month/day/issue

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment