Skip to content

Instantly share code, notes, and snippets.

@epoz
Created December 8, 2023 14:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save epoz/7b4c99efbee161d6d95dfa4df8af4513 to your computer and use it in GitHub Desktop.
Save epoz/7b4c99efbee161d6d95dfa4df8af4513 to your computer and use it in GitHub Desktop.
Test converting the UVA Library Beeldbank MARC XML data to simple RDF
from xml.etree import ElementTree as ET
import marcalyx
import pyoxigraph as px
import os
from rich.progress import track
MARCXML_FILENAME = os.environ.get("MARCXML_FILENAME", "uva_alma_beeldbank_marc_new.xml")
def main():
print("Parsing", MARCXML_FILENAME)
doc = ET.parse(MARCXML_FILENAME).getroot()
coll = marcalyx.Collection(doc)
G = px.Store()
def tu(uid, field, val):
if not val:
return
if type(val) == px.BlankNode:
G.add(px.Quad(uid, px.NamedNode(field), val))
else:
G.add(px.Quad(uid, px.NamedNode(field), px.NamedNode(val)))
def tl(uid, field, val, language="nl", datatype=None):
if not val:
return
val = val.replace("<p>", "").replace("</p>", "\n")
if datatype:
G.add(
px.Quad(
(
uid,
px.NamedNode(field),
px.Literal(val, datatype=px.NamedNode(datatype)),
)
)
)
else:
G.add(
px.Quad(
uid,
px.NamedNode(field),
px.Literal(val, language=language),
)
)
places = {}
for marc in track(coll.records()):
mid = marc["001"][0].value
uid = px.NamedNode(marc["024"][0]["a"][0].value)
tu(
uid,
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
"http://www.cidoc-crm.org/cidoc-crm/E22_Human-Made_Object",
)
tu(
uid,
"http://www.w3.org/ns/prov#wasDerivedFrom",
f"https://lib.uva.nl/permalink/31UKB_UAM1_INST/2nbb0e/alma{mid}",
)
tl(
uid,
"http://www.w3.org/2000/01/rdf-schema#label",
"\n".join(x.value for x in marc[("245", "a")])
+ " "
+ "\n".join(x.value for x in marc[("245", "b")])
+ " "
+ "\n".join(x.value for x in marc[("245", "c")]),
)
tl(uid, "http://purl.org/dc/terms/identifier", mid, language=None)
for code in ("500", "520"):
tl(
uid,
"http://purl.org/dc/terms/description",
"\n".join(x.value for x in marc[(code, "a")]),
)
for code in ("510", "581"):
for x in marc[(code, "a")]:
tl(uid, "http://purl.org/dc/terms/references", x.value)
for x in marc[("600", "0")]:
if x.value.startswith("http"):
tu(uid, "http://purl.org/dc/terms/creator", x.value)
else:
tl(uid, "http://purl.org/dc/terms/creator", x.value, language=None)
for code in ("650", "651", "655"):
for x in marc[(code, "0")]:
if x.value.startswith("http"):
tu(uid, "http://purl.org/dc/terms/subject", x.value)
else:
tl(uid, "http://purl.org/dc/terms/subject", x.value, language=None)
# Add the place-of-origin in 694f with a blank node and a label
for x in marc[("694", "f")]:
place = places.get(x.value)
if not place:
place = px.BlankNode()
tl(place, "http://www.w3.org/2000/01/rdf-schema#label", x.value)
places[x.value] = place
tu(uid, "http://purl.org/dc/terms/spatial", place)
# Note, is very interesting to use 100a 100d and 100e as combo to see how person is referenced and related see: https://hdl.handle.net/11245/3.3707
G.dump(open("ap.nt", "wb"), "application/n-triples")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment