Skip to content

Instantly share code, notes, and snippets.

@phette23
Created June 27, 2024 18:47
Show Gist options
  • Save phette23/b127eda2b95560043cb2f5b3d205ea51 to your computer and use it in GitHub Desktop.
Save phette23/b127eda2b95560043cb2f5b3d205ea51 to your computer and use it in GitHub Desktop.
find mods/subject/name values in our metadata but not in our names taxonomy
#!/usr/bin/env python
# run from root of cca/vault_migration project over _all_ VAULT metadata JSON like
# `poetry run python missing-names.py vm/*.json`
import csv
import json
import os
import sys
import xmltodict
# add migrate directory to python import path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
migrate_dir = os.path.join(parent_dir, "migrate")
sys.path.append(migrate_dir)
from utils import find_items, mklist # type: ignore
from subjects import Subject, find_subjects # type: ignore
with open("taxos/subject-name-complete.json", "r") as f:
data = json.load(f)
names = [n["term"] for n in data]
# check for name subjects not in our names list
missing = {}
for file in sys.argv:
items = find_items(file)
for item in items:
xml = xmltodict.parse(item["metadata"])["xml"]
subjects: set[Subject] = find_subjects(xml)
for subject in subjects:
if subject.type == "Name" and subject.value not in names:
if subject.value not in missing:
missing[subject.value] = [item["links"]["view"]]
else:
missing[subject.value].append(item["links"]["view"])
with open("taxos/subject-name-missing.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Links"])
for name, links in missing.items():
writer.writerow([name] + links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment