Skip to content

Instantly share code, notes, and snippets.

@ebenenglish
Created August 19, 2021 18:18
Show Gist options
  • Save ebenenglish/0ea5ce0b7728dcbd9f6da1e939903ed1 to your computer and use it in GitHub Desktop.
Save ebenenglish/0ea5ce0b7728dcbd9f6da1e939903ed1 to your computer and use it in GitHub Desktop.
A set of scripts to query the Preservica REST API and create an OAI-PMH response XML file
# create a static OAI XML file with all records
# @param sets [Array] collection ids, e.g. ['c1ee8010-fb87-40d6-ac23-be547344c4f2', '2000d071-a976-4536-b173-8772a11c3588', ...]
# @param units_to_ignore [Array] ids of objects to ignore
# @param file_path [String] location where static XML file should be written
# @param credentials [Hash] Preservica REST API credentials: {un: 'foo', pw: 'bar'}
def generate_static_oai_xml(sets, units_to_ignore, file_path, credentials)
@oai_doc = initialize_oai_xml
@units_to_ignore = units_to_ignore
@preservica_rest_api_base = 'https://us.preservica.com/api/entity'
@credentials = credentials
@missing_mods = []
sets.each do |col_id|
col_resp_xml = preservica_rest_to_xml(col_id, 'structural-objects')
process_resp_xml(col_resp_xml, col_id)
end
filename = "#{file_path}/cba_preservica_mods-harvest_#{Time.now.to_i.to_s}.xml"
File.open(filename, 'w') { |f| f.write(@oai_doc) }
missing_filename = "#{file_path}/cba_preservica_missing-mods_#{Time.now.to_i.to_s}.txt"
File.open(missing_filename, 'w') { |f| f.write(@missing_mods) }
end
# call the Preservica REST API and return the response as Nokogiri XML object
# @param id [String] identifier of the object
# @param obj_type [String] the type of object ('structural-object', etc)
# @param include_children [Boolean] use to add '/children' to the URL
# @return [Nokogiri::XML::Document]
def preservica_rest_to_xml(id, obj_type, include_children = true)
url = "#{@preservica_rest_api_base}/#{obj_type}/#{id}"
url << '/children' if include_children
resp = Typhoeus::Request.get(url,
userpwd: "#{@credentials[:un]}:#{@credentials[:pw]}")
Nokogiri::XML(resp.body)
end
# process a Preservica REST API response (as Nokogiri XML object)
# @param xml_doc [Nokogiri::XML::Document]
# @param col_id [String] collection id
def process_resp_xml(xml_doc, col_id)
entries = get_preservica_children(xml_doc)
entries.each do |entry|
case entry[:type]
when 'information-objects'
process_info_object(entry[:id], col_id)
when 'structural-objects'
collection_resp = preservica_rest_to_xml(entry[:id], 'structural-objects')
process_resp_xml(collection_resp, col_id)
else
raise StandardError,
"Unprocessable entry type '#{entry[:type]}' found in: \n#{xml_doc}"
end
end
end
# get an information-object from Preservica REST API and process
# @param info_obj_id [String] information-object id
# @param col_id [String] collection id
def process_info_object(info_obj_id, col_id)
return if @units_to_ignore.include?(info_obj_id)
info_obj_resp_xml = preservica_rest_to_xml(info_obj_id, 'information-objects', false)
mods_record = get_info_obj_mods(info_obj_resp_xml, info_obj_id)
if mods_record.present?
insert_record(@oai_doc, info_obj_id, col_id, mods_record)
else
@missing_mods << info_obj_id
end
end
# parse a REST API response for child entries
# @param xml_doc [Nokogiri::XML::Document]
# @return [Array]
def get_preservica_children(xml_doc)
entries = []
children = xml_doc.xpath('//xmlns:Children/xmlns:Child')
children.each do |child|
entry_hash = {}
entry_hash[:id] = child.attributes['ref']&.value
entry_hash[:title] = child.attributes['title']&.value
entry_hash[:type] = parse_for_child_type(child)
entries << entry_hash
end
entries
end
# determine the object type
# @param node [Nokogiri::XML::Element]
def parse_for_child_type(node)
node.text.gsub("#{@preservica_rest_api_base}/", '')[/[a-zA-Z-]*/]
end
# get the MODS record for an information-object
# @param xml_doc [Nokogiri::XML::Document]
# @param info_obj_id [String]
# @return [Nokogiri::XML::NodeSet]
def get_info_obj_mods(xml_doc, info_obj_id)
fragments_xpath = '//xmlns:AdditionalInformation/xmlns:Metadata/xmlns:Fragment'
fragments = xml_doc.xpath(fragments_xpath)
return if fragments.blank?
mods_url = fragments.find do |node|
node.attributes['schema']&.value == 'http://www.loc.gov/mods/v3'
end&.text
return unless mods_url
metadata_id = mods_url.match(/[0-9a-z-]*\z/).to_s
mods_xml = preservica_rest_to_xml("#{info_obj_id}/metadata/#{metadata_id}",
'information-objects', false)
return unless mods_xml
mods_xml.xpath('//mods:mods', 'mods' => 'http://www.loc.gov/mods/v3')
end
# create an empty OAI ListRecords XML document
# @return [Nokogiri::XML::Document]
def initialize_oai_xml
oai_starter = "<OAI-PMH xmlns=\"http://www.openarchives.org/OAI/2.0/\"
xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"
xsi:schemaLocation=\"http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd\">
<responseDate>#{Time.now.iso8601}</responseDate>
<request verb=\"ListRecords\" metadataPrefix=\"mods\">http://static.digitalcommonwealth.org/cba/oai.xml</request>
<ListRecords>"
Nokogiri::XML(oai_starter)
end
# insert a <record> into <ListRecords> in the OAI XML document
# @param xml_doc [Nokogiri::XML::Document]
# @param file_id [String] object id
# @param col_id [String] collection id
# @param mods_record [Nokogiri::XML::NodeSet]
def insert_record(xml_doc, file_id, col_id, mods_record)
records_list = xml_doc.at_xpath('//xmlns:ListRecords')
new_record = Nokogiri::XML::DocumentFragment.parse ''
Nokogiri::XML::Builder.with(new_record) do |xml|
xml.record do
xml.header do
xml.identifier(file_id)
xml.datestamp(Time.now.iso8601)
xml.setSpec(col_id)
end
xml.metadata do
xml.parent << mods_record
end
end
end
records_list.add_child(new_record)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment