Created
March 19, 2014 20:44
-
-
Save peaeater/9650778 to your computer and use it in GitHub Desktop.
Converts Internet Archive XML metadata about a digitized publication to an XML manifest prepped for Solr ingest. Elements are mapped to Andi fields, and sometimes need transformation (e.g. dates to decades).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# convert IA metadata XML to Solr-ready manifest XML | |
<# | |
metadata.imagecount - 2 => pagecount | |
metadata.identifier => WebSafe($1) => id | |
metadata.title => title, freetext | |
metadata.date => toDecade($1) => date, date_free, freetext | |
metadata.creator => name, name_free, freetext | |
metadata.publisher => name, name_free, freetext | |
metadata.year => date_free, freetext | |
metadata.volume => identifier_free, freetext | |
metadata.issue => identifier_free, freetext | |
metadata.month => identifier_free, freetext | |
metadata.call_number => identifier_free, freetext | |
metadata.editor => name, name_free, freetext | |
metadata.language => expand('eng'=) => language, freetext | |
#> | |
param( | |
[Parameter(Mandatory=$true,ValueFromPipeline=$true,Position=0)] | |
[string]$in, | |
[Parameter(Position=1)] | |
[string]$out = ".\manifest.xml", | |
[Parameter(Position=2)] | |
[string]$source = "NO SOURCE PROVIDED" | |
) | |
process { | |
function WebSafe([string]$s) { | |
return $s.ToLowerInvariant().Replace(" ", "-") | |
} | |
# get an XMLTextWriter to create the manifest | |
$manifest = New-Object System.XMl.XmlTextWriter($out,$Null) | |
$manifest.Formatting = 'Indented' | |
$manifest.Indentation = 1 | |
$manifest.IndentChar = "`t" | |
# write the header | |
$manifest.WriteStartDocument() | |
# create root elements | |
$manifest.WriteStartElement('add') | |
$manifest.WriteStartElement('doc') | |
# grab content from IA metadata | |
$metadata = New-Object -TypeName XML | |
$metadata.Load($in) | |
# page count total | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'pagecount') | |
$manifest.WriteCData(([int]$metadata.metadata.imagecount) - 2) | |
$manifest.WriteEndElement() | |
# base id | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'id') | |
$manifest.WriteCData((WebSafe($metadata.metadata.identifier))) | |
$manifest.WriteEndElement() | |
# identifier => srcid | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'srcid') | |
$manifest.WriteCData($metadata.metadata.identifier) | |
$manifest.WriteEndElement() | |
# title | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'title') | |
$manifest.WriteCData($metadata.metadata.title) | |
$manifest.WriteEndElement() | |
if ($metadata.metadata.date) { | |
# date => to be decaded (is that a word?) in DIH | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'date_ignored') | |
$manifest.WriteCData($metadata.metadata.date) | |
$manifest.WriteEndElement() | |
# free date | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'date_free') | |
$manifest.WriteCData($metadata.metadata.date) | |
$manifest.WriteEndElement() | |
# freetext date | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<date label='Date'>" + $metadata.metadata.date + "</date>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.creator) { | |
# creator => name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name') | |
$manifest.WriteCData($metadata.metadata.creator) | |
$manifest.WriteEndElement() | |
# creator => free name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name_free') | |
$manifest.WriteCData($metadata.metadata.creator) | |
$manifest.WriteEndElement() | |
# creator => freetext name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<name label='Creator'>" + $metadata.metadata.creator + "</name>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.publisher) { | |
# publisher => name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name') | |
$manifest.WriteCData($metadata.metadata.publisher) | |
$manifest.WriteEndElement() | |
# publisher => free name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name_free') | |
$manifest.WriteCData($metadata.metadata.publisher) | |
$manifest.WriteEndElement() | |
# publisher => freetext name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<name label='Publisher'>" + $metadata.metadata.publisher + "</name>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.editor) { | |
# editor => name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name') | |
$manifest.WriteCData($metadata.metadata.editor) | |
$manifest.WriteEndElement() | |
# editor => free name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'name_free') | |
$manifest.WriteCData($metadata.metadata.editor) | |
$manifest.WriteEndElement() | |
# editor => freetext name | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<name label='Editor'>" + $metadata.metadata.editor + "</name>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.year) { | |
# year => date_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'date_free') | |
$manifest.WriteCData($metadata.metadata.year) | |
$manifest.WriteEndElement() | |
# year => freetext date | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<date label='Year'>" + $metadata.metadata.year + "</date>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.volume) { | |
# volume => identifier_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'identifier_free') | |
$manifest.WriteCData("Volume " + $metadata.metadata.volume) | |
$manifest.WriteEndElement() | |
# volume => freetext identifier | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<identifier label='Volume'>" + $metadata.metadata.volume + "</identifier>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.issue) { | |
# issue => identifier_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'identifier_free') | |
$manifest.WriteCData("Issue " + $metadata.metadata.issue) | |
$manifest.WriteEndElement() | |
# issue => freetext identifier | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<identifier label='Issue'>" + $metadata.metadata.issue + "</identifier>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.month) { | |
# month => identifier_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'identifier_free') | |
$manifest.WriteCData($metadata.metadata.month) | |
$manifest.WriteEndElement() | |
# month => freetext identifier | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<identifier label='Month'>" + $metadata.metadata.month + "</identifier>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.call_number) { | |
# call_number => identifier_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'identifier_free') | |
$manifest.WriteCData($metadata.metadata.call_number) | |
$manifest.WriteEndElement() | |
# call_number => freetext identifier | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<identifier label='Call Number'>" + $metadata.metadata.call_number + "</identifier>") | |
$manifest.WriteEndElement() | |
} | |
if ($metadata.metadata.language) { | |
# language => language DIH (i.e. eng => English) | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'language_ignored') | |
$manifest.WriteCData($metadata.metadata.language) | |
$manifest.WriteEndElement() | |
# language => language_free | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'language_free') | |
$manifest.WriteCData($metadata.metadata.language) | |
$manifest.WriteEndElement() | |
# language => freetext language | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<language label='Language'>" + $metadata.metadata.language + "</language>") | |
$manifest.WriteEndElement() | |
} | |
<# STATIC VALUES #> | |
# ocrtype 'djvu' | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'ocrtype') | |
$manifest.WriteCData("djvu") | |
$manifest.WriteEndElement() | |
# objectType 'textual record (electronic)' | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'objectType') | |
$manifest.WriteCData("textual record (electronic)") | |
$manifest.WriteEndElement() | |
# objectType freetext | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'freetext') | |
$manifest.WriteCData("<objectType label='Format'>textual record (electronic)</objectType>") | |
$manifest.WriteEndElement() | |
# onlineMediaRights 'No Restrictions' | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'onlineMediaRights') | |
$manifest.WriteCData("No Restrictions") | |
$manifest.WriteEndElement() | |
# onlineMediaType | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'onlineMediaType') | |
$manifest.WriteCData("Scanned Page") | |
$manifest.WriteEndElement() | |
# usageFlag | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'usageFlag') | |
$manifest.WriteCData("Internet Archive") | |
$manifest.WriteEndElement() | |
# source param => src because IA metadata does not include the name of the serial | |
$manifest.WriteStartElement('field') | |
$manifest.WriteAttributeString('name', 'src') | |
$manifest.WriteCData($source) | |
$manifest.WriteEndElement() | |
# close the doc/add nodes and finalize the document | |
$manifest.WriteEndElement() | |
$manifest.WriteEndElement() | |
$manifest.WriteEndDocument() | |
$manifest.Flush() | |
$manifest.Close() | |
# debug in notepad | |
#notepad $out | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment