Skip to content

Instantly share code, notes, and snippets.

@peaeater
Created March 19, 2014 20:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peaeater/9650778 to your computer and use it in GitHub Desktop.
Save peaeater/9650778 to your computer and use it in GitHub Desktop.
Converts Internet Archive XML metadata about a digitized publication to an XML manifest prepped for Solr ingest. Elements are mapped to Andi fields, and sometimes need transformation (e.g. dates to decades).
# convert IA metadata XML to Solr-ready manifest XML
<#
metadata.imagecount - 2 => pagecount
metadata.identifier => WebSafe($1) => id
metadata.title => title, freetext
metadata.date => toDecade($1) => date, date_free, freetext
metadata.creator => name, name_free, freetext
metadata.publisher => name, name_free, freetext
metadata.year => date_free, freetext
metadata.volume => identifier_free, freetext
metadata.issue => identifier_free, freetext
metadata.month => identifier_free, freetext
metadata.call_number => identifier_free, freetext
metadata.editor => name, name_free, freetext
metadata.language => expand('eng'=) => language, freetext
#>
param(
[Parameter(Mandatory=$true,ValueFromPipeline=$true,Position=0)]
[string]$in,
[Parameter(Position=1)]
[string]$out = ".\manifest.xml",
[Parameter(Position=2)]
[string]$source = "NO SOURCE PROVIDED"
)
process {
function WebSafe([string]$s) {
return $s.ToLowerInvariant().Replace(" ", "-")
}
# get an XMLTextWriter to create the manifest
$manifest = New-Object System.XMl.XmlTextWriter($out,$Null)
$manifest.Formatting = 'Indented'
$manifest.Indentation = 1
$manifest.IndentChar = "`t"
# write the header
$manifest.WriteStartDocument()
# create root elements
$manifest.WriteStartElement('add')
$manifest.WriteStartElement('doc')
# grab content from IA metadata
$metadata = New-Object -TypeName XML
$metadata.Load($in)
# page count total
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'pagecount')
$manifest.WriteCData(([int]$metadata.metadata.imagecount) - 2)
$manifest.WriteEndElement()
# base id
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'id')
$manifest.WriteCData((WebSafe($metadata.metadata.identifier)))
$manifest.WriteEndElement()
# identifier => srcid
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'srcid')
$manifest.WriteCData($metadata.metadata.identifier)
$manifest.WriteEndElement()
# title
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'title')
$manifest.WriteCData($metadata.metadata.title)
$manifest.WriteEndElement()
if ($metadata.metadata.date) {
# date => to be decaded (is that a word?) in DIH
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'date_ignored')
$manifest.WriteCData($metadata.metadata.date)
$manifest.WriteEndElement()
# free date
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'date_free')
$manifest.WriteCData($metadata.metadata.date)
$manifest.WriteEndElement()
# freetext date
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<date label='Date'>" + $metadata.metadata.date + "</date>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.creator) {
# creator => name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name')
$manifest.WriteCData($metadata.metadata.creator)
$manifest.WriteEndElement()
# creator => free name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name_free')
$manifest.WriteCData($metadata.metadata.creator)
$manifest.WriteEndElement()
# creator => freetext name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<name label='Creator'>" + $metadata.metadata.creator + "</name>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.publisher) {
# publisher => name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name')
$manifest.WriteCData($metadata.metadata.publisher)
$manifest.WriteEndElement()
# publisher => free name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name_free')
$manifest.WriteCData($metadata.metadata.publisher)
$manifest.WriteEndElement()
# publisher => freetext name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<name label='Publisher'>" + $metadata.metadata.publisher + "</name>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.editor) {
# editor => name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name')
$manifest.WriteCData($metadata.metadata.editor)
$manifest.WriteEndElement()
# editor => free name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'name_free')
$manifest.WriteCData($metadata.metadata.editor)
$manifest.WriteEndElement()
# editor => freetext name
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<name label='Editor'>" + $metadata.metadata.editor + "</name>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.year) {
# year => date_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'date_free')
$manifest.WriteCData($metadata.metadata.year)
$manifest.WriteEndElement()
# year => freetext date
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<date label='Year'>" + $metadata.metadata.year + "</date>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.volume) {
# volume => identifier_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'identifier_free')
$manifest.WriteCData("Volume " + $metadata.metadata.volume)
$manifest.WriteEndElement()
# volume => freetext identifier
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<identifier label='Volume'>" + $metadata.metadata.volume + "</identifier>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.issue) {
# issue => identifier_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'identifier_free')
$manifest.WriteCData("Issue " + $metadata.metadata.issue)
$manifest.WriteEndElement()
# issue => freetext identifier
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<identifier label='Issue'>" + $metadata.metadata.issue + "</identifier>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.month) {
# month => identifier_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'identifier_free')
$manifest.WriteCData($metadata.metadata.month)
$manifest.WriteEndElement()
# month => freetext identifier
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<identifier label='Month'>" + $metadata.metadata.month + "</identifier>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.call_number) {
# call_number => identifier_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'identifier_free')
$manifest.WriteCData($metadata.metadata.call_number)
$manifest.WriteEndElement()
# call_number => freetext identifier
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<identifier label='Call Number'>" + $metadata.metadata.call_number + "</identifier>")
$manifest.WriteEndElement()
}
if ($metadata.metadata.language) {
# language => language DIH (i.e. eng => English)
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'language_ignored')
$manifest.WriteCData($metadata.metadata.language)
$manifest.WriteEndElement()
# language => language_free
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'language_free')
$manifest.WriteCData($metadata.metadata.language)
$manifest.WriteEndElement()
# language => freetext language
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<language label='Language'>" + $metadata.metadata.language + "</language>")
$manifest.WriteEndElement()
}
<# STATIC VALUES #>
# ocrtype 'djvu'
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'ocrtype')
$manifest.WriteCData("djvu")
$manifest.WriteEndElement()
# objectType 'textual record (electronic)'
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'objectType')
$manifest.WriteCData("textual record (electronic)")
$manifest.WriteEndElement()
# objectType freetext
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'freetext')
$manifest.WriteCData("<objectType label='Format'>textual record (electronic)</objectType>")
$manifest.WriteEndElement()
# onlineMediaRights 'No Restrictions'
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'onlineMediaRights')
$manifest.WriteCData("No Restrictions")
$manifest.WriteEndElement()
# onlineMediaType
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'onlineMediaType')
$manifest.WriteCData("Scanned Page")
$manifest.WriteEndElement()
# usageFlag
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'usageFlag')
$manifest.WriteCData("Internet Archive")
$manifest.WriteEndElement()
# source param => src because IA metadata does not include the name of the serial
$manifest.WriteStartElement('field')
$manifest.WriteAttributeString('name', 'src')
$manifest.WriteCData($source)
$manifest.WriteEndElement()
# close the doc/add nodes and finalize the document
$manifest.WriteEndElement()
$manifest.WriteEndElement()
$manifest.WriteEndDocument()
$manifest.Flush()
$manifest.Close()
# debug in notepad
#notepad $out
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment