Skip to content

Instantly share code, notes, and snippets.

@djjuhasz
Created September 3, 2014 21:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save djjuhasz/2768a2eb929a7fe1ae5d to your computer and use it in GitHub Desktop.
Save djjuhasz/2768a2eb929a7fe1ae5d to your computer and use it in GitHub Desktop.
Add validation for MODS XML
<?php
/*
* This file is part of the Access to Memory (AtoM) software.
*
* Access to Memory (AtoM) is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Access to Memory (AtoM) is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Access to Memory (AtoM). If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Import an XML document into Qubit.
*
* @package AccesstoMemory
* @subpackage library
* @author MJ Suhonos <mj@suhonos.ca>
* @author Peter Van Garderen <peter@artefactual.com>
* @author Mike Cantelon <mike@artefactual.com>
*/
class QubitXmlImport
{
protected
$errors = null,
$rootObject = null,
$parent = null;
public function import($xmlFile, $options = array())
{
// load the XML document into a DOMXML object
$importDOM = $this->loadXML($xmlFile, $options);
// if we were unable to parse the XML file at all
if (empty($importDOM->documentElement))
{
$errorMsg = sfContext::getInstance()->i18n->__('Unable to parse XML file: malformed or unresolvable entities');
throw new Exception($errorMsg);
}
// if libxml threw errors, populate them to show in the template
if ($importDOM->libxmlerrors)
{
// warning condition, XML file has errors (perhaps not well-formed or invalid?)
foreach ($importDOM->libxmlerrors as $libxmlerror)
{
$xmlerrors[] = sfContext::getInstance()->i18n->__('libxml error %code% on line %line% in input file: %message%', array('%code%' => $libxmlerror->code, '%message%' => $libxmlerror->message, '%line%' => $libxmlerror->line));
}
$this->errors = array_merge((array) $this->errors, $xmlerrors);
}
if ('eac-cpf' == $importDOM->documentElement->tagName)
{
$this->rootObject = new QubitActor;
$this->rootObject->parentId = QubitActor::ROOT_ID;
$eac = new sfEacPlugin($this->rootObject);
$eac->parse($importDOM);
$this->rootObject->save();
if (isset($eac->itemsSubjectOf))
{
foreach ($eac->itemsSubjectOf as $item)
{
$relation = new QubitRelation;
$relation->object = $this->rootObject;
$relation->typeId = QubitTerm::NAME_ACCESS_POINT_ID;
$item->relationsRelatedBysubjectId[] = $relation;
$item->save();
}
}
return $this;
}
// FIXME hardcoded until we decide how these will be developed
$validSchemas = array(
// document type declarations
'+//ISBN 1-931666-00-8//DTD ead.dtd Encoded Archival Description (EAD) Version 2002//EN' => 'ead',
'-//Society of American Archivists//DTD ead.dtd (Encoded Archival Description (EAD) Version 1.0)//EN' => 'ead1',
// namespaces
'http://www.loc.gov/METS/' => 'mets',
'http://www.loc.gov/mods/' => 'mods',
'http://www.loc.gov/MARC21/slim' => 'marc',
// root element names
//'collection' => 'marc',
//'record' => 'marc',
'record' => 'oai_dc_record',
'dc' => 'dc',
'oai_dc:dc' => 'dc',
'dublinCore' => 'dc',
'metadata' => 'dc',
//'mets' => 'mets',
'mods' => 'mods',
'ead' => 'ead',
'add' => 'alouette',
'http://www.w3.org/2004/02/skos/core#' => 'skos'
);
// determine what kind of schema we're trying to import
$schemaDescriptors = array($importDOM->documentElement->tagName);
if (!empty($importDOM->namespaces))
{
krsort($importDOM->namespaces);
$schemaDescriptors = array_merge($schemaDescriptors, $importDOM->namespaces);
}
if (!empty($importDOM->doctype))
{
$schemaDescriptors = array_merge($schemaDescriptors, array($importDOM->doctype->name, $importDOM->doctype->systemId, $importDOM->doctype->publicId));
}
foreach ($schemaDescriptors as $descriptor)
{
if (array_key_exists($descriptor, $validSchemas))
{
$importSchema = $validSchemas[$descriptor];
}
}
switch ($importSchema)
{
case 'ead':
// just validate EAD import for now until we can get StrictXMLParsing working for all schemas in the self::LoadXML function. Having problems right now loading schemas.
$importDOM->validate();
// if libxml threw errors, populate them to show in the template
foreach (libxml_get_errors() as $libxmlerror)
{
$this->errors[] = sfContext::getInstance()->i18n->__('libxml error %code% on line %line% in input file: %message%', array('%code%' => $libxmlerror->code, '%message%' => $libxmlerror->message, '%line%' => $libxmlerror->line));
}
break;
case 'mods':
// Validate against local XSD
$importDOM->schemaValidate('data/schemas/mods.xsd');
// if libxml threw errors, populate them to show in the template
foreach (libxml_get_errors() as $libxmlerror)
{
$this->errors[] = sfContext::getInstance()->i18n->__('libxml error %code% on line %line% in input file: %message%', array('%code%' => $libxmlerror->code, '%message%' => $libxmlerror->message, '%line%' => $libxmlerror->line));
}
break;
case 'skos':
$criteria = new Criteria;
$criteria->add(QubitSetting::NAME, 'plugins');
$setting = QubitSetting::getOne($criteria);
if (null === $setting || !in_array('sfSkosPlugin', unserialize($setting->getValue(array('sourceCulture' => true)))))
{
throw new sfException(sfContext::getInstance()->i18n->__('The SKOS plugin is not enabled'));
}
$importTerms = sfSkosPlugin::parse($importDOM, $options);
$this->rootObject = QubitTaxonomy::getById(QubitTaxonomy::SUBJECT_ID);
$this->count = count($importTerms);
return $this;
break;
}
$importMap = sfConfig::get('sf_app_module_dir').DIRECTORY_SEPARATOR.'object'.DIRECTORY_SEPARATOR.'config'.DIRECTORY_SEPARATOR.'import'.DIRECTORY_SEPARATOR.$importSchema.'.yml';
if (!file_exists($importMap))
{
// error condition, unknown schema or no import filter
$errorMsg = sfContext::getInstance()->i18n->__('Unknown schema or import format: "%format%"', array('%format%' => $importSchema));
throw new Exception($errorMsg);
}
$this->schemaMap = sfYaml::load($importMap);
// if XSLs are specified in the mapping, process them
if (!empty($this->schemaMap['processXSLT']))
{
// pre-filter through XSLs in order
foreach ((array) $this->schemaMap['processXSLT'] as $importXSL)
{
$importXSL = sfConfig::get('sf_data_dir').DIRECTORY_SEPARATOR.'xslt'.DIRECTORY_SEPARATOR.$importXSL;
if (file_exists($importXSL))
{
// instantiate an XSLT parser
$xslDOM = new DOMDocument;
$xslDOM->load($importXSL);
// Configure the transformer
$xsltProc = new XSLTProcessor;
$xsltProc->registerPHPFunctions();
$xsltProc->importStyleSheet($xslDOM);
$importDOM->loadXML($xsltProc->transformToXML($importDOM));
unset($xslDOM);
unset($xsltProc);
}
else
{
$this->errors[] = sfContext::getInstance()->i18n->__('Unable to load import XSL filter: "%importXSL%"', array('%importXSL%' => $importXSL));
}
}
// re-initialize xpath on the new XML
$importDOM->xpath = new DOMXPath($importDOM);
}
// switch source culture if language is set in an EAD document
if ($importSchema == 'ead')
{
if (is_object($langusage = $importDOM->xpath->query('//eadheader/profiledesc/langusage/language/@langcode')))
{
$sf_user = sfContext::getInstance()->user;
$currentCulture = $sf_user->getCulture();
$langCodeConvertor = new fbISO639_Map;
foreach ($langusage as $language)
{
$isocode = trim(preg_replace('/[\n\r\s]+/', ' ', $language->nodeValue));
// convert to Symfony culture code
if (!$twoCharCode = strtolower($langCodeConvertor->getID1($isocode, false)))
{
$twoCharCode = $isocode;
}
// Check to make sure that the selected language is supported with a Symfony i18n data file.
// If not it will cause a fatal error in the Language List component on every response.
ProjectConfiguration::getActive()->loadHelpers('I18N');
try
{
format_language($twoCharCode, $twoCharCode);
}
catch (Exception $e)
{
$this->errors[] = sfContext::getInstance()->i18n->__('EAD "langmaterial" is set to').': "'.$isocode.'". '.sfContext::getInstance()->i18n->__('This language is currently not supported.');
continue;
}
if ($currentCulture !== $twoCharCode)
{
$this->errors[] = sfContext::getInstance()->i18n->__('EAD "langmaterial" is set to').': "'.$isocode.'" ('.format_language($twoCharCode, 'en').'). '.sfContext::getInstance()->i18n->__('Your XML document has been saved in this language and your user interface has just been switched to this language.');
}
$sf_user->setCulture($twoCharCode);
// can only set to one language, so have to break once the first valid language is encountered
break;
}
}
}
unset($this->schemaMap['processXSLT']);
// go through schema map and populate objects/properties
foreach ($this->schemaMap as $name => $mapping)
{
// if object is not defined or a valid class, we can't process this mapping
if (empty($mapping['Object']) || !class_exists('Qubit'.$mapping['Object']))
{
$this->errors[] = sfContext::getInstance()->i18n->__('Non-existent class defined in import mapping: "%class%"', array('%class%' => 'Qubit'.$mapping['Object']));
continue;
}
// get a list of XML nodes to process
$nodeList = $importDOM->xpath->query($mapping['XPath']);
foreach ($nodeList as $domNode)
{
// create a new object
$class = 'Qubit'.$mapping['Object'];
$currentObject = new $class;
// set the rootObject to use for initial display in successful import
if (!$this->rootObject)
{
$this->rootObject = $currentObject;
}
// use DOM to populate object
$this->populateObject($domNode, $importDOM, $mapping, $currentObject, $importSchema);
}
}
return $this;
}
private function populateObject(&$domNode, &$importDOM, &$mapping, &$currentObject, $importSchema)
{
// if a parent path is specified, try to parent the node
if (empty($mapping['Parent']))
{
$parentNodes = new DOMNodeList;
}
else
{
$parentNodes = $importDOM->xpath->query('('.$mapping['Parent'].')', $domNode);
}
if ($parentNodes->length > 0)
{
// parent ID comes from last node in the list because XPath forces forward document order
$parentId = $parentNodes->item($parentNodes->length - 1)->getAttribute('xml:id');
unset($parentNodes);
if (!empty($parentId) && is_callable(array($currentObject, 'setParentId')))
{
$currentObject->parentId = $parentId;
}
}
else
{
// orphaned object, set root if possible
if (isset($this->parent))
{
$currentObject->parentId = $this->parent->id;
}
else if (is_callable(array($currentObject, 'setRoot')))
{
$currentObject->setRoot();
}
}
// go through methods and populate properties
$this->processMethods($domNode, $importDOM, $mapping['Methods'], $currentObject, $importSchema);
// make sure we have a publication status set before indexing
if ($currentObject instanceof QubitInformationObject && count($currentObject->statuss) == 0)
{
$currentObject->setPublicationStatus(sfConfig::get('app_defaultPubStatus', QubitTerm::PUBLICATION_STATUS_DRAFT_ID));
}
// save the object after it's fully-populated
$currentObject->save();
// write the ID onto the current XML node for tracking
$domNode->setAttribute('xml:id', $currentObject->id);
}
/*
* Cycle through methods and populate object based on relevant data
*
* @return null
*/
private function processMethods(&$domNode, &$importDOM, $methods, &$currentObject, $importSchema)
{
// go through methods and populate properties
foreach ($methods as $name => $methodMap)
{
// if method is not defined, we can't process this mapping
if (empty($methodMap['Method']) || !is_callable(array($currentObject, $methodMap['Method'])))
{
$this->errors[] = sfContext::getInstance()->i18n->__('Non-existent method defined in import mapping: "%method%"', array('%method%' => $methodMap['Method']));
continue;
}
// Get a list of XML nodes to process
// This condition mitigates a problem where the XPath query wasn't working
// as expected, see #4302 for more details
if ($importSchema == 'dc' && $methodMap['XPath'] != '.')
{
$nodeList2 = $importDOM->getElementsByTagName($methodMap['XPath']);
}
else
{
$nodeList2 = $importDOM->xpath->query($methodMap['XPath'], $domNode);
}
if (is_object($nodeList2))
{
switch($name)
{
// hack: some multi-value elements (e.g. 'languages') need to get passed as one array instead of individual nodes values
case 'languages':
case 'language':
$langCodeConvertor = new fbISO639_Map;
$isID3 = ($importSchhema == 'dc') ? true : false;
$value = array();
foreach ($nodeList2 as $item)
{
if ($twoCharCode = $langCodeConvertor->getID1($item->nodeValue, $isID3))
{
$value[] = strtolower($twoCharCode);
}
else
{
$value[] = $item->nodeValue;
}
}
$currentObject->language = $value;
break;
case 'processinfo':
foreach ($nodeList2 as $item)
{
if (($childNode = $importDOM->xpath->query('p/date', $item)) !== null)
{
$currentObject->revisionHistory = $childNode->item(0)->nodeValue;
}
if (($childNode = $importDOM->xpath->query('p', $item)) !== null)
{
$note = '';
foreach ($childNode as $pNode)
{
// A <p> node inside <processinfo> with no other children,
// this is part of an archivist's note.
if ($pNode->childNodes->length === 1 && $pNode->firstChild->nodeType === XML_TEXT_NODE)
{
// If this isn't our first <p> in the note, add newlines
// to simulate paragraph.
if (strlen($note) > 0)
{
$note .= "\n\n";
}
$note .= $pNode->nodeValue;
}
}
if (strlen($note) > 0)
{
$currentObject->importEadNote(array('note' => $note, 'noteTypeId' => QubitTerm::ARCHIVIST_NOTE_ID));
}
}
// TODO: Add more child node processing, for <note> <head> etc.
}
break;
case 'flocat':
case 'digital_object':
$resources = array();
foreach ($nodeList2 as $item)
{
$resources[] = $item->nodeValue;
}
if (0 < count($resources))
{
$currentObject->importDigitalObjectFromUri($resources, $this->errors);
}
break;
case 'container':
foreach ($nodeList2 as $item)
{
$container = $item->nodeValue;
$type = $importDOM->xpath->query('@type', $item)->item(0)->nodeValue;
$label = $importDOM->xpath->query('@label', $item)->item(0)->nodeValue;
$parent = $importDOM->xpath->query('@parent', $item)->item(0)->nodeValue;
$location = $importDOM->xpath->query('did/physloc[@id="'.$parent.'"]', $domNode)->item(0)->nodeValue;
$currentObject->importPhysicalObject($location, $container, $type, $label);
}
break;
case 'relatedunitsofdescription':
$i = 0;
$nodeValue = '';
foreach ($nodeList2 as $item)
{
if ($i++ == 0)
{
$nodeValue .= self::normalizeNodeValue($item);
}
else
{
$nodeValue .= "\n\n" . self::normalizeNodeValue($item);
}
}
$currentObject->setRelatedUnitsOfDescription($nodeValue);
break;
default:
foreach ($nodeList2 as $key => $domNode2)
{
// normalize the node text; NB: this will strip any child elements, eg. HTML tags
$nodeValue = self::normalizeNodeValue($domNode2);
// if you want the full XML from the node, use this
$nodeXML = $domNode2->ownerDocument->saveXML($domNode2);
// set the parameters for the method call
if (empty($methodMap['Parameters']))
{
$parameters = array($nodeValue);
}
else
{
$parameters = array();
foreach ((array) $methodMap['Parameters'] as $parameter)
{
// if the parameter begins with %, evaluate it as an XPath expression relative to the current node
if ('%' == substr($parameter, 0, 1))
{
// evaluate the XPath expression
$xPath = substr($parameter, 1);
$result = $importDOM->xpath->query($xPath, $domNode2);
if ($result->length > 1)
{
// convert nodelist into an array
foreach ($result as $element)
{
$resultArray[] = $element->nodeValue;
}
$parameters[] = $resultArray;
}
else
{
// pass the node value unaltered; this provides an alternative to $nodeValue above
$parameters[] = $result->item(0)->nodeValue;
}
}
else
{
// Confirm DOMXML node exists to avoid warnings at run-time
if (false !== preg_match_all('/\$importDOM->xpath->query\(\'@\w+\', \$domNode2\)->item\(0\)->nodeValue/', $parameter, $matches))
{
foreach ($matches[0] as $match)
{
$str = str_replace('->nodeValue', '', $match);
if (null !== ($node = eval('return '.$str.';')))
{
// Substitute node value for search string
$parameter = str_replace($match, '\''.$node->nodeValue.'\'', $parameter);
}
else
{
// Replace empty nodes with null in parameter string
$parameter = str_replace($match, 'null', $parameter);
}
}
}
eval('$parameters[] = '.$parameter.';');
}
}
}
// Load taxonomies into variables to avoid use of magic numbers
$termData = QubitFlatfileImport::loadTermsFromTaxonomies(array(
QubitTaxonomy::NOTE_TYPE_ID => 'noteTypes',
QubitTaxonomy::RAD_NOTE_ID => 'radNoteTypes',
QubitTaxonomy::RAD_TITLE_NOTE_ID => 'titleNoteTypes'
));
$titleVariationNoteTypeId = array_search('Variations in title', $termData['titleNoteTypes']);
$titleAttributionsNoteTypeId = array_search('Attributions and conjectures', $termData['titleNoteTypes']);
$titleContinuationNoteTypeId = array_search('Continuation of title', $termData['titleNoteTypes']);
$titleStatRepNoteTypeId = array_search('Statements of responsibility', $termData['titleNoteTypes']);
$titleParallelNoteTypeId = array_search('Parallel titles and other title information', $termData['titleNoteTypes']);
$titleSourceNoteTypeId = array_search('Source of title proper', $termData['titleNoteTypes']);
$alphaNumericaDesignationsNoteTypeId = array_search('Alpha-numeric designations', $termData['radNoteTypes']);
$physDescNoteTypeId = array_search('Physical description', $termData['radNoteTypes']);
$editionNoteTypeId = array_search('Edition', $termData['radNoteTypes']);
$conservationNoteTypeId = array_search('Conservation', $termData['radNoteTypes']);
$pubSeriesNoteTypeId = array_search("Publisher's series", $termData['radNoteTypes']);
$rightsNoteTypeId = array_search("Rights", $termData['radNoteTypes']);
$materialNoteTypeId = array_search("Accompanying material", $termData['radNoteTypes']);
$generalNoteTypeId = array_search("General note", $termData['radNoteTypes']);
// invoke the object and method defined in the schema map
call_user_func_array(array( & $currentObject, $methodMap['Method']), $parameters);
}
}
unset($nodeList2);
}
}
}
/**
* modified helper methods from (http://www.php.net/manual/en/ref.dom.php):
*
* - create a DOMDocument from a file
* - parse the namespaces in it
* - create a XPath object with all the namespaces registered
* - load the schema locations
* - validate the file on the main schema (the one without prefix)
*
* @param string $xmlFile XML document file
* @param array $options optional parameters
* @return DOMDocument an object representation of the XML document
*/
protected function loadXML($xmlFile, $options = array())
{
libxml_use_internal_errors(true);
// FIXME: trap possible load validation errors (just suppress for now)
$err_level = error_reporting(0);
$doc = new DOMDocument('1.0', 'UTF-8');
// Default $strictXmlParsing to false
$strictXmlParsing = (isset($options['strictXmlParsing'])) ? $options['strictXmlParsing'] : false;
if ($strictXmlParsing)
{
// enforce all XML parsing rules and validation
$doc->validateOnParse = true;
$doc->resolveExternals = true;
}
else
{
// try to load whatever we've got, even if it's malformed or invalid
$doc->recover = true;
$doc->strictErrorChecking = false;
}
$doc->formatOutput = false;
$doc->preserveWhitespace = false;
$doc->substituteEntities = true;
$doc->load($xmlFile);
$xsi = false;
$doc->namespaces = array();
$doc->xpath = new DOMXPath($doc);
// pass along any XML errors that have been generated
$doc->libxmlerrors = libxml_get_errors();
// if the document didn't parse correctly, stop right here
if (empty($doc->documentElement))
{
return $doc;
}
error_reporting($err_level);
// look through the entire document for namespaces
// FIXME: #2787
// https://projects.artefactual.com/issues/2787
//
// THIS SHOULD ONLY INSPECT THE ROOT NODE NAMESPACES
// Consider: http://www.php.net/manual/en/book.dom.php#73793
$re = '/xmlns:([^=]+)="([^"]+)"/';
preg_match_all($re, $doc->saveXML(), $mat, PREG_SET_ORDER);
foreach ($mat as $xmlns)
{
$pre = $xmlns[1];
$uri = $xmlns[2];
$doc->namespaces[$pre] = $uri;
if ($pre == '')
{
$pre = 'noname';
}
$doc->xpath->registerNamespace($pre, $uri);
}
/*
if (!isset($doc->namespaces['']))
{
$doc->namespaces[''] = $doc->documentElement->lookupnamespaceURI(null);
}
if ($xsi)
{
$doc->schemaLocations = array();
$lst = $doc->xpath->query('//@$xsi:schemaLocation');
foreach ($lst as $el)
{
$re = "{[\\s\n\r]*([^\\s\n\r]+)[\\s\n\r]*([^\\s\n\r]+)}";
preg_match_all($re, $el->nodeValue, $mat);
for ($i = 0; $i < count($mat[0]); $i++)
{
$value = $mat[2][$i];
$doc->schemaLocations[$mat[1][$i]] = $value;
}
}
// validate document against default namespace schema
$doc->schemaValidate($doc->schemaLocations[$doc->namespaces['']]);
}
*/
return $doc;
}
/**
*
*
* @return DOMNodeList
*/
public static function queryDomNode($node, $xpathQuery)
{
$doc = new DOMDocument();
$doc->loadXML('<xml></xml>');
$doc->documentElement->appendChild($doc->importNode($node, true));
$xpath = new DOMXPath($doc);
return $xpath->query($xpathQuery);
}
/**
* Return true if import had errors
*
* @return boolean
*/
public function hasErrors()
{
return $this->errors != null;
}
/**
* Return array of error messages
*
* @return unknown
*/
public function getErrors()
{
return $this->errors;
}
/**
* Get the root object for the import
*
* @return mixed the root object (object type depends on import type)
*/
public function getRootObject()
{
return $this->rootObject;
}
/**
* Get the root object for the import
*
* @return mixed the root object (object type depends on import type)
*/
public function setParent($parent)
{
return $this->parent = $parent;
}
/**
* Replace </lb> tags for '\n'
*
* @return node value without linebreaks tags
*/
public static function replaceLineBreaks($node)
{
$nodeValue = '';
foreach ($node->childNodes as $child)
{
if ($child->nodeName == 'lb')
{
$nodeValue .= "\n";
}
else
{
$nodeValue .= preg_replace('/[\n\r\s]+/', ' ', $child->nodeValue);
}
}
return $nodeValue;
}
/**
* Normalize node, replaces <p> and <lb/>
*
* @return node value normalized
*/
public static function normalizeNodeValue($node)
{
$nodeValue = '';
if (!($node instanceof DOMAttr))
{
$nodeList = $node->getElementsByTagName('p');
if (0 < $nodeList->length)
{
$i = 0;
foreach ($nodeList as $pNode)
{
if ($i++ == 0)
{
$nodeValue .= self::replaceLineBreaks($pNode);
}
else
{
$nodeValue .= "\n\n" . self::replaceLineBreaks($pNode);
}
}
}
else
{
$nodeValue .= self::replaceLineBreaks($node);
}
}
else
{
$nodeValue .= $node->nodeValue;
}
return $nodeValue;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment