Skip to content

Instantly share code, notes, and snippets.

@hubgit
Created April 24, 2012 16:54
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hubgit/2481431 to your computer and use it in GitHub Desktop.
Save hubgit/2481431 to your computer and use it in GitHub Desktop.
Convert Harvard Library Bibliographic Dataset (MARC21) to MODS XML
<?php
require 'File/MARC.php';
$xsl = new DOMDocument;
$xsl->load('http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-4.xsl');
$xsltproc = new XSLTProcessor;
$xsltproc->importStylesheet($xsl);
$marcxml = new DOMDocument;
$marcxml->preserveWhiteSpace = false;
$i = 0;
foreach (glob('/marc/*.mrc') as $file) {
$items = new File_MARC($file);
while ($record = $items->next()) {
$marcxml->loadXML($record->toXML(), LIBXML_NOCDATA);
$mods = $xsltproc->transformToDoc($marcxml);
$xpath = new DOMXPath($mods);
$xpath->registerNamespace('mods', 'http://www.loc.gov/mods/v3');
$nodes = $xpath->query('mods:mods/mods:recordInfo/mods:recordIdentifier');
if (!$nodes->length) continue; // no identifier
$id = $nodes->item(0)->textContent;
$file = sprintf('/mods/%s/%s/%s.xml', substr($id, 0, 3), substr($id, 3, 3), $id);
if (($i++ % 1000) === 0) print "$file\n";
$dir = dirname($file);
if (!file_exists($dir)) mkdir($dir, 0777, true);
$mods->formatOutput = true;
file_put_contents($file, $mods->saveXML($mods->documentElement->firstChild));
}
}
<?php
$xsl = new DOMDocument;
$xsl->load('mods-to-cloudsearch.xsl');
$xsltproc = new XSLTProcessor;
$xsltproc->importStylesheet($xsl);
$mods = new DOMDocument;
$mods->preserveWhiteSpace = false;
$i = 0;
$files = glob('/mods/000/000/*.xml');
$output = fopen('/mods/cloud-000-000.xml', 'w');
fwrite($output, "<batch>\n");
foreach ($files as $file) {
$mods->load($file, LIBXML_NOCDATA);
fwrite($output, $xsltproc->transformToXML($mods));
}
fwrite($output, "</batch>\n");
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output nethod="xml" encoding="utf-8" omit-xml-declaration="yes" standalone="no" indent="yes"/>
<xsl:template match="/mods">
<add id="{recordInfo/recordIdentifier}" version="1" lang="en">
<xsl:for-each select="titleInfo/title">
<field name="title"><xsl:value-of select="."/></field>
</xsl:for-each>
<xsl:for-each select="subject[@authority='lcsh']/topic">
<field name="subject"><xsl:value-of select="."/></field>
</xsl:for-each>
</add>
</xsl:template>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment