Skip to content

Instantly share code, notes, and snippets.

@sailist
Created March 28, 2020 11:36
Show Gist options
  • Save sailist/a19ed267aabdad9cd4f9f3bbfd46c42a to your computer and use it in GitHub Desktop.
Save sailist/a19ed267aabdad9cd4f9f3bbfd46c42a to your computer and use it in GitHub Desktop.
读取MARC数据?
<xsl:stylesheet xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="1.0" exclude-result-prefixes="marc">
<xsl:import href="MARC21slimUtils.xsl"/>
<xsl:output method="xml" indent="yes"/>
<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="marc:record">
<xsl:variable name="leader" select="marc:leader"/>
<xsl:variable name="leader6" select="substring($leader,7,1)"/>
<xsl:variable name="leader7" select="substring($leader,8,1)"/>
<xsl:variable name="controlField008" select="marc:controlfield[@tag=008]"/>
<rdf:Description>
<xsl:for-each select="marc:datafield[@tag=245]">
<dc:title>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abfghk</xsl:with-param>
</xsl:call-template>
</dc:title>
</xsl:for-each>
<xsl:for-each
select="marc:datafield[@tag=100]|marc:datafield[@tag=110]|marc:datafield[@tag=111]|marc:datafield[@tag=700]|marc:datafield[@tag=710]|marc:datafield[@tag=711]|marc:datafield[@tag=720]">
<dc:creator>
<xsl:value-of select="."/>
</dc:creator>
</xsl:for-each>
<dc:type>
<xsl:if test="$leader7='c'">
<xsl:text>collection</xsl:text>
<!-- Removed attributes 6/04 jer -->
<!--
<xsl:attribute name="collection">yes</xsl:attribute>
-->
</xsl:if>
<xsl:if test="$leader6='d' or $leader6='f' or $leader6='p' or $leader6='t'">
<!-- Removed attributes 6/04 jer -->
<!--
<xsl:attribute name="manuscript">yes</xsl:attribute>
-->
<xsl:text>manuscript</xsl:text>
</xsl:if>
<xsl:choose>
<xsl:when test="$leader6='a' or $leader6='t'">text</xsl:when>
<xsl:when test="$leader6='e' or $leader6='f'">cartographic</xsl:when>
<xsl:when test="$leader6='c' or $leader6='d'">notated music</xsl:when>
<xsl:when test="$leader6='i' or $leader6='j'">sound recording</xsl:when>
<xsl:when test="$leader6='k'">still image</xsl:when>
<xsl:when test="$leader6='g'">moving image</xsl:when>
<xsl:when test="$leader6='r'">three dimensional object</xsl:when>
<xsl:when test="$leader6='m'">software, multimedia</xsl:when>
<xsl:when test="$leader6='p'">mixed material</xsl:when>
</xsl:choose>
</dc:type>
<xsl:for-each select="marc:datafield[@tag=655]">
<dc:type>
<xsl:value-of select="."/>
</dc:type>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=260]">
<dc:publisher>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">ab</xsl:with-param>
</xsl:call-template>
</dc:publisher>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=260]/marc:subfield[@code='c']">
<dc:date>
<xsl:value-of select="."/>
</dc:date>
</xsl:for-each>
<dc:language>
<xsl:value-of select="substring($controlField008,36,3)"/>
</dc:language>
<xsl:for-each select="marc:datafield[500 &lt;= @tag and @tag &lt;= 599][not(@tag=506 or @tag=530 or @tag=540 or @tag=546)]">
<dc:description>
<xsl:value-of select="marc:subfield[@code='a']"/>
</dc:description>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=600]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=610]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=611]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=630]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=650]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=653]">
<dc:subject>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdq</xsl:with-param>
</xsl:call-template>
</dc:subject>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=662]">
<dc:coverage>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdefgh</xsl:with-param>
</xsl:call-template>
</dc:coverage>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=752]">
<dc:coverage>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">adcdfgh</xsl:with-param>
</xsl:call-template>
</dc:coverage>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=530]">
<dc:relation type="original">
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">abcdu</xsl:with-param>
</xsl:call-template>
</dc:relation>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=760]|marc:datafield[@tag=762]|marc:datafield[@tag=765]|marc:datafield[@tag=767]|marc:datafield[@tag=770]|marc:datafield[@tag=772]|marc:datafield[@tag=773]|marc:datafield[@tag=774]|marc:datafield[@tag=775]|marc:datafield[@tag=776]|marc:datafield[@tag=777]|marc:datafield[@tag=780]|marc:datafield[@tag=785]|marc:datafield[@tag=786]|marc:datafield[@tag=787]">
<dc:relation>
<xsl:call-template name="subfieldSelect">
<xsl:with-param name="codes">ot</xsl:with-param>
</xsl:call-template>
</dc:relation>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=856]">
<dc:identifier>
<xsl:value-of select="marc:subfield[@code='u']"/>
</dc:identifier>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=020]">
<dc:identifier>
<xsl:text>URN:ISBN:</xsl:text>
<xsl:value-of select="marc:subfield[@code='a']"/>
</dc:identifier>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=506]">
<dc:rights>
<xsl:value-of select="marc:subfield[@code='a']"/>
</dc:rights>
</xsl:for-each>
<xsl:for-each select="marc:datafield[@tag=540]">
<dc:rights>
<xsl:value-of select="marc:subfield[@code='a']"/>
</dc:rights>
</xsl:for-each>
</rdf:Description>
</xsl:template>
</xsl:stylesheet>
<?xml version='1.0'?>
<xsl:stylesheet version="1.0" xmlns:marc="http://www.loc.gov/MARC21/slim"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<!-- 08/08/08: tmee added corrected chopPunctuation templates for 260c -->
<!-- 08/19/04: ntra added "marc:" prefix to datafield element -->
<!-- 12/14/07: ntra added url encoding template -->
<!-- url encoding -->
<xsl:variable name="ascii">
<xsl:text> !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~</xsl:text>
</xsl:variable>
<xsl:variable name="latin1">
<xsl:text> ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ</xsl:text>
</xsl:variable>
<!-- Characters that usually don't need to be escaped -->
<xsl:variable name="safe">
<xsl:text>!'()*-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~</xsl:text>
</xsl:variable>
<xsl:variable name="hex">0123456789ABCDEF</xsl:variable>
<xsl:template name="datafield">
<xsl:param name="tag"/>
<xsl:param name="ind1">
<xsl:text> </xsl:text>
</xsl:param>
<xsl:param name="ind2">
<xsl:text> </xsl:text>
</xsl:param>
<xsl:param name="subfields"/>
<xsl:element name="marc:datafield">
<xsl:attribute name="tag">
<xsl:value-of select="$tag"/>
</xsl:attribute>
<xsl:attribute name="ind1">
<xsl:value-of select="$ind1"/>
</xsl:attribute>
<xsl:attribute name="ind2">
<xsl:value-of select="$ind2"/>
</xsl:attribute>
<xsl:copy-of select="$subfields"/>
</xsl:element>
</xsl:template>
<xsl:template name="subfieldSelect">
<xsl:param name="codes">abcdefghijklmnopqrstuvwxyz</xsl:param>
<xsl:param name="delimeter">
<xsl:text> </xsl:text>
</xsl:param>
<xsl:variable name="str">
<xsl:for-each select="marc:subfield">
<xsl:if test="contains($codes, @code)">
<xsl:value-of select="text()"/>
<xsl:value-of select="$delimeter"/>
</xsl:if>
</xsl:for-each>
</xsl:variable>
<xsl:value-of select="substring($str,1,string-length($str)-string-length($delimeter))"/>
</xsl:template>
<xsl:template name="buildSpaces">
<xsl:param name="spaces"/>
<xsl:param name="char">
<xsl:text> </xsl:text>
</xsl:param>
<xsl:if test="$spaces>0">
<xsl:value-of select="$char"/>
<xsl:call-template name="buildSpaces">
<xsl:with-param name="spaces" select="$spaces - 1"/>
<xsl:with-param name="char" select="$char"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template name="chopPunctuation">
<xsl:param name="chopString"/>
<xsl:param name="punctuation">
<xsl:text>.:,;/ </xsl:text>
</xsl:param>
<xsl:variable name="length" select="string-length($chopString)"/>
<xsl:choose>
<xsl:when test="$length=0"/>
<xsl:when test="contains($punctuation, substring($chopString,$length,1))">
<xsl:call-template name="chopPunctuation">
<xsl:with-param name="chopString" select="substring($chopString,1,$length - 1)"/>
<xsl:with-param name="punctuation" select="$punctuation"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="not($chopString)"/>
<xsl:otherwise>
<xsl:value-of select="$chopString"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="chopPunctuationFront">
<xsl:param name="chopString"/>
<xsl:variable name="length" select="string-length($chopString)"/>
<xsl:choose>
<xsl:when test="$length=0"/>
<xsl:when test="contains('.:,;/[ ', substring($chopString,1,1))">
<xsl:call-template name="chopPunctuationFront">
<xsl:with-param name="chopString" select="substring($chopString,2,$length - 1)"
/>
</xsl:call-template>
</xsl:when>
<xsl:when test="not($chopString)"/>
<xsl:otherwise>
<xsl:value-of select="$chopString"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="chopPunctuationBack">
<xsl:param name="chopString"/>
<xsl:param name="punctuation">
<xsl:text>.:,;/] </xsl:text>
</xsl:param>
<xsl:variable name="length" select="string-length($chopString)"/>
<xsl:choose>
<xsl:when test="$length=0"/>
<xsl:when test="contains($punctuation, substring($chopString,$length,1))">
<xsl:call-template name="chopPunctuation">
<xsl:with-param name="chopString" select="substring($chopString,1,$length - 1)"/>
<xsl:with-param name="punctuation" select="$punctuation"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="not($chopString)"/>
<xsl:otherwise>
<xsl:value-of select="$chopString"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- nate added 12/14/2007 for lccn.loc.gov: url encode ampersand, etc. -->
<xsl:template name="url-encode">
<xsl:param name="str"/>
<xsl:if test="$str">
<xsl:variable name="first-char" select="substring($str,1,1)"/>
<xsl:choose>
<xsl:when test="contains($safe,$first-char)">
<xsl:value-of select="$first-char"/>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="codepoint">
<xsl:choose>
<xsl:when test="contains($ascii,$first-char)">
<xsl:value-of
select="string-length(substring-before($ascii,$first-char)) + 32"
/>
</xsl:when>
<xsl:when test="contains($latin1,$first-char)">
<xsl:value-of
select="string-length(substring-before($latin1,$first-char)) + 160"/>
<!-- was 160 -->
</xsl:when>
<xsl:otherwise>
<xsl:message terminate="no">Warning: string contains a character
that is out of range! Substituting "?".</xsl:message>
<xsl:text>63</xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="hex-digit1"
select="substring($hex,floor($codepoint div 16) + 1,1)"/>
<xsl:variable name="hex-digit2" select="substring($hex,$codepoint mod 16 + 1,1)"/>
<!-- <xsl:value-of select="concat('%',$hex-digit2)"/> -->
<xsl:value-of select="concat('%',$hex-digit1,$hex-digit2)"/>
</xsl:otherwise>
</xsl:choose>
<xsl:if test="string-length($str) &gt; 1">
<xsl:call-template name="url-encode">
<xsl:with-param name="str" select="substring($str,2)"/>
</xsl:call-template>
</xsl:if>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
<!-- Stylus Studio meta-information - (c)1998-2002 eXcelon Corp.
<metaInformation>
<scenarios/><MapperInfo srcSchemaPath="" srcSchemaRoot="" srcSchemaPathIsRelative="yes" srcSchemaInterpretAsXML="no" destSchemaPath="" destSchemaRoot="" destSchemaPathIsRelative="yes" destSchemaInterpretAsXML="no"/>
</metaInformation>
-->
"""Python script for converting MARC21 to Dublin Core XML
Usage:
Command-line
$ mkdir output_xml
$ cp marc2dc.py output_xml/.
$ cd output_xml
$ python marc2dc.py --marc E:\Research\test-marc-file.mrc
"""
___author__ = "Jeremy Nelson"
___modify__ = "Danke wangchunqu"
__license__ = "GPLv3"
import argparse
import datetime
import pymarc
import urllib.request
from lxml import etree
# MARC_DC_URL = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slim2RDFDC.xsl'
class FileResolver(etree.Resolver):
"""Copied from Stackoverflow: http://stackoverflow.com/questions/8831941/lxml-and-xsl-document-function"""
def resolve(self, url, pubid, context):
return self.resolve_filename(url, context)
i = 0
def run(line):
"""Function takes a marc21 filepath, converts each record to MARCXML,
run LOC MARC2DC XLST, and saves resulting DC XML to disk"""
reader = pymarc.MARCReader(line,to_unicode=True,file_encoding="gbk")
parser = etree.XMLParser()
parser.resolvers.add(FileResolver())
marc2dc_xslt = etree.parse("k.xml", parser=parser)
transform = etree.XSLT(marc2dc_xslt)
record = next(reader)
# for i, record in enumerate():
# Convert to MARC XML
record_xml = etree.XML(
pymarc.record_to_xml(record, namespace=True),
parser=parser)
# Transform to Dublin Core RDF XML
dc_xml = transform(record_xml)
# Save to DC XML local directory
if '001' in record:
dc_filename = "dc-{}.xml".format(record['001'].data)
else:
dc_filename = "dc-marc-{}.xml".format(i)
with open(dc_filename, 'w+') as dc_file:
dc_file.write(etree.tostring(dc_xml).decode())
# https://www.freeformatter.com/xml-escape.html
# https://www.google.com/search?q=MARC21slimUtils.xsl&oq=MARC21slimUtils.xsl&aqs=chrome..69i57.3448j0j7&sourceid=chrome&ie=UTF-8
if __name__ == '__main__':
# arg_parser = argparse.ArgumentParser()
# arg_parser.add_argument('--marc', help='Full Path to MARC21 file')
# args = arg_parser.parse_args()
start = datetime.datetime.utcnow()
print("Starting MARC21 to Dublin Core at {}".format(start.isoformat()))
f = open("2020汉唐第一期采访数据2020.1.6-科技.iso", 'rb')
for line in f:
i+=1
run(line)
# run(f.readline())
end = datetime.datetime.utcnow()
print("Finished MARC21 to Dublin Core at {}, total time={} seconds".format(
end.isoformat(),
(end-start).seconds))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment