Created
March 28, 2020 11:36
-
-
Save sailist/a19ed267aabdad9cd4f9f3bbfd46c42a to your computer and use it in GitHub Desktop.
读取MARC数据?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<xsl:stylesheet xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
version="1.0" exclude-result-prefixes="marc"> | |
<xsl:import href="MARC21slimUtils.xsl"/> | |
<xsl:output method="xml" indent="yes"/> | |
<xsl:template match="/"> | |
<xsl:apply-templates/> | |
</xsl:template> | |
<xsl:template match="marc:record"> | |
<xsl:variable name="leader" select="marc:leader"/> | |
<xsl:variable name="leader6" select="substring($leader,7,1)"/> | |
<xsl:variable name="leader7" select="substring($leader,8,1)"/> | |
<xsl:variable name="controlField008" select="marc:controlfield[@tag=008]"/> | |
<rdf:Description> | |
<xsl:for-each select="marc:datafield[@tag=245]"> | |
<dc:title> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abfghk</xsl:with-param> | |
</xsl:call-template> | |
</dc:title> | |
</xsl:for-each> | |
<xsl:for-each | |
select="marc:datafield[@tag=100]|marc:datafield[@tag=110]|marc:datafield[@tag=111]|marc:datafield[@tag=700]|marc:datafield[@tag=710]|marc:datafield[@tag=711]|marc:datafield[@tag=720]"> | |
<dc:creator> | |
<xsl:value-of select="."/> | |
</dc:creator> | |
</xsl:for-each> | |
<dc:type> | |
<xsl:if test="$leader7='c'"> | |
<xsl:text>collection</xsl:text> | |
<!-- Removed attributes 6/04 jer --> | |
<!-- | |
<xsl:attribute name="collection">yes</xsl:attribute> | |
--> | |
</xsl:if> | |
<xsl:if test="$leader6='d' or $leader6='f' or $leader6='p' or $leader6='t'"> | |
<!-- Removed attributes 6/04 jer --> | |
<!-- | |
<xsl:attribute name="manuscript">yes</xsl:attribute> | |
--> | |
<xsl:text>manuscript</xsl:text> | |
</xsl:if> | |
<xsl:choose> | |
<xsl:when test="$leader6='a' or $leader6='t'">text</xsl:when> | |
<xsl:when test="$leader6='e' or $leader6='f'">cartographic</xsl:when> | |
<xsl:when test="$leader6='c' or $leader6='d'">notated music</xsl:when> | |
<xsl:when test="$leader6='i' or $leader6='j'">sound recording</xsl:when> | |
<xsl:when test="$leader6='k'">still image</xsl:when> | |
<xsl:when test="$leader6='g'">moving image</xsl:when> | |
<xsl:when test="$leader6='r'">three dimensional object</xsl:when> | |
<xsl:when test="$leader6='m'">software, multimedia</xsl:when> | |
<xsl:when test="$leader6='p'">mixed material</xsl:when> | |
</xsl:choose> | |
</dc:type> | |
<xsl:for-each select="marc:datafield[@tag=655]"> | |
<dc:type> | |
<xsl:value-of select="."/> | |
</dc:type> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=260]"> | |
<dc:publisher> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">ab</xsl:with-param> | |
</xsl:call-template> | |
</dc:publisher> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=260]/marc:subfield[@code='c']"> | |
<dc:date> | |
<xsl:value-of select="."/> | |
</dc:date> | |
</xsl:for-each> | |
<dc:language> | |
<xsl:value-of select="substring($controlField008,36,3)"/> | |
</dc:language> | |
<xsl:for-each select="marc:datafield[500 <= @tag and @tag <= 599][not(@tag=506 or @tag=530 or @tag=540 or @tag=546)]"> | |
<dc:description> | |
<xsl:value-of select="marc:subfield[@code='a']"/> | |
</dc:description> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=600]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=610]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=611]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=630]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=650]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=653]"> | |
<dc:subject> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdq</xsl:with-param> | |
</xsl:call-template> | |
</dc:subject> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=662]"> | |
<dc:coverage> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdefgh</xsl:with-param> | |
</xsl:call-template> | |
</dc:coverage> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=752]"> | |
<dc:coverage> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">adcdfgh</xsl:with-param> | |
</xsl:call-template> | |
</dc:coverage> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=530]"> | |
<dc:relation type="original"> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">abcdu</xsl:with-param> | |
</xsl:call-template> | |
</dc:relation> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=760]|marc:datafield[@tag=762]|marc:datafield[@tag=765]|marc:datafield[@tag=767]|marc:datafield[@tag=770]|marc:datafield[@tag=772]|marc:datafield[@tag=773]|marc:datafield[@tag=774]|marc:datafield[@tag=775]|marc:datafield[@tag=776]|marc:datafield[@tag=777]|marc:datafield[@tag=780]|marc:datafield[@tag=785]|marc:datafield[@tag=786]|marc:datafield[@tag=787]"> | |
<dc:relation> | |
<xsl:call-template name="subfieldSelect"> | |
<xsl:with-param name="codes">ot</xsl:with-param> | |
</xsl:call-template> | |
</dc:relation> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=856]"> | |
<dc:identifier> | |
<xsl:value-of select="marc:subfield[@code='u']"/> | |
</dc:identifier> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=020]"> | |
<dc:identifier> | |
<xsl:text>URN:ISBN:</xsl:text> | |
<xsl:value-of select="marc:subfield[@code='a']"/> | |
</dc:identifier> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=506]"> | |
<dc:rights> | |
<xsl:value-of select="marc:subfield[@code='a']"/> | |
</dc:rights> | |
</xsl:for-each> | |
<xsl:for-each select="marc:datafield[@tag=540]"> | |
<dc:rights> | |
<xsl:value-of select="marc:subfield[@code='a']"/> | |
</dc:rights> | |
</xsl:for-each> | |
</rdf:Description> | |
</xsl:template> | |
</xsl:stylesheet> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version='1.0'?> | |
<xsl:stylesheet version="1.0" xmlns:marc="http://www.loc.gov/MARC21/slim" | |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> | |
<!-- 08/08/08: tmee added corrected chopPunctuation templates for 260c --> | |
<!-- 08/19/04: ntra added "marc:" prefix to datafield element --> | |
<!-- 12/14/07: ntra added url encoding template --> | |
<!-- url encoding --> | |
<xsl:variable name="ascii"> | |
<xsl:text> !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~</xsl:text> | |
</xsl:variable> | |
<xsl:variable name="latin1"> | |
<xsl:text> ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ</xsl:text> | |
</xsl:variable> | |
<!-- Characters that usually don't need to be escaped --> | |
<xsl:variable name="safe"> | |
<xsl:text>!'()*-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~</xsl:text> | |
</xsl:variable> | |
<xsl:variable name="hex">0123456789ABCDEF</xsl:variable> | |
<xsl:template name="datafield"> | |
<xsl:param name="tag"/> | |
<xsl:param name="ind1"> | |
<xsl:text> </xsl:text> | |
</xsl:param> | |
<xsl:param name="ind2"> | |
<xsl:text> </xsl:text> | |
</xsl:param> | |
<xsl:param name="subfields"/> | |
<xsl:element name="marc:datafield"> | |
<xsl:attribute name="tag"> | |
<xsl:value-of select="$tag"/> | |
</xsl:attribute> | |
<xsl:attribute name="ind1"> | |
<xsl:value-of select="$ind1"/> | |
</xsl:attribute> | |
<xsl:attribute name="ind2"> | |
<xsl:value-of select="$ind2"/> | |
</xsl:attribute> | |
<xsl:copy-of select="$subfields"/> | |
</xsl:element> | |
</xsl:template> | |
<xsl:template name="subfieldSelect"> | |
<xsl:param name="codes">abcdefghijklmnopqrstuvwxyz</xsl:param> | |
<xsl:param name="delimeter"> | |
<xsl:text> </xsl:text> | |
</xsl:param> | |
<xsl:variable name="str"> | |
<xsl:for-each select="marc:subfield"> | |
<xsl:if test="contains($codes, @code)"> | |
<xsl:value-of select="text()"/> | |
<xsl:value-of select="$delimeter"/> | |
</xsl:if> | |
</xsl:for-each> | |
</xsl:variable> | |
<xsl:value-of select="substring($str,1,string-length($str)-string-length($delimeter))"/> | |
</xsl:template> | |
<xsl:template name="buildSpaces"> | |
<xsl:param name="spaces"/> | |
<xsl:param name="char"> | |
<xsl:text> </xsl:text> | |
</xsl:param> | |
<xsl:if test="$spaces>0"> | |
<xsl:value-of select="$char"/> | |
<xsl:call-template name="buildSpaces"> | |
<xsl:with-param name="spaces" select="$spaces - 1"/> | |
<xsl:with-param name="char" select="$char"/> | |
</xsl:call-template> | |
</xsl:if> | |
</xsl:template> | |
<xsl:template name="chopPunctuation"> | |
<xsl:param name="chopString"/> | |
<xsl:param name="punctuation"> | |
<xsl:text>.:,;/ </xsl:text> | |
</xsl:param> | |
<xsl:variable name="length" select="string-length($chopString)"/> | |
<xsl:choose> | |
<xsl:when test="$length=0"/> | |
<xsl:when test="contains($punctuation, substring($chopString,$length,1))"> | |
<xsl:call-template name="chopPunctuation"> | |
<xsl:with-param name="chopString" select="substring($chopString,1,$length - 1)"/> | |
<xsl:with-param name="punctuation" select="$punctuation"/> | |
</xsl:call-template> | |
</xsl:when> | |
<xsl:when test="not($chopString)"/> | |
<xsl:otherwise> | |
<xsl:value-of select="$chopString"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:template> | |
<xsl:template name="chopPunctuationFront"> | |
<xsl:param name="chopString"/> | |
<xsl:variable name="length" select="string-length($chopString)"/> | |
<xsl:choose> | |
<xsl:when test="$length=0"/> | |
<xsl:when test="contains('.:,;/[ ', substring($chopString,1,1))"> | |
<xsl:call-template name="chopPunctuationFront"> | |
<xsl:with-param name="chopString" select="substring($chopString,2,$length - 1)" | |
/> | |
</xsl:call-template> | |
</xsl:when> | |
<xsl:when test="not($chopString)"/> | |
<xsl:otherwise> | |
<xsl:value-of select="$chopString"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:template> | |
<xsl:template name="chopPunctuationBack"> | |
<xsl:param name="chopString"/> | |
<xsl:param name="punctuation"> | |
<xsl:text>.:,;/] </xsl:text> | |
</xsl:param> | |
<xsl:variable name="length" select="string-length($chopString)"/> | |
<xsl:choose> | |
<xsl:when test="$length=0"/> | |
<xsl:when test="contains($punctuation, substring($chopString,$length,1))"> | |
<xsl:call-template name="chopPunctuation"> | |
<xsl:with-param name="chopString" select="substring($chopString,1,$length - 1)"/> | |
<xsl:with-param name="punctuation" select="$punctuation"/> | |
</xsl:call-template> | |
</xsl:when> | |
<xsl:when test="not($chopString)"/> | |
<xsl:otherwise> | |
<xsl:value-of select="$chopString"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:template> | |
<!-- nate added 12/14/2007 for lccn.loc.gov: url encode ampersand, etc. --> | |
<xsl:template name="url-encode"> | |
<xsl:param name="str"/> | |
<xsl:if test="$str"> | |
<xsl:variable name="first-char" select="substring($str,1,1)"/> | |
<xsl:choose> | |
<xsl:when test="contains($safe,$first-char)"> | |
<xsl:value-of select="$first-char"/> | |
</xsl:when> | |
<xsl:otherwise> | |
<xsl:variable name="codepoint"> | |
<xsl:choose> | |
<xsl:when test="contains($ascii,$first-char)"> | |
<xsl:value-of | |
select="string-length(substring-before($ascii,$first-char)) + 32" | |
/> | |
</xsl:when> | |
<xsl:when test="contains($latin1,$first-char)"> | |
<xsl:value-of | |
select="string-length(substring-before($latin1,$first-char)) + 160"/> | |
<!-- was 160 --> | |
</xsl:when> | |
<xsl:otherwise> | |
<xsl:message terminate="no">Warning: string contains a character | |
that is out of range! Substituting "?".</xsl:message> | |
<xsl:text>63</xsl:text> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:variable> | |
<xsl:variable name="hex-digit1" | |
select="substring($hex,floor($codepoint div 16) + 1,1)"/> | |
<xsl:variable name="hex-digit2" select="substring($hex,$codepoint mod 16 + 1,1)"/> | |
<!-- <xsl:value-of select="concat('%',$hex-digit2)"/> --> | |
<xsl:value-of select="concat('%',$hex-digit1,$hex-digit2)"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
<xsl:if test="string-length($str) > 1"> | |
<xsl:call-template name="url-encode"> | |
<xsl:with-param name="str" select="substring($str,2)"/> | |
</xsl:call-template> | |
</xsl:if> | |
</xsl:if> | |
</xsl:template> | |
</xsl:stylesheet> | |
<!-- Stylus Studio meta-information - (c)1998-2002 eXcelon Corp. | |
<metaInformation> | |
<scenarios/><MapperInfo srcSchemaPath="" srcSchemaRoot="" srcSchemaPathIsRelative="yes" srcSchemaInterpretAsXML="no" destSchemaPath="" destSchemaRoot="" destSchemaPathIsRelative="yes" destSchemaInterpretAsXML="no"/> | |
</metaInformation> | |
--> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Python script for converting MARC21 to Dublin Core XML | |
Usage: | |
Command-line | |
$ mkdir output_xml | |
$ cp marc2dc.py output_xml/. | |
$ cd output_xml | |
$ python marc2dc.py --marc E:\Research\test-marc-file.mrc | |
""" | |
___author__ = "Jeremy Nelson" | |
___modify__ = "Danke wangchunqu" | |
__license__ = "GPLv3" | |
import argparse | |
import datetime | |
import pymarc | |
import urllib.request | |
from lxml import etree | |
# MARC_DC_URL = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slim2RDFDC.xsl' | |
class FileResolver(etree.Resolver): | |
"""Copied from Stackoverflow: http://stackoverflow.com/questions/8831941/lxml-and-xsl-document-function""" | |
def resolve(self, url, pubid, context): | |
return self.resolve_filename(url, context) | |
i = 0 | |
def run(line): | |
"""Function takes a marc21 filepath, converts each record to MARCXML, | |
run LOC MARC2DC XLST, and saves resulting DC XML to disk""" | |
reader = pymarc.MARCReader(line,to_unicode=True,file_encoding="gbk") | |
parser = etree.XMLParser() | |
parser.resolvers.add(FileResolver()) | |
marc2dc_xslt = etree.parse("k.xml", parser=parser) | |
transform = etree.XSLT(marc2dc_xslt) | |
record = next(reader) | |
# for i, record in enumerate(): | |
# Convert to MARC XML | |
record_xml = etree.XML( | |
pymarc.record_to_xml(record, namespace=True), | |
parser=parser) | |
# Transform to Dublin Core RDF XML | |
dc_xml = transform(record_xml) | |
# Save to DC XML local directory | |
if '001' in record: | |
dc_filename = "dc-{}.xml".format(record['001'].data) | |
else: | |
dc_filename = "dc-marc-{}.xml".format(i) | |
with open(dc_filename, 'w+') as dc_file: | |
dc_file.write(etree.tostring(dc_xml).decode()) | |
# https://www.freeformatter.com/xml-escape.html | |
# https://www.google.com/search?q=MARC21slimUtils.xsl&oq=MARC21slimUtils.xsl&aqs=chrome..69i57.3448j0j7&sourceid=chrome&ie=UTF-8 | |
if __name__ == '__main__': | |
# arg_parser = argparse.ArgumentParser() | |
# arg_parser.add_argument('--marc', help='Full Path to MARC21 file') | |
# args = arg_parser.parse_args() | |
start = datetime.datetime.utcnow() | |
print("Starting MARC21 to Dublin Core at {}".format(start.isoformat())) | |
f = open("2020汉唐第一期采访数据2020.1.6-科技.iso", 'rb') | |
for line in f: | |
i+=1 | |
run(line) | |
# run(f.readline()) | |
end = datetime.datetime.utcnow() | |
print("Finished MARC21 to Dublin Core at {}, total time={} seconds".format( | |
end.isoformat(), | |
(end-start).seconds)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment