Created
May 20, 2013 16:34
-
-
Save jeff/5613442 to your computer and use it in GitHub Desktop.
found via archive.org Wayback Machine, the generate.py Python script used to create marcdoc.xml by crawling the Library of Congress MARC documentation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This is a scraper for the LoC website that will parse the human | |
readable HTML documentation for MARC tags, and generate machine | |
readable (XML) documentation. | |
You'll need to have python, elementtree [1] and elementtidy [2] installed | |
before running this script. | |
Questions or comments welcome at ehs@pobox.com or in #code4lib on | |
irc.freenode.net. | |
[1] http://effbot.org/downloads/#elementtree | |
[2] http://effbot.org/downloads/#elementtidy | |
""" | |
from urllib2 import urlopen | |
from elementtidy import TidyHTMLTreeBuilder | |
from elementtree import ElementTree | |
from elementtree.ElementTree import Element, SubElement | |
from xml.sax import parseString, ContentHandler | |
from sys import stderr | |
import re | |
class Field: | |
"""Represents a field definition | |
""" | |
def __init__(self): | |
self.tag = '' | |
self.name = '' | |
self.repeatable = False | |
self.description = '' | |
self.subfields = [] | |
self.indicators = [] | |
def __str__(self): | |
"""Convert a field definition to a string | |
""" | |
str = "%s - %s " % (self.tag, self.name) | |
# basic tag info | |
if self.repeatable: | |
str += "(REQUIRED)" | |
else: | |
str += "(NOT REQUIRED)" | |
str += "\n%s\n" % self.description | |
# add indicators | |
for indicator in self.indicators: | |
str += indicator + "\n" | |
# add subfield data | |
str += "Subfields\n" | |
for subfield in self.subfields: | |
str += "%s\n" % subfield | |
return str | |
def to_element(self): | |
"""Convert a Field to an elementtree Element | |
""" | |
field = Element('field') | |
field.attrib['tag'] = self.tag | |
if self.repeatable: | |
field.attrib['repeatable'] = 'true' | |
else: | |
field.attrib['repeatable'] = 'false' | |
SubElement(field, 'description').text = self.description | |
for indicator in self.indicators: | |
field.append(indicator.to_element()) | |
for subfield in self.subfields: | |
field.append(subfield.to_element()) | |
return field | |
class Subfield: | |
"""Represents a subfield definition inside a Field | |
""" | |
def __init__(self,code='',description=''): | |
self.code = code | |
self.description = description | |
self.repeatable = False | |
def __str__(self): | |
"""Convert a subfield to a string | |
""" | |
str = "%s - %s " % (self.code, self.description) | |
if self.repeatable: | |
str += "(REQUIRED)" | |
else: | |
str += "(NOT REQUIRED)" | |
return str | |
def to_element(self): | |
"""Convert a subfield to an elementtree Element | |
""" | |
subfield = Element('subfield') | |
subfield.attrib['code'] = self.code | |
if self.repeatable: | |
subfield.attrib['repeatable'] = 'true' | |
else: | |
subfield.attrib['repeatable'] = 'false' | |
SubElement(subfield, 'description').text = self.description | |
return subfield | |
class Indicator: | |
"""Represents an indicator values | |
""" | |
def __init__(self, position, value, description): | |
self.position = position | |
self.value = value | |
self.description = description | |
def __str__(self): | |
return "indicator %i value %i - %s" % (self.position, self.value, | |
self.description) | |
def to_element(self): | |
"""Convert a indicator to an elementtree Element | |
""" | |
indicator = Element('indicator') | |
indicator.attrib['position'] = self.position | |
indicator.attrib['value'] = self.value | |
SubElement(indicator,'description').text = self.description | |
return indicator | |
class HTMLHandler(ContentHandler): | |
def __init__(self): | |
self.fields = [] | |
self.seen = [] | |
self.text = '' | |
self.insideName = False | |
self.insideDescription = False | |
self.insideH3 = False | |
self.insideSubfields = False | |
self.insideIndicators = False | |
self.insideIndicator1 = False | |
self.insideIndicator2 = False | |
self.field = None | |
def inside(self, tag): | |
if tag in self.seen: | |
return True | |
return False | |
def startElement(self, name, attrs): | |
self.seen.append(name) | |
if name == 'html:a' and attrs.has_key('name'): | |
self.text = '' | |
self.insideName = True | |
elif name == 'html:p' and self.field and not self.field.description: | |
self.text = '' | |
self.insideDescription = True | |
elif name == 'html:h3' and self.field: | |
self.text = '' | |
self.insideH3 = True | |
elif name == 'html:ul' and self.insideIndicators: | |
match = re.match('(First|Second) - (.+)', self.text) | |
if match: | |
if match.group(1) == 'First': | |
self.insideIndicator1 = True | |
self.text = '' | |
elif match.group(1) == 'Second': | |
self.insideIndicator1 = False | |
self.insideIndicator2 = True | |
self.text = '' | |
def endElement(self, name): | |
self.seen.pop() | |
# if we are ending the name section | |
if self.insideName and name == 'html:a': | |
match = re.match('(\d\d\d+) - (.*)(\(N?R\))',self.text) | |
if match: | |
self.field = Field() | |
self.field.tag = match.group(1) | |
self.field.name = match.group(2) | |
if match.group(3) == '(R)': | |
self.field.repeatable = True | |
else: | |
self.field.repeatable = False | |
self.text = '' | |
self.insideName = False | |
# ending the description | |
elif self.insideDescription and name == 'html:p': | |
self.field.description = self.text | |
self.text = '' | |
self.insideDescription = False | |
# inside indicators | |
elif self.insideH3 and self.text == 'Indicators': | |
self.insideIndicators = True | |
self.insideH3 = False | |
self.text = '' | |
# add indicator 1 value | |
elif self.insideIndicator1 and name == 'html:li': | |
match = re.match('([0-9\-]+) - (.*)', self.text) | |
if match: | |
self.field.indicators.append( | |
Indicator('1', match.group(1), match.group(2))) | |
self.text = '' | |
# add indicator 2 value | |
elif self.insideIndicator2 and name == 'html:li': | |
match = re.match('([0-9\-]+) - (.*)', self.text) | |
if match: | |
self.field.indicators.append( | |
Indicator('2', match.group(1), match.group(2))) | |
self.text = '' | |
# start of subfields | |
elif self.insideH3 and self.text == 'Subfield Codes': | |
self.insideSubfields = True | |
self.insideH3 == False | |
self.insideIndicator1 = False | |
self.insideIndicator2 = False | |
self.insideIndicators = False | |
self.text = '' | |
# add subfield | |
elif self.insideSubfields and name == 'html:li': | |
match = re.match('\$(.) - (.*)\((N?R)\)(.*)', self.text) | |
if match: | |
subfield = Subfield() | |
subfield.code = match.group(1) | |
subfield.description = match.group(2) + match.group(4) | |
if match.group(3) == 'R': | |
subfield.repeatable = True | |
self.field.subfields.append(subfield) | |
self.text = '' | |
# end of field definition | |
elif name == 'html:hr': | |
if self.field: | |
self.fields.append(self.field) | |
self.field = None | |
self.insideName = False | |
self.insideDescription = False | |
self.insideH3 = False | |
self.insideSubfields = False | |
self.insideIndicators = False | |
self.insideIndicator1 = False | |
self.insideIndicator2 = False | |
self.text = '' | |
def characters(self, chars): | |
chars = chars.encode('utf8') | |
chars = chars.strip("\n") | |
if self.insideName or self.insideDescription or self.insideH3 \ | |
or self.insideSubfields or self.insideIndicators \ | |
or self.insideIndicator1 or self.insideIndicator2: | |
self.text += chars | |
def findall (node,pattern): | |
"""Helper for doing a findall across all nodes of an HTML tree | |
""" | |
ns = '{http://www.w3.org/1999/xhtml}' | |
return node.findall('.//%s%s' % (ns, pattern)) | |
def find (node,pattern): | |
"""Helper for finding one node. | |
""" | |
nodes = findall(node,pattern) | |
if len(nodes) > 0: return nodes[0] | |
return None | |
def get_bibliographic_urls (): | |
url = 'http://www.loc.gov/marc/bibliographic/ecbdhome.html' | |
tree = TidyHTMLTreeBuilder.parse(urlopen(url)) | |
urls = [] | |
for a in findall(tree,'a'): | |
if 'bibliographic' in a.attrib['href']: | |
urls.append(a.attrib['href']) | |
return urls | |
def extract_fields (url): | |
"""Looks for field definitions at a URL, and returns all | |
found definitions as a list of Field objects. | |
""" | |
tree = TidyHTMLTreeBuilder.parse(urlopen(url)) | |
xml = ElementTree.tostring(tree.getroot()) | |
handler = HTMLHandler() | |
parseString(xml,handler) | |
return handler.fields | |
# ok start your engines | |
if __name__ == '__main__': | |
fields = Element('fields') | |
for url in get_bibliographic_urls(): | |
stderr.write("fetching %s\n" % url) | |
for field in extract_fields(url): | |
fields.append(field.to_element()) | |
ElementTree.dump(fields) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment