Skip to content

Instantly share code, notes, and snippets.

@jeff
Created May 20, 2013 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeff/5613442 to your computer and use it in GitHub Desktop.
Save jeff/5613442 to your computer and use it in GitHub Desktop.
found via archive.org Wayback Machine, the generate.py Python script used to create marcdoc.xml by crawling the Library of Congress MARC documentation
#!/usr/bin/env python
"""
This is a scraper for the LoC website that will parse the human
readable HTML documentation for MARC tags, and generate machine
readable (XML) documentation.
You'll need to have python, elementtree [1] and elementtidy [2] installed
before running this script.
Questions or comments welcome at ehs@pobox.com or in #code4lib on
irc.freenode.net.
[1] http://effbot.org/downloads/#elementtree
[2] http://effbot.org/downloads/#elementtidy
"""
from urllib2 import urlopen
from elementtidy import TidyHTMLTreeBuilder
from elementtree import ElementTree
from elementtree.ElementTree import Element, SubElement
from xml.sax import parseString, ContentHandler
from sys import stderr
import re
class Field:
"""Represents a field definition
"""
def __init__(self):
self.tag = ''
self.name = ''
self.repeatable = False
self.description = ''
self.subfields = []
self.indicators = []
def __str__(self):
"""Convert a field definition to a string
"""
str = "%s - %s " % (self.tag, self.name)
# basic tag info
if self.repeatable:
str += "(REQUIRED)"
else:
str += "(NOT REQUIRED)"
str += "\n%s\n" % self.description
# add indicators
for indicator in self.indicators:
str += indicator + "\n"
# add subfield data
str += "Subfields\n"
for subfield in self.subfields:
str += "%s\n" % subfield
return str
def to_element(self):
"""Convert a Field to an elementtree Element
"""
field = Element('field')
field.attrib['tag'] = self.tag
if self.repeatable:
field.attrib['repeatable'] = 'true'
else:
field.attrib['repeatable'] = 'false'
SubElement(field, 'description').text = self.description
for indicator in self.indicators:
field.append(indicator.to_element())
for subfield in self.subfields:
field.append(subfield.to_element())
return field
class Subfield:
"""Represents a subfield definition inside a Field
"""
def __init__(self,code='',description=''):
self.code = code
self.description = description
self.repeatable = False
def __str__(self):
"""Convert a subfield to a string
"""
str = "%s - %s " % (self.code, self.description)
if self.repeatable:
str += "(REQUIRED)"
else:
str += "(NOT REQUIRED)"
return str
def to_element(self):
"""Convert a subfield to an elementtree Element
"""
subfield = Element('subfield')
subfield.attrib['code'] = self.code
if self.repeatable:
subfield.attrib['repeatable'] = 'true'
else:
subfield.attrib['repeatable'] = 'false'
SubElement(subfield, 'description').text = self.description
return subfield
class Indicator:
"""Represents an indicator values
"""
def __init__(self, position, value, description):
self.position = position
self.value = value
self.description = description
def __str__(self):
return "indicator %i value %i - %s" % (self.position, self.value,
self.description)
def to_element(self):
"""Convert a indicator to an elementtree Element
"""
indicator = Element('indicator')
indicator.attrib['position'] = self.position
indicator.attrib['value'] = self.value
SubElement(indicator,'description').text = self.description
return indicator
class HTMLHandler(ContentHandler):
def __init__(self):
self.fields = []
self.seen = []
self.text = ''
self.insideName = False
self.insideDescription = False
self.insideH3 = False
self.insideSubfields = False
self.insideIndicators = False
self.insideIndicator1 = False
self.insideIndicator2 = False
self.field = None
def inside(self, tag):
if tag in self.seen:
return True
return False
def startElement(self, name, attrs):
self.seen.append(name)
if name == 'html:a' and attrs.has_key('name'):
self.text = ''
self.insideName = True
elif name == 'html:p' and self.field and not self.field.description:
self.text = ''
self.insideDescription = True
elif name == 'html:h3' and self.field:
self.text = ''
self.insideH3 = True
elif name == 'html:ul' and self.insideIndicators:
match = re.match('(First|Second) - (.+)', self.text)
if match:
if match.group(1) == 'First':
self.insideIndicator1 = True
self.text = ''
elif match.group(1) == 'Second':
self.insideIndicator1 = False
self.insideIndicator2 = True
self.text = ''
def endElement(self, name):
self.seen.pop()
# if we are ending the name section
if self.insideName and name == 'html:a':
match = re.match('(\d\d\d+) - (.*)(\(N?R\))',self.text)
if match:
self.field = Field()
self.field.tag = match.group(1)
self.field.name = match.group(2)
if match.group(3) == '(R)':
self.field.repeatable = True
else:
self.field.repeatable = False
self.text = ''
self.insideName = False
# ending the description
elif self.insideDescription and name == 'html:p':
self.field.description = self.text
self.text = ''
self.insideDescription = False
# inside indicators
elif self.insideH3 and self.text == 'Indicators':
self.insideIndicators = True
self.insideH3 = False
self.text = ''
# add indicator 1 value
elif self.insideIndicator1 and name == 'html:li':
match = re.match('([0-9\-]+) - (.*)', self.text)
if match:
self.field.indicators.append(
Indicator('1', match.group(1), match.group(2)))
self.text = ''
# add indicator 2 value
elif self.insideIndicator2 and name == 'html:li':
match = re.match('([0-9\-]+) - (.*)', self.text)
if match:
self.field.indicators.append(
Indicator('2', match.group(1), match.group(2)))
self.text = ''
# start of subfields
elif self.insideH3 and self.text == 'Subfield Codes':
self.insideSubfields = True
self.insideH3 == False
self.insideIndicator1 = False
self.insideIndicator2 = False
self.insideIndicators = False
self.text = ''
# add subfield
elif self.insideSubfields and name == 'html:li':
match = re.match('\$(.) - (.*)\((N?R)\)(.*)', self.text)
if match:
subfield = Subfield()
subfield.code = match.group(1)
subfield.description = match.group(2) + match.group(4)
if match.group(3) == 'R':
subfield.repeatable = True
self.field.subfields.append(subfield)
self.text = ''
# end of field definition
elif name == 'html:hr':
if self.field:
self.fields.append(self.field)
self.field = None
self.insideName = False
self.insideDescription = False
self.insideH3 = False
self.insideSubfields = False
self.insideIndicators = False
self.insideIndicator1 = False
self.insideIndicator2 = False
self.text = ''
def characters(self, chars):
chars = chars.encode('utf8')
chars = chars.strip("\n")
if self.insideName or self.insideDescription or self.insideH3 \
or self.insideSubfields or self.insideIndicators \
or self.insideIndicator1 or self.insideIndicator2:
self.text += chars
def findall (node,pattern):
"""Helper for doing a findall across all nodes of an HTML tree
"""
ns = '{http://www.w3.org/1999/xhtml}'
return node.findall('.//%s%s' % (ns, pattern))
def find (node,pattern):
"""Helper for finding one node.
"""
nodes = findall(node,pattern)
if len(nodes) > 0: return nodes[0]
return None
def get_bibliographic_urls ():
url = 'http://www.loc.gov/marc/bibliographic/ecbdhome.html'
tree = TidyHTMLTreeBuilder.parse(urlopen(url))
urls = []
for a in findall(tree,'a'):
if 'bibliographic' in a.attrib['href']:
urls.append(a.attrib['href'])
return urls
def extract_fields (url):
"""Looks for field definitions at a URL, and returns all
found definitions as a list of Field objects.
"""
tree = TidyHTMLTreeBuilder.parse(urlopen(url))
xml = ElementTree.tostring(tree.getroot())
handler = HTMLHandler()
parseString(xml,handler)
return handler.fields
# ok start your engines
if __name__ == '__main__':
fields = Element('fields')
for url in get_bibliographic_urls():
stderr.write("fetching %s\n" % url)
for field in extract_fields(url):
fields.append(field.to_element())
ElementTree.dump(fields)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment