Created
December 22, 2009 02:46
-
-
Save anarchivist/261457 to your computer and use it in GitHub Desktop.
Grab records from a CSV file to do lookups against the Virtual International Authority File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import re | |
| import time | |
| import string | |
| import sys | |
| import urllib | |
| import urllib2 | |
| import pymarc | |
| PUNC_RE = re.compile('[%s]' % string.punctuation) | |
| VIAF_URL = 'http://orlabs.oclc.org/viaf/' | |
| QUERY_BASE = VIAF_URL + 'search/VIAF?query=local.personalName+all+' | |
| QUERY_PARAMS = '+&version=1.1&maximumRecords=100&operation=searchRetrieve&stylesheet=/viaf/xsl/Results.xsl&sortKeys=holdingscount&recordSchema=BriefMarcXML' | |
| def normalize(instr): | |
| return urllib.quote_plus(re.sub(PUNC_RE, ' ', instr).strip()) | |
| def grab_response(name): | |
| """docstring for grab_response""" | |
| url = '%s%%22%s%%22%s' % (QUERY_BASE, normalize(name), QUERY_PARAMS) | |
| #print url | |
| response = urllib.urlopen(url) | |
| return response | |
| def do_it(r): | |
| r_ = u'' | |
| r_ = unicode(r.__str__()) | |
| print r_.encode("utf-8", 'ignore') | |
| def is_not_none(n): | |
| return n is not None | |
| def grab_auth(identifier): | |
| """grab viaf authority record""" | |
| url = VIAF_URL + identifier | |
| #print url | |
| auth = urllib.urlopen(url) | |
| return auth | |
| #_ = input('String to grab:') | |
| reader = csv.DictReader(file(sys.argv[1], 'rU')) | |
| writer = csv.writer(file(sys.argv[2], 'wt')) | |
| writer.writerow(('InName', 'LOC', 'ParsedName', 'ParsedId')) | |
| for row in reader: | |
| xh = pymarc.XmlHandler() | |
| inname = row['Uniform_Author_Name'] | |
| lc = row['LOC'] | |
| pymarc.parse_xml(grab_response(inname), xh) | |
| xh.records = filter(is_not_none, xh.records) | |
| for rec in xh.records: | |
| #print rec['001'].data | |
| try: | |
| identifier = rec['001'].data | |
| if identifier.startswith('LC'): | |
| pn = rec['100'].format_field() | |
| writer.writerow((inname, lc, pn, identifier)) | |
| print "%s, %s, %s, %s" % (inname, lc, pn, identifier) | |
| break | |
| else: | |
| pass | |
| except: | |
| identifier = 'Fail' | |
| pn = 'Fail' | |
| writer.writerow((inname, lc, pn, identifier)) | |
| print "%s, %s, %s, %s" % (inname, lc, pn, identifier) | |
| time.sleep(1) | |
| writer.close() | |
| #map(do_it, xh.records) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is an old script that predates the production service - but I will try to find time to factor in your changes. Thanks!