Created
December 22, 2009 02:46
-
-
Save anarchivist/261457 to your computer and use it in GitHub Desktop.
Grab records from a CSV file to do lookups against the Virtual International Authority File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import time | |
import string | |
import sys | |
import urllib | |
import urllib2 | |
import pymarc | |
PUNC_RE = re.compile('[%s]' % string.punctuation) | |
VIAF_URL = 'http://orlabs.oclc.org/viaf/' | |
QUERY_BASE = VIAF_URL + 'search/VIAF?query=local.personalName+all+' | |
QUERY_PARAMS = '+&version=1.1&maximumRecords=100&operation=searchRetrieve&stylesheet=/viaf/xsl/Results.xsl&sortKeys=holdingscount&recordSchema=BriefMarcXML' | |
def normalize(instr): | |
return urllib.quote_plus(re.sub(PUNC_RE, ' ', instr).strip()) | |
def grab_response(name): | |
"""docstring for grab_response""" | |
url = '%s%%22%s%%22%s' % (QUERY_BASE, normalize(name), QUERY_PARAMS) | |
#print url | |
response = urllib.urlopen(url) | |
return response | |
def do_it(r): | |
r_ = u'' | |
r_ = unicode(r.__str__()) | |
print r_.encode("utf-8", 'ignore') | |
def is_not_none(n): | |
return n is not None | |
def grab_auth(identifier): | |
"""grab viaf authority record""" | |
url = VIAF_URL + identifier | |
#print url | |
auth = urllib.urlopen(url) | |
return auth | |
#_ = input('String to grab:') | |
reader = csv.DictReader(file(sys.argv[1], 'rU')) | |
writer = csv.writer(file(sys.argv[2], 'wt')) | |
writer.writerow(('InName', 'LOC', 'ParsedName', 'ParsedId')) | |
for row in reader: | |
xh = pymarc.XmlHandler() | |
inname = row['Uniform_Author_Name'] | |
lc = row['LOC'] | |
pymarc.parse_xml(grab_response(inname), xh) | |
xh.records = filter(is_not_none, xh.records) | |
for rec in xh.records: | |
#print rec['001'].data | |
try: | |
identifier = rec['001'].data | |
if identifier.startswith('LC'): | |
pn = rec['100'].format_field() | |
writer.writerow((inname, lc, pn, identifier)) | |
print "%s, %s, %s, %s" % (inname, lc, pn, identifier) | |
break | |
else: | |
pass | |
except: | |
identifier = 'Fail' | |
pn = 'Fail' | |
writer.writerow((inname, lc, pn, identifier)) | |
print "%s, %s, %s, %s" % (inname, lc, pn, identifier) | |
time.sleep(1) | |
writer.close() | |
#map(do_it, xh.records) |
In the meantime, I've made a softlink for Results.xsl, so your code should stop failing.
Ralph
This is an old script that predates the production service - but I will try to find time to factor in your changes. Thanks!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Would you mind making a couple of changes?
You point at orlabs.oclc.org. That's our experimental server. It would be better if you pointed at viaf.org, which is our production service. test.viaf.org is available, but is not as reliable and we do sometimes put up small experimental versions of the viaf database there.
You reference stylesheet Results.xsl. There's no such stylesheet and serverside rendering results in an internal 404 on that stylesheet and probably no response to your request. (It was in tracking down our internal 404 that I ran across your code.) If you change the reference to results.xsl, you should get better results.
Let me know if I can do anything to help.
Ralph