Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Created March 20, 2012 20:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lawlesst/2141226 to your computer and use it in GitHub Desktop.
Save lawlesst/2141226 to your computer and use it in GitHub Desktop.
Attempts to map OpenURLs to BibJSON
"""
Attempts to parse raw OpenURLs to the BibJSON convention.
"""
import urllib
import urlparse
import sys
import json
from pprint import pprint
def pull_oclc(odict):
"""
Pull OCLC numbers from incoming FirstSearch/Worldcat urls.
"""
import re
oclc_reg = re.compile('\d+')
oclc = None
if odict.get('rfr_id', ['null'])[0].rfind('firstsearch') > -1:
oclc = odict.get('rfe_dat', ['null'])[0]
match = oclc_reg.search(oclc)
if match:
oclc = match.group()
return oclc
def initialize_id():
"""
Helper to simply return dict with k,v that BibJSON expects.
"""
d = {}
d['id'] = None
d['type'] = None
return d
def pull_and_map(key_list, cite):
"""
Given a list of keys and a dictionary. Return the first found value.
"""
for k in key_list:
val = cite.get(k, None)
if val:
return val[0].strip()
return
def openurl(query):
"""
An attempt to map raw OpenURLs to BibJSON convention.
Loosely based of this nice PHP openurl parser from Rod Page:
http://code.google.com/p/bioguid/source/search?q=openurl&origq=openurl&btnG=Search+Trunk
"""
#Load query into dictionary
cite = urlparse.parse_qs(query)
#Default
referent = {}
#Some fields will have multiple values
referent['author'] = []
referent['identifier'] = []
type = 'unknown'
#Initialize an identifier
id = initialize_id()
for k,value_list in cite.items():
for v in value_list:
#Get type
if k == 'rft_val_fmt':
if v == 'info:ofi/fmt:kev:mtx:journal':
type = 'article'
elif v == 'info:ofi/fmt:kev:mtx:book':
type = 'book'
#Article title
elif(k == 'rft.atitle') or (k == 'atitle'):
referent['title'] = v
type = 'article'
#Book title
elif k == ('rft.btitle') or (k == 'btitle'):
referent['booktitle'] = v
type = 'book'
#Journal title
elif (k == 'rft.jtitle') or (k == 'rft.title') or (k == 'title'):
#referent['secondary_title'] = v
ti = {'name': v}
#Try to pull short title code.
stitle = cite.get('rft.stitle', None)
if not stitle:
stitle = cite.get('stitle', None)
if stitle:
ti['shortcode'] = stitle[0]
referent['journal'] = ti
type = 'article'
#Issn
elif (k == 'rft.issn') or (k == 'issn'):
id['type'] = 'issn'
id['id'] = v
#ISBN
elif (k == 'rft.isbn') or (k == 'isbn'):
id['type'] = 'isbn'
id['id'] = v
#Publisher - doesn't seem to be standard but found in the wild.
elif (k == 'rft.pub') or (k == 'pub'):
referent['publisher'] = v
#Identifiers
elif (k == 'rft.id') or (k == 'id'):
if v.startswith('info:doi/'):
#referent['doi'] = v.lstrip('info:doi/')
id['type'] = 'doi'
id['id'] = "doi:%s" % v.lstrip('info:doi/')
elif v.startswith('info:pmid/'):
id['type'] = 'pmid'
id['id'] = v
#From the wild: id=pmid:21080734&sid=Entrez:PubMed
elif v.startswith('pmid:'):
#referent['pmid'] = v.lstrip('pmid:')
id['type'] = 'pmid'
id['id'] = v.lstrip('pmid:')
#Other ids from the wild
elif k == 'pmid':
id['type'] = 'pmid'
id['id'] = v
elif k == 'doi':
id['type'] = 'doi'
id['id'] = "doi:%s" % v
#OCLC - non standard
#Authors
elif (k == 'rft.au') or (k == 'au') or\
(k == 'rft.aulast') or (k == 'aulast'):
#If it's a full name, set here.
if (k == 'rft.au') or (k == 'au'):
au = {'name': v}
else:
au = {}
aulast = pull_and_map(['rft.aulast', 'aulast'],
cite)
if aulast:
au['lastname'] = aulast
aufirst = pull_and_map(['rft.aufirst', 'aufirst'],
cite)
if aufirst:
au['firstname'] = aufirst
#Put the full name together now if we can.
if not au.has_key('name'):
if au.has_key('lastname'):
if au.has_key('firstname'):
au['name'] = "%s %s" % (au['firstname'],
au['lastname'])
referent['author'].append(au)
#Volume
elif (k == 'rft.volume') or (k == 'volume'):
referent['volume'] = v
#Issue
elif (k == 'rft.issue') or (k == 'issue'):
referent['issue'] = v
#Date/Year
elif (k == 'rft.date') or (k == 'date'):
referent['year'] = v[:4]
#Pages are gross
elif (k == 'rft.pages') or (k == 'pages'):
referent['pages'] = v
elif (k == 'rft.spage') or (k == 'spage'):
referent['start_page'] = v
elif (k == 'rft.epage') or (k == 'epage'):
referent['end_page'] = v
#Look at the type term
elif k == 'type':
if v == 'book':
if type != 'book':
type = 'book'
elif v == 'article':
if type != 'book':
type = 'article'
else:
referent['sub_type'] = v
#Referers or sids
if (k == 'rfr_id') or (k == 'sid'):
referent['bul:rfr'] = v
#Add any identifiers picked up on this pass
if (id['type']) and (id['id']):
#Make sure this pair isn't already there.
if id not in referent['identifier']:
referent['identifier'].append(id)
#Re-initialize an identifier so that it's blank on the next trip
id = initialize_id()
referent['type'] = type
#Add oclc ids - non standard so handle here
oclc = pull_oclc(cite)
if oclc:
referent['identifier'].append({'type': 'oclc',
'id': oclc})
return referent
import unittest
class TestBibJSON(unittest.TestCase):
def test_book_from_worldcat(self):
q = 'rft.pub=W+H+Freeman+%26+Co&rft.btitle=Introduction+to+Genetic+Analysis.&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&isbn=9781429233231&req_dat=%3Csessionid%3E0%3C%2Fsessionid%3E&title=Introduction+to+Genetic+Analysis.&pid=%3Caccession+number%3E277200522%3C%2Faccession+number%3E%3Cfssessid%3E0%3C%2Ffssessid%3E&rft.date=2008&genre=book&rft_id=urn%3AISBN%3A9781429233231&openurl=sid&rfe_dat=%3Caccessionnumber%3E277200522%3C%2Faccessionnumber%3E&rft.isbn=9781429233231&url_ver=Z39.88-2004&date=2008&rfr_id=info%3Asid%2Ffirstsearch.oclc.org%3AWorldCat&id=doi%3A&rft.genre=book'
bib = openurl(q)
self.assertEqual(bib['type'], 'book')
self.assertEqual(bib['booktitle'],
'Introduction to Genetic Analysis.')
self.assertEqual(bib['year'], '2008')
self.assertTrue({'type': 'oclc',
'id': '277200522'} in bib['identifier'])
pprint(bib)
pprint(urlparse.parse_qs(q))
def test_article(self):
q = 'volume=16&genre=article&spage=538&sid=EBSCO:aph&title=Current+Pharmaceutical+Design&date=20100211&issue=5&issn=13816128&pid=&atitle=Targeting+%ce%b17+Nicotinic+Acetylcholine+Receptors+in+the+Treatment+of+Schizophrenia.'
bib = openurl(q)
self.assertEqual(bib['journal']['name'],
'Current Pharmaceutical Design')
self.assertEqual(bib['year'],
'2010')
self.assertTrue({'type': 'issn',
'id': '13816128'} in bib['identifier'])
def test_article_stitle(self):
q = 'rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/www.isinet.com:WoK:UA&rft.spage=30&rft.issue=1&rft.epage=42&rft.title=INTEGRATIVE%20BIOLOGY&rft.aulast=Castillo&url_ctx_fmt=info:ofi/fmt:kev:mtx:ctx&rft.date=2009&rft.volume=1&url_ver=Z39.88-2004&rft.stitle=INTEGR%20BIOL&rft.atitle=Manipulation%20of%20biological%20samples%20using%20micro%20and%20nano%20techniques&rft.au=Svendsen%2C%20W&rft_id=info:doi/10%2E1039%2Fb814549k&rft.auinit=J&rft.issn=1757-9694&rft.genre=article'
bib = openurl(q)
self.assertEqual(bib['title'],
'Manipulation of biological samples using micro and nano techniques')
self.assertEqual(bib['journal']['shortcode'],
'INTEGR BIOL')
#pprint(bib)
#pprint(urlparse.parse_qs(q))
def test_article_no_full_author(self):
q = 'issn=1040676X&aulast=Wallace&title=Chronicle%20of%20Philanthropy&pid=<metalib_doc_number>000117190</metalib_doc_number><metalib_base_url>http://sfx.brown.edu:8331</metalib_base_url><opid></opid>&sid=metalib:EBSCO_APH&__service_type=&volume=17&genre=&sici=&epage=23&atitle=Where%20Should%20the%20Money%20Go%3F&date=2005&isbn=&spage=9&issue=24&id=doi:&auinit=&aufirst=%20Nicole'
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment