lawlesst/openurl_to_bibjson.py

## openurl_to_bibjson.py
"""
Attempts to parse raw OpenURLs to the BibJSON convention.
"""

import urllib
import urlparse
import sys
import json
from pprint import pprint


def pull_oclc(odict):
    """
    Pull OCLC numbers from incoming FirstSearch/Worldcat urls.
    """
    import re
    oclc_reg = re.compile('\d+')
    oclc = None
    if odict.get('rfr_id', ['null'])[0].rfind('firstsearch') > -1:
        oclc = odict.get('rfe_dat', ['null'])[0]
        match = oclc_reg.search(oclc)
        if match:
            oclc = match.group()
    return oclc

def initialize_id():
    """
    Helper to simply return dict with k,v that BibJSON expects.
    """
    d = {}
    d['id'] = None
    d['type'] = None
    return d

def pull_and_map(key_list, cite):
    """
    Given a list of keys and a dictionary.  Return the first found value.
    """
    for k in key_list:
        val = cite.get(k, None)
        if val:
            return val[0].strip()
    return


def openurl(query):
    """
    An attempt to map raw OpenURLs to BibJSON convention.

    Loosely based of this nice PHP openurl parser from Rod Page:
    http://code.google.com/p/bioguid/source/search?q=openurl&origq=openurl&btnG=Search+Trunk
    """
    #Load query into dictionary
    cite = urlparse.parse_qs(query)
    #Default
    referent = {}
    #Some fields will have multiple values
    referent['author'] = []
    referent['identifier'] = []
    type = 'unknown'
    #Initialize an identifier
    id = initialize_id()
    for k,value_list in cite.items():
        for v in value_list:
            #Get type
            if k == 'rft_val_fmt':
                if v == 'info:ofi/fmt:kev:mtx:journal':
                    type = 'article'
                elif v == 'info:ofi/fmt:kev:mtx:book':
                    type = 'book'
            #Article title
            elif(k == 'rft.atitle') or (k == 'atitle'):
                referent['title'] = v
                type = 'article'
            #Book title
            elif k == ('rft.btitle') or (k == 'btitle'):
                referent['booktitle'] = v
                type = 'book'
            #Journal title
            elif (k == 'rft.jtitle') or (k == 'rft.title') or (k == 'title'):
                #referent['secondary_title'] = v
                ti = {'name': v}
                #Try to pull short title code.
                stitle = cite.get('rft.stitle', None)
                if not stitle:
                    stitle = cite.get('stitle', None)
                if stitle:
                    ti['shortcode'] = stitle[0]
                referent['journal'] = ti
                type = 'article'
            #Issn
            elif (k == 'rft.issn') or (k == 'issn'):
                id['type'] = 'issn'
                id['id'] = v
            #ISBN
            elif (k == 'rft.isbn') or (k == 'isbn'):
                id['type'] = 'isbn'
                id['id'] = v
            #Publisher - doesn't seem to be standard but found in the wild.
            elif (k == 'rft.pub') or (k == 'pub'):
                referent['publisher'] = v
            #Identifiers
            elif (k == 'rft.id') or (k == 'id'):
                if v.startswith('info:doi/'):
                    #referent['doi'] = v.lstrip('info:doi/')
                    id['type'] = 'doi'
                    id['id'] = "doi:%s" % v.lstrip('info:doi/')
                elif v.startswith('info:pmid/'):
                    id['type'] = 'pmid'
                    id['id'] = v
                #From the wild: id=pmid:21080734&sid=Entrez:PubMed
                elif v.startswith('pmid:'):
                    #referent['pmid'] = v.lstrip('pmid:')
                    id['type'] = 'pmid'
                    id['id'] = v.lstrip('pmid:')
            #Other ids from the wild
            elif k == 'pmid':
                id['type'] = 'pmid'
                id['id'] = v
            elif k == 'doi':
                id['type'] = 'doi'
                id['id'] = "doi:%s" % v
            #OCLC - non standard
            #Authors
            elif (k == 'rft.au') or (k == 'au') or\
                 (k == 'rft.aulast') or (k == 'aulast'):
                #If it's a full name, set here.
                if (k == 'rft.au') or (k == 'au'):
                    au = {'name': v}
                else:
                    au = {}
                aulast = pull_and_map(['rft.aulast', 'aulast'],
                                      cite)
                if aulast:
                    au['lastname'] = aulast
                aufirst = pull_and_map(['rft.aufirst', 'aufirst'],
                                       cite)
                if aufirst:
                    au['firstname'] = aufirst

                #Put the full name together now if we can.
                if not au.has_key('name'):
                    if au.has_key('lastname'):
                        if au.has_key('firstname'):
                            au['name'] = "%s %s" % (au['firstname'],
                                                    au['lastname'])
                referent['author'].append(au)
            #Volume
            elif (k == 'rft.volume') or (k == 'volume'):
                referent['volume'] = v
            #Issue
            elif (k == 'rft.issue') or (k == 'issue'):
                referent['issue'] = v
            #Date/Year
            elif (k == 'rft.date') or (k == 'date'):
                referent['year'] = v[:4]
            #Pages are gross
            elif (k == 'rft.pages') or (k == 'pages'):
                referent['pages'] = v
            elif (k == 'rft.spage') or (k == 'spage'):
                referent['start_page'] = v
            elif (k == 'rft.epage') or (k == 'epage'):
                referent['end_page'] = v
            #Look at the type term
            elif k == 'type':
                if v == 'book':
                    if type != 'book':
                        type = 'book'
                elif v == 'article':
                    if type != 'book':
                        type = 'article'
                else:
                    referent['sub_type'] = v
            #Referers or sids
            if (k == 'rfr_id') or (k == 'sid'):
                referent['bul:rfr'] = v

            #Add any identifiers picked up on this pass
            if (id['type']) and (id['id']):
                #Make sure this pair isn't already there.
                if id not in referent['identifier']:
                    referent['identifier'].append(id)
                    #Re-initialize an identifier so that it's blank on the next trip
                    id = initialize_id()

    referent['type'] = type
    #Add oclc ids - non standard so handle here
    oclc = pull_oclc(cite)
    if oclc:
        referent['identifier'].append({'type': 'oclc',
                                       'id': oclc})
    return referent


import unittest
class TestBibJSON(unittest.TestCase):

    def test_book_from_worldcat(self):
        q = 'rft.pub=W+H+Freeman+%26+Co&rft.btitle=Introduction+to+Genetic+Analysis.&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&isbn=9781429233231&req_dat=%3Csessionid%3E0%3C%2Fsessionid%3E&title=Introduction+to+Genetic+Analysis.&pid=%3Caccession+number%3E277200522%3C%2Faccession+number%3E%3Cfssessid%3E0%3C%2Ffssessid%3E&rft.date=2008&genre=book&rft_id=urn%3AISBN%3A9781429233231&openurl=sid&rfe_dat=%3Caccessionnumber%3E277200522%3C%2Faccessionnumber%3E&rft.isbn=9781429233231&url_ver=Z39.88-2004&date=2008&rfr_id=info%3Asid%2Ffirstsearch.oclc.org%3AWorldCat&id=doi%3A&rft.genre=book'
        bib = openurl(q)
        self.assertEqual(bib['type'], 'book')
        self.assertEqual(bib['booktitle'],
                        'Introduction to Genetic Analysis.')
        self.assertEqual(bib['year'], '2008')
        self.assertTrue({'type': 'oclc',
                          'id': '277200522'} in bib['identifier'])
        pprint(bib)
        pprint(urlparse.parse_qs(q))

    def test_article(self):
        q = 'volume=16&genre=article&spage=538&sid=EBSCO:aph&title=Current+Pharmaceutical+Design&date=20100211&issue=5&issn=13816128&pid=&atitle=Targeting+%ce%b17+Nicotinic+Acetylcholine+Receptors+in+the+Treatment+of+Schizophrenia.'
        bib = openurl(q)
        self.assertEqual(bib['journal']['name'],
                         'Current Pharmaceutical Design')
        self.assertEqual(bib['year'],
                         '2010')
        self.assertTrue({'type': 'issn',
                         'id': '13816128'} in bib['identifier'])

    def test_article_stitle(self):
        q = 'rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/www.isinet.com:WoK:UA&rft.spage=30&rft.issue=1&rft.epage=42&rft.title=INTEGRATIVE%20BIOLOGY&rft.aulast=Castillo&url_ctx_fmt=info:ofi/fmt:kev:mtx:ctx&rft.date=2009&rft.volume=1&url_ver=Z39.88-2004&rft.stitle=INTEGR%20BIOL&rft.atitle=Manipulation%20of%20biological%20samples%20using%20micro%20and%20nano%20techniques&rft.au=Svendsen%2C%20W&rft_id=info:doi/10%2E1039%2Fb814549k&rft.auinit=J&rft.issn=1757-9694&rft.genre=article'

        bib = openurl(q)
        self.assertEqual(bib['title'],
                         'Manipulation of biological samples using micro and nano techniques')
        self.assertEqual(bib['journal']['shortcode'],
                         'INTEGR BIOL')
        #pprint(bib)
        #pprint(urlparse.parse_qs(q))

    def test_article_no_full_author(self):
        q = 'issn=1040676X&aulast=Wallace&title=Chronicle%20of%20Philanthropy&pid=<metalib_doc_number>000117190</metalib_doc_number><metalib_base_url>http://sfx.brown.edu:8331</metalib_base_url><opid></opid>&sid=metalib:EBSCO_APH&__service_type=&volume=17&genre=&sici=&epage=23&atitle=Where%20Should%20the%20Money%20Go%3F&date=2005&isbn=&spage=9&issue=24&id=doi:&auinit=&aufirst=%20Nicole'


if __name__ == '__main__':
    unittest.main()
	"""
	Attempts to parse raw OpenURLs to the BibJSON convention.
	"""

	import urllib
	import urlparse
	import sys
	import json
	from pprint import pprint


	def pull_oclc(odict):
	"""
	Pull OCLC numbers from incoming FirstSearch/Worldcat urls.
	"""
	import re
	oclc_reg = re.compile('\d+')
	oclc = None
	if odict.get('rfr_id', ['null'])[0].rfind('firstsearch') > -1:
	oclc = odict.get('rfe_dat', ['null'])[0]
	match = oclc_reg.search(oclc)
	if match:
	oclc = match.group()
	return oclc

	def initialize_id():
	"""
	Helper to simply return dict with k,v that BibJSON expects.
	"""
	d = {}
	d['id'] = None
	d['type'] = None
	return d

	def pull_and_map(key_list, cite):
	"""
	Given a list of keys and a dictionary. Return the first found value.
	"""
	for k in key_list:
	val = cite.get(k, None)
	if val:
	return val[0].strip()
	return


	def openurl(query):
	"""
	An attempt to map raw OpenURLs to BibJSON convention.

	Loosely based of this nice PHP openurl parser from Rod Page:
	http://code.google.com/p/bioguid/source/search?q=openurl&origq=openurl&btnG=Search+Trunk
	"""
	#Load query into dictionary
	cite = urlparse.parse_qs(query)
	#Default
	referent = {}
	#Some fields will have multiple values
	referent['author'] = []
	referent['identifier'] = []
	type = 'unknown'
	#Initialize an identifier
	id = initialize_id()
	for k,value_list in cite.items():
	for v in value_list:
	#Get type
	if k == 'rft_val_fmt':
	if v == 'info:ofi/fmt:kev:mtx:journal':
	type = 'article'
	elif v == 'info:ofi/fmt:kev:mtx:book':
	type = 'book'
	#Article title
	elif(k == 'rft.atitle') or (k == 'atitle'):
	referent['title'] = v
	type = 'article'
	#Book title
	elif k == ('rft.btitle') or (k == 'btitle'):
	referent['booktitle'] = v
	type = 'book'
	#Journal title
	elif (k == 'rft.jtitle') or (k == 'rft.title') or (k == 'title'):
	#referent['secondary_title'] = v
	ti = {'name': v}
	#Try to pull short title code.
	stitle = cite.get('rft.stitle', None)
	if not stitle:
	stitle = cite.get('stitle', None)
	if stitle:
	ti['shortcode'] = stitle[0]
	referent['journal'] = ti
	type = 'article'
	#Issn
	elif (k == 'rft.issn') or (k == 'issn'):
	id['type'] = 'issn'
	id['id'] = v
	#ISBN
	elif (k == 'rft.isbn') or (k == 'isbn'):
	id['type'] = 'isbn'
	id['id'] = v
	#Publisher - doesn't seem to be standard but found in the wild.
	elif (k == 'rft.pub') or (k == 'pub'):
	referent['publisher'] = v
	#Identifiers
	elif (k == 'rft.id') or (k == 'id'):
	if v.startswith('info:doi/'):
	#referent['doi'] = v.lstrip('info:doi/')
	id['type'] = 'doi'
	id['id'] = "doi:%s" % v.lstrip('info:doi/')
	elif v.startswith('info:pmid/'):
	id['type'] = 'pmid'
	id['id'] = v
	#From the wild: id=pmid:21080734&sid=Entrez:PubMed
	elif v.startswith('pmid:'):
	#referent['pmid'] = v.lstrip('pmid:')
	id['type'] = 'pmid'
	id['id'] = v.lstrip('pmid:')
	#Other ids from the wild
	elif k == 'pmid':
	id['type'] = 'pmid'
	id['id'] = v
	elif k == 'doi':
	id['type'] = 'doi'
	id['id'] = "doi:%s" % v
	#OCLC - non standard
	#Authors
	elif (k == 'rft.au') or (k == 'au') or\
	(k == 'rft.aulast') or (k == 'aulast'):
	#If it's a full name, set here.
	if (k == 'rft.au') or (k == 'au'):
	au = {'name': v}
	else:
	au = {}
	aulast = pull_and_map(['rft.aulast', 'aulast'],
	cite)
	if aulast:
	au['lastname'] = aulast
	aufirst = pull_and_map(['rft.aufirst', 'aufirst'],
	cite)
	if aufirst:
	au['firstname'] = aufirst

	#Put the full name together now if we can.
	if not au.has_key('name'):
	if au.has_key('lastname'):
	if au.has_key('firstname'):
	au['name'] = "%s %s" % (au['firstname'],
	au['lastname'])
	referent['author'].append(au)
	#Volume
	elif (k == 'rft.volume') or (k == 'volume'):
	referent['volume'] = v
	#Issue
	elif (k == 'rft.issue') or (k == 'issue'):
	referent['issue'] = v
	#Date/Year
	elif (k == 'rft.date') or (k == 'date'):
	referent['year'] = v[:4]
	#Pages are gross
	elif (k == 'rft.pages') or (k == 'pages'):
	referent['pages'] = v
	elif (k == 'rft.spage') or (k == 'spage'):
	referent['start_page'] = v
	elif (k == 'rft.epage') or (k == 'epage'):
	referent['end_page'] = v
	#Look at the type term
	elif k == 'type':
	if v == 'book':
	if type != 'book':
	type = 'book'
	elif v == 'article':
	if type != 'book':
	type = 'article'
	else:
	referent['sub_type'] = v
	#Referers or sids
	if (k == 'rfr_id') or (k == 'sid'):
	referent['bul:rfr'] = v

	#Add any identifiers picked up on this pass
	if (id['type']) and (id['id']):
	#Make sure this pair isn't already there.
	if id not in referent['identifier']:
	referent['identifier'].append(id)
	#Re-initialize an identifier so that it's blank on the next trip
	id = initialize_id()

	referent['type'] = type
	#Add oclc ids - non standard so handle here
	oclc = pull_oclc(cite)
	if oclc:
	referent['identifier'].append({'type': 'oclc',
	'id': oclc})
	return referent


	import unittest
	class TestBibJSON(unittest.TestCase):

	def test_book_from_worldcat(self):
	q = 'rft.pub=W+H+Freeman+%26+Co&rft.btitle=Introduction+to+Genetic+Analysis.&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&isbn=9781429233231&req_dat=%3Csessionid%3E0%3C%2Fsessionid%3E&title=Introduction+to+Genetic+Analysis.&pid=%3Caccession+number%3E277200522%3C%2Faccession+number%3E%3Cfssessid%3E0%3C%2Ffssessid%3E&rft.date=2008&genre=book&rft_id=urn%3AISBN%3A9781429233231&openurl=sid&rfe_dat=%3Caccessionnumber%3E277200522%3C%2Faccessionnumber%3E&rft.isbn=9781429233231&url_ver=Z39.88-2004&date=2008&rfr_id=info%3Asid%2Ffirstsearch.oclc.org%3AWorldCat&id=doi%3A&rft.genre=book'
	bib = openurl(q)
	self.assertEqual(bib['type'], 'book')
	self.assertEqual(bib['booktitle'],
	'Introduction to Genetic Analysis.')
	self.assertEqual(bib['year'], '2008')
	self.assertTrue({'type': 'oclc',
	'id': '277200522'} in bib['identifier'])
	pprint(bib)
	pprint(urlparse.parse_qs(q))

	def test_article(self):
	q = 'volume=16&genre=article&spage=538&sid=EBSCO:aph&title=Current+Pharmaceutical+Design&date=20100211&issue=5&issn=13816128&pid=&atitle=Targeting+%ce%b17+Nicotinic+Acetylcholine+Receptors+in+the+Treatment+of+Schizophrenia.'
	bib = openurl(q)
	self.assertEqual(bib['journal']['name'],
	'Current Pharmaceutical Design')
	self.assertEqual(bib['year'],
	'2010')
	self.assertTrue({'type': 'issn',
	'id': '13816128'} in bib['identifier'])

	def test_article_stitle(self):
	q = 'rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/www.isinet.com:WoK:UA&rft.spage=30&rft.issue=1&rft.epage=42&rft.title=INTEGRATIVE%20BIOLOGY&rft.aulast=Castillo&url_ctx_fmt=info:ofi/fmt:kev:mtx:ctx&rft.date=2009&rft.volume=1&url_ver=Z39.88-2004&rft.stitle=INTEGR%20BIOL&rft.atitle=Manipulation%20of%20biological%20samples%20using%20micro%20and%20nano%20techniques&rft.au=Svendsen%2C%20W&rft_id=info:doi/10%2E1039%2Fb814549k&rft.auinit=J&rft.issn=1757-9694&rft.genre=article'

	bib = openurl(q)
	self.assertEqual(bib['title'],
	'Manipulation of biological samples using micro and nano techniques')
	self.assertEqual(bib['journal']['shortcode'],
	'INTEGR BIOL')
	#pprint(bib)
	#pprint(urlparse.parse_qs(q))

	def test_article_no_full_author(self):
	q = 'issn=1040676X&aulast=Wallace&title=Chronicle%20of%20Philanthropy&pid=<metalib_doc_number>000117190</metalib_doc_number><metalib_base_url>http://sfx.brown.edu:8331</metalib_base_url><opid></opid>&sid=metalib:EBSCO_APH&__service_type=&volume=17&genre=&sici=&epage=23&atitle=Where%20Should%20the%20Money%20Go%3F&date=2005&isbn=&spage=9&issue=24&id=doi:&auinit=&aufirst=%20Nicole'




	if __name__ == '__main__':
	unittest.main()