Last active
September 2, 2016 20:58
-
-
Save kaplun/96e5b143f70b04ecc4b6776027a9059e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
import csv | |
import time | |
import click | |
import shelve | |
from urllib import quote | |
from invenio.dbquery import run_sql | |
from invenio.bibrecord import record_get_field_value, record_get_field_values | |
from invenio.search_engine import perform_request_search, get_record | |
JOURNAL_QUERY = { | |
'Physical Review D': '773__p:"Phys.Rev." 773__v:"D*"', | |
'Physics Letters B': '773__p:"Phys.Lett." 773__v:"B*"', | |
'Nuclear Physics B': '773__p:"Nucl.Phys." 773__v:"B*"', | |
'The European Physical Journal C': '773__p:"Eur.Phys.J." 773__v:"C*"', | |
'Journal of High Energy Physics': '773__p:"JHEP"' | |
} | |
def get_doi_date(journal): | |
click.echo("Querying Crossref for {0}...".format(journal)) | |
filename = journal.replace(' ', '_').lower() + '.bin' | |
ret = shelve.open(filename) | |
try: | |
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor=*&rows=1000".format(quote(journal))).json()['message'] | |
if len(ret) == result['total-results']: | |
click.echo("Already successfully downloaded") | |
return ret | |
with click.progressbar(length=result['total-results']) as bar: | |
while result['items']: | |
for item in result['items']: | |
ret[str(item['DOI'].lower())] = '{0}-{1:02d}-{2:02d}'.format(*item['issued']['date-parts'][0] + [1, 1]) | |
bar.update(len(result['items'])) | |
try: | |
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor={1}&rows=1000".format(quote(journal), quote(result['next-cursor']))).json()['message'] | |
except Exception: | |
time.sleep(3) | |
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor={1}&rows=1000".format(quote(journal), quote(result['next-cursor']))).json()['message'] | |
click.echo("DONE.") | |
finally: | |
click.echo("Crossref info saved into {0}".format(filename)) | |
return ret | |
def get_info_from_inspire(journal): | |
filename = journal.replace(' ', '_').lower() + '.csv' | |
out = open(filename, 'w') | |
csv_writer = csv.writer(out) | |
recids = perform_request_search(p=JOURNAL_QUERY[journal] + ' 035:arXiv 0247_2:DOI', wl=0) | |
click.echo("{0} records found in INSPIRE for {1}".format(len(recids), journal)) | |
doi_date_mapping = get_doi_date(journal) | |
click.echo("{0} records found in Crossref for {1}".format(len(doi_date_mapping), journal)) | |
with click.progressbar(recids) as bar: | |
for recid in bar: | |
record = get_record(recid) | |
earliest_date = run_sql("SELECT earliest_date FROM bibrec WHERE id=%s", (recid, ))[0][0].strftime('%Y-%m-%d') | |
arxiv_eprint = record_get_field_values(record, '037', code='a', filter_subfield_code='9', filter_subfield_value='arXiv')[0] | |
if not arxiv_eprint: | |
click.echo("WARNING: record {0} does not have a valid arXiv eprint".format(recid), err=True) | |
dois = record_get_field_values(record, '024', '7', code='a', filter_subfield_code='2', filter_subfield_value='DOI') | |
doi_dates = sorted([(doi_date_mapping[doi.lower()], doi.lower()) for doi in dois if doi.lower() in doi_date_mapping]) | |
if not doi_dates: | |
click.echo("Can't find valid DOI for {0}".format(recid), err=True) | |
continue | |
doi_date, doi = doi_dates[0] | |
csv_writer.writerow((arxiv_eprint, earliest_date, doi, doi_date)) | |
return filename | |
def main(): | |
for journal in JOURNAL_QUERY: | |
click.echo("Processing {0}...".format(journal)) | |
filename = get_info_from_inspire(journal) | |
click.echo("... {0} DONE.".format(filename)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment