Skip to content

Instantly share code, notes, and snippets.

@kaplun
Last active September 2, 2016 20:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaplun/96e5b143f70b04ecc4b6776027a9059e to your computer and use it in GitHub Desktop.
Save kaplun/96e5b143f70b04ecc4b6776027a9059e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import requests
import csv
import time
import click
import shelve
from urllib import quote
from invenio.dbquery import run_sql
from invenio.bibrecord import record_get_field_value, record_get_field_values
from invenio.search_engine import perform_request_search, get_record
JOURNAL_QUERY = {
'Physical Review D': '773__p:"Phys.Rev." 773__v:"D*"',
'Physics Letters B': '773__p:"Phys.Lett." 773__v:"B*"',
'Nuclear Physics B': '773__p:"Nucl.Phys." 773__v:"B*"',
'The European Physical Journal C': '773__p:"Eur.Phys.J." 773__v:"C*"',
'Journal of High Energy Physics': '773__p:"JHEP"'
}
def get_doi_date(journal):
click.echo("Querying Crossref for {0}...".format(journal))
filename = journal.replace(' ', '_').lower() + '.bin'
ret = shelve.open(filename)
try:
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor=*&rows=1000".format(quote(journal))).json()['message']
if len(ret) == result['total-results']:
click.echo("Already successfully downloaded")
return ret
with click.progressbar(length=result['total-results']) as bar:
while result['items']:
for item in result['items']:
ret[str(item['DOI'].lower())] = '{0}-{1:02d}-{2:02d}'.format(*item['issued']['date-parts'][0] + [1, 1])
bar.update(len(result['items']))
try:
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor={1}&rows=1000".format(quote(journal), quote(result['next-cursor']))).json()['message']
except Exception:
time.sleep(3)
result = requests.get("http://api.crossref.org/works?filter=container-title:{0}&cursor={1}&rows=1000".format(quote(journal), quote(result['next-cursor']))).json()['message']
click.echo("DONE.")
finally:
click.echo("Crossref info saved into {0}".format(filename))
return ret
def get_info_from_inspire(journal):
filename = journal.replace(' ', '_').lower() + '.csv'
out = open(filename, 'w')
csv_writer = csv.writer(out)
recids = perform_request_search(p=JOURNAL_QUERY[journal] + ' 035:arXiv 0247_2:DOI', wl=0)
click.echo("{0} records found in INSPIRE for {1}".format(len(recids), journal))
doi_date_mapping = get_doi_date(journal)
click.echo("{0} records found in Crossref for {1}".format(len(doi_date_mapping), journal))
with click.progressbar(recids) as bar:
for recid in bar:
record = get_record(recid)
earliest_date = run_sql("SELECT earliest_date FROM bibrec WHERE id=%s", (recid, ))[0][0].strftime('%Y-%m-%d')
arxiv_eprint = record_get_field_values(record, '037', code='a', filter_subfield_code='9', filter_subfield_value='arXiv')[0]
if not arxiv_eprint:
click.echo("WARNING: record {0} does not have a valid arXiv eprint".format(recid), err=True)
dois = record_get_field_values(record, '024', '7', code='a', filter_subfield_code='2', filter_subfield_value='DOI')
doi_dates = sorted([(doi_date_mapping[doi.lower()], doi.lower()) for doi in dois if doi.lower() in doi_date_mapping])
if not doi_dates:
click.echo("Can't find valid DOI for {0}".format(recid), err=True)
continue
doi_date, doi = doi_dates[0]
csv_writer.writerow((arxiv_eprint, earliest_date, doi, doi_date))
return filename
def main():
for journal in JOURNAL_QUERY:
click.echo("Processing {0}...".format(journal))
filename = get_info_from_inspire(journal)
click.echo("... {0} DONE.".format(filename))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment