Last active
November 14, 2017 19:23
-
-
Save eseiver/ebf951514e68aba2c1e4bb70e1dc4bf3 to your computer and use it in GitHub Desktop.
How to get every PLOS article DOI using PLOS's search API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" This small script will return a list of every PLOS article DOI. | |
It uses regular expressions to return only DOIs to full articles. | |
For more information on text and data-mining PLOS content, see http://api.plos.org/text-and-data-mining/ | |
""" | |
import requests | |
def get_all_solr_dois(): | |
""" | |
Get every article published by PLOS, up to 500,000, as indexed by Solr on api.plos.org. | |
URL includes regex to exclude sub-DOIs and image DOIs. | |
:return: list of DOIs for all PLOS articles | |
""" | |
solr_magic_url = ('http://api.plos.org/terms?terms.fl=id&terms.limit=500000&wt=json&indent=true&terms.regex=' | |
'10%5C.1371%5C/(journal%5C.p%5Ba-zA-Z%5D%7B3%7D%5C.%5B%5Cd%5D%7B7%7D$%7Cannotation%5C/' | |
'%5Ba-zA-Z0-9%5D%7B8%7D-%5Ba-zA-Z0-9%5D%7B4%7D-%5Ba-zA-Z0-9%5D%7B4%7D-%5Ba-zA-Z0-9%5D' | |
'%7B4%7D-%5Ba-zA-Z0-9%5D%7B12%7D$)') | |
results = requests.get(solr_magic_url).json() | |
solr_dois = [id for id in results['terms']['id'] if isinstance(id, str)] | |
return solr_dois | |
if __name__ == "__main__": | |
plos_dois = get_all_solr_dois() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment