Skip to content

Instantly share code, notes, and snippets.

@eseiver
Last active August 28, 2017 22:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eseiver/4748b11547b638ef550a6d9eb28faebe to your computer and use it in GitHub Desktop.
Save eseiver/4748b11547b638ef550a6d9eb28faebe to your computer and use it in GitHub Desktop.
Get all PLOS PMCIDs from PMC using Entrez
import requests
import time
def get_all_pmc_dois(retstart=0, retmax=80000, count=None):
"""Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals,
individually included in the search url.
See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch for more info on search parameters
:return: the full list of PMCIDs in PMC for PLOS articles
"""
pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term='
'(((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])'
'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])'
'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal]+OR+"plos+currents"[Journal]'
'&retmode=json&tool=corpustest&email=name@domain.com')
pmcidlist = []
r = requests.get(pmc_allplos_query_url).json()
if count is None:
count = int(r['esearchresult']['count'])
print(count, "articles found in PMC")
while retstart < count:
query = pmc_allplos_query_url + '&retstart={}&retmax={}'.format(retstart, retmax)
r = requests.get(query).json()
idlist = r['esearchresult']['idlist']
for id in idlist:
pmcidlist.append('PMC' + id)
retstart += retmax
time.sleep(1)
pmcidlist = sorted(list(set(pmcidlist)))
print(len(pmcidlist), "articles retrieved")
return pmcidlist
if __name__ == '__main__':
pmcidlist = get_all_pmc_dois()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment