Last active
May 19, 2024 08:45
-
-
Save bonzanini/5a4c39e4c02502a8451d to your computer and use it in GitHub Desktop.
Searching PubMed with Biopython
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code uses Biopython to retrieve lists of articles from pubmed | |
# you need to install Biopython first. | |
# If you use Anaconda: | |
# conda install biopython | |
# If you use pip/venv: | |
# pip install biopython | |
# Full discussion: | |
# https://marcobonzanini.wordpress.com/2015/01/12/searching-pubmed-with-python/ | |
from Bio import Entrez | |
def search(query): | |
Entrez.email = 'your.email@example.com' | |
handle = Entrez.esearch(db='pubmed', | |
sort='relevance', | |
retmax='20', | |
retmode='xml', | |
term=query) | |
results = Entrez.read(handle) | |
return results | |
def fetch_details(id_list): | |
ids = ','.join(id_list) | |
Entrez.email = 'your.email@example.com' | |
handle = Entrez.efetch(db='pubmed', | |
retmode='xml', | |
id=ids) | |
results = Entrez.read(handle) | |
return results | |
if __name__ == '__main__': | |
results = search('fever') | |
id_list = results['IdList'] | |
papers = fetch_details(id_list) | |
for i, paper in enumerate(papers['PubmedArticle']): | |
print("{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle'])) |
Thanks this is very helpful. Somewhat similar to @echorule, I'm having trouble obtaining the full abstract of a paper as one string. What I get looks like this:
StringElement(' ... ', attributes={'Label': 'BACKGROUND', 'NlmCategory': 'BACKGROUND'}), StringElement(' ... ', attributes={'Label': 'METHODS', 'NlmCategory': 'METHODS'}), StringElement('...' ...)
Is there a more reasonable way to go about this?
@rsgoncalves did you manage to resolve this? You can fetch abstract by using: paper['MedlineCitation']['Article']['Abstract']['AbstractText']
So your full code retrieving for author, title and abstract could look something like:
from Bio import Entrez
def search(query):
Entrez.email = 'your.email@example.com'
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmax='20',
retmode='xml',
term=query)
results = Entrez.read(handle)
return results
def fetch_details(id_list):
ids = ','.join(id_list)
Entrez.email = 'your.email@example.com'
handle = Entrez.efetch(db='pubmed',
retmode='xml',
id=ids)
results = Entrez.read(handle)
return results
def get_abstract(paper):
abstract = ''
if 'Abstract' in paper['MedlineCitation']['Article']:
abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText']
if isinstance(abstract, list):
abstract = ' '.join(abstract)
return abstract
if __name__ == '__main__':
results = search('fever')
id_list = results['IdList']
papers = fetch_details(id_list)
for i, paper in enumerate(papers['PubmedArticle']):
title = paper['MedlineCitation']['Article']['ArticleTitle']
author_list = paper['MedlineCitation']['Article']['AuthorList']
authors = ', '.join([author.get('LastName', '') for author in author_list])
abstract = get_abstract(paper)
print("{}) Title: {}".format(i+1, title))
print(" Authors: {}".format(authors))
print(" Abstract: {}".format(abstract))
print()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks this is very helpful. Somewhat similar to @echorule, I'm having trouble obtaining the full abstract of a paper as one string. What I get looks like this:
StringElement(' ... ', attributes={'Label': 'BACKGROUND', 'NlmCategory': 'BACKGROUND'}), StringElement(' ... ', attributes={'Label': 'METHODS', 'NlmCategory': 'METHODS'}), StringElement('...' ...)
Is there a more reasonable way to go about this?