Skip to content

Instantly share code, notes, and snippets.

@nutjob4life
Created June 14, 2023 16:47
Show Gist options
  • Save nutjob4life/1aa843b6a6540bbac51b4f53e6b74c1c to your computer and use it in GitHub Desktop.
Save nutjob4life/1aa843b6a6540bbac51b4f53e6b74c1c to your computer and use it in GitHub Desktop.
Extract abstracts for EDRN's publications
# encoding: utf-8
'''Extract abstracts used by EDRN publications.
To use::
python3.10 -m venv .venv
.venv/bin/pip install --quiet --upgrade setuptools pip wheel build
.venv/bin/pip install biopython~=1.81 rdflib~=6.3.2
.venv/bin/python extract.py > all-abstracts.txt
'''
from Bio import Entrez
import logging, rdflib, time, contextlib
_logger = logging.getLogger(__name__) # Logging
Entrez.tool = 'edrn-pubs' # Name of our "tool" for Entrez API identification
Entrez.email = 'sean.kelly@nih.gov' # Owner of the "tool"
_batch_size = 20 # How many abstracts to get at a time
_wait_betwixt = 5 # How long in seconds to let the API rest between batches
# RDF predicate that contains pub med IDs
_pubmed_predicate = rdflib.URIRef('http://edrn.nci.nih.gov/rdf/schema.rdf#pmid')
# Sources of EDRN publications
_rdf_sources = [
'https://bmdb.jpl.nasa.gov/rdf/publications', # Biomarker database publications
'https://edrn.jpl.nasa.gov/cancerdataexpo/rdf-data/publications/@@rdf' # DMCC-tracked publications
]
def _read_rdf(url: str) -> dict[rdflib.URIRef, dict[rdflib.URIRef, list[rdflib.URIRef | rdflib.Literal]]]:
'''Read RDF from the given ``url`` and return a dictionary of statements made, which are
subject URIs to predicates. The predicates are themselves a dictionary of predicate URIs to
sequences of objects, which are either other URI references or literals.
'''
graph = rdflib.Graph()
_logger.debug('Parsing RDF at %s', url)
graph.parse(url)
statements = {}
for s, p, o in graph:
if s not in statements:
statements[s] = {}
predicates = statements[s]
if p not in predicates:
predicates[p] = []
predicates[p].append(o)
return statements
def _get_pub_med_ids(rdf_sources: list[str]) -> set[str]:
'''Given a list of possible sources of RDF-based information, determine the unique set of
PubMedIDs represented.
'''
_logger.debug('Geting RDF from %r', rdf_sources)
ids = set()
for rdf_source in rdf_sources:
statements = _read_rdf(rdf_source)
for s, p in statements.items():
pubmeds = p.get(_pubmed_predicate, [])
for pubmedid in pubmeds:
ids.add(str(pubmedid))
_logger.debug('Found %d unique pubmed IDs', len(ids))
return ids
def _divvy(ids: list[str], batch_size: int) -> list[str]:
'''Divvy up a list of string ``ids`` into ``batch_size`` parts.'''
while len(ids) > 0:
batch, ids = ids[:batch_size], ids[batch_size:]
yield batch
def _retrieve_abstracts(ids: list[str]) -> list[str]:
'''Retrieve the abstracts for pubmed ``ids`` and return them as a sequence of strings.'''
_logger.debug('Retrieving abstracts for %d pubmed IDs', len(ids))
abstracts = []
with contextlib.closing(Entrez.efetch(db='pubmed', retmode='xml', rettype='medline', id=ids)) as ef:
records = Entrez.read(ef)
for record in records['PubmedArticle']:
pubmedid = str(record['MedlineCitation']['PMID'])
try:
abstract = '\n'.join([str(i) for i in record['MedlineCitation']['Article']['Abstract']['AbstractText']])
abstracts.append(abstract)
except KeyError:
_logger.info('No abstract available for pubmed ID %s; skipping it', pubmedid)
return abstracts
def main():
'''Write all known EDRN abstracts to the standard output.'''
pub_med_ids = list(_get_pub_med_ids(_rdf_sources))
for batch in _divvy(pub_med_ids, _batch_size):
abstracts = _retrieve_abstracts(batch)
for abstract in abstracts:
print(abstract)
if len(batch) == _batch_size:
time.sleep(_wait_betwixt)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment