nutjob4life/extract.py

## extract.py
# encoding: utf-8

'''Extract abstracts used by EDRN publications.

To use::

    python3.10 -m venv .venv
    .venv/bin/pip install --quiet --upgrade setuptools pip wheel build
    .venv/bin/pip install biopython~=1.81 rdflib~=6.3.2
    .venv/bin/python extract.py > all-abstracts.txt
'''

from Bio import Entrez
import logging, rdflib, time, contextlib

_logger       = logging.getLogger(__name__)  # Logging
Entrez.tool   = 'edrn-pubs'                  # Name of our "tool" for Entrez API identification
Entrez.email  = 'sean.kelly@nih.gov'         # Owner of the "tool"
_batch_size   = 20                           # How many abstracts to get at a time
_wait_betwixt = 5                            # How long in seconds to let the API rest between batches

# RDF predicate that contains pub med IDs
_pubmed_predicate = rdflib.URIRef('http://edrn.nci.nih.gov/rdf/schema.rdf#pmid')

# Sources of EDRN publications
_rdf_sources = [
    'https://bmdb.jpl.nasa.gov/rdf/publications',                           # Biomarker database publications
    'https://edrn.jpl.nasa.gov/cancerdataexpo/rdf-data/publications/@@rdf'  # DMCC-tracked publications
]


def _read_rdf(url: str) -> dict[rdflib.URIRef, dict[rdflib.URIRef, list[rdflib.URIRef | rdflib.Literal]]]:
    '''Read RDF from the given ``url`` and return a dictionary of statements made, which are
    subject URIs to predicates. The predicates are themselves a dictionary of predicate URIs to
    sequences of objects, which are either other URI references or literals.
    '''
    graph = rdflib.Graph()
    _logger.debug('Parsing RDF at %s', url)
    graph.parse(url)
    statements = {}
    for s, p, o in graph:
        if s not in statements:
            statements[s] = {}
        predicates = statements[s]
        if p not in predicates:
            predicates[p] = []
        predicates[p].append(o)
    return statements


def _get_pub_med_ids(rdf_sources: list[str]) -> set[str]:
    '''Given a list of possible sources of RDF-based information, determine the unique set of
    PubMedIDs represented.
    '''
    _logger.debug('Geting RDF from %r', rdf_sources)
    ids = set()
    for rdf_source in rdf_sources:
        statements = _read_rdf(rdf_source)
        for s, p in statements.items():
            pubmeds = p.get(_pubmed_predicate, [])
            for pubmedid in pubmeds:
                ids.add(str(pubmedid))
    _logger.debug('Found %d unique pubmed IDs', len(ids))
    return ids


def _divvy(ids: list[str], batch_size: int) -> list[str]:
    '''Divvy up a list of string ``ids`` into ``batch_size`` parts.'''
    while len(ids) > 0:
        batch, ids = ids[:batch_size], ids[batch_size:]
        yield batch


def _retrieve_abstracts(ids: list[str]) -> list[str]:
    '''Retrieve the abstracts for pubmed ``ids`` and return them as a sequence of strings.'''
    _logger.debug('Retrieving abstracts for %d pubmed IDs', len(ids))
    abstracts = []
    with contextlib.closing(Entrez.efetch(db='pubmed', retmode='xml', rettype='medline', id=ids)) as ef:
        records = Entrez.read(ef)
        for record in records['PubmedArticle']:
            pubmedid = str(record['MedlineCitation']['PMID'])
            try:
                abstract = '\n'.join([str(i) for i in record['MedlineCitation']['Article']['Abstract']['AbstractText']])
                abstracts.append(abstract)
            except KeyError:
                _logger.info('No abstract available for pubmed ID %s; skipping it', pubmedid)
    return abstracts


def main():
    '''Write all known EDRN abstracts to the standard output.'''
    pub_med_ids = list(_get_pub_med_ids(_rdf_sources))
    for batch in _divvy(pub_med_ids, _batch_size):
        abstracts = _retrieve_abstracts(batch)
        for abstract in abstracts:
            print(abstract)
        if len(batch) == _batch_size:
            time.sleep(_wait_betwixt)


if __name__ == '__main__':
    main()
	# encoding: utf-8

	'''Extract abstracts used by EDRN publications.

	To use::

	python3.10 -m venv .venv
	.venv/bin/pip install --quiet --upgrade setuptools pip wheel build
	.venv/bin/pip install biopython~=1.81 rdflib~=6.3.2
	.venv/bin/python extract.py > all-abstracts.txt
	'''

	from Bio import Entrez
	import logging, rdflib, time, contextlib

	_logger = logging.getLogger(__name__) # Logging
	Entrez.tool = 'edrn-pubs' # Name of our "tool" for Entrez API identification
	Entrez.email = 'sean.kelly@nih.gov' # Owner of the "tool"
	_batch_size = 20 # How many abstracts to get at a time
	_wait_betwixt = 5 # How long in seconds to let the API rest between batches

	# RDF predicate that contains pub med IDs
	_pubmed_predicate = rdflib.URIRef('http://edrn.nci.nih.gov/rdf/schema.rdf#pmid')

	# Sources of EDRN publications
	_rdf_sources = [
	'https://bmdb.jpl.nasa.gov/rdf/publications', # Biomarker database publications
	'https://edrn.jpl.nasa.gov/cancerdataexpo/rdf-data/publications/@@rdf' # DMCC-tracked publications
	]


	def _read_rdf(url: str) -> dict[rdflib.URIRef, dict[rdflib.URIRef, list[rdflib.URIRef \| rdflib.Literal]]]:
	'''Read RDF from the given ``url`` and return a dictionary of statements made, which are
	subject URIs to predicates. The predicates are themselves a dictionary of predicate URIs to
	sequences of objects, which are either other URI references or literals.
	'''
	graph = rdflib.Graph()
	_logger.debug('Parsing RDF at %s', url)
	graph.parse(url)
	statements = {}
	for s, p, o in graph:
	if s not in statements:
	statements[s] = {}
	predicates = statements[s]
	if p not in predicates:
	predicates[p] = []
	predicates[p].append(o)
	return statements


	def _get_pub_med_ids(rdf_sources: list[str]) -> set[str]:
	'''Given a list of possible sources of RDF-based information, determine the unique set of
	PubMedIDs represented.
	'''
	_logger.debug('Geting RDF from %r', rdf_sources)
	ids = set()
	for rdf_source in rdf_sources:
	statements = _read_rdf(rdf_source)
	for s, p in statements.items():
	pubmeds = p.get(_pubmed_predicate, [])
	for pubmedid in pubmeds:
	ids.add(str(pubmedid))
	_logger.debug('Found %d unique pubmed IDs', len(ids))
	return ids


	def _divvy(ids: list[str], batch_size: int) -> list[str]:
	'''Divvy up a list of string ``ids`` into ``batch_size`` parts.'''
	while len(ids) > 0:
	batch, ids = ids[:batch_size], ids[batch_size:]
	yield batch


	def _retrieve_abstracts(ids: list[str]) -> list[str]:
	'''Retrieve the abstracts for pubmed ``ids`` and return them as a sequence of strings.'''
	_logger.debug('Retrieving abstracts for %d pubmed IDs', len(ids))
	abstracts = []
	with contextlib.closing(Entrez.efetch(db='pubmed', retmode='xml', rettype='medline', id=ids)) as ef:
	records = Entrez.read(ef)
	for record in records['PubmedArticle']:
	pubmedid = str(record['MedlineCitation']['PMID'])
	try:
	abstract = '\n'.join([str(i) for i in record['MedlineCitation']['Article']['Abstract']['AbstractText']])
	abstracts.append(abstract)
	except KeyError:
	_logger.info('No abstract available for pubmed ID %s; skipping it', pubmedid)
	return abstracts


	def main():
	'''Write all known EDRN abstracts to the standard output.'''
	pub_med_ids = list(_get_pub_med_ids(_rdf_sources))
	for batch in _divvy(pub_med_ids, _batch_size):
	abstracts = _retrieve_abstracts(batch)
	for abstract in abstracts:
	print(abstract)
	if len(batch) == _batch_size:
	time.sleep(_wait_betwixt)


	if __name__ == '__main__':
	main()