pafonta/nlm_mesh.py

## nlm_mesh.py
# For an example of use, see https://gist.github.com/pafonta/162c1b9ec0380e95a017297a707a4d66#gistcomment-3935739.

"""Find & Rank MeSH terms associated with an author."""

from __future__ import annotations

import json
from collections import Counter
from collections.abc import Iterator
from xml.etree.ElementTree import Element  # nosec

import requests
from defusedxml import ElementTree


def articles_search(author: str, limit: int = 200) -> list[str]:
    """Find articles from an author.

    Parameters
    ----------
    author
        Name of the author.
    limit
        Maximum number of returned articles.

    Returns
    -------
    list of str
        Identifiers of the articles from the given author.
    """
    params = f"db=pubmed&term={author}[Author]&retmax={limit}&retmode=json"
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{params}"
    response = requests.get(url)
    content = response.content.decode()
    data = json.loads(content)
    identifiers = data["esearchresult"]["idlist"]
    count = data["esearchresult"]["count"]
    print(f"{count} articles found")
    return identifiers


def articles_fetch(identifiers: list[str]) -> Element:
    """Retrieve articles for the given article identifiers.

    Parameters
    ----------
    identifiers
        Identifiers of the articles.

    Returns
    -------
    Element
        Metadata of the articles as a parsed XML mesh_tree.
    """
    params = f"db=pubmed&id={','.join(identifiers)}&retmode=xml"
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{params}"
    response = requests.get(url)
    content = response.content.decode()
    metadata = ElementTree.fromstring(content)
    return metadata


def articles_mesh(articles: Element, only_major: bool = False) -> Iterator[str]:
    """Find the MeSH terms for the given articles.

    Parameters
    ----------
    articles
        Metadata of the articles as a parsed XML mesh_tree.
    only_major
        If True, keep only the MeSH identified as major topic.

    Yields
    ------
    str
        MeSH names from the articles.
    """
    xpath = "PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName"
    for mesh in articles.iterfind(xpath):
        if only_major:
            if mesh.get("MajorTopicYN") == "Y":
                yield mesh.text
        else:
            yield mesh.text


def top(data: Iterator[str], k: int) -> list[tuple[int, str]]:
    """Return the most repeated elements in the given data, in descending order.

    Parameters
    ----------
    data
        Elements to count and order.
    k
        Number of most repeated elements to keep.

    Returns
    -------
    list of tuple
        The most repeated elements in the data, in descending order.
    """
    counted = Counter(data)
    filtered = counted.most_common(k)
    return [(count, element) for element, count in filtered]


def mesh_numbers(tree: ElementTree, names: set[str]) -> Iterator[str]:
    """Find the tree numbers of the given MeSH names.

    Parameters
    ----------
    tree
        MeSH metadata as a parsed XML tree.
    names
        MeSH names.

    Yields
    ------
    str
        Tree number of the MeSH name.
    """
    for mesh in tree.iter("DescriptorRecord"):
        name = mesh.find("DescriptorName/String").text
        if name in names:
            for number in mesh.iterfind("TreeNumberList/TreeNumber"):
                yield number.text


def mesh_names(tree: ElementTree, numbers: set[str]) -> Iterator[tuple[str, str]]:
    """Find the names of the given MeSH tree numbers.

    Parameters
    ----------
    tree
        MeSH metadata as a parsed XML tree.
    numbers
        MeSH tree numbers.

    Yields
    ------
    tuple
        Name of the MeSH tree number.
    """
    for mesh in tree.iter("DescriptorRecord"):
        numbers_ = {x.text for x in mesh.findall("TreeNumberList/TreeNumber")}
        name = mesh.find("DescriptorName/String").text
        intersect = numbers & numbers_
        for number in intersect:
            yield number, name
	# For an example of use, see https://gist.github.com/pafonta/162c1b9ec0380e95a017297a707a4d66#gistcomment-3935739.

	"""Find & Rank MeSH terms associated with an author."""

	from __future__ import annotations

	import json
	from collections import Counter
	from collections.abc import Iterator
	from xml.etree.ElementTree import Element # nosec

	import requests
	from defusedxml import ElementTree


	def articles_search(author: str, limit: int = 200) -> list[str]:
	"""Find articles from an author.

	Parameters
	----------
	author
	Name of the author.
	limit
	Maximum number of returned articles.

	Returns
	-------
	list of str
	Identifiers of the articles from the given author.
	"""
	params = f"db=pubmed&term={author}[Author]&retmax={limit}&retmode=json"
	url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{params}"
	response = requests.get(url)
	content = response.content.decode()
	data = json.loads(content)
	identifiers = data["esearchresult"]["idlist"]
	count = data["esearchresult"]["count"]
	print(f"{count} articles found")
	return identifiers


	def articles_fetch(identifiers: list[str]) -> Element:
	"""Retrieve articles for the given article identifiers.

	Parameters
	----------
	identifiers
	Identifiers of the articles.

	Returns
	-------
	Element
	Metadata of the articles as a parsed XML mesh_tree.
	"""
	params = f"db=pubmed&id={','.join(identifiers)}&retmode=xml"
	url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{params}"
	response = requests.get(url)
	content = response.content.decode()
	metadata = ElementTree.fromstring(content)
	return metadata


	def articles_mesh(articles: Element, only_major: bool = False) -> Iterator[str]:
	"""Find the MeSH terms for the given articles.

	Parameters
	----------
	articles
	Metadata of the articles as a parsed XML mesh_tree.
	only_major
	If True, keep only the MeSH identified as major topic.

	Yields
	------
	str
	MeSH names from the articles.
	"""
	xpath = "PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName"
	for mesh in articles.iterfind(xpath):
	if only_major:
	if mesh.get("MajorTopicYN") == "Y":
	yield mesh.text
	else:
	yield mesh.text


	def top(data: Iterator[str], k: int) -> list[tuple[int, str]]:
	"""Return the most repeated elements in the given data, in descending order.

	Parameters
	----------
	data
	Elements to count and order.
	k
	Number of most repeated elements to keep.

	Returns
	-------
	list of tuple
	The most repeated elements in the data, in descending order.
	"""
	counted = Counter(data)
	filtered = counted.most_common(k)
	return [(count, element) for element, count in filtered]


	def mesh_numbers(tree: ElementTree, names: set[str]) -> Iterator[str]:
	"""Find the tree numbers of the given MeSH names.

	Parameters
	----------
	tree
	MeSH metadata as a parsed XML tree.
	names
	MeSH names.

	Yields
	------
	str
	Tree number of the MeSH name.
	"""
	for mesh in tree.iter("DescriptorRecord"):
	name = mesh.find("DescriptorName/String").text
	if name in names:
	for number in mesh.iterfind("TreeNumberList/TreeNumber"):
	yield number.text


	def mesh_names(tree: ElementTree, numbers: set[str]) -> Iterator[tuple[str, str]]:
	"""Find the names of the given MeSH tree numbers.

	Parameters
	----------
	tree
	MeSH metadata as a parsed XML tree.
	numbers
	MeSH tree numbers.

	Yields
	------
	tuple
	Name of the MeSH tree number.
	"""
	for mesh in tree.iter("DescriptorRecord"):
	numbers_ = {x.text for x in mesh.findall("TreeNumberList/TreeNumber")}
	name = mesh.find("DescriptorName/String").text
	intersect = numbers & numbers_
	for number in intersect:
	yield number, name