Skip to content

Instantly share code, notes, and snippets.

@pafonta
Last active November 19, 2021 15:27
Show Gist options
  • Save pafonta/162c1b9ec0380e95a017297a707a4d66 to your computer and use it in GitHub Desktop.
Save pafonta/162c1b9ec0380e95a017297a707a4d66 to your computer and use it in GitHub Desktop.
# For an example of use, see https://gist.github.com/pafonta/162c1b9ec0380e95a017297a707a4d66#gistcomment-3935739.
"""Find & Rank MeSH terms associated with an author."""
from __future__ import annotations
import json
from collections import Counter
from collections.abc import Iterator
from xml.etree.ElementTree import Element # nosec
import requests
from defusedxml import ElementTree
def articles_search(author: str, limit: int = 200) -> list[str]:
"""Find articles from an author.
Parameters
----------
author
Name of the author.
limit
Maximum number of returned articles.
Returns
-------
list of str
Identifiers of the articles from the given author.
"""
params = f"db=pubmed&term={author}[Author]&retmax={limit}&retmode=json"
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{params}"
response = requests.get(url)
content = response.content.decode()
data = json.loads(content)
identifiers = data["esearchresult"]["idlist"]
count = data["esearchresult"]["count"]
print(f"{count} articles found")
return identifiers
def articles_fetch(identifiers: list[str]) -> Element:
"""Retrieve articles for the given article identifiers.
Parameters
----------
identifiers
Identifiers of the articles.
Returns
-------
Element
Metadata of the articles as a parsed XML mesh_tree.
"""
params = f"db=pubmed&id={','.join(identifiers)}&retmode=xml"
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{params}"
response = requests.get(url)
content = response.content.decode()
metadata = ElementTree.fromstring(content)
return metadata
def articles_mesh(articles: Element, only_major: bool = False) -> Iterator[str]:
"""Find the MeSH terms for the given articles.
Parameters
----------
articles
Metadata of the articles as a parsed XML mesh_tree.
only_major
If True, keep only the MeSH identified as major topic.
Yields
------
str
MeSH names from the articles.
"""
xpath = "PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName"
for mesh in articles.iterfind(xpath):
if only_major:
if mesh.get("MajorTopicYN") == "Y":
yield mesh.text
else:
yield mesh.text
def top(data: Iterator[str], k: int) -> list[tuple[int, str]]:
"""Return the most repeated elements in the given data, in descending order.
Parameters
----------
data
Elements to count and order.
k
Number of most repeated elements to keep.
Returns
-------
list of tuple
The most repeated elements in the data, in descending order.
"""
counted = Counter(data)
filtered = counted.most_common(k)
return [(count, element) for element, count in filtered]
def mesh_numbers(tree: ElementTree, names: set[str]) -> Iterator[str]:
"""Find the tree numbers of the given MeSH names.
Parameters
----------
tree
MeSH metadata as a parsed XML tree.
names
MeSH names.
Yields
------
str
Tree number of the MeSH name.
"""
for mesh in tree.iter("DescriptorRecord"):
name = mesh.find("DescriptorName/String").text
if name in names:
for number in mesh.iterfind("TreeNumberList/TreeNumber"):
yield number.text
def mesh_names(tree: ElementTree, numbers: set[str]) -> Iterator[tuple[str, str]]:
"""Find the names of the given MeSH tree numbers.
Parameters
----------
tree
MeSH metadata as a parsed XML tree.
numbers
MeSH tree numbers.
Yields
------
tuple
Name of the MeSH tree number.
"""
for mesh in tree.iter("DescriptorRecord"):
numbers_ = {x.text for x in mesh.findall("TreeNumberList/TreeNumber")}
name = mesh.find("DescriptorName/String").text
intersect = numbers & numbers_
for number in intersect:
yield number, name
@pafonta
Copy link
Author

pafonta commented Oct 22, 2021

Example of use

Import utility functions:

from nlm_mesh import *

Parse the MeSH tree:

# wget https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2021.xml
mesh_tree: ElementTree = ElementTree.parse("desc2021.xml")

Find the articles from author Lastname I, where I is the initial(s):

identifiers = articles_search("Lastname I")

Retrieve the metadata of the articles:

articles = articles_fetch(identifiers)

Find the MeSH names for the articles:

mesh = articles_mesh(articles, only_major=True)

Find the corresponding MeSH tree numbers:

names = set(mesh)
numbers = set(mesh_numbers(mesh_tree, names))

Consider a specific level of the MeSH tree. Find the corresponding MeSH names:

level = 2  # In the MesSH tree, starting at 0.
limit = (3 + 4 * level)
truncated = {x[:limit] for x in numbers}
mapping = dict(mesh_names(mesh_tree, truncated))

Display the selected MeSH tree numbers and names:

for number in sorted(truncated):
    print(f"{number} | {mapping[number]}")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment