Skip to content

Instantly share code, notes, and snippets.

@pafonta
Last active November 19, 2021 15:27
Show Gist options
  • Save pafonta/d33a0d5d849932f8ceab8b711d995497 to your computer and use it in GitHub Desktop.
Save pafonta/d33a0d5d849932f8ceab8b711d995497 to your computer and use it in GitHub Desktop.
# For an example of use, see https://gist.github.com/pafonta/d33a0d5d849932f8ceab8b711d995497#gistcomment-3965575.
"""Find MeSH terms in the MeSH tree simply (i.e. without using a graph)."""
from __future__ import annotations
import json
from collections.abc import Iterator
from xml.etree.ElementTree import Element # nosec
from defusedxml import ElementTree
def is_child(mesh: Element, roots: list[str]) -> bool:
"""Check if a MeSH term is a child of the given MeSH terms in the MeSH tree.
Parameters
----------
mesh
The MeSH term.
roots
The root MeSH terms for which looking for children.
Returns
-------
bool
True, if the MeSH term is under one of the root MeSH terms or is one of them.
"""
for x in mesh.iterfind("TreeNumberList/TreeNumber"):
for y in roots:
if x.text.startswith(y):
return True
return False
def collect(mesh_tree: ElementTree, roots: list[str]) -> Iterator[str]:
"""Collect MeSH terms which are children of the given MeSH terms in the MeSH tree.
Parameters
----------
mesh_tree
The MeSH tree.
roots
The root MeSH terms for which looking for children.
Yields
------
str
The MeSH terms which are under the root MeSH terms, including them.
"""
for x in mesh_tree.iter("DescriptorRecord"):
match = is_child(x, roots)
if match:
name = x.find("DescriptorName/String").text
yield " ".join(name.split())
@pafonta
Copy link
Author

pafonta commented Nov 17, 2021

Example of use

Import utility functions:

from nlm_mesh_tree import *

Parse the MeSH tree:

# wget https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2021.xml
mesh_tree: ElementTree = ElementTree.parse("desc2021.xml")

Define the MeSH terms to use as roots:

targets = [
    "A08",
    "A11.650",
    "A11.671",
    "E01.370.376",
    "E05.393.332",
    "E05.599.395.642",
    "E05.629",
    "G01.358.500.249.277",
    "G02.111.820.850",
    "G03.493",
    "G04.580",
    "G04.835.850",
    "G05.308",
    "G07.265",
    "G11.561",
    "H01.158.273.180",
    "L01.313.124",
]

Collect the children MeSH terms (roots ones are included too):

mesh_names = set(collect(mesh_tree, targets))

[Optional] Save the collected MeSH terms:

with open("mesh_names.json", "w") as f:
    ordered = sorted(mesh_names)
    json.dump(ordered, f)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment