Last active
November 19, 2021 15:40
-
-
Save pafonta/37762f56e8c1879569bca64901d0a000 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For an example of use, see https://gist.github.com/pafonta/37762f56e8c1879569bca64901d0a000#gistcomment-3968062. | |
"""Collect statistics on PubMed articles.""" | |
from __future__ import annotations | |
from pathlib import Path | |
from defusedxml import ElementTree | |
from tqdm import tqdm | |
def stats_topics(dirpath: str, topics: list[str]) -> dict[str, tuple]: | |
"""Statistics on topics, i.e. articles with specific MeSH terms. | |
Parameters | |
---------- | |
dirpath | |
The directory with all the PubMed XML files. | |
topics | |
The list of MeSH terms of interest. | |
Returns | |
------- | |
dict | |
Per file, in order: number of articles, number of articles without MeSH terms, | |
number of articles with MeSH terms which did not match, number of articles with | |
MeSH terms which matched. | |
""" | |
stats: dict[str, tuple] = {} | |
filepaths = Path(dirpath).iterdir() | |
for filepath in tqdm(sorted(filepaths)): | |
total = 0 | |
no_meshes = 0 | |
not_matched = 0 | |
matched = 0 | |
articles = ElementTree.parse(str(filepath)) | |
for article in articles.iter("PubmedArticle"): | |
total += 1 | |
meshes = article.find("./MedlineCitation/MeshHeadingList") | |
if meshes is None: | |
no_meshes += 1 | |
else: | |
descriptor = meshes.findall("MeshHeading/DescriptorName") | |
names = {x.text for x in descriptor} | |
if names.isdisjoint(topics): | |
not_matched += 1 | |
else: | |
matched += 1 | |
stats[filepath.stem] = (total, no_meshes, not_matched, matched) | |
return stats |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example of use
Import utility functions:
Load the
MeSH
terms defining the topics of interest:Compute statistics about these topics on
PubMed
:Notes
MeSH
terms of interest, see here.MeSH
tree, see here.