Skip to content

Instantly share code, notes, and snippets.

@pafonta
Last active November 19, 2021 15:40
Show Gist options
  • Save pafonta/37762f56e8c1879569bca64901d0a000 to your computer and use it in GitHub Desktop.
Save pafonta/37762f56e8c1879569bca64901d0a000 to your computer and use it in GitHub Desktop.
# For an example of use, see https://gist.github.com/pafonta/37762f56e8c1879569bca64901d0a000#gistcomment-3968062.
"""Collect statistics on PubMed articles."""
from __future__ import annotations
from pathlib import Path
from defusedxml import ElementTree
from tqdm import tqdm
def stats_topics(dirpath: str, topics: list[str]) -> dict[str, tuple]:
"""Statistics on topics, i.e. articles with specific MeSH terms.
Parameters
----------
dirpath
The directory with all the PubMed XML files.
topics
The list of MeSH terms of interest.
Returns
-------
dict
Per file, in order: number of articles, number of articles without MeSH terms,
number of articles with MeSH terms which did not match, number of articles with
MeSH terms which matched.
"""
stats: dict[str, tuple] = {}
filepaths = Path(dirpath).iterdir()
for filepath in tqdm(sorted(filepaths)):
total = 0
no_meshes = 0
not_matched = 0
matched = 0
articles = ElementTree.parse(str(filepath))
for article in articles.iter("PubmedArticle"):
total += 1
meshes = article.find("./MedlineCitation/MeshHeadingList")
if meshes is None:
no_meshes += 1
else:
descriptor = meshes.findall("MeshHeading/DescriptorName")
names = {x.text for x in descriptor}
if names.isdisjoint(topics):
not_matched += 1
else:
matched += 1
stats[filepath.stem] = (total, no_meshes, not_matched, matched)
return stats
@pafonta
Copy link
Author

pafonta commented Nov 19, 2021

Example of use

Import utility functions:

import json

from pubmed_statistics import *

Load the MeSH terms defining the topics of interest:

# 'meshes.json' is a JSON file.
# It contains a list of MeSH names of interest (see notes).
with open("meshes.json") as f:
    meshes = json.load(f)

Compute statistics about these topics on PubMed:

# 'pubmed/xml/' is a directory.
# It contains only the XML files from 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'.
statistics = stats_topics("pubmed/xml/", meshes)

Notes

  • For selecting the MeSH terms of interest, see here.
  • For selecting all their children in the MeSH tree, see here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment