Last active
November 19, 2021 15:27
-
-
Save pafonta/162c1b9ec0380e95a017297a707a4d66 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For an example of use, see https://gist.github.com/pafonta/162c1b9ec0380e95a017297a707a4d66#gistcomment-3935739. | |
"""Find & Rank MeSH terms associated with an author.""" | |
from __future__ import annotations | |
import json | |
from collections import Counter | |
from collections.abc import Iterator | |
from xml.etree.ElementTree import Element # nosec | |
import requests | |
from defusedxml import ElementTree | |
def articles_search(author: str, limit: int = 200) -> list[str]: | |
"""Find articles from an author. | |
Parameters | |
---------- | |
author | |
Name of the author. | |
limit | |
Maximum number of returned articles. | |
Returns | |
------- | |
list of str | |
Identifiers of the articles from the given author. | |
""" | |
params = f"db=pubmed&term={author}[Author]&retmax={limit}&retmode=json" | |
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{params}" | |
response = requests.get(url) | |
content = response.content.decode() | |
data = json.loads(content) | |
identifiers = data["esearchresult"]["idlist"] | |
count = data["esearchresult"]["count"] | |
print(f"{count} articles found") | |
return identifiers | |
def articles_fetch(identifiers: list[str]) -> Element: | |
"""Retrieve articles for the given article identifiers. | |
Parameters | |
---------- | |
identifiers | |
Identifiers of the articles. | |
Returns | |
------- | |
Element | |
Metadata of the articles as a parsed XML mesh_tree. | |
""" | |
params = f"db=pubmed&id={','.join(identifiers)}&retmode=xml" | |
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{params}" | |
response = requests.get(url) | |
content = response.content.decode() | |
metadata = ElementTree.fromstring(content) | |
return metadata | |
def articles_mesh(articles: Element, only_major: bool = False) -> Iterator[str]: | |
"""Find the MeSH terms for the given articles. | |
Parameters | |
---------- | |
articles | |
Metadata of the articles as a parsed XML mesh_tree. | |
only_major | |
If True, keep only the MeSH identified as major topic. | |
Yields | |
------ | |
str | |
MeSH names from the articles. | |
""" | |
xpath = "PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName" | |
for mesh in articles.iterfind(xpath): | |
if only_major: | |
if mesh.get("MajorTopicYN") == "Y": | |
yield mesh.text | |
else: | |
yield mesh.text | |
def top(data: Iterator[str], k: int) -> list[tuple[int, str]]: | |
"""Return the most repeated elements in the given data, in descending order. | |
Parameters | |
---------- | |
data | |
Elements to count and order. | |
k | |
Number of most repeated elements to keep. | |
Returns | |
------- | |
list of tuple | |
The most repeated elements in the data, in descending order. | |
""" | |
counted = Counter(data) | |
filtered = counted.most_common(k) | |
return [(count, element) for element, count in filtered] | |
def mesh_numbers(tree: ElementTree, names: set[str]) -> Iterator[str]: | |
"""Find the tree numbers of the given MeSH names. | |
Parameters | |
---------- | |
tree | |
MeSH metadata as a parsed XML tree. | |
names | |
MeSH names. | |
Yields | |
------ | |
str | |
Tree number of the MeSH name. | |
""" | |
for mesh in tree.iter("DescriptorRecord"): | |
name = mesh.find("DescriptorName/String").text | |
if name in names: | |
for number in mesh.iterfind("TreeNumberList/TreeNumber"): | |
yield number.text | |
def mesh_names(tree: ElementTree, numbers: set[str]) -> Iterator[tuple[str, str]]: | |
"""Find the names of the given MeSH tree numbers. | |
Parameters | |
---------- | |
tree | |
MeSH metadata as a parsed XML tree. | |
numbers | |
MeSH tree numbers. | |
Yields | |
------ | |
tuple | |
Name of the MeSH tree number. | |
""" | |
for mesh in tree.iter("DescriptorRecord"): | |
numbers_ = {x.text for x in mesh.findall("TreeNumberList/TreeNumber")} | |
name = mesh.find("DescriptorName/String").text | |
intersect = numbers & numbers_ | |
for number in intersect: | |
yield number, name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example of use
Import utility functions:
Parse the
MeSH
tree:Find the articles from author
Lastname I
, whereI
is the initial(s):Retrieve the metadata of the articles:
Find the
MeSH
names for the articles:Find the corresponding
MeSH
tree numbers:Consider a specific level of the
MeSH
tree. Find the correspondingMeSH
names:Display the selected
MeSH
tree numbers and names: