victormurcia/fetch_pubmed_data.py

## fetch_pubmed_data.py
from Bio import Entrez
from Bio.Medline import parse
from io import StringIO
import pandas as pd

def fetch_pubmed_data(search_term, email, retmax=100):
    """
    Fetches data from PubMed related to a specific search term.

    Parameters:
        search_term (str): The term to search for in the PubMed database.
        email (str): The email address to be used for accessing PubMed's API.
        retmax (int, optional): The maximum number of results to retrieve. Defaults to 100.

    Returns:
        pandas.DataFrame: A DataFrame containing the details of the PubMed entries, including
                          PMID, Title, Authors, Abstract, Publication Date, Journal, Volume,
                          Issue, Pages, Affiliation, Article ID, E-Publication Date, Place of
                          Publication, Journal Abbreviation, Language, Publication Type, and MeSH Terms.
    """

    Entrez.email = email

    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()

    idlist = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    records = handle.read()
    handle.close()

    records = parse(StringIO(records))

    columns = ["PMID", "Title", "Authors", "Abstract", "Publication Date", "Journal", "Volume", "Issue", "Pages", "Affiliation", "Article ID", "E-Publication Date", "Place of Publication", "Journal Abbreviation", "Language", "Publication Type", "MeSH Terms"]
    df = pd.DataFrame(columns=columns)

    for record in records:
        new_row = {
            "PMID": record.get("PMID", "N/A"),
            "Title": record.get("TI", "N/A"),
            "Authors": ", ".join(record.get("AU", ["N/A"])),
            "Abstract": record.get("AB", "N/A"),
            "Publication Date": record.get("DP", "N/A"),
            "Journal": record.get("JT", "N/A"),
            "Volume": record.get("VI", "N/A"),
            "Issue": record.get("IP", "N/A"),
            "Pages": record.get("PG", "N/A"),
            "Affiliation": record.get("AD", "N/A"),
            "Article ID": ", ".join(record.get("AID", ["N/A"])),
            "E-Publication Date": record.get("DEP", "N/A"),
            "Place of Publication": record.get("PL", "N/A"),
            "Journal Abbreviation": record.get("TA", "N/A"),
            "Language": ", ".join(record.get("LA", ["N/A"])),
            "Publication Type": ", ".join(record.get("PT", ["N/A"])),
            "MeSH Terms": ", ".join(record.get("MH", ["N/A"])),
        }
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    return df

# Example usage
search_term = "Chronic Inflammatory Demyelinating Polyneuropathy (CIDP)"
email = "your-email@example.com"
df = fetch_pubmed_data(search_term, email)
print(df.head())
	from Bio import Entrez
	from Bio.Medline import parse
	from io import StringIO
	import pandas as pd

	def fetch_pubmed_data(search_term, email, retmax=100):
	"""
	Fetches data from PubMed related to a specific search term.

	Parameters:
	search_term (str): The term to search for in the PubMed database.
	email (str): The email address to be used for accessing PubMed's API.
	retmax (int, optional): The maximum number of results to retrieve. Defaults to 100.

	Returns:
	pandas.DataFrame: A DataFrame containing the details of the PubMed entries, including
	PMID, Title, Authors, Abstract, Publication Date, Journal, Volume,
	Issue, Pages, Affiliation, Article ID, E-Publication Date, Place of
	Publication, Journal Abbreviation, Language, Publication Type, and MeSH Terms.
	"""

	Entrez.email = email

	handle = Entrez.esearch(db="pubmed", term=search_term, retmax=retmax)
	record = Entrez.read(handle)
	handle.close()

	idlist = record["IdList"]
	handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
	records = handle.read()
	handle.close()

	records = parse(StringIO(records))

	columns = ["PMID", "Title", "Authors", "Abstract", "Publication Date", "Journal", "Volume", "Issue", "Pages", "Affiliation", "Article ID", "E-Publication Date", "Place of Publication", "Journal Abbreviation", "Language", "Publication Type", "MeSH Terms"]
	df = pd.DataFrame(columns=columns)

	for record in records:
	new_row = {
	"PMID": record.get("PMID", "N/A"),
	"Title": record.get("TI", "N/A"),
	"Authors": ", ".join(record.get("AU", ["N/A"])),
	"Abstract": record.get("AB", "N/A"),
	"Publication Date": record.get("DP", "N/A"),
	"Journal": record.get("JT", "N/A"),
	"Volume": record.get("VI", "N/A"),
	"Issue": record.get("IP", "N/A"),
	"Pages": record.get("PG", "N/A"),
	"Affiliation": record.get("AD", "N/A"),
	"Article ID": ", ".join(record.get("AID", ["N/A"])),
	"E-Publication Date": record.get("DEP", "N/A"),
	"Place of Publication": record.get("PL", "N/A"),
	"Journal Abbreviation": record.get("TA", "N/A"),
	"Language": ", ".join(record.get("LA", ["N/A"])),
	"Publication Type": ", ".join(record.get("PT", ["N/A"])),
	"MeSH Terms": ", ".join(record.get("MH", ["N/A"])),
	}
	df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

	return df

	# Example usage
	search_term = "Chronic Inflammatory Demyelinating Polyneuropathy (CIDP)"
	email = "your-email@example.com"
	df = fetch_pubmed_data(search_term, email)
	print(df.head())