Skip to content

Instantly share code, notes, and snippets.

@ahwagner
Last active September 1, 2019 22:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahwagner/857d40b4416bcf7254ce6b366aaaaaac to your computer and use it in GitHub Desktop.
Save ahwagner/857d40b4416bcf7254ce6b366aaaaaac to your computer and use it in GitHub Desktop.
A Python method for extracting the date from a PubMed article into a Pandas datetime object
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
__author__ = "Alex H. Wagner"
def pandas_datetime_from_pmid(pmid):
"""Returns a pandas datetime object corresponding to the NCBI reported publication date for a PubMed ID (pmid)"""
resp = requests.get('https://www.ncbi.nlm.nih.gov/pubmed/{0}?report=xml'.format(pmid))
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "xml")
xml = soup.find('pre').text
better_soup = BeautifulSoup(xml, "xml")
date = better_soup.PubDate
if date:
month = datetime.datetime.strptime(date.Month.text, '%b').month
year = int(date.Year.text)
if date.Day:
day = int(date.Day.text)
else:
day = 1
return pd.datetime(year, month, day)
else:
return pd.NaT
if __name__ == '__main__':
pmid = 26531824
published = pandas_datetime_from_pmid(pmid)
print("DGIdb (PMID: {}) was published on {}.".format(pmid, published.date()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment