Last active
August 15, 2016 23:52
-
-
Save myedibleenso/f233359445461a71ad37017393fe921f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -* | |
try: | |
# python 3.X | |
from urllib.request import urlopen, urlretrieve | |
except: | |
# python 2.7 | |
from urllib2 import urlopen | |
from urllib import urlretrieve | |
import re | |
import argparse | |
import sys | |
import os | |
""" | |
usage: python fetch_nxml.py --pmcids PMC1234 PMC1235 | |
""" | |
# pubmed documentation: http://www.ncbi.nlm.nih.gov/books/NBK25499/ | |
class PubMedResources(object): | |
@staticmethod | |
def retrieve_nxml_abstract(pmid, outfile = None): | |
""" | |
Retrieves nxml file of the abstract associated with the provided pmid | |
""" | |
query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=abstract".format(pmid) | |
nxml_file = outfile or "{}.nxml".format(pmid) | |
urlretrieve(query, nxml_file) | |
@staticmethod | |
def retrieve_nxml_paper(pmcid, outfile = None): | |
""" | |
Retrieves nxml file for the provided pmcid | |
""" | |
query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}".format(pmcid) | |
nxml_file = outfile or "{}.nxml".format(pmcid) | |
urlretrieve(query, nxml_file) | |
class PubMedEntry(object): | |
def __init__(self, someid): | |
self.convert_ids(someid) | |
def convert_ids(self, someid): | |
# id conversion api: http://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/ | |
url = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={}".format(someid) | |
with urlopen(url) as response: | |
html = response.read().decode("utf-8") | |
self.pmcid = re.findall("(?<!version) pmcid=\"(.*?)\"", html)[0] | |
self.pmid = re.findall("(?<!version) pmid=\"(.*?)\"", html)[0] | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Retrieve nxml from an PMCID') | |
parser.add_argument('--pmcids', #"-i", | |
dest="pmcids", | |
nargs='+', | |
required=True, | |
help='a list of PMCIDs (delimited by whitespace)' | |
) | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_args() | |
for pmcid in args.pmcids: | |
outfile = "{}.nxml".format(pmcid) | |
PubMedResources.retrieve_nxml_paper(pmcid, outfile) | |
error_msg = "error=\"The following PMCID is not available: " | |
if error_msg in open(outfile, 'r').read(): | |
os.remove(outfile) | |
print("Unable to retrieve nxml for {}".format(pmcid)) | |
else: | |
print("retrieved {}".format(outfile)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment