ejmurray/scrapeUpdate.py

## scrapeUpdate.py
#!/usr/bin/env python

import urllib
import xml.etree.ElementTree as ET
import cgi
import pyperclip
import sys
# TODO: read the xml file of the article in question to figure out where to add the doi.


def escapeToHTML(text, escapeQuotes=False):
    # unicodeText = unicode(text, 'utf-8') # convert '\xce\xb3' escaping into u'\u03b3' escaping
    # escape html symbols, like <>&
    htmlEscapedText = cgi.escape(text, escapeQuotes)
    # encode non-ascii characters into xhtml entities, like &#947;
    htmlEntityText = htmlEscapedText.encode('utf-8', 'xmlcharrefreplace')
    return htmlEntityText


def checkXML(XML, path):
    if XML.find(path) is not None:
        if XML.find(path).text is not None:
            return XML.find(path).text
        else:
            return ""
    else:
        return ""


def digest_authors(authors):
    author_list = []
    for author in authors:
        author_list.append(checkXML(author, "./Initials") + " " + checkXML(author, "./LastName"))

    return author_list


def digest_issue(issue):
    issue_string = ""
    issue_string += checkXML(issue, "./Volume")
    if issue.find("./Issue") is not None:
        issue_string += "(" + issue.find("./Issue").text + ")"
    return issue_string


def digest_year(XML):
    if XML.find("./Article/Journal/JournalIssue/PubDate/Year") is not None:
        return XML.find("./Article/Journal/JournalIssue/PubDate/Year").text
    elif XML.find("./Article/Journal/JournalIssue/PubDate/MedlineDate") is not None:
        return XML.find("./Article/Journal/JournalIssue/PubDate/MedlineDate").text[0:4]
    else:
        return "1900"


def search_pubmed(term):
    params= {
        'db': 'pubmed',
        'tool': 'test',
        'email':'test@test.com',
        'term': term,
        'usehistory':'y',
        'retmax':20
        }
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode(params)

    tree = ET.fromstring(urllib.urlopen(url).read())

    params['query_key'] = tree.find("./QueryKey").text
    params['WebEnv'] = tree.find("./WebEnv").text
    params['retmode'] = 'xml'

    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.urlencode(params)
    data = urllib.urlopen(url).read()

    return data


def xml_to_papers(data):
    tree = ET.fromstring(data)
    articles = tree.findall("./PubmedArticle/MedlineCitation")

    papers = []
    for article in articles:
        paper = dict()
        paper["journal_name"] = article.find("./Article/Journal/ISOAbbreviation").text
        paper["title"] = article.find("./Article/ArticleTitle").text
        paper["authors"] = digest_authors(article.findall("./Article/AuthorList/Author"))
        paper["issue"] = digest_issue(article.find("./Article/Journal/JournalIssue"))
        paper["year"] = digest_year(article)
        paper["page_num"] = checkXML(article, "./Article/Pagination/MedlinePgn")
        paper["pmid"] = article.find("./PMID").text
        paper["doi"] = checkXML(article, "./ELocationID")
        # paper["doi"] = article.find("./Journal/ELocationID").text

        papers.append(paper)

    return papers


def printCV():
    # Collect papers in dictionary structure

    if len(sys.argv) > 1:
        # Get address from command line.
        address = ' '.join(sys.argv[1:])
    else:
        # Get address from clipboard.
        title = pyperclip.paste()  # use this to get the title from the clipboard.
        # webbrowser.open('https://scholar.google.co.uk/scholar?hl=en&q=' + title)
        # webbrowser.open('http://www.ncbi.nlm.nih.gov/pubmed/?term=' + title)
        papers = xml_to_papers(search_pubmed(title))
    return papers

cache = []

papers = printCV()

for paper in papers:
    authorlist = ""
    num_authors = len(paper["authors"])
    for a in range(num_authors):
        authorlist += escapeToHTML(paper["authors"][a])
        if a < num_authors-1:
            authorlist += ", "
        else:
            authorlist += "."

    print authorlist
    print (paper["title"])
    print "http://www.ncbi.nlm.nih.gov/pubmed/" + paper["pmid"]
    if paper["journal_name"][-1] == ".":
        j_title = paper["journal_name"][0:-1]
    else:
        j_title = paper["journal_name"]
    if len(paper["doi"]) > 0:
        print j_title + " " + paper["year"]+" " + paper["issue"] + ":" + paper["page_num"] + "doi: <a href='http://doi.org/" + paper["doi"] + "'>" + paper["doi"] + " </p>"
    else:
        print j_title + ". " + paper["year"]+", " + paper["issue"] + ":" + paper["page_num"] + paper["doi"] + "."
	#!/usr/bin/env python

	import urllib
	import xml.etree.ElementTree as ET
	import cgi
	import pyperclip
	import sys
	# TODO: read the xml file of the article in question to figure out where to add the doi.


	def escapeToHTML(text, escapeQuotes=False):
	# unicodeText = unicode(text, 'utf-8') # convert '\xce\xb3' escaping into u'\u03b3' escaping
	# escape html symbols, like <>&
	htmlEscapedText = cgi.escape(text, escapeQuotes)
	# encode non-ascii characters into xhtml entities, like γ
	htmlEntityText = htmlEscapedText.encode('utf-8', 'xmlcharrefreplace')
	return htmlEntityText


	def checkXML(XML, path):
	if XML.find(path) is not None:
	if XML.find(path).text is not None:
	return XML.find(path).text
	else:
	return ""
	else:
	return ""


	def digest_authors(authors):
	author_list = []
	for author in authors:
	author_list.append(checkXML(author, "./Initials") + " " + checkXML(author, "./LastName"))

	return author_list


	def digest_issue(issue):
	issue_string = ""
	issue_string += checkXML(issue, "./Volume")
	if issue.find("./Issue") is not None:
	issue_string += "(" + issue.find("./Issue").text + ")"
	return issue_string


	def digest_year(XML):
	if XML.find("./Article/Journal/JournalIssue/PubDate/Year") is not None:
	return XML.find("./Article/Journal/JournalIssue/PubDate/Year").text
	elif XML.find("./Article/Journal/JournalIssue/PubDate/MedlineDate") is not None:
	return XML.find("./Article/Journal/JournalIssue/PubDate/MedlineDate").text[0:4]
	else:
	return "1900"


	def search_pubmed(term):
	params= {
	'db': 'pubmed',
	'tool': 'test',
	'email':'test@test.com',
	'term': term,
	'usehistory':'y',
	'retmax':20
	}
	url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode(params)

	tree = ET.fromstring(urllib.urlopen(url).read())

	params['query_key'] = tree.find("./QueryKey").text
	params['WebEnv'] = tree.find("./WebEnv").text
	params['retmode'] = 'xml'

	url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.urlencode(params)
	data = urllib.urlopen(url).read()

	return data


	def xml_to_papers(data):
	tree = ET.fromstring(data)
	articles = tree.findall("./PubmedArticle/MedlineCitation")

	papers = []
	for article in articles:
	paper = dict()
	paper["journal_name"] = article.find("./Article/Journal/ISOAbbreviation").text
	paper["title"] = article.find("./Article/ArticleTitle").text
	paper["authors"] = digest_authors(article.findall("./Article/AuthorList/Author"))
	paper["issue"] = digest_issue(article.find("./Article/Journal/JournalIssue"))
	paper["year"] = digest_year(article)
	paper["page_num"] = checkXML(article, "./Article/Pagination/MedlinePgn")
	paper["pmid"] = article.find("./PMID").text
	paper["doi"] = checkXML(article, "./ELocationID")
	# paper["doi"] = article.find("./Journal/ELocationID").text

	papers.append(paper)

	return papers


	def printCV():
	# Collect papers in dictionary structure

	if len(sys.argv) > 1:
	# Get address from command line.
	address = ' '.join(sys.argv[1:])
	else:
	# Get address from clipboard.
	title = pyperclip.paste() # use this to get the title from the clipboard.
	# webbrowser.open('https://scholar.google.co.uk/scholar?hl=en&q=' + title)
	# webbrowser.open('http://www.ncbi.nlm.nih.gov/pubmed/?term=' + title)
	papers = xml_to_papers(search_pubmed(title))
	return papers

	cache = []

	papers = printCV()

	for paper in papers:
	authorlist = ""
	num_authors = len(paper["authors"])
	for a in range(num_authors):
	authorlist += escapeToHTML(paper["authors"][a])
	if a < num_authors-1:
	authorlist += ", "
	else:
	authorlist += "."

	print authorlist
	print (paper["title"])
	print "http://www.ncbi.nlm.nih.gov/pubmed/" + paper["pmid"]
	if paper["journal_name"][-1] == ".":
	j_title = paper["journal_name"][0:-1]
	else:
	j_title = paper["journal_name"]
	if len(paper["doi"]) > 0:
	print j_title + " " + paper["year"]+" " + paper["issue"] + ":" + paper["page_num"] + "doi: <a href='http://doi.org/" + paper["doi"] + "'>" + paper["doi"] + " </p>"
	else:
	print j_title + ". " + paper["year"]+", " + paper["issue"] + ":" + paper["page_num"] + paper["doi"] + "."