egaudrain/journal_report.py

## journal_report.py
#-*- coding: utf-8 -*-

import xml.etree.ElementTree
import urllib2, urllib
import sqlite3
import datetime, time
from numpy import *
from scipy import *
from matplotlib import pylab, cm
from matplotlib.font_manager import FontProperties

#=================================================
# Create Cache database
#=================================================

def get_element_date(e):
	d = e.find('Year').text + " " + e.find('Month').text + " " + e.find('Day').text
	try:
		d = datetime.datetime.strptime(d, "%Y %m %d")
	except:
		try:
			d = datetime.datetime.strptime(d, "%Y %b %d")
		except:
			d = None
	if d==None:
		return None
	else:
		return [d.year, d.month, d.day]

class cache:
	def __init__(self):
		self.conn = sqlite3.connect('./journal_report.cache')
		self.cursor = self.conn.cursor()
		self.cursor.execute('PRAGMA journal_mode=OFF')
		sql = """
			CREATE TABLE IF NOT EXISTS cache (
				pmid INTEGER PRIMARY KEY,
				xml TEXT,
				received_date TEXT,
				accepted_date TEXT,
				review_duration INTEGER
				)
			"""
		self.cursor.execute(sql)
		self.conn.commit()

	def __del__(self):
		self.conn.commit()
		self.cursor.close()

	def get(self, pmid):
		self.cursor.execute("SELECT xml, received_date, accepted_date, review_duration FROM cache WHERE pmid=?", (str(pmid),))
		r = self.cursor.fetchone()
		if r==None:
			return None
		else:
			xml_str, received_date, accepted_date, review_duration = r
			if received_date is not None:
				received_date = [int(x) for x in received_date.split('-')]
			if accepted_date is not None:
				accepted_date = [int(x) for x in accepted_date.split('-')]
			return (pmid, xml_str, received_date, accepted_date, review_duration)

	def set(self, pmid, xml_str):
		x = xml.etree.ElementTree.fromstring(xml_str)
		elems = x.findall('PubmedArticle/PubmedData/History/PubMedPubDate')
		received_date = None
		accepted_date = None
		review_duration = None
		if elems is not None:
			for e in elems:
				if e.attrib['PubStatus']=='received':
					received_date = get_element_date(e)
				elif e.attrib['PubStatus']=='accepted':
					accepted_date = get_element_date(e)

			if received_date is not None and accepted_date is not None:
				d1 = datetime.date(*received_date)
				d2 = datetime.date(*accepted_date)
				review_duration  = (d2-d1).days

		if received_date is not None:
			received_date_str = "-".join([str(x) for x in received_date])
		else:
			received_date_str = None
		if accepted_date is not None:
			accepted_date_str = "-".join([str(x) for x in accepted_date])
		else:
			accepted_date_str = None
		self.cursor.execute("INSERT INTO cache (pmid, xml, received_date, accepted_date, review_duration) VALUES (?, ?, ?, ?, ?)", (pmid, xml_str.decode('utf-8'), received_date_str, accepted_date_str, review_duration))
		self.conn.commit()

		return (pmid, xml_str, received_date, accepted_date, review_duration)

#=================================================
# Handle PubMed
#=================================================

def get_pubmed_entry(pmid):
	dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml" % pmid)
	return dat.read()

def get_journal_pmidlist(journal):
	"""Get the articles of a journal. Returns an iterator."""
	dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=100000" % urllib.quote_plus(journal))
	#~ dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=30" % urllib.quote_plus(journal))
	x = xml.etree.ElementTree.fromstring(dat.read())
	elems = x.findall('IdList/Id')
	pmid = list()
	for e in elems:
		pmid.append(int(e.text))
	print "Found", len(pmid), "articles for '%s'" % journal
	return pmid

#=================================================
# Plot
#=================================================

def stdErr(x):
	return std(array(x))/sqrt(len(x))

def plot(dat, journal):

	#-------- Plotting
	dat = array(dat)
	tmin = datetime.date.fromtimestamp(min(dat[:,0]))
	tmax = datetime.date.fromtimestamp(max(dat[:,0]))
	tmin = tmin.replace(day=1)
	tmax = tmax.replace(day=1, month=(tmax.month%12)+1, year=tmax.year+tmax.month/12)
	print tmin, tmax
	tbins = list()
	tticklabels = list()
	t = tmin
	step = 12 # months
	while t<tmax:
		tbins.append(time.mktime(t.timetuple()))
		tticklabels.append("%d-%02d" % (t.year, t.month))
		if t.month+step > 12:
			t = t.replace(month=((t.month-1+step)%12)+1, year=t.year+1)
		else:
			t = t.replace(month=t.month+step)

	fig = pylab.figure()
	ax  = fig.add_axes((.1, .12, .8, .8))

	cmap = cm.ScalarMappable(cmap=cm.jet).to_rgba(range(len(tbins)-1))
	alpha = tile(linspace(.8, 0, len(tbins)-1), (3, 1)).T
	beta  = tile(linspace(0, .5, len(tbins)-1), (3, 1)).T
	cmap = (1-beta)**.7 * cmap[:,0:3] * (1-alpha) + alpha * 1

	for i in range(len(tbins)-1):
		d = dat[ (dat[:,0]<tbins[i+1]).__and__( dat[:,0]>=tbins[i] ).__and__( dat[:,1]>=30 ), 1]

		if len(d)==0:
			continue

		if std(d)<10:
			continue

		#h, e = histogram(d, 10)
		#ax.plot(e[0:-1]+mean(diff(e)), h, '-', color=cmap[i,:], label=tticklabels[i])
		ax.boxplot(d, positions=[i], notch=0, sym='k+', vert=1, whis=1.5, bootstrap=2000)

	#ax.legend(loc=1, prop=FontProperties(size=10))
	ax.set_xticks(range(len(tbins)-1))
	ax.set_xticklabels(tticklabels, rotation='vertical', fontsize=9)
	ax.set_ylabel('Days between submitted and accepted')

	ax.set_title("%s - %d articles" % (journal.strip('"'), dat.shape[0]))

	fig.set_size_inches(8, 6)
	fig.savefig("jreport - %s.png" % journal.strip('"'), dpi=200, format="png")

#=================================================

#~ journal = '"Brain research"'
journal = '"Hearing Research"'

C = cache()
dat = list()
i = 0
was_none = False
n_none_cons = 0
pmid_list = get_journal_pmidlist(journal)
for pmid in pmid_list:
	r = C.get(pmid)
	if r==None:
		print "Retrieving", pmid
		xml_str = get_pubmed_entry(pmid)
		pmid, xml_str, received_date, accepted_date, review_duration = C.set(pmid, xml_str)
	else:
		print "Got", pmid, "from the cache"
		pmid, xml_str, received_date, accepted_date, review_duration = r
	if received_date!=None and review_duration!=None:
		received_date = time.mktime(datetime.date(*(received_date)).timetuple())
		dat.append([received_date, review_duration])
		was_none = False
		n_none_cons = 0
	else:
		if was_none:
			n_none_cons += 1
		was_none = True

	if n_none_cons > 50:
		print "More than 50 consecutive entries with no date. Stopping..."
		break

	i += 1
	if i%100==0:
		print "Plotting (%d/%d)..." % (i, len(pmid_list))
		plot(dat, journal)

plot(dat, journal)
	#-- coding: utf-8 --

	import xml.etree.ElementTree
	import urllib2, urllib
	import sqlite3
	import datetime, time
	from numpy import *
	from scipy import *
	from matplotlib import pylab, cm
	from matplotlib.font_manager import FontProperties

	#=================================================
	# Create Cache database
	#=================================================

	def get_element_date(e):
	d = e.find('Year').text + " " + e.find('Month').text + " " + e.find('Day').text
	try:
	d = datetime.datetime.strptime(d, "%Y %m %d")
	except:
	try:
	d = datetime.datetime.strptime(d, "%Y %b %d")
	except:
	d = None
	if d==None:
	return None
	else:
	return [d.year, d.month, d.day]

	class cache:
	def __init__(self):
	self.conn = sqlite3.connect('./journal_report.cache')
	self.cursor = self.conn.cursor()
	self.cursor.execute('PRAGMA journal_mode=OFF')
	sql = """
	CREATE TABLE IF NOT EXISTS cache (
	pmid INTEGER PRIMARY KEY,
	xml TEXT,
	received_date TEXT,
	accepted_date TEXT,
	review_duration INTEGER
	)
	"""
	self.cursor.execute(sql)
	self.conn.commit()

	def __del__(self):
	self.conn.commit()
	self.cursor.close()

	def get(self, pmid):
	self.cursor.execute("SELECT xml, received_date, accepted_date, review_duration FROM cache WHERE pmid=?", (str(pmid),))
	r = self.cursor.fetchone()
	if r==None:
	return None
	else:
	xml_str, received_date, accepted_date, review_duration = r
	if received_date is not None:
	received_date = [int(x) for x in received_date.split('-')]
	if accepted_date is not None:
	accepted_date = [int(x) for x in accepted_date.split('-')]
	return (pmid, xml_str, received_date, accepted_date, review_duration)

	def set(self, pmid, xml_str):
	x = xml.etree.ElementTree.fromstring(xml_str)
	elems = x.findall('PubmedArticle/PubmedData/History/PubMedPubDate')
	received_date = None
	accepted_date = None
	review_duration = None
	if elems is not None:
	for e in elems:
	if e.attrib['PubStatus']=='received':
	received_date = get_element_date(e)
	elif e.attrib['PubStatus']=='accepted':
	accepted_date = get_element_date(e)

	if received_date is not None and accepted_date is not None:
	d1 = datetime.date(*received_date)
	d2 = datetime.date(*accepted_date)
	review_duration = (d2-d1).days

	if received_date is not None:
	received_date_str = "-".join([str(x) for x in received_date])
	else:
	received_date_str = None
	if accepted_date is not None:
	accepted_date_str = "-".join([str(x) for x in accepted_date])
	else:
	accepted_date_str = None
	self.cursor.execute("INSERT INTO cache (pmid, xml, received_date, accepted_date, review_duration) VALUES (?, ?, ?, ?, ?)", (pmid, xml_str.decode('utf-8'), received_date_str, accepted_date_str, review_duration))
	self.conn.commit()

	return (pmid, xml_str, received_date, accepted_date, review_duration)

	#=================================================
	# Handle PubMed
	#=================================================

	def get_pubmed_entry(pmid):
	dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml" % pmid)
	return dat.read()

	def get_journal_pmidlist(journal):
	"""Get the articles of a journal. Returns an iterator."""
	dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=100000" % urllib.quote_plus(journal))
	#~ dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=30" % urllib.quote_plus(journal))
	x = xml.etree.ElementTree.fromstring(dat.read())
	elems = x.findall('IdList/Id')
	pmid = list()
	for e in elems:
	pmid.append(int(e.text))
	print "Found", len(pmid), "articles for '%s'" % journal
	return pmid

	#=================================================
	# Plot
	#=================================================

	def stdErr(x):
	return std(array(x))/sqrt(len(x))

	def plot(dat, journal):

	#-------- Plotting
	dat = array(dat)
	tmin = datetime.date.fromtimestamp(min(dat[:,0]))
	tmax = datetime.date.fromtimestamp(max(dat[:,0]))
	tmin = tmin.replace(day=1)
	tmax = tmax.replace(day=1, month=(tmax.month%12)+1, year=tmax.year+tmax.month/12)
	print tmin, tmax
	tbins = list()
	tticklabels = list()
	t = tmin
	step = 12 # months
	while t<tmax:
	tbins.append(time.mktime(t.timetuple()))
	tticklabels.append("%d-%02d" % (t.year, t.month))
	if t.month+step > 12:
	t = t.replace(month=((t.month-1+step)%12)+1, year=t.year+1)
	else:
	t = t.replace(month=t.month+step)

	fig = pylab.figure()
	ax = fig.add_axes((.1, .12, .8, .8))

	cmap = cm.ScalarMappable(cmap=cm.jet).to_rgba(range(len(tbins)-1))
	alpha = tile(linspace(.8, 0, len(tbins)-1), (3, 1)).T
	beta = tile(linspace(0, .5, len(tbins)-1), (3, 1)).T
	cmap = (1-beta)*.7 cmap[:,0:3] * (1-alpha) + alpha * 1

	for i in range(len(tbins)-1):
	d = dat[ (dat[:,0]<tbins[i+1]).__and__( dat[:,0]>=tbins[i] ).__and__( dat[:,1]>=30 ), 1]

	if len(d)==0:
	continue

	if std(d)<10:
	continue

	#h, e = histogram(d, 10)
	#ax.plot(e[0:-1]+mean(diff(e)), h, '-', color=cmap[i,:], label=tticklabels[i])
	ax.boxplot(d, positions=[i], notch=0, sym='k+', vert=1, whis=1.5, bootstrap=2000)

	#ax.legend(loc=1, prop=FontProperties(size=10))
	ax.set_xticks(range(len(tbins)-1))
	ax.set_xticklabels(tticklabels, rotation='vertical', fontsize=9)
	ax.set_ylabel('Days between submitted and accepted')

	ax.set_title("%s - %d articles" % (journal.strip('"'), dat.shape[0]))

	fig.set_size_inches(8, 6)
	fig.savefig("jreport - %s.png" % journal.strip('"'), dpi=200, format="png")

	#=================================================

	#~ journal = '"Brain research"'
	journal = '"Hearing Research"'

	C = cache()
	dat = list()
	i = 0
	was_none = False
	n_none_cons = 0
	pmid_list = get_journal_pmidlist(journal)
	for pmid in pmid_list:
	r = C.get(pmid)
	if r==None:
	print "Retrieving", pmid
	xml_str = get_pubmed_entry(pmid)
	pmid, xml_str, received_date, accepted_date, review_duration = C.set(pmid, xml_str)
	else:
	print "Got", pmid, "from the cache"
	pmid, xml_str, received_date, accepted_date, review_duration = r
	if received_date!=None and review_duration!=None:
	received_date = time.mktime(datetime.date(*(received_date)).timetuple())
	dat.append([received_date, review_duration])
	was_none = False
	n_none_cons = 0
	else:
	if was_none:
	n_none_cons += 1
	was_none = True

	if n_none_cons > 50:
	print "More than 50 consecutive entries with no date. Stopping..."
	break

	i += 1
	if i%100==0:
	print "Plotting (%d/%d)..." % (i, len(pmid_list))
	plot(dat, journal)

	plot(dat, journal)