Skip to content

Instantly share code, notes, and snippets.

@egaudrain
Created August 17, 2011 13:49
Show Gist options
  • Save egaudrain/1151556 to your computer and use it in GitHub Desktop.
Save egaudrain/1151556 to your computer and use it in GitHub Desktop.
Python Journal Report
#-*- coding: utf-8 -*-
import xml.etree.ElementTree
import urllib2, urllib
import sqlite3
import datetime, time
from numpy import *
from scipy import *
from matplotlib import pylab, cm
from matplotlib.font_manager import FontProperties
#=================================================
# Create Cache database
#=================================================
def get_element_date(e):
d = e.find('Year').text + " " + e.find('Month').text + " " + e.find('Day').text
try:
d = datetime.datetime.strptime(d, "%Y %m %d")
except:
try:
d = datetime.datetime.strptime(d, "%Y %b %d")
except:
d = None
if d==None:
return None
else:
return [d.year, d.month, d.day]
class cache:
def __init__(self):
self.conn = sqlite3.connect('./journal_report.cache')
self.cursor = self.conn.cursor()
self.cursor.execute('PRAGMA journal_mode=OFF')
sql = """
CREATE TABLE IF NOT EXISTS cache (
pmid INTEGER PRIMARY KEY,
xml TEXT,
received_date TEXT,
accepted_date TEXT,
review_duration INTEGER
)
"""
self.cursor.execute(sql)
self.conn.commit()
def __del__(self):
self.conn.commit()
self.cursor.close()
def get(self, pmid):
self.cursor.execute("SELECT xml, received_date, accepted_date, review_duration FROM cache WHERE pmid=?", (str(pmid),))
r = self.cursor.fetchone()
if r==None:
return None
else:
xml_str, received_date, accepted_date, review_duration = r
if received_date is not None:
received_date = [int(x) for x in received_date.split('-')]
if accepted_date is not None:
accepted_date = [int(x) for x in accepted_date.split('-')]
return (pmid, xml_str, received_date, accepted_date, review_duration)
def set(self, pmid, xml_str):
x = xml.etree.ElementTree.fromstring(xml_str)
elems = x.findall('PubmedArticle/PubmedData/History/PubMedPubDate')
received_date = None
accepted_date = None
review_duration = None
if elems is not None:
for e in elems:
if e.attrib['PubStatus']=='received':
received_date = get_element_date(e)
elif e.attrib['PubStatus']=='accepted':
accepted_date = get_element_date(e)
if received_date is not None and accepted_date is not None:
d1 = datetime.date(*received_date)
d2 = datetime.date(*accepted_date)
review_duration = (d2-d1).days
if received_date is not None:
received_date_str = "-".join([str(x) for x in received_date])
else:
received_date_str = None
if accepted_date is not None:
accepted_date_str = "-".join([str(x) for x in accepted_date])
else:
accepted_date_str = None
self.cursor.execute("INSERT INTO cache (pmid, xml, received_date, accepted_date, review_duration) VALUES (?, ?, ?, ?, ?)", (pmid, xml_str.decode('utf-8'), received_date_str, accepted_date_str, review_duration))
self.conn.commit()
return (pmid, xml_str, received_date, accepted_date, review_duration)
#=================================================
# Handle PubMed
#=================================================
def get_pubmed_entry(pmid):
dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml" % pmid)
return dat.read()
def get_journal_pmidlist(journal):
"""Get the articles of a journal. Returns an iterator."""
dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=100000" % urllib.quote_plus(journal))
#~ dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=30" % urllib.quote_plus(journal))
x = xml.etree.ElementTree.fromstring(dat.read())
elems = x.findall('IdList/Id')
pmid = list()
for e in elems:
pmid.append(int(e.text))
print "Found", len(pmid), "articles for '%s'" % journal
return pmid
#=================================================
# Plot
#=================================================
def stdErr(x):
return std(array(x))/sqrt(len(x))
def plot(dat, journal):
#-------- Plotting
dat = array(dat)
tmin = datetime.date.fromtimestamp(min(dat[:,0]))
tmax = datetime.date.fromtimestamp(max(dat[:,0]))
tmin = tmin.replace(day=1)
tmax = tmax.replace(day=1, month=(tmax.month%12)+1, year=tmax.year+tmax.month/12)
print tmin, tmax
tbins = list()
tticklabels = list()
t = tmin
step = 12 # months
while t<tmax:
tbins.append(time.mktime(t.timetuple()))
tticklabels.append("%d-%02d" % (t.year, t.month))
if t.month+step > 12:
t = t.replace(month=((t.month-1+step)%12)+1, year=t.year+1)
else:
t = t.replace(month=t.month+step)
fig = pylab.figure()
ax = fig.add_axes((.1, .12, .8, .8))
cmap = cm.ScalarMappable(cmap=cm.jet).to_rgba(range(len(tbins)-1))
alpha = tile(linspace(.8, 0, len(tbins)-1), (3, 1)).T
beta = tile(linspace(0, .5, len(tbins)-1), (3, 1)).T
cmap = (1-beta)**.7 * cmap[:,0:3] * (1-alpha) + alpha * 1
for i in range(len(tbins)-1):
d = dat[ (dat[:,0]<tbins[i+1]).__and__( dat[:,0]>=tbins[i] ).__and__( dat[:,1]>=30 ), 1]
if len(d)==0:
continue
if std(d)<10:
continue
#h, e = histogram(d, 10)
#ax.plot(e[0:-1]+mean(diff(e)), h, '-', color=cmap[i,:], label=tticklabels[i])
ax.boxplot(d, positions=[i], notch=0, sym='k+', vert=1, whis=1.5, bootstrap=2000)
#ax.legend(loc=1, prop=FontProperties(size=10))
ax.set_xticks(range(len(tbins)-1))
ax.set_xticklabels(tticklabels, rotation='vertical', fontsize=9)
ax.set_ylabel('Days between submitted and accepted')
ax.set_title("%s - %d articles" % (journal.strip('"'), dat.shape[0]))
fig.set_size_inches(8, 6)
fig.savefig("jreport - %s.png" % journal.strip('"'), dpi=200, format="png")
#=================================================
#~ journal = '"Brain research"'
journal = '"Hearing Research"'
C = cache()
dat = list()
i = 0
was_none = False
n_none_cons = 0
pmid_list = get_journal_pmidlist(journal)
for pmid in pmid_list:
r = C.get(pmid)
if r==None:
print "Retrieving", pmid
xml_str = get_pubmed_entry(pmid)
pmid, xml_str, received_date, accepted_date, review_duration = C.set(pmid, xml_str)
else:
print "Got", pmid, "from the cache"
pmid, xml_str, received_date, accepted_date, review_duration = r
if received_date!=None and review_duration!=None:
received_date = time.mktime(datetime.date(*(received_date)).timetuple())
dat.append([received_date, review_duration])
was_none = False
n_none_cons = 0
else:
if was_none:
n_none_cons += 1
was_none = True
if n_none_cons > 50:
print "More than 50 consecutive entries with no date. Stopping..."
break
i += 1
if i%100==0:
print "Plotting (%d/%d)..." % (i, len(pmid_list))
plot(dat, journal)
plot(dat, journal)
@egaudrain
Copy link
Author

To use, just replace with the name of the journal on line 175, with the double quotes, then run.
You will need to have numpy, scipy and matplotlib installed.

@egaudrain
Copy link
Author

Now with box plot. Journal name is now on line 177.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment