Created
August 17, 2011 13:49
-
-
Save egaudrain/1151556 to your computer and use it in GitHub Desktop.
Python Journal Report
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import xml.etree.ElementTree | |
import urllib2, urllib | |
import sqlite3 | |
import datetime, time | |
from numpy import * | |
from scipy import * | |
from matplotlib import pylab, cm | |
from matplotlib.font_manager import FontProperties | |
#================================================= | |
# Create Cache database | |
#================================================= | |
def get_element_date(e): | |
d = e.find('Year').text + " " + e.find('Month').text + " " + e.find('Day').text | |
try: | |
d = datetime.datetime.strptime(d, "%Y %m %d") | |
except: | |
try: | |
d = datetime.datetime.strptime(d, "%Y %b %d") | |
except: | |
d = None | |
if d==None: | |
return None | |
else: | |
return [d.year, d.month, d.day] | |
class cache: | |
def __init__(self): | |
self.conn = sqlite3.connect('./journal_report.cache') | |
self.cursor = self.conn.cursor() | |
self.cursor.execute('PRAGMA journal_mode=OFF') | |
sql = """ | |
CREATE TABLE IF NOT EXISTS cache ( | |
pmid INTEGER PRIMARY KEY, | |
xml TEXT, | |
received_date TEXT, | |
accepted_date TEXT, | |
review_duration INTEGER | |
) | |
""" | |
self.cursor.execute(sql) | |
self.conn.commit() | |
def __del__(self): | |
self.conn.commit() | |
self.cursor.close() | |
def get(self, pmid): | |
self.cursor.execute("SELECT xml, received_date, accepted_date, review_duration FROM cache WHERE pmid=?", (str(pmid),)) | |
r = self.cursor.fetchone() | |
if r==None: | |
return None | |
else: | |
xml_str, received_date, accepted_date, review_duration = r | |
if received_date is not None: | |
received_date = [int(x) for x in received_date.split('-')] | |
if accepted_date is not None: | |
accepted_date = [int(x) for x in accepted_date.split('-')] | |
return (pmid, xml_str, received_date, accepted_date, review_duration) | |
def set(self, pmid, xml_str): | |
x = xml.etree.ElementTree.fromstring(xml_str) | |
elems = x.findall('PubmedArticle/PubmedData/History/PubMedPubDate') | |
received_date = None | |
accepted_date = None | |
review_duration = None | |
if elems is not None: | |
for e in elems: | |
if e.attrib['PubStatus']=='received': | |
received_date = get_element_date(e) | |
elif e.attrib['PubStatus']=='accepted': | |
accepted_date = get_element_date(e) | |
if received_date is not None and accepted_date is not None: | |
d1 = datetime.date(*received_date) | |
d2 = datetime.date(*accepted_date) | |
review_duration = (d2-d1).days | |
if received_date is not None: | |
received_date_str = "-".join([str(x) for x in received_date]) | |
else: | |
received_date_str = None | |
if accepted_date is not None: | |
accepted_date_str = "-".join([str(x) for x in accepted_date]) | |
else: | |
accepted_date_str = None | |
self.cursor.execute("INSERT INTO cache (pmid, xml, received_date, accepted_date, review_duration) VALUES (?, ?, ?, ?, ?)", (pmid, xml_str.decode('utf-8'), received_date_str, accepted_date_str, review_duration)) | |
self.conn.commit() | |
return (pmid, xml_str, received_date, accepted_date, review_duration) | |
#================================================= | |
# Handle PubMed | |
#================================================= | |
def get_pubmed_entry(pmid): | |
dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml" % pmid) | |
return dat.read() | |
def get_journal_pmidlist(journal): | |
"""Get the articles of a journal. Returns an iterator.""" | |
dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=100000" % urllib.quote_plus(journal)) | |
#~ dat = urllib2.urlopen("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%s[journal]&retmode=xml&retmax=30" % urllib.quote_plus(journal)) | |
x = xml.etree.ElementTree.fromstring(dat.read()) | |
elems = x.findall('IdList/Id') | |
pmid = list() | |
for e in elems: | |
pmid.append(int(e.text)) | |
print "Found", len(pmid), "articles for '%s'" % journal | |
return pmid | |
#================================================= | |
# Plot | |
#================================================= | |
def stdErr(x): | |
return std(array(x))/sqrt(len(x)) | |
def plot(dat, journal): | |
#-------- Plotting | |
dat = array(dat) | |
tmin = datetime.date.fromtimestamp(min(dat[:,0])) | |
tmax = datetime.date.fromtimestamp(max(dat[:,0])) | |
tmin = tmin.replace(day=1) | |
tmax = tmax.replace(day=1, month=(tmax.month%12)+1, year=tmax.year+tmax.month/12) | |
print tmin, tmax | |
tbins = list() | |
tticklabels = list() | |
t = tmin | |
step = 12 # months | |
while t<tmax: | |
tbins.append(time.mktime(t.timetuple())) | |
tticklabels.append("%d-%02d" % (t.year, t.month)) | |
if t.month+step > 12: | |
t = t.replace(month=((t.month-1+step)%12)+1, year=t.year+1) | |
else: | |
t = t.replace(month=t.month+step) | |
fig = pylab.figure() | |
ax = fig.add_axes((.1, .12, .8, .8)) | |
cmap = cm.ScalarMappable(cmap=cm.jet).to_rgba(range(len(tbins)-1)) | |
alpha = tile(linspace(.8, 0, len(tbins)-1), (3, 1)).T | |
beta = tile(linspace(0, .5, len(tbins)-1), (3, 1)).T | |
cmap = (1-beta)**.7 * cmap[:,0:3] * (1-alpha) + alpha * 1 | |
for i in range(len(tbins)-1): | |
d = dat[ (dat[:,0]<tbins[i+1]).__and__( dat[:,0]>=tbins[i] ).__and__( dat[:,1]>=30 ), 1] | |
if len(d)==0: | |
continue | |
if std(d)<10: | |
continue | |
#h, e = histogram(d, 10) | |
#ax.plot(e[0:-1]+mean(diff(e)), h, '-', color=cmap[i,:], label=tticklabels[i]) | |
ax.boxplot(d, positions=[i], notch=0, sym='k+', vert=1, whis=1.5, bootstrap=2000) | |
#ax.legend(loc=1, prop=FontProperties(size=10)) | |
ax.set_xticks(range(len(tbins)-1)) | |
ax.set_xticklabels(tticklabels, rotation='vertical', fontsize=9) | |
ax.set_ylabel('Days between submitted and accepted') | |
ax.set_title("%s - %d articles" % (journal.strip('"'), dat.shape[0])) | |
fig.set_size_inches(8, 6) | |
fig.savefig("jreport - %s.png" % journal.strip('"'), dpi=200, format="png") | |
#================================================= | |
#~ journal = '"Brain research"' | |
journal = '"Hearing Research"' | |
C = cache() | |
dat = list() | |
i = 0 | |
was_none = False | |
n_none_cons = 0 | |
pmid_list = get_journal_pmidlist(journal) | |
for pmid in pmid_list: | |
r = C.get(pmid) | |
if r==None: | |
print "Retrieving", pmid | |
xml_str = get_pubmed_entry(pmid) | |
pmid, xml_str, received_date, accepted_date, review_duration = C.set(pmid, xml_str) | |
else: | |
print "Got", pmid, "from the cache" | |
pmid, xml_str, received_date, accepted_date, review_duration = r | |
if received_date!=None and review_duration!=None: | |
received_date = time.mktime(datetime.date(*(received_date)).timetuple()) | |
dat.append([received_date, review_duration]) | |
was_none = False | |
n_none_cons = 0 | |
else: | |
if was_none: | |
n_none_cons += 1 | |
was_none = True | |
if n_none_cons > 50: | |
print "More than 50 consecutive entries with no date. Stopping..." | |
break | |
i += 1 | |
if i%100==0: | |
print "Plotting (%d/%d)..." % (i, len(pmid_list)) | |
plot(dat, journal) | |
plot(dat, journal) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Now with box plot. Journal name is now on line 177.