Skip to content

Instantly share code, notes, and snippets.

@raggleton
Created September 8, 2016 10:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raggleton/baf7f3da76f5af57bb53a30698ee4bee to your computer and use it in GitHub Desktop.
Save raggleton/baf7f3da76f5af57bb53a30698ee4bee to your computer and use it in GitHub Desktop.
Statusboard for thesis: https://raggleton.github.io/status.html
#!/usr/bin/env python
"""
Go through TeX files and count words, plot things.
TODO:
- improve timezone handling
- only count commits where tex file changed?
- add PDF pagecount?
"""
import os
import numpy as np
import matplotlib
matplotlib.use('agg') # Can also use 'tkagg' or 'webagg'
import matplotlib.pyplot as plt
from collections import OrderedDict
from subprocess import check_output, check_call
from itertools import izip
import datetime
# import matplotlib.dates as mdates
from sys import platform as _platform
from scipy import stats
import plotly
from plotly.graph_objs import Scatter, Layout, Histogram2d, Heatmap, Histogram
from math import ceil, floor
# The main TeX file to count. Assumes you've \included all necessary material
MAIN_TEX_FILE = 'thesis.tex'
MAIN_PDF_FILE = MAIN_TEX_FILE.replace(".tex", ".pdf")
# Start plotting/counting from this date
# START_DATE = datetime.datetime(2016, 3, 29)
START_DATE = datetime.datetime.fromtimestamp(1459366339)
# Notable events. key is commit hash,
# value is the label you wnat to put on the graph
NOTABLE_HASHES = OrderedDict()
NOTABLE_HASHES['d41da4c52e30ae11ad9c2f651f723152e1956191'] = 'Inv Higgs draft done',
NOTABLE_HASHES['1257bea1296e40d1ce465a3bb83bda5c44872e42'] = '4tau start',
NOTABLE_HASHES['cd08b0009889a5758de199b0b3f6ac1968408212'] = 'Remove \mynoteline, \\todo from texcount'
NOTABLE_HASHES['4349c54fa46edb9ff0765047b7c6d62035dd7e93'] = 'Start of LHCP'
NOTABLE_HASHES['9d786c7be573e9429c75cfd0a5b5ba884983c2c0'] = 'Back from LHCP'
NOTABLE_HASHES['e62b254d245fc8beb3cc32d1e12a67d51b986ff9'] = 'Post-brexit'
NOTABLE_HASHES['f7695008dddffecab215c677a7c8f68164bd1e27'] = 'Change paragraph spacing'
NOTABLE_HASHES['9fe61a21aee114f2e60e96e71b9482d362acd65c'] = 'Stop double-counting in wordcount'
DELIM = '\t\t'
CSV_DTYPE = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8'), ('pagecount', 'i8'), ('message', 'S200')]
def get_wordcount(main_file, include_bib=True):
"""Get wordcount using main TeX file and its included.
include_bib is a flag to include bibliography in word count.
"""
incbib_opt = '-incbib' if include_bib else ''
cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
return int(check_output(cmds).strip())
def get_pdf_pagecount(pdf_filename):
cmd = "pdfinfo %s | grep Pages | awk '{print $2}'" % pdf_filename
return int(check_output(cmd, shell=True))
def get_git_username():
"""Get git user.name"""
return check_output('git config --get user.name'.split()).strip()
def get_git_current_hash(short=False):
"""Get current commit hash."""
cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
return check_output(cmd.split()).strip()
def get_git_commit_hashes(author, short=False):
"""Get all commit hashes for given author. short flag for short hasheds.
Returned in chronological order, oldest first.
"""
hash_fmt = r'%h' if short else r'%H'
cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
'--author=%s' % author, '--reverse']
return [x.strip('"') for x in check_output(cmds).splitlines()]
def get_git_commit_timestamp(commit_hash):
"""Get timestamp for commit hash"""
cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
return check_output(cmds).strip('"')
def get_git_commit_message(commit_hash):
"""Get commit subject for commit hash"""
cmds = ['git', 'show', '-s', '--pretty=format:"%s"', commit_hash]
return check_output(cmds).strip('"')
def get_wordcount_history(main_file, already_stored_hashes=None):
"""Return lists of commit hashes, timestamps, wordcounts, pdf pagecounts, and messages"""
if already_stored_hashes is None:
already_stored_hashes = []
curr_branch = check_output(['git', 'branch']).split()[1].strip()
stashtag = 'wordcounterscript'
check_call(['git', 'stash', 'save', stashtag ]) # stash current changes
try:
hashes = get_git_commit_hashes(get_git_username()) # get commits to checkout
# checkout each, get wordcount & timestamp for that commit
new_hashes, timestamps, wordcounts, pagecounts, messages = [], [], [], [], []
for ghash in hashes:
if ghash in already_stored_hashes:
continue
check_call(['git', 'checkout', '-q', ghash])
new_hashes.append(ghash)
timestamps.append(get_git_commit_timestamp(ghash))
wordcounts.append(get_wordcount(main_file))
if os.path.isfile(MAIN_PDF_FILE):
pagecounts.append(get_pdf_pagecount(MAIN_PDF_FILE))
else:
if len(pagecounts) >= 1:
pagecounts.append(pagecounts[-1])
else:
pagecounts.append(0)
messages.append(get_git_commit_message(ghash))
print 'ADDING hash:', ghash, 'timestamp:', timestamps[-1], 'pdf pagecount', pagecounts[-1], 'wordcount:', wordcounts[-1]
finally:
# get back current changes
check_call(['git', 'checkout', curr_branch])
stash_list = check_output(['git', 'stash', 'list'])
if stashtag in stash_list:
# only apply stash if we actually stashe danything, otherwise it fails
print 'Applying stashed changes...'
check_call(['git', 'stash', 'pop'])
return new_hashes, timestamps, wordcounts, pagecounts, messages
def make_recarray(hashes, timestamps, wordcounts, pagecounts, messages):
"""Make numpy recarray from lists"""
objects = [(h, ts, wc, pc, ms) for h, ts, wc, pc, ms in izip(hashes, timestamps, wordcounts, pagecounts, messages)]
return np.rec.array(objects, dtype=CSV_DTYPE)
def update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages):
"""Update numpy recarray `store` from lists
Over-write older hashes if timestamp/wordcount changes.
"""
# print hashes, timestamps, wordcounts
new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages = [], [], [], [], []
for h, t, w, p, m in izip(hashes, timestamps, wordcounts, pagecounts, messages):
if m in store.message:
ind = np.where(store.message == m)
# if len(ind[0]) != 2:
# print store
# print h, t, w, p, m
# print ind
# raise IndexError('ERROR updating store, cannot get index from %s' % ind)
ind = ind[0][0]
store.hash[ind] = h
store.timestamp[ind] = t
store.wordcount[ind] = w
store.pagecount[ind] = p
# store.message[ind] = m
else:
new_hashes.append(h)
new_timestamps.append(t)
new_wordcounts.append(w)
new_pagecounts.append(p)
new_messages.append(m)
if len(new_hashes) != 0:
new_store = make_recarray(new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages)
concat_recarray = np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)
concat_recarray.sort(order='timestamp')
return concat_recarray
else:
return store
def write_recarray_to_file(store, csv_filename):
"""Save recarray to file as CSV"""
np.savetxt(csv_filename, store, delimiter=DELIM,
header=','.join(store.dtype.names),
fmt=['%s', '%u', '%u', '%u', '%s'])
def plot_wordcount_vs_time(store, start_date=None, do_fit=True, ax=None):
"""Make plot of wordcount vs time
Can perform fit for word count progression graph.
"""
# Setup generic Axes if the user hasn't provided one
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if not start_date:
start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
mask = store.timestamp >= start
# Convert to datetime objects, otherwise matplotlib can't handle it
# TODO: get the right timezone, currnetly thinks its UTC not BST
timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
ax.set_ylabel('Word count')
ax.grid(True)
for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
notable_mask = store.hash == notable_hash
timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
wordcount_notable = store[notable_mask].wordcount[0]
ax.plot_date(timestamp_notable, wordcount_notable, '*', markersize=10, label=notable_label)
if do_fit:
slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
fit_x = np.linspace(store.timestamp[mask][0], store.timestamp[mask][-1], 2)
fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
fit_y = fit_x * slope + intercept
ax.plot_date(fit_x_dt, fit_y, 'r', label='Linear fit')
words_per_day = slope * (24 * 60. * 60.)
plot_title = '%.0f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
ax.set_title(plot_title, fontsize=12)
ax.legend(loc='best', framealpha=0.8, fontsize=10, numpoints=1)
# custom axis markers ?
# mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
# mondaysFmt = mdates.DateFormatter("%d %b")
# ax.xaxis.set_major_locator(mondays)
# ax.xaxis.set_major_formatter(mondaysFmt)
plt.gcf().autofmt_xdate()
def plot_commit_heatmap(store, start_date=None, ax=None):
"""Plot heatmap of commits on day vs hour plot"""
# Setup generic Axes if the user hasn't provided one
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if not start_date:
start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
mask = store.timestamp >= start
# Convert to datetime objects, otherwise matplotlib can't handle it
# TODO: get the right timezone, currnetly thinks its UTC not BST
timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
hours = [x.hour - 0.5 for x in timestamps_dt]
days = [x.weekday() - 0.5 for x in timestamps_dt]
xedges = np.arange(-0.5, 25.5, 1)
yedges = np.arange(-0.5, 7.5, 1)
# plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('YlOrBr'), zorder=1)
plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('PuBu'), zorder=1)
ax.set_xticks(np.arange(0, 25, 2))
ylabels = ['', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
ax.set_yticklabels(ylabels)
ax.set_xlabel('Hour')
# plt.colorbar()
ax.set_axisbelow(True)
ax.grid(zorder=0)
def plot_statusboard(store, start_date=None, do_fit=True):
"""Plot wordcount vs time, and commit heatmap on one plot using matplotlib."""
fig = plt.figure(figsize=(15, 5))
cols = 7
rh_span = 4
ax_l = plt.subplot2grid((1, cols), (0, 0), colspan=cols-rh_span)
plot_wordcount_vs_time(store, start_date, do_fit, ax_l)
ax_r = plt.subplot2grid((1, cols), (0, cols-rh_span), colspan=rh_span)
plot_commit_heatmap(store, start_date, ax_r)
last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
last_commit = last_commit.strftime(r"%c")
plt.suptitle("Up to last commit on %s, %d words" % (last_commit, store.wordcount[-1]), y=1.02)
plt.tight_layout()
filename = 'status.pdf'
plt.savefig(filename, bbox_inches='tight')
plt.clf()
return filename
def open_pdf(pdf_filename):
"""Open a PDF file using system's default PDF viewer."""
if _platform.startswith("linux"):
# linux
check_call(["xdg-open", pdf_filename])
elif _platform == "darwin":
# OS X
check_call(["open", pdf_filename])
elif _platform == "win32":
# Windows
check_call(["start", pdf_filename])
def plot_statusboard_plotly(store, start_date=None, auto_open=True,
do_fit=True, html_filename='status.html'):
"""Make statusboard plot with plotly, can auto open in browser."""
# filter data, convert to datetime objects
if not start_date:
start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
mask = store.timestamp >= start
timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
# Lefthand commit timeline + fit plot
# -----------------------------------
wordcount_colour = '#1f77b4'
wordcount_data = Scatter(x=timestamps_dt, y=store.wordcount[mask], name='Wordcount',
showlegend=True, mode='lines', yaxis='y1',
text=store.message[mask], marker=dict(color=wordcount_colour))
lh_traces = [wordcount_data]
timeline_title = ''
if do_fit:
slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
fit_x = store.timestamp[mask]
fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
fit_y = fit_x * slope + intercept
words_per_day = slope * (24 * 60. * 60.)
timeline_title = '%.2f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Wordcount Fit',
line=dict(color=wordcount_colour, dash='dash'))
lh_traces.append(fit_data)
for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
notable_mask = store.hash == notable_hash
timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
wordcount_notable = store[notable_mask].wordcount[0]
notable_data = Scatter(x=[timestamp_notable], y=[wordcount_notable],
mode='markers', name=notable_label,
marker=dict(size=15, symbol='star'))
lh_traces.append(notable_data)
# lh_traces.append(pagecount_data)
# Righthand commit heatmap
# ------------------------
hours = np.array([x.hour - 0.5 for x in timestamps_dt])
days = np.array([x.weekday() - 0.5 for x in timestamps_dt])
start_hour = floor(min(hours) * 2) / 2.
end_hour = 1 + ceil(max(hours) * 2) / 2.
hour_edges = np.arange(start_hour-1, end_hour, 1)
day_edges = np.arange(-0.5, 7.5, 1)
z, _, _ = np.histogram2d(hours-1, days, bins=[hour_edges, day_edges])
# hour_labels = np.arange(0, 25, 1)
hour_labels = hour_edges + 1
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# heatmap = Heatmap(z=z, x=day_labels, y=hour_labels, colorscale='YlOrBr')
# custom color scale - want 0 to be white, the rest to follow YlOrRd
# note that the z values must be specified in range 0, 1
# colors taken from http://colorbrewer2.org/
max_z = z.max()
start_z = 1. / max_z
mid_z = (start_z + 1) / 2.0
color_scheme = ['#ffeda0','#feb24c','#f03b20'] # YlOrRd
color_scheme = ['#ece7f2','#a6bddb','#2b8cbe'] # PuBu
white = '#FFFFFF'
heatmap = Heatmap(z=z, x=day_labels, y=hour_labels,
colorscale=[[0, white],
[start_z - 0.0001, white],
[start_z, color_scheme[0]],
[mid_z, color_scheme[1]],
[1.0, color_scheme[2]]])
# Add traces to subplots
# --------------------------------------
# setup subplots
rows = 4
cols = 9 # total number of columns
specs = [[{'colspan': 4, 'rowspan':4}, None, None, None, None, {'colspan': 3}, None, None, None],
[None, None, None, None, None, {'colspan': 3, 'rowspan': 3}, None, None, {'rowspan': 3}],
[None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None]]
fig = plotly.tools.make_subplots(rows=rows, cols=cols,
specs=specs,
subplot_titles=(timeline_title, None, None, None))
# 'Commit heatmap'))
for thing in lh_traces:
fig.append_trace(thing, 1, 1)
fig.append_trace(heatmap, 2, 6)
fig['layout']['yaxis1'].update(
title='Word count',
titlefont=dict(
color=wordcount_colour
),
tickfont=dict(
color=wordcount_colour
)
)
fig['layout']['xaxis3'].update(title='Day of week')
# for some reason, using nticks = 24 gives 12 ticks...
fig['layout']['yaxis3'].update(title='Hour', nticks=24, tickmode='auto')
fig['layout']['yaxis4'].update(nticks=24, tickmode='auto')
# 9 - 5 highlight on heatmap
fig['layout']['shapes'] = [
{
'type': 'rect',
'xref': 'x3',
'yref': 'y3',
'x0': -0.5,
'y0': 8.5,
'x1': 6.5,
'y1': 17.5,
'line': {
'color': 'rgb(155, 128, 191)',
'width': 2,
},
'fillcolor': 'rgba(155, 128, 191, 0)'
}]
# Add pagecount data - need to add axis first
# -------------------------------------------
pagecount_colour = '#9467bd'
pagecount_data = Scatter(x=timestamps_dt, y=store.pagecount[mask], name='Pagecount',
showlegend=True, mode='line', text=store.message[mask],
xaxis='x1', yaxis='y5', marker=dict(color=pagecount_colour, line={"color": pagecount_colour}))
fig['layout']['yaxis5'] = dict(
title='Page count',
overlaying='y1',
anchor='x1',
side='right',
titlefont=dict(
color=pagecount_colour
),
tickfont=dict(
color=pagecount_colour
),
)
fig['data'].append(pagecount_data)
if do_fit:
slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.pagecount[mask])
fit_x = store.timestamp[mask]
fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
fit_y = fit_x * slope + intercept
pages = slope * (24 * 60. * 60.)
fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Pagecount Fit',
showlegend=True, xaxis='x1', yaxis='y5',
line=dict(color=pagecount_colour, dash='dash'))
fig['data'].append(fit_data)
timeline_title = '%.2f pages/day (r-value: %.3f, p-value: %.3e)' % (pages, r_value, p_value)
fig['layout']['annotations'][0]['text'] += '<br>'
fig['layout']['annotations'][0]['text'] += timeline_title
fig['layout']['annotations'][0]['font']['size'] = 14
day_hist = Histogram(x=days,
text=day_labels,
hoverinfo='y+text',
xbins={
'start': day_edges[0],
'end': day_edges[-1],
'size': day_edges[1]-day_edges[0]
},
# xaxis='x3',
autobinx=False,
marker={'color': pagecount_colour},
showlegend=False,
name="")
hour_hist = Histogram(y=hours,
ybins={
'start': hour_edges[0],
# 'start': 7.5,
'end': hour_edges[-1]+1,
# 'end': 23.5,
'size': 1
},
yaxis='y3',
hoverinfo='x+y+text',
autobiny=False,
marker={'color': wordcount_colour},
showlegend=False,
name="")
fig.append_trace(day_hist, 1, 6)
fig.append_trace(hour_hist, 2, 9)
fig.layout['xaxis2']['showticklabels'] = False
fig.layout['yaxis4']['showticklabels'] = False
# Add overall title & legend
# --------------------------
last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
last_commit = last_commit.strftime(r"%c")
title = "Up to last commit on %s, %d words, %d pages, %d commits" % (last_commit,
store.wordcount[-1],
store.pagecount[-1],
len(timestamps_dt))
fig['layout'].update(
showlegend=True,
title=title,
legend=dict(
x=0.01,
y=1
),
hovermode='closest'
)
# print fig['layout']
# print fig.to_string()
plotly.offline.plot(fig, auto_open=auto_open, filename=html_filename)
if __name__ == "__main__":
print 'Current word count:', get_wordcount(MAIN_TEX_FILE)
print 'Current page count:', get_pdf_pagecount(MAIN_PDF_FILE)
# Get our data - either from CSV, or go through old commits
csv_filename = 'word_count_history.csv'
generate_data = False
already_stored_hashes = None
if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
generate_data = True
else:
store = np.recfromtxt(csv_filename, delimiter=DELIM, dtype=CSV_DTYPE)
# check if we already have a word count for this commit,
# if not recheck commits - we've prob missed others
if get_git_current_hash() not in store.hash:
generate_data = True
already_stored_hashes = store.hash
if generate_data:
hashes, timestamps, wordcounts, pagecounts, messages = get_wordcount_history(MAIN_TEX_FILE, already_stored_hashes)
if already_stored_hashes is None:
store = make_recarray(hashes, timestamps, wordcounts, pagecounts, messages)
else:
store = update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages)
write_recarray_to_file(store, csv_filename)
# Now do any analysis and plotting
# Plot with matplotlib:
# pdf_filename = plot_statusboard(store, start_date=START_DATE)
# open_pdf(pdf_filename)
# Plot with plotly
plot_statusboard_plotly(store, start_date=START_DATE, html_filename='status.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment