raggleton/word_counter.py

## word_counter.py
#!/usr/bin/env python

"""
Go through TeX files and count words, plot things.

TODO:
- improve timezone handling
- only count commits where tex file changed?
- add PDF pagecount?
"""


import os
import numpy as np
import matplotlib
matplotlib.use('agg')  # Can also use 'tkagg' or 'webagg'
import matplotlib.pyplot as plt
from collections import OrderedDict
from subprocess import check_output, check_call
from itertools import izip
import datetime
# import matplotlib.dates as mdates
from sys import platform as _platform
from scipy import stats
import plotly
from plotly.graph_objs import Scatter, Layout, Histogram2d, Heatmap, Histogram
from math import ceil, floor

# The main TeX file to count. Assumes you've \included all necessary material
MAIN_TEX_FILE = 'thesis.tex'
MAIN_PDF_FILE = MAIN_TEX_FILE.replace(".tex", ".pdf")

# Start plotting/counting from this date
# START_DATE = datetime.datetime(2016, 3, 29)
START_DATE = datetime.datetime.fromtimestamp(1459366339)

# Notable events. key is commit hash,
# value is the label you wnat to put on the graph
NOTABLE_HASHES = OrderedDict()
NOTABLE_HASHES['d41da4c52e30ae11ad9c2f651f723152e1956191'] = 'Inv Higgs draft done',
NOTABLE_HASHES['1257bea1296e40d1ce465a3bb83bda5c44872e42'] = '4tau start',
NOTABLE_HASHES['cd08b0009889a5758de199b0b3f6ac1968408212'] = 'Remove \mynoteline, \\todo from texcount'
NOTABLE_HASHES['4349c54fa46edb9ff0765047b7c6d62035dd7e93'] = 'Start of LHCP'
NOTABLE_HASHES['9d786c7be573e9429c75cfd0a5b5ba884983c2c0'] = 'Back from LHCP'
NOTABLE_HASHES['e62b254d245fc8beb3cc32d1e12a67d51b986ff9'] = 'Post-brexit'
NOTABLE_HASHES['f7695008dddffecab215c677a7c8f68164bd1e27'] = 'Change paragraph spacing'
NOTABLE_HASHES['9fe61a21aee114f2e60e96e71b9482d362acd65c'] = 'Stop double-counting in wordcount'


DELIM = '\t\t'
CSV_DTYPE = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8'), ('pagecount', 'i8'), ('message', 'S200')]


def get_wordcount(main_file, include_bib=True):
    """Get wordcount using main TeX file and its included.
    include_bib is a flag to include bibliography in word count.
    """
    incbib_opt = '-incbib' if include_bib else ''
    cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
    return int(check_output(cmds).strip())


def get_pdf_pagecount(pdf_filename):
    cmd = "pdfinfo %s | grep Pages | awk '{print $2}'" % pdf_filename
    return int(check_output(cmd, shell=True))

def get_git_username():
    """Get git user.name"""
    return check_output('git config --get user.name'.split()).strip()


def get_git_current_hash(short=False):
    """Get current commit hash."""
    cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
    return check_output(cmd.split()).strip()


def get_git_commit_hashes(author, short=False):
    """Get all commit hashes for given author. short flag for short hasheds.
    Returned in chronological order, oldest first.
    """
    hash_fmt = r'%h' if short else r'%H'
    cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
            '--author=%s' % author, '--reverse']
    return [x.strip('"') for x in check_output(cmds).splitlines()]


def get_git_commit_timestamp(commit_hash):
    """Get timestamp for commit hash"""
    cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
    return check_output(cmds).strip('"')


def get_git_commit_message(commit_hash):
    """Get commit subject for commit hash"""
    cmds = ['git', 'show', '-s', '--pretty=format:"%s"', commit_hash]
    return check_output(cmds).strip('"')


def get_wordcount_history(main_file, already_stored_hashes=None):
    """Return lists of commit hashes, timestamps, wordcounts, pdf pagecounts, and messages"""
    if already_stored_hashes is None:
        already_stored_hashes = []
    curr_branch = check_output(['git', 'branch']).split()[1].strip()
    stashtag = 'wordcounterscript'
    check_call(['git', 'stash', 'save', stashtag ])  # stash current changes
    try:
        hashes = get_git_commit_hashes(get_git_username())  # get commits to checkout
        # checkout each, get wordcount & timestamp for that commit
        new_hashes, timestamps, wordcounts, pagecounts, messages = [], [], [], [], []
        for ghash in hashes:
            if ghash in already_stored_hashes:
                continue
            check_call(['git', 'checkout', '-q', ghash])
            new_hashes.append(ghash)
            timestamps.append(get_git_commit_timestamp(ghash))
            wordcounts.append(get_wordcount(main_file))
            if os.path.isfile(MAIN_PDF_FILE):
                pagecounts.append(get_pdf_pagecount(MAIN_PDF_FILE))
            else:
                if len(pagecounts) >= 1:
                    pagecounts.append(pagecounts[-1])
                else:
                    pagecounts.append(0)
            messages.append(get_git_commit_message(ghash))
            print 'ADDING hash:', ghash, 'timestamp:', timestamps[-1], 'pdf pagecount', pagecounts[-1], 'wordcount:', wordcounts[-1]
    finally:
        # get back current changes
        check_call(['git', 'checkout', curr_branch])
        stash_list = check_output(['git', 'stash', 'list'])
        if stashtag in stash_list:
            # only apply stash if we actually stashe danything, otherwise it fails
            print 'Applying stashed changes...'
            check_call(['git', 'stash', 'pop'])

    return new_hashes, timestamps, wordcounts, pagecounts, messages


def make_recarray(hashes, timestamps, wordcounts, pagecounts, messages):
    """Make numpy recarray from lists"""
    objects = [(h, ts, wc, pc, ms) for h, ts, wc, pc, ms in izip(hashes, timestamps, wordcounts, pagecounts, messages)]
    return np.rec.array(objects, dtype=CSV_DTYPE)


def update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages):
    """Update numpy recarray `store` from lists

    Over-write older hashes if timestamp/wordcount changes.
    """
    # print hashes, timestamps, wordcounts
    new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages = [], [], [], [], []
    for h, t, w, p, m in izip(hashes, timestamps, wordcounts, pagecounts, messages):
        if m in store.message:
            ind = np.where(store.message == m)
            # if len(ind[0]) != 2:
                # print store
                # print h, t, w, p, m
                # print ind
                # raise IndexError('ERROR updating store, cannot get index from %s' % ind)
            ind = ind[0][0]
            store.hash[ind] = h
            store.timestamp[ind] = t
            store.wordcount[ind] = w
            store.pagecount[ind] = p
            # store.message[ind] = m
        else:
            new_hashes.append(h)
            new_timestamps.append(t)
            new_wordcounts.append(w)
            new_pagecounts.append(p)
            new_messages.append(m)
    if len(new_hashes) != 0:
        new_store = make_recarray(new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages)
        concat_recarray = np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)
        concat_recarray.sort(order='timestamp')
        return concat_recarray
    else:
        return store


def write_recarray_to_file(store, csv_filename):
    """Save recarray to file as CSV"""
    np.savetxt(csv_filename, store, delimiter=DELIM,
               header=','.join(store.dtype.names),
               fmt=['%s', '%u', '%u', '%u', '%s'])


def plot_wordcount_vs_time(store, start_date=None, do_fit=True, ax=None):
    """Make plot of wordcount vs time
    Can perform fit for word count progression graph.
    """
    # Setup generic Axes if the user hasn't provided one
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    if not start_date:
        start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
    start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
    mask = store.timestamp >= start
    # Convert to datetime objects, otherwise matplotlib can't handle it
    # TODO: get the right timezone, currnetly thinks its UTC not BST
    timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
    ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
    ax.set_ylabel('Word count')
    ax.grid(True)

    for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
        notable_mask = store.hash == notable_hash
        timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
        wordcount_notable = store[notable_mask].wordcount[0]
        ax.plot_date(timestamp_notable, wordcount_notable, '*', markersize=10, label=notable_label)

    if do_fit:
        slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
        fit_x = np.linspace(store.timestamp[mask][0], store.timestamp[mask][-1], 2)
        fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
        fit_y = fit_x * slope + intercept
        ax.plot_date(fit_x_dt, fit_y, 'r', label='Linear fit')
        words_per_day = slope * (24 * 60. * 60.)
        plot_title = '%.0f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
        ax.set_title(plot_title, fontsize=12)

    ax.legend(loc='best', framealpha=0.8, fontsize=10, numpoints=1)

    # custom axis markers ?
    # mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
    # mondaysFmt = mdates.DateFormatter("%d %b")
    # ax.xaxis.set_major_locator(mondays)
    # ax.xaxis.set_major_formatter(mondaysFmt)

    plt.gcf().autofmt_xdate()


def plot_commit_heatmap(store, start_date=None, ax=None):
    """Plot heatmap of commits on day vs hour plot"""
    # Setup generic Axes if the user hasn't provided one
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
    if not start_date:
        start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
    start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
    mask = store.timestamp >= start
    # Convert to datetime objects, otherwise matplotlib can't handle it
    # TODO: get the right timezone, currnetly thinks its UTC not BST
    timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
    hours = [x.hour - 0.5 for x in timestamps_dt]
    days = [x.weekday() - 0.5 for x in timestamps_dt]
    xedges = np.arange(-0.5, 25.5, 1)
    yedges = np.arange(-0.5, 7.5, 1)
    # plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('YlOrBr'), zorder=1)
    plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('PuBu'), zorder=1)
    ax.set_xticks(np.arange(0, 25, 2))
    ylabels = ['', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ax.set_yticklabels(ylabels)
    ax.set_xlabel('Hour')
    # plt.colorbar()
    ax.set_axisbelow(True)
    ax.grid(zorder=0)


def plot_statusboard(store, start_date=None, do_fit=True):
    """Plot wordcount vs time, and commit heatmap on one plot using matplotlib."""
    fig = plt.figure(figsize=(15, 5))
    cols = 7
    rh_span = 4

    ax_l = plt.subplot2grid((1, cols), (0, 0), colspan=cols-rh_span)
    plot_wordcount_vs_time(store, start_date, do_fit, ax_l)

    ax_r = plt.subplot2grid((1, cols), (0, cols-rh_span), colspan=rh_span)
    plot_commit_heatmap(store, start_date, ax_r)

    last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
    last_commit = last_commit.strftime(r"%c")
    plt.suptitle("Up to last commit on %s, %d words" % (last_commit, store.wordcount[-1]), y=1.02)
    plt.tight_layout()

    filename = 'status.pdf'
    plt.savefig(filename, bbox_inches='tight')
    plt.clf()
    return filename


def open_pdf(pdf_filename):
    """Open a PDF file using system's default PDF viewer."""
    if _platform.startswith("linux"):
        # linux
        check_call(["xdg-open", pdf_filename])
    elif _platform == "darwin":
        # OS X
        check_call(["open", pdf_filename])
    elif _platform == "win32":
        # Windows
        check_call(["start", pdf_filename])


def plot_statusboard_plotly(store, start_date=None, auto_open=True,
                                  do_fit=True, html_filename='status.html'):
    """Make statusboard plot with plotly, can auto open in browser."""
    # filter data, convert to datetime objects
    if not start_date:
        start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
    start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
    mask = store.timestamp >= start
    timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]

    # Lefthand commit timeline + fit plot
    # -----------------------------------
    wordcount_colour = '#1f77b4'
    wordcount_data = Scatter(x=timestamps_dt, y=store.wordcount[mask], name='Wordcount',
                             showlegend=True, mode='lines', yaxis='y1',
                             text=store.message[mask], marker=dict(color=wordcount_colour))

    lh_traces = [wordcount_data]

    timeline_title = ''

    if do_fit:
        slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
        fit_x = store.timestamp[mask]
        fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
        fit_y = fit_x * slope + intercept
        words_per_day = slope * (24 * 60. * 60.)
        timeline_title = '%.2f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
        fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Wordcount Fit',
                           line=dict(color=wordcount_colour, dash='dash'))
        lh_traces.append(fit_data)

    for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
        notable_mask = store.hash == notable_hash
        timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
        wordcount_notable = store[notable_mask].wordcount[0]
        notable_data = Scatter(x=[timestamp_notable], y=[wordcount_notable],
                               mode='markers', name=notable_label,
                               marker=dict(size=15, symbol='star'))
        lh_traces.append(notable_data)

    # lh_traces.append(pagecount_data)

    # Righthand commit heatmap
    # ------------------------
    hours = np.array([x.hour - 0.5 for x in timestamps_dt])
    days = np.array([x.weekday() - 0.5 for x in timestamps_dt])
    start_hour = floor(min(hours) * 2) / 2.
    end_hour = 1 + ceil(max(hours) * 2) / 2.
    hour_edges = np.arange(start_hour-1, end_hour, 1)
    day_edges = np.arange(-0.5, 7.5, 1)
    z, _, _ = np.histogram2d(hours-1, days, bins=[hour_edges, day_edges])
    # hour_labels = np.arange(0, 25, 1)
    hour_labels = hour_edges + 1
    day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    # heatmap = Heatmap(z=z, x=day_labels, y=hour_labels, colorscale='YlOrBr')
    # custom color scale - want 0 to be white, the rest to follow YlOrRd
    # note that the z values must be specified in range 0, 1
    # colors taken from http://colorbrewer2.org/
    max_z = z.max()
    start_z = 1. / max_z
    mid_z = (start_z + 1) / 2.0
    color_scheme = ['#ffeda0','#feb24c','#f03b20']  # YlOrRd
    color_scheme = ['#ece7f2','#a6bddb','#2b8cbe']  # PuBu
    white = '#FFFFFF'
    heatmap = Heatmap(z=z, x=day_labels, y=hour_labels,
                     colorscale=[[0, white],
                                 [start_z - 0.0001, white],
                                 [start_z, color_scheme[0]],
                                 [mid_z, color_scheme[1]],
                                 [1.0, color_scheme[2]]])


    # Add traces to subplots
    # --------------------------------------
    # setup subplots
    rows = 4
    cols = 9  # total number of columns
    specs = [[{'colspan': 4, 'rowspan':4}, None, None, None, None, {'colspan': 3}, None, None, None],
             [None, None, None, None, None, {'colspan': 3, 'rowspan': 3}, None, None, {'rowspan': 3}],
             [None, None, None, None, None, None, None, None, None],
             [None, None, None, None, None, None, None, None, None]]
    fig = plotly.tools.make_subplots(rows=rows, cols=cols,
                                     specs=specs,
                                     subplot_titles=(timeline_title, None, None, None))
                                     #                 'Commit heatmap'))
    for thing in lh_traces:
        fig.append_trace(thing, 1, 1)

    fig.append_trace(heatmap, 2, 6)

    fig['layout']['yaxis1'].update(
        title='Word count',
        titlefont=dict(
            color=wordcount_colour
        ),
        tickfont=dict(
            color=wordcount_colour
        )
    )

    fig['layout']['xaxis3'].update(title='Day of week')
    # for some reason, using nticks = 24 gives 12 ticks...
    fig['layout']['yaxis3'].update(title='Hour', nticks=24, tickmode='auto')
    fig['layout']['yaxis4'].update(nticks=24, tickmode='auto')
    # 9 - 5 highlight on heatmap
    fig['layout']['shapes'] = [
        {
            'type': 'rect',
            'xref': 'x3',
            'yref': 'y3',
            'x0': -0.5,
            'y0': 8.5,
            'x1': 6.5,
            'y1': 17.5,
            'line': {
                'color': 'rgb(155, 128, 191)',
                'width': 2,
            },
            'fillcolor': 'rgba(155, 128, 191, 0)'
        }]


    # Add pagecount data - need to add axis first
    # -------------------------------------------
    pagecount_colour = '#9467bd'
    pagecount_data = Scatter(x=timestamps_dt, y=store.pagecount[mask], name='Pagecount',
                             showlegend=True, mode='line', text=store.message[mask],
                             xaxis='x1', yaxis='y5', marker=dict(color=pagecount_colour, line={"color": pagecount_colour}))
    fig['layout']['yaxis5'] = dict(
        title='Page count',
        overlaying='y1',
        anchor='x1',
        side='right',
        titlefont=dict(
            color=pagecount_colour
        ),
        tickfont=dict(
            color=pagecount_colour
        ),
    )
    fig['data'].append(pagecount_data)
    if do_fit:
        slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.pagecount[mask])
        fit_x = store.timestamp[mask]
        fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
        fit_y = fit_x * slope + intercept
        pages = slope * (24 * 60. * 60.)
        fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Pagecount Fit',
                           showlegend=True, xaxis='x1', yaxis='y5',
                           line=dict(color=pagecount_colour, dash='dash'))
        fig['data'].append(fit_data)
        timeline_title = '%.2f pages/day (r-value: %.3f, p-value: %.3e)' % (pages, r_value, p_value)
        fig['layout']['annotations'][0]['text'] += '<br>'
        fig['layout']['annotations'][0]['text'] += timeline_title
        fig['layout']['annotations'][0]['font']['size'] = 14

    day_hist = Histogram(x=days,
                         text=day_labels,
                         hoverinfo='y+text',
                         xbins={
                            'start': day_edges[0],
                            'end': day_edges[-1],
                            'size': day_edges[1]-day_edges[0]
                            },
                         # xaxis='x3',
                         autobinx=False,
                         marker={'color': pagecount_colour},
                         showlegend=False,
                         name="")
    hour_hist = Histogram(y=hours,
                          ybins={
                            'start': hour_edges[0],
                            # 'start': 7.5,
                            'end': hour_edges[-1]+1,
                            # 'end': 23.5,
                            'size': 1
                            },
                          yaxis='y3',
                          hoverinfo='x+y+text',
                          autobiny=False,
                          marker={'color': wordcount_colour},
                          showlegend=False,
                          name="")
    fig.append_trace(day_hist, 1, 6)
    fig.append_trace(hour_hist, 2, 9)
    fig.layout['xaxis2']['showticklabels'] = False
    fig.layout['yaxis4']['showticklabels'] = False

    # Add overall title & legend
    # --------------------------
    last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
    last_commit = last_commit.strftime(r"%c")
    title = "Up to last commit on %s, %d words, %d pages, %d commits" % (last_commit,
                                                                         store.wordcount[-1],
                                                                         store.pagecount[-1],
                                                                         len(timestamps_dt))
    fig['layout'].update(
        showlegend=True,
        title=title,
        legend=dict(
            x=0.01,
            y=1
        ),
        hovermode='closest'
    )
    # print fig['layout']
    # print fig.to_string()
    plotly.offline.plot(fig, auto_open=auto_open, filename=html_filename)


if __name__ == "__main__":

    print 'Current word count:', get_wordcount(MAIN_TEX_FILE)
    print 'Current page count:', get_pdf_pagecount(MAIN_PDF_FILE)

    # Get our data - either from CSV, or go through old commits
    csv_filename = 'word_count_history.csv'
    generate_data = False
    already_stored_hashes = None
    if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
        generate_data = True
    else:
        store = np.recfromtxt(csv_filename, delimiter=DELIM, dtype=CSV_DTYPE)
        # check if we already have a word count for this commit,
        # if not recheck commits - we've prob missed others
        if get_git_current_hash() not in store.hash:
            generate_data = True
            already_stored_hashes = store.hash

    if generate_data:
        hashes, timestamps, wordcounts, pagecounts, messages = get_wordcount_history(MAIN_TEX_FILE, already_stored_hashes)
        if already_stored_hashes is None:
            store = make_recarray(hashes, timestamps, wordcounts, pagecounts, messages)
        else:
            store = update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages)
        write_recarray_to_file(store, csv_filename)

    # Now do any analysis and plotting

    # Plot with matplotlib:
    # pdf_filename = plot_statusboard(store, start_date=START_DATE)
    # open_pdf(pdf_filename)

    # Plot with plotly
    plot_statusboard_plotly(store, start_date=START_DATE, html_filename='status.html')
	#!/usr/bin/env python

	"""
	Go through TeX files and count words, plot things.

	TODO:
	- improve timezone handling
	- only count commits where tex file changed?
	- add PDF pagecount?
	"""


	import os
	import numpy as np
	import matplotlib
	matplotlib.use('agg') # Can also use 'tkagg' or 'webagg'
	import matplotlib.pyplot as plt
	from collections import OrderedDict
	from subprocess import check_output, check_call
	from itertools import izip
	import datetime
	# import matplotlib.dates as mdates
	from sys import platform as _platform
	from scipy import stats
	import plotly
	from plotly.graph_objs import Scatter, Layout, Histogram2d, Heatmap, Histogram
	from math import ceil, floor

	# The main TeX file to count. Assumes you've \included all necessary material
	MAIN_TEX_FILE = 'thesis.tex'
	MAIN_PDF_FILE = MAIN_TEX_FILE.replace(".tex", ".pdf")

	# Start plotting/counting from this date
	# START_DATE = datetime.datetime(2016, 3, 29)
	START_DATE = datetime.datetime.fromtimestamp(1459366339)

	# Notable events. key is commit hash,
	# value is the label you wnat to put on the graph
	NOTABLE_HASHES = OrderedDict()
	NOTABLE_HASHES['d41da4c52e30ae11ad9c2f651f723152e1956191'] = 'Inv Higgs draft done',
	NOTABLE_HASHES['1257bea1296e40d1ce465a3bb83bda5c44872e42'] = '4tau start',
	NOTABLE_HASHES['cd08b0009889a5758de199b0b3f6ac1968408212'] = 'Remove \mynoteline, \\todo from texcount'
	NOTABLE_HASHES['4349c54fa46edb9ff0765047b7c6d62035dd7e93'] = 'Start of LHCP'
	NOTABLE_HASHES['9d786c7be573e9429c75cfd0a5b5ba884983c2c0'] = 'Back from LHCP'
	NOTABLE_HASHES['e62b254d245fc8beb3cc32d1e12a67d51b986ff9'] = 'Post-brexit'
	NOTABLE_HASHES['f7695008dddffecab215c677a7c8f68164bd1e27'] = 'Change paragraph spacing'
	NOTABLE_HASHES['9fe61a21aee114f2e60e96e71b9482d362acd65c'] = 'Stop double-counting in wordcount'


	DELIM = '\t\t'
	CSV_DTYPE = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8'), ('pagecount', 'i8'), ('message', 'S200')]


	def get_wordcount(main_file, include_bib=True):
	"""Get wordcount using main TeX file and its included.
	include_bib is a flag to include bibliography in word count.
	"""
	incbib_opt = '-incbib' if include_bib else ''
	cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
	return int(check_output(cmds).strip())


	def get_pdf_pagecount(pdf_filename):
	cmd = "pdfinfo %s \| grep Pages \| awk '{print $2}'" % pdf_filename
	return int(check_output(cmd, shell=True))

	def get_git_username():
	"""Get git user.name"""
	return check_output('git config --get user.name'.split()).strip()


	def get_git_current_hash(short=False):
	"""Get current commit hash."""
	cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
	return check_output(cmd.split()).strip()


	def get_git_commit_hashes(author, short=False):
	"""Get all commit hashes for given author. short flag for short hasheds.
	Returned in chronological order, oldest first.
	"""
	hash_fmt = r'%h' if short else r'%H'
	cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
	'--author=%s' % author, '--reverse']
	return [x.strip('"') for x in check_output(cmds).splitlines()]


	def get_git_commit_timestamp(commit_hash):
	"""Get timestamp for commit hash"""
	cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
	return check_output(cmds).strip('"')


	def get_git_commit_message(commit_hash):
	"""Get commit subject for commit hash"""
	cmds = ['git', 'show', '-s', '--pretty=format:"%s"', commit_hash]
	return check_output(cmds).strip('"')


	def get_wordcount_history(main_file, already_stored_hashes=None):
	"""Return lists of commit hashes, timestamps, wordcounts, pdf pagecounts, and messages"""
	if already_stored_hashes is None:
	already_stored_hashes = []
	curr_branch = check_output(['git', 'branch']).split()[1].strip()
	stashtag = 'wordcounterscript'
	check_call(['git', 'stash', 'save', stashtag ]) # stash current changes
	try:
	hashes = get_git_commit_hashes(get_git_username()) # get commits to checkout
	# checkout each, get wordcount & timestamp for that commit
	new_hashes, timestamps, wordcounts, pagecounts, messages = [], [], [], [], []
	for ghash in hashes:
	if ghash in already_stored_hashes:
	continue
	check_call(['git', 'checkout', '-q', ghash])
	new_hashes.append(ghash)
	timestamps.append(get_git_commit_timestamp(ghash))
	wordcounts.append(get_wordcount(main_file))
	if os.path.isfile(MAIN_PDF_FILE):
	pagecounts.append(get_pdf_pagecount(MAIN_PDF_FILE))
	else:
	if len(pagecounts) >= 1:
	pagecounts.append(pagecounts[-1])
	else:
	pagecounts.append(0)
	messages.append(get_git_commit_message(ghash))
	print 'ADDING hash:', ghash, 'timestamp:', timestamps[-1], 'pdf pagecount', pagecounts[-1], 'wordcount:', wordcounts[-1]
	finally:
	# get back current changes
	check_call(['git', 'checkout', curr_branch])
	stash_list = check_output(['git', 'stash', 'list'])
	if stashtag in stash_list:
	# only apply stash if we actually stashe danything, otherwise it fails
	print 'Applying stashed changes...'
	check_call(['git', 'stash', 'pop'])

	return new_hashes, timestamps, wordcounts, pagecounts, messages


	def make_recarray(hashes, timestamps, wordcounts, pagecounts, messages):
	"""Make numpy recarray from lists"""
	objects = [(h, ts, wc, pc, ms) for h, ts, wc, pc, ms in izip(hashes, timestamps, wordcounts, pagecounts, messages)]
	return np.rec.array(objects, dtype=CSV_DTYPE)


	def update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages):
	"""Update numpy recarray `store` from lists

	Over-write older hashes if timestamp/wordcount changes.
	"""
	# print hashes, timestamps, wordcounts
	new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages = [], [], [], [], []
	for h, t, w, p, m in izip(hashes, timestamps, wordcounts, pagecounts, messages):
	if m in store.message:
	ind = np.where(store.message == m)
	# if len(ind[0]) != 2:
	# print store
	# print h, t, w, p, m
	# print ind
	# raise IndexError('ERROR updating store, cannot get index from %s' % ind)
	ind = ind[0][0]
	store.hash[ind] = h
	store.timestamp[ind] = t
	store.wordcount[ind] = w
	store.pagecount[ind] = p
	# store.message[ind] = m
	else:
	new_hashes.append(h)
	new_timestamps.append(t)
	new_wordcounts.append(w)
	new_pagecounts.append(p)
	new_messages.append(m)
	if len(new_hashes) != 0:
	new_store = make_recarray(new_hashes, new_timestamps, new_wordcounts, new_pagecounts, new_messages)
	concat_recarray = np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)
	concat_recarray.sort(order='timestamp')
	return concat_recarray
	else:
	return store


	def write_recarray_to_file(store, csv_filename):
	"""Save recarray to file as CSV"""
	np.savetxt(csv_filename, store, delimiter=DELIM,
	header=','.join(store.dtype.names),
	fmt=['%s', '%u', '%u', '%u', '%s'])


	def plot_wordcount_vs_time(store, start_date=None, do_fit=True, ax=None):
	"""Make plot of wordcount vs time
	Can perform fit for word count progression graph.
	"""
	# Setup generic Axes if the user hasn't provided one
	if ax is None:
	fig = plt.figure()
	ax = fig.add_subplot(1, 1, 1)

	if not start_date:
	start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
	start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
	mask = store.timestamp >= start
	# Convert to datetime objects, otherwise matplotlib can't handle it
	# TODO: get the right timezone, currnetly thinks its UTC not BST
	timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
	ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
	ax.set_ylabel('Word count')
	ax.grid(True)

	for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
	notable_mask = store.hash == notable_hash
	timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
	wordcount_notable = store[notable_mask].wordcount[0]
	ax.plot_date(timestamp_notable, wordcount_notable, '*', markersize=10, label=notable_label)

	if do_fit:
	slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
	fit_x = np.linspace(store.timestamp[mask][0], store.timestamp[mask][-1], 2)
	fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
	fit_y = fit_x * slope + intercept
	ax.plot_date(fit_x_dt, fit_y, 'r', label='Linear fit')
	words_per_day = slope * (24 * 60. * 60.)
	plot_title = '%.0f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
	ax.set_title(plot_title, fontsize=12)

	ax.legend(loc='best', framealpha=0.8, fontsize=10, numpoints=1)

	# custom axis markers ?
	# mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
	# mondaysFmt = mdates.DateFormatter("%d %b")
	# ax.xaxis.set_major_locator(mondays)
	# ax.xaxis.set_major_formatter(mondaysFmt)

	plt.gcf().autofmt_xdate()


	def plot_commit_heatmap(store, start_date=None, ax=None):
	"""Plot heatmap of commits on day vs hour plot"""
	# Setup generic Axes if the user hasn't provided one
	if ax is None:
	fig = plt.figure()
	ax = fig.add_subplot(1, 1, 1)
	if not start_date:
	start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
	start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
	mask = store.timestamp >= start
	# Convert to datetime objects, otherwise matplotlib can't handle it
	# TODO: get the right timezone, currnetly thinks its UTC not BST
	timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
	hours = [x.hour - 0.5 for x in timestamps_dt]
	days = [x.weekday() - 0.5 for x in timestamps_dt]
	xedges = np.arange(-0.5, 25.5, 1)
	yedges = np.arange(-0.5, 7.5, 1)
	# plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('YlOrBr'), zorder=1)
	plt.hist2d(hours, days, bins=(xedges, yedges), cmin=1, cmap=plt.get_cmap('PuBu'), zorder=1)
	ax.set_xticks(np.arange(0, 25, 2))
	ylabels = ['', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	ax.set_yticklabels(ylabels)
	ax.set_xlabel('Hour')
	# plt.colorbar()
	ax.set_axisbelow(True)
	ax.grid(zorder=0)


	def plot_statusboard(store, start_date=None, do_fit=True):
	"""Plot wordcount vs time, and commit heatmap on one plot using matplotlib."""
	fig = plt.figure(figsize=(15, 5))
	cols = 7
	rh_span = 4

	ax_l = plt.subplot2grid((1, cols), (0, 0), colspan=cols-rh_span)
	plot_wordcount_vs_time(store, start_date, do_fit, ax_l)

	ax_r = plt.subplot2grid((1, cols), (0, cols-rh_span), colspan=rh_span)
	plot_commit_heatmap(store, start_date, ax_r)

	last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
	last_commit = last_commit.strftime(r"%c")
	plt.suptitle("Up to last commit on %s, %d words" % (last_commit, store.wordcount[-1]), y=1.02)
	plt.tight_layout()

	filename = 'status.pdf'
	plt.savefig(filename, bbox_inches='tight')
	plt.clf()
	return filename


	def open_pdf(pdf_filename):
	"""Open a PDF file using system's default PDF viewer."""
	if _platform.startswith("linux"):
	# linux
	check_call(["xdg-open", pdf_filename])
	elif _platform == "darwin":
	# OS X
	check_call(["open", pdf_filename])
	elif _platform == "win32":
	# Windows
	check_call(["start", pdf_filename])


	def plot_statusboard_plotly(store, start_date=None, auto_open=True,
	do_fit=True, html_filename='status.html'):
	"""Make statusboard plot with plotly, can auto open in browser."""
	# filter data, convert to datetime objects
	if not start_date:
	start_date = datetime.datetime.fromtimestamp(store.timestamp[0])
	start = (start_date - datetime.datetime.utcfromtimestamp(0)).total_seconds()
	mask = store.timestamp >= start
	timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]

	# Lefthand commit timeline + fit plot
	# -----------------------------------
	wordcount_colour = '#1f77b4'
	wordcount_data = Scatter(x=timestamps_dt, y=store.wordcount[mask], name='Wordcount',
	showlegend=True, mode='lines', yaxis='y1',
	text=store.message[mask], marker=dict(color=wordcount_colour))

	lh_traces = [wordcount_data]

	timeline_title = ''

	if do_fit:
	slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.wordcount[mask])
	fit_x = store.timestamp[mask]
	fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
	fit_y = fit_x * slope + intercept
	words_per_day = slope * (24 * 60. * 60.)
	timeline_title = '%.2f words/day (r-value: %.3f, p-value: %.3e)' % (words_per_day, r_value, p_value)
	fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Wordcount Fit',
	line=dict(color=wordcount_colour, dash='dash'))
	lh_traces.append(fit_data)

	for notable_hash, notable_label in NOTABLE_HASHES.iteritems():
	notable_mask = store.hash == notable_hash
	timestamp_notable = datetime.datetime.utcfromtimestamp(store[notable_mask].timestamp[0])
	wordcount_notable = store[notable_mask].wordcount[0]
	notable_data = Scatter(x=[timestamp_notable], y=[wordcount_notable],
	mode='markers', name=notable_label,
	marker=dict(size=15, symbol='star'))
	lh_traces.append(notable_data)

	# lh_traces.append(pagecount_data)

	# Righthand commit heatmap
	# ------------------------
	hours = np.array([x.hour - 0.5 for x in timestamps_dt])
	days = np.array([x.weekday() - 0.5 for x in timestamps_dt])
	start_hour = floor(min(hours) * 2) / 2.
	end_hour = 1 + ceil(max(hours) * 2) / 2.
	hour_edges = np.arange(start_hour-1, end_hour, 1)
	day_edges = np.arange(-0.5, 7.5, 1)
	z, _, _ = np.histogram2d(hours-1, days, bins=[hour_edges, day_edges])
	# hour_labels = np.arange(0, 25, 1)
	hour_labels = hour_edges + 1
	day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	# heatmap = Heatmap(z=z, x=day_labels, y=hour_labels, colorscale='YlOrBr')
	# custom color scale - want 0 to be white, the rest to follow YlOrRd
	# note that the z values must be specified in range 0, 1
	# colors taken from http://colorbrewer2.org/
	max_z = z.max()
	start_z = 1. / max_z
	mid_z = (start_z + 1) / 2.0
	color_scheme = ['#ffeda0','#feb24c','#f03b20'] # YlOrRd
	color_scheme = ['#ece7f2','#a6bddb','#2b8cbe'] # PuBu
	white = '#FFFFFF'
	heatmap = Heatmap(z=z, x=day_labels, y=hour_labels,
	colorscale=[[0, white],
	[start_z - 0.0001, white],
	[start_z, color_scheme[0]],
	[mid_z, color_scheme[1]],
	[1.0, color_scheme[2]]])


	# Add traces to subplots
	# --------------------------------------
	# setup subplots
	rows = 4
	cols = 9 # total number of columns
	specs = [[{'colspan': 4, 'rowspan':4}, None, None, None, None, {'colspan': 3}, None, None, None],
	[None, None, None, None, None, {'colspan': 3, 'rowspan': 3}, None, None, {'rowspan': 3}],
	[None, None, None, None, None, None, None, None, None],
	[None, None, None, None, None, None, None, None, None]]
	fig = plotly.tools.make_subplots(rows=rows, cols=cols,
	specs=specs,
	subplot_titles=(timeline_title, None, None, None))
	# 'Commit heatmap'))
	for thing in lh_traces:
	fig.append_trace(thing, 1, 1)

	fig.append_trace(heatmap, 2, 6)

	fig['layout']['yaxis1'].update(
	title='Word count',
	titlefont=dict(
	color=wordcount_colour
	),
	tickfont=dict(
	color=wordcount_colour
	)
	)

	fig['layout']['xaxis3'].update(title='Day of week')
	# for some reason, using nticks = 24 gives 12 ticks...
	fig['layout']['yaxis3'].update(title='Hour', nticks=24, tickmode='auto')
	fig['layout']['yaxis4'].update(nticks=24, tickmode='auto')
	# 9 - 5 highlight on heatmap
	fig['layout']['shapes'] = [
	{
	'type': 'rect',
	'xref': 'x3',
	'yref': 'y3',
	'x0': -0.5,
	'y0': 8.5,
	'x1': 6.5,
	'y1': 17.5,
	'line': {
	'color': 'rgb(155, 128, 191)',
	'width': 2,
	},
	'fillcolor': 'rgba(155, 128, 191, 0)'
	}]


	# Add pagecount data - need to add axis first
	# -------------------------------------------
	pagecount_colour = '#9467bd'
	pagecount_data = Scatter(x=timestamps_dt, y=store.pagecount[mask], name='Pagecount',
	showlegend=True, mode='line', text=store.message[mask],
	xaxis='x1', yaxis='y5', marker=dict(color=pagecount_colour, line={"color": pagecount_colour}))
	fig['layout']['yaxis5'] = dict(
	title='Page count',
	overlaying='y1',
	anchor='x1',
	side='right',
	titlefont=dict(
	color=pagecount_colour
	),
	tickfont=dict(
	color=pagecount_colour
	),
	)
	fig['data'].append(pagecount_data)
	if do_fit:
	slope, intercept, r_value, p_value, std_err = stats.linregress(store.timestamp[mask], store.pagecount[mask])
	fit_x = store.timestamp[mask]
	fit_x_dt = [datetime.datetime.utcfromtimestamp(x) for x in fit_x]
	fit_y = fit_x * slope + intercept
	pages = slope * (24 * 60. * 60.)
	fit_data = Scatter(x=fit_x_dt, y=fit_y, mode='lines', name='Pagecount Fit',
	showlegend=True, xaxis='x1', yaxis='y5',
	line=dict(color=pagecount_colour, dash='dash'))
	fig['data'].append(fit_data)
	timeline_title = '%.2f pages/day (r-value: %.3f, p-value: %.3e)' % (pages, r_value, p_value)
	fig['layout']['annotations'][0]['text'] += '<br>'
	fig['layout']['annotations'][0]['text'] += timeline_title
	fig['layout']['annotations'][0]['font']['size'] = 14

	day_hist = Histogram(x=days,
	text=day_labels,
	hoverinfo='y+text',
	xbins={
	'start': day_edges[0],
	'end': day_edges[-1],
	'size': day_edges[1]-day_edges[0]
	},
	# xaxis='x3',
	autobinx=False,
	marker={'color': pagecount_colour},
	showlegend=False,
	name="")
	hour_hist = Histogram(y=hours,
	ybins={
	'start': hour_edges[0],
	# 'start': 7.5,
	'end': hour_edges[-1]+1,
	# 'end': 23.5,
	'size': 1
	},
	yaxis='y3',
	hoverinfo='x+y+text',
	autobiny=False,
	marker={'color': wordcount_colour},
	showlegend=False,
	name="")
	fig.append_trace(day_hist, 1, 6)
	fig.append_trace(hour_hist, 2, 9)
	fig.layout['xaxis2']['showticklabels'] = False
	fig.layout['yaxis4']['showticklabels'] = False

	# Add overall title & legend
	# --------------------------
	last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
	last_commit = last_commit.strftime(r"%c")
	title = "Up to last commit on %s, %d words, %d pages, %d commits" % (last_commit,
	store.wordcount[-1],
	store.pagecount[-1],
	len(timestamps_dt))
	fig['layout'].update(
	showlegend=True,
	title=title,
	legend=dict(
	x=0.01,
	y=1
	),
	hovermode='closest'
	)
	# print fig['layout']
	# print fig.to_string()
	plotly.offline.plot(fig, auto_open=auto_open, filename=html_filename)


	if __name__ == "__main__":

	print 'Current word count:', get_wordcount(MAIN_TEX_FILE)
	print 'Current page count:', get_pdf_pagecount(MAIN_PDF_FILE)

	# Get our data - either from CSV, or go through old commits
	csv_filename = 'word_count_history.csv'
	generate_data = False
	already_stored_hashes = None
	if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
	generate_data = True
	else:
	store = np.recfromtxt(csv_filename, delimiter=DELIM, dtype=CSV_DTYPE)
	# check if we already have a word count for this commit,
	# if not recheck commits - we've prob missed others
	if get_git_current_hash() not in store.hash:
	generate_data = True
	already_stored_hashes = store.hash

	if generate_data:
	hashes, timestamps, wordcounts, pagecounts, messages = get_wordcount_history(MAIN_TEX_FILE, already_stored_hashes)
	if already_stored_hashes is None:
	store = make_recarray(hashes, timestamps, wordcounts, pagecounts, messages)
	else:
	store = update_recarray(store, hashes, timestamps, wordcounts, pagecounts, messages)
	write_recarray_to_file(store, csv_filename)

	# Now do any analysis and plotting

	# Plot with matplotlib:
	# pdf_filename = plot_statusboard(store, start_date=START_DATE)
	# open_pdf(pdf_filename)

	# Plot with plotly
	plot_statusboard_plotly(store, start_date=START_DATE, html_filename='status.html')