mgymrek/CompileFiguresTables.py

## CompileFiguresTables.py
def usage():
    print """
Usage: python CompileFiguresTables.py --figlist <FILE> --nb <FILE>[,<FILE>,<file...] --out <STRING>

This script compiles figures and tables with legends for a paper from an ipython notebook.

Main text figures are compiled to A4 sized PDFs, with a specified layout,
giving "A", "B", "C", etc. Figure legends and tables written to a .docx file.

Supplemental figures and tables are compiled to a .docx file, with one
figure/legend per page. Use specified layout for multiple panels.

Arguments:
--figlist: file with list of figures. JSON format with. Best explained by
  the example given in example_fig_list.json. Briefly, it has:
  MainText
   Figures
   Tables
  Supplemental
   Figures
   Tables

  Figures and Tables are lists of figure and table objects.
  Figure format:
    {
      "FigureName": "name",
      "FigureTitle": "title",
      "SubFigures": [
          "fig1",
          "fig2",
          "fig3",
          ...
      ],
      "Layout": "<layout>"
    }

  "SubFigureName" and "Table" is given in the Ipython notebook file:

  Cells with code for figures/tables have a comment "# FIGURE: <$SubFigureName|$Table>".
    Figure cells should add to a pyplot axis called "ax".
    Table cells should output a pandas dataframe.
  To have an empty grid space, specify the empty string for the SubFigureName.
  If a figure is huge when written to PDF, use $SubFigureName:png to make the
  plot body displayed in png rather than pdf.

  Cells with legends are in markdown format and have a title "### Legend: <$SubFigureName|$Table> ###".
  If no legend is given the empty string is used

  Layout is a format string giving grid: Examples:
    A single figure: (1)
    2x2 grid: (1,2),(3,4)
    2x2 grid, first figure takes up whole top row: (1,1),(2,3)
    3x1 grid: (1),(2),(3)

--nb: ipython notebook file. Can give comma separated list of files to compile multiple notebooks.
   If using multiple notebook files, make sure variables are unique between them since code will be
   loaded for all of them at once.
--out: output prefix. Write:
  <out>.<FigureName>.pdf for each main text figure
  <out>.maintext_legends_and_tables.docx: for main text figure legends
  <out>.supplemental_figures_and_tables.docx: for supplemental figures and legends
  <out>_supp_pdfs: directory pdfs for each supp figure
-h, --help: print this message
-v, --verbose: print helpful status messages

NOTES:
1. This runs by running all cells without "FIGURE" in them first, then producing all the figures.
Code needs to be able to run accordingly.
2. Assume 1 plt.Axes per figure, named "ax".
3. Currently doesn't allow magic functions

e.g.
python CompileFiguresTables.py \
  --nb small-test.ipynb \
  --out test \
  --figlist example_fig_list.json

Wishlist:
deal with magics
set font and table size/styles for docx outputs
"""

import matplotlib
matplotlib.use('Agg') # don't break if not in X forwarding

from docx import *
import getopt
import itertools
import json
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import os
import pandas as pd
import PyPDF2
import random
import re
import sys
import time

######## utils ###########
def MakeTwoDigits(num):
    if num < 10:
        return "0%s"%num
    else: return str(num)

def GetTime():
    t = time.localtime()
    return "%s/%s/%s:%s:%s"%(t.tm_mon, t.tm_mday, t.tm_year, MakeTwoDigits(t.tm_hour), MakeTwoDigits(t.tm_min))

def LOG(scriptName, message, fname=None):
    msg = "[%s] %s %s\n"%(scriptName, GetTime(), message)
    if fname:
        f = open(fname, "a")
        f.write(msg)
        f.close()
    sys.stderr.write(msg)

def CheckFileExists(fname):
    if not os.path.exists(fname):
        LOG(sname, "File or directory %s does not exist"%fname)
        sys.exit(1)
########################

sname = "CompileFiguresTables.py"
NumberToLetter =["A","B","C","D","E","F","G","H","I","J"]
LETTERSIZE = (8.27, 11.69)

try:
    opts, args = getopt.getopt(sys.argv[1:], "hv", ["help","verbose","figlist=","nb=","out="])
except getopt.GetoptError, err:
    print str(err)
    usage()
    sys.exit(2)

args = [item[0] for item in opts]

if ((not "--figlist" in args) or (not "--nb" in args) or (not "--out" in args)):
    usage()
    sys.exit(2)

# initialize variables
VERBOSE = False
FIGLIST_FILE = ""
NB_FILES = ""
OUT_PREFIX = ""
params = []

# set variables
for o, a in opts:
    params.append("%s=%s"%(o.strip("-"),a))
    if o == "--figlist":
        FIGLIST_FILE = a
        CheckFileExists(FIGLIST_FILE)
    if o == "--out":
        OUT_PREFIX = a
    if o == "--nb":
        NB_FILES = a.split(",")
        for item in NB_FILES: CheckFileExists(item)
    if o == "--help" or o == "-h":
        usage()
        sys.exit(0)
    if o == "-v" or "--verbose":
        VERBOSE = True

########################################
# functions
def ParseNB(nbfile):
    """
    Inputs:
      nbfile (string): path to ipython notebook
    Return:
      FigureToCode (dict:string->[string]): SubfigureName or Table ->code lines
      FigureToLegend (dict:string->string): SubfigureName or Table->legend
      SupportingCode [[string]]: list of code for each cell that is not a figure/table
    """
    FigureToCode = {}
    FigureToLegend = {}
    SupportingCode = []
    nb = json.load(open(nbfile, "r"))
    cells = nb["worksheets"][0]["cells"]
    for cell in cells:
        if cell["cell_type"] == "code":
            textlines = cell["input"]
            figname = None
            for item in textlines:
                if re.match("#\s?FIGURE: .*", item):
                    figname = item.split("FIGURE:")[1].strip()
                elif re.match("#\s?DISPLAY: .*", item):
                    figname = item.split("DISPLAY:")[1].strip()
            if figname:
                FigureToCode[figname] = textlines
            else:
                SupportingCode.append(textlines)
        if cell["cell_type"] == "markdown":
            textlines = cell["source"]
            figname = None
            text = ""
            for item in textlines:
                if re.match("### LEGEND: .* ###\n", item):
                    figname = item.split("LEGEND:")[1].split("###")[0].strip()
                    text = [item for item in textlines if "LEGEND" not in item]
            if figname: FigureToLegend[figname] = "".join(text)
    return FigureToCode, FigureToLegend, SupportingCode

def GetAllFigureNames(figlist):
    """
    Input:
      figlist (pandas.DataFrame returned by pandas.read_jason)
    Return:
      [string]: list of all SubFigureNames to process
    """
    main_text_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.MainText["Figures"]]))
    supp_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.Supplemental["Figures"]]))
    return [item.split(":")[0] for item in main_text_figs + supp_figs]

def GetAllTableNames(figlist):
    """
    Input:
      figlist (pandas.DataFrame returned by pandas.read_jason)
    Return:
      [string]: list of all Tables to process
    """
    main_text_tables = [item["Table"] for item in figlist.MainText["Tables"]]
    supp_tables = [item["Table"] for item in figlist.Supplemental["Tables"]]
    return main_text_tables + supp_tables

def ScaleToAxis(tick_positions, old_axis, new_axis):
    """
    Scale ticks to new axis when using imshow to display png
    Input:
       tick_positions (np.array or list) from old axis
       old_axis: (min,max) of old axis
       new_axis: (min,max) of new axis
    Return:
       new_ticks (list): new tick positions scaled to new axis
    """
    min_old, max_old = old_axis
    width_old = max_old-min_old
    min_new, max_new = new_axis
    width_new = max_new-min_new
    new_ticks = []
    for t in tick_positions:
        perc = (t-min_old)*1.0/width_old
        new = min_new + perc*width_new
        new_ticks.append(new)
    return new_ticks

def GetFigureSpan(layout, fignum):
    """
    Input:
      layout ([[int]]) (list of list of ints): layout format array
      fignum (int): number of the figure we're processing
    Return:
      from_row, to_row, from_col, to_col (int,int,int,int)
    """
    rows = [i for i in range(len(layout)) if fignum in layout[i]]
    from_row = min(rows)
    to_row = max(rows)
    if len(rows) != to_row-from_row + 1:
        LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (row)")
        sys.exit(1)
    cols = [i for i in range(len(layout[rows[0]])) if layout[rows[0]][i]==fignum]
    from_col = min(cols)
    to_col = max(cols)
    if len(cols) != to_col-from_col + 1:
        LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (col)")
        sys.exit(1)
    for row in rows:
        row = layout[row]
        for i in range(len(row)):
            if i >= from_col and i <= to_col:
                if row[i] != fignum:
                    LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
                    sys.exit(1)
            else:
                if row[i] == fignum:
                    LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
                    sys.exit(1)
    return from_row, to_row, from_col, to_col

def MakeFigure(figcode, layout, figpath, size=None, gl={}, pngs=[]):
    """
    Main function to process figures.
    Make subplots on layout. Save to figpath
    Inputs:
       figcode ([[string]]): list of list of lines of code to execute for each figure
       layout (string): layout format string
       figpath (string): path to save figure
       size (int,int): width/height in iches. If None, save to letter size
       gl: dictionary of global variables (from globals())
       pngs [int]: list of figure numbers to make as pngs (because they're too big otherwise)
    """
    # parse layout
    layout = [map(int,item.strip(",").split(",")) for item in layout.replace("(","").split(")")[:-1]]
    # check
    numrows = len(layout)
    numcols = [len(item) for item in layout]
    if not numcols.count(numcols[0]) == len(numcols):
        LOG(sname, "ERROR: invalid layout grid")
        sys.exit(1)
    numcols = numcols[0]
    lf = set(itertools.chain.from_iterable(layout))
    for i in range(1, len(figcode)+1):
        if i not in lf:
            LOG(sname, "ERROR: not enough positions specified in layout")
            sys.exit(1)

    # set up figure
    plt.clf()
    fig = plt.figure(1)
    grid_width = 1.0/numcols
    grid_height = grid_width # make them square
    col_scale = 1
    row_scale = 1
    if numrows == 2 or numcols == 2:
        row_scale = 0.8
        col_scale = 0.8
    if numrows == 3: row_scale = 0.7
    if numcols == 3: col_scale = 0.7
    fignum = 1
    for i in range(len(figcode)):
        figletter = NumberToLetter[fignum-1]
        # get span
        from_row, to_row, from_col, to_col = GetFigureSpan(layout, fignum)
        # get letter label
        if len(figcode) > 1 and len(figcode[i]) > 0:
            ax = fig.add_axes([from_col*grid_width, 1-(from_row+1)*grid_height, grid_width, grid_height])
            ax.set_axis_off()
            ax.set_ylim(bottom=0, top=1)
            ax.text(0,0.8,figletter, size=20, weight="bold")
        colspan = (to_col-from_col+1)
        rowspan = (to_row-from_row+1)
        w = grid_width*(colspan-1)+grid_width*col_scale
        h = grid_height*(rowspan-1)+grid_height*row_scale
        ax = fig.add_axes([from_col*grid_width+(1-col_scale)*0.7*grid_width, 1-(to_row+1)*grid_height, w, h])
        newcode = ""
        for codeline in figcode[i]:
            if "fig =" not in codeline and "fig=" not in codeline and \
                    "ax =" not in codeline and \
                    "set_size_inches" not in codeline:
                newcode = newcode + codeline
        if i in pngs:
            fname = "/tmp/%s.png"%(random.randint(0,1000000))
            # Make a new figure, which we'll save to png (only the non-axis part)
            addcodelines = []
            aftercodelines = []
            addcodelines.append("ax_old = ax") # keep track of old axes
            addcodelines.append("fig2 = plt.figure(2)") # new figure
            addcodelines.append("ax = fig2.add_axes([0,0,w,h])") # new axes
            aftercodelines.append("xticklabels = [t.get_text() for t in ax.get_xticklabels()]")
            aftercodelines.append("yticklabels = [t.get_text() for t in ax.get_yticklabels()]")
            aftercodelines.append("if xticklabels[0] == \"\": xticklabels = ax.get_xticks()")
            aftercodelines.append("if yticklabels[0] == \"\": yticklabels = ax.get_yticks()")
            aftercodelines.append("ax.set_axis_off()")
            aftercodelines.append("ax.get_xaxis().set_visible(False)")
            aftercodelines.append("ax.get_yaxis().set_visible(False)")
            aftercodelines.append("plt.savefig(\"%s\", bbox_inches=\"tight\", pad_inches=0, dpi=500)"%fname) # save as png
            aftercodelines.append("plt.close(2)")
            aftercodelines.append("plt.figure(1)") # get back to figure 1
            aftercodelines.append("ax_png = ax")
            aftercodelines.append("ax = ax_old") # get back to the axis we want to plot
            aftercodelines.append("img = mpimg.imread(\"%s\")"%fname)
            aftercodelines.append("ax.imshow(img, extent=[0,1.1,0,1.1], interpolation=\"nearest\", aspect=\"equal\")")
            # set the axis to how it should be
            aftercodelines.append("ax.set_xlabel(ax_png.get_xlabel())")
            aftercodelines.append("ax.set_ylabel(ax_png.get_ylabel())")
            aftercodelines.append("ax.set_xticks(ScaleToAxis(ax_png.get_xticks(), ax_png.get_xlim(), ax.get_xlim()))")
            aftercodelines.append("ax.set_yticks(ScaleToAxis(ax_png.get_yticks(), ax_png.get_ylim(), ax.get_ylim()))")
            aftercodelines.append("ax.set_xticklabels(xticklabels, size=12)");
            aftercodelines.append("ax.set_yticklabels(yticklabels, size=12)");
            newcode = "\n".join(addcodelines) + "\n" + newcode + "\n" + "\n".join(aftercodelines)
        if len(newcode) > 0:
            newcode_comp = compile(newcode, "<string>", "exec")
            exec(newcode_comp, gl, locals())
            fignum = fignum + 1
        else: ax.set_axis_off()
    # set size
    if size is None:
        size = LETTERSIZE
        pad = 0.42
        fig.set_size_inches((size[0]-pad, (size[0]-pad)*numcols*1.0/numrows))
        dpi = 500
    else:
        xPix = 400
        dpi = xPix/size[0]
    for p in figpath:
        plt.savefig(p, bbox_inches="tight", pad_inches=0, dpi=dpi)
        # if pdf and size is letter, change the paper size
        if ".pdf" in p and size == LETTERSIZE:
            pr = PyPDF2.PdfFileReader(open(p,"rb"))
            page1 = pr.pages[0]
            # extend the paper to letter size
            mbox = page1.mediaBox
            newh = (float(mbox[2])*LETTERSIZE[1]/LETTERSIZE[0])
            deltaH = newh - float(mbox[3])
            page1.mediaBox = PyPDF2.generic.RectangleObject([0,-1*deltaH,mbox[2],mbox[3]])
            # write it
            wr = PyPDF2.PdfFileWriter()
            wr.addPage(page1)
            wr.write(open(p+".tmp","wb"))
            os.system("mv -f %s %s"%(p+".tmp",p))


def ProcessFigure(figdata, figpath, FigureToCode, FigureToLegend, size=None, gl={}):
    """
    Process a figure and return the legend
    Input:
      figdata (pandas.DataFrame): item from "Figures" list in figlist
      figpath (string): path to save figure to
      FigureToCode (dict:string->[string]): code for each subfigure
      FigureToLegend (dict:string->string): legend for each subfigure
      size: (int,int): width/height of the figure in inches. If None, use letter size
      gl: dictionary of global variables, from calling globals()
    Return:
      legend [(string, format)] formatted using docx style
    """
    LOG(sname, "  %s"%figdata["FigureTitle"])
    subfigs = figdata["SubFigures"]
    layout = figdata["Layout"]
    legend = (figdata["FigureTitle"] + ". ", [])
    figcode = []
    pngs = []
    for figname in subfigs:
        if ":png" in figname:
            pngs.append(subfigs.index(figname))
            figname = figname.split(":")[0]
        code = FigureToCode.get(figname, "")
        legend[1].append(FigureToLegend.get(figname, "No legend"))
        figcode.append(code)
    legend_text = [(legend[0], 'b')]
    fignum = 0
    for item in legend[1]:
        figletter = NumberToLetter[fignum]
        if len(legend[1]) > 1:
            legend_text.append((figletter+". ",'b'))
        legend_text.append(item+" ")
        fignum = fignum + 1
    MakeFigure(figcode, layout, figpath, size=size, gl=gl, pngs=pngs)
    return legend_text

def ConvertToString(val):
    """
    Convert values to strings for table
    Input:
      val (object)
    Return:
      string
    """
    try:
        x = float(val)
        return "{:.2g}".format(x)
    except: return str(val)

def MakeTable(tablecode, gl={}):
    """
    Main function to process tables
    Input:
      tablecode [string]: lines of code to create table, should return a pandas DataFrame
      gl: global variables from calling globals()
    Return:
      [[string]]: list of rows for the table, will be processed by docx to make table
    """
    comp = compile("".join(tablecode), "<string>", "exec")
    exec(comp, gl, locals())
    df = eval(tablecode[-1].strip(), gl, locals())
    df_list = [list(df.columns)]
    for i in range(df.shape[0]):
        df_list.append(map(ConvertToString,list(df.iloc[i,:])))
    return df_list

def ProcessTable(tabledata, FigureToCode, FigureToLegend, gl={}):
    """
    Process a table and return the legend
    Input:
      tabledata (pandas.DataFrame): item from "Tables" list in figlist
      FigureToCode (dict:string->[string]): code for each table
      FigureToLegend (dict:string->string): legend for each table
      gl: dictionary of global variables, from calling globals()
    Return:
      table [[string]]: list of rows for the table, will be processed by docx to make table
      legend [(string, format)] formatted using docx style
    """
    LOG(sname, " %s"%tabledata["TableTitle"])
    legend = [(tabledata["TableTitle"] + ". ", 'b'), (FigureToLegend.get(tabledata["Table"],""))]
    tablecode = FigureToCode[tabledata["Table"]]
    table = MakeTable(tablecode, gl=gl)
    return table, legend

########################################

# Set up MS word stuff
title    = "Figures"
subject  = "Figures"
creator  = 'Melissa Gymrek'
keywords = []
coreprops = coreproperties(title=title, subject=subject, creator=creator,
                           keywords=keywords)
appprops = appproperties()
contenttypes = contenttypes()
websettings = websettings()

# Load figlist
if VERBOSE: LOG(sname, "Parsing figlist")
figlist = pd.read_json(FIGLIST_FILE)

# Load code and legend for each figure from Ipython notebooks
if VERBOSE: LOG(sname, "Parsing ipython notebokos")
FigureToCode = {}
FigureToLegend = {}
SupportingCode = []
for nbfile in NB_FILES:
    a,b,c = ParseNB(nbfile)
    FigureToCode.update(a)
    FigureToLegend.update(b)
    SupportingCode.extend(c)

# Check that we have everything we need (code and legends for all figures)
all_figure_names = GetAllFigureNames(figlist)
for fig in all_figure_names:
    if fig not in FigureToCode:
        LOG(sname, "WARNING: Figure %s has no code"%(fig))
    if fig not in FigureToLegend:
        LOG(sname, "WARNING: Figure %s has no legend"%(fig))
all_table_names = GetAllTableNames(figlist)
for tab in all_table_names:
    if tab not in FigureToCode:
        LOG(sname, "WARNING: Table %s has no code"%(tab))
    if tab not in FigureToLegend:
        LOG(sname, "WARNING: Table %s has no legend"%(tab))

# Run supporting code
if VERBOSE: LOG(sname, "Executing supporting code")
for cell in SupportingCode:
    newcell = []
    for line in cell:
        if line[0] != "%": newcell.append(line)
    code_comp = compile("".join(newcell), "<string>", "exec")
    exec code_comp

# Process Main figures
if VERBOSE: LOG(sname, "Process main figures")
main_figs = figlist.MainText["Figures"]
main_tables = figlist.MainText["Tables"]
relationships = relationshiplist()
document = newdocument()
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
for mf in main_figs:
    legend_text = ProcessFigure(mf, ["%s.%s.pdf"%(OUT_PREFIX, mf["FigureName"])], FigureToCode, FigureToLegend, gl=globals())
    body.append(heading(mf["FigureName"],2))
    body.append(paragraph(legend_text))

# Process Main Tables
if VERBOSE: LOG(sname, "Process main tables")
if len(main_tables) > 0:
    body.append(pagebreak(type="page", orient="portrait"))
tablenum = 1
for mt in main_tables:
    tbl, legend = ProcessTable(mt, FigureToCode, FigureToLegend, gl=globals())
    if tbl != []:
        body.append(heading("Table %s"%tablenum, 1))
        body.append(table(tbl))
        body.append(paragraph(legend))
        if mt != main_tables[-1]:
            body.append(pagebreak(type="page", orient="portrait"))
        tablenum = tablenum + 1

wr = wordrelationships(relationships)
savedocx(document, coreprops, appprops, contenttypes, websettings,
         wr, "%s.maintext_legends_and_tables.docx"%OUT_PREFIX)

# Process Supplemental figures
if VERBOSE: LOG(sname, "Process supplemental figures")
relationships = relationshiplist()
document = newdocument()
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
try:
    os.mkdir("%s_supp_pdfs"%OUT_PREFIX)
except OSError: pass
supp_figs = figlist.Supplemental["Figures"]
supp_tables = figlist.Supplemental["Tables"]
fignum = 1
for sf in supp_figs:
    figpath_pdf = "%s_supp_pdfs/%s.pdf"%(OUT_PREFIX, sf["FigureName"])
    figpath_png = "%s.png"%(sf["FigureName"])
    legend_text = ProcessFigure(sf, [figpath_png], FigureToCode, FigureToLegend, size=(8,4), gl=globals())
    relationships, picpara = picture(relationships, figpath_png, sf["FigureName"])
    body.append(heading("Supplemental Figure %s"%fignum, 1))
    body.append(picpara)
    body.append(paragraph(legend_text))
    if (sf != supp_figs[-1]) or (sf == supp_figs[-1] and len(supp_tables) > 0):
        body.append(pagebreak(type='page', orient='portrait'))
    fignum = fignum + 1
    cmd = "rm %s"%figpath_png
    os.system(cmd)

# Process Supplemental tables
if VERBOSE: LOG(sname, "Process supplemental tables")
tablenum = 1
for st in supp_tables:
    tbl, legend = ProcessTable(st, FigureToCode, FigureToLegend, gl=globals())
    body.append(heading("Supplemental Table %s"%tablenum, 1))
    body.append(table(tbl))
    body.append(paragraph(legend))
    if st != supp_tables[-1]:
        body.append(pagebreak(type='page', orient='portrait'))
    tablenum = tablenum + 1

wr = wordrelationships(relationships)
savedocx(document, coreprops, appprops, contenttypes, websettings,
         wr, "%s.supplemental_figures_and_tables.docx"%OUT_PREFIX)

LOG(sname, "Done!")

## example_fig_list.json
{
    "MainText": {
        "Figures": [
            {
                "FigureName": "Figure1",
		"FigureTitle": "TestFigure1",
                "SubFigures": [
                    "fig1",
		    "fig2",
		    "fig1",
		    "fig2"
                ],
                "Layout": "(1,2),(3,4)"
            }
        ],
	"Tables": []
    },
    "Supplemental": {
         "Figures": [
            {
                "FigureName": "SuppFig1",
		"FigureTitle": "SuppFig1Test",
		"SubFigures": [
	            "fig2",
		    "fig1"
		],
		"Layout": "(1,2)"
            },
	    {
		"FigureName": "SuppFig2",
		"FigureTitle": "SuppFig2Test",
		"SubFigures": [
		    "fig1"
		],
		"Layout": "(1)"
	    }
        ],
	"Tables": [
	    {
	        "TableName": "SuppTable1",
		"TableTitle": "Testing tables",
		"Table": "test-table"
	    }
	]
    }
}

## small-test.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              small-test.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	def usage():
	print """
	Usage: python CompileFiguresTables.py --figlist <FILE> --nb <FILE>[,<FILE>,<file...] --out <STRING>

	This script compiles figures and tables with legends for a paper from an ipython notebook.

	Main text figures are compiled to A4 sized PDFs, with a specified layout,
	giving "A", "B", "C", etc. Figure legends and tables written to a .docx file.

	Supplemental figures and tables are compiled to a .docx file, with one
	figure/legend per page. Use specified layout for multiple panels.

	Arguments:
	--figlist: file with list of figures. JSON format with. Best explained by
	the example given in example_fig_list.json. Briefly, it has:
	MainText
	Figures
	Tables
	Supplemental
	Figures
	Tables

	Figures and Tables are lists of figure and table objects.
	Figure format:
	{
	"FigureName": "name",
	"FigureTitle": "title",
	"SubFigures": [
	"fig1",
	"fig2",
	"fig3",
	...
	],
	"Layout": "<layout>"
	}

	"SubFigureName" and "Table" is given in the Ipython notebook file:

	Cells with code for figures/tables have a comment "# FIGURE: <$SubFigureName\|$Table>".
	Figure cells should add to a pyplot axis called "ax".
	Table cells should output a pandas dataframe.
	To have an empty grid space, specify the empty string for the SubFigureName.
	If a figure is huge when written to PDF, use $SubFigureName:png to make the
	plot body displayed in png rather than pdf.

	Cells with legends are in markdown format and have a title "### Legend: <$SubFigureName\|$Table> ###".
	If no legend is given the empty string is used

	Layout is a format string giving grid: Examples:
	A single figure: (1)
	2x2 grid: (1,2),(3,4)
	2x2 grid, first figure takes up whole top row: (1,1),(2,3)
	3x1 grid: (1),(2),(3)

	--nb: ipython notebook file. Can give comma separated list of files to compile multiple notebooks.
	If using multiple notebook files, make sure variables are unique between them since code will be
	loaded for all of them at once.
	--out: output prefix. Write:
	<out>.<FigureName>.pdf for each main text figure
	<out>.maintext_legends_and_tables.docx: for main text figure legends
	<out>.supplemental_figures_and_tables.docx: for supplemental figures and legends
	<out>_supp_pdfs: directory pdfs for each supp figure
	-h, --help: print this message
	-v, --verbose: print helpful status messages

	NOTES:
	1. This runs by running all cells without "FIGURE" in them first, then producing all the figures.
	Code needs to be able to run accordingly.
	2. Assume 1 plt.Axes per figure, named "ax".
	3. Currently doesn't allow magic functions

	e.g.
	python CompileFiguresTables.py \
	--nb small-test.ipynb \
	--out test \
	--figlist example_fig_list.json

	Wishlist:
	deal with magics
	set font and table size/styles for docx outputs
	"""

	import matplotlib
	matplotlib.use('Agg') # don't break if not in X forwarding

	from docx import *
	import getopt
	import itertools
	import json
	import matplotlib.image as mpimg
	import matplotlib.pyplot as plt
	import os
	import pandas as pd
	import PyPDF2
	import random
	import re
	import sys
	import time

	######## utils ###########
	def MakeTwoDigits(num):
	if num < 10:
	return "0%s"%num
	else: return str(num)

	def GetTime():
	t = time.localtime()
	return "%s/%s/%s:%s:%s"%(t.tm_mon, t.tm_mday, t.tm_year, MakeTwoDigits(t.tm_hour), MakeTwoDigits(t.tm_min))

	def LOG(scriptName, message, fname=None):
	msg = "[%s] %s %s\n"%(scriptName, GetTime(), message)
	if fname:
	f = open(fname, "a")
	f.write(msg)
	f.close()
	sys.stderr.write(msg)

	def CheckFileExists(fname):
	if not os.path.exists(fname):
	LOG(sname, "File or directory %s does not exist"%fname)
	sys.exit(1)
	########################

	sname = "CompileFiguresTables.py"
	NumberToLetter =["A","B","C","D","E","F","G","H","I","J"]
	LETTERSIZE = (8.27, 11.69)

	try:
	opts, args = getopt.getopt(sys.argv[1:], "hv", ["help","verbose","figlist=","nb=","out="])
	except getopt.GetoptError, err:
	print str(err)
	usage()
	sys.exit(2)

	args = [item[0] for item in opts]

	if ((not "--figlist" in args) or (not "--nb" in args) or (not "--out" in args)):
	usage()
	sys.exit(2)

	# initialize variables
	VERBOSE = False
	FIGLIST_FILE = ""
	NB_FILES = ""
	OUT_PREFIX = ""
	params = []

	# set variables
	for o, a in opts:
	params.append("%s=%s"%(o.strip("-"),a))
	if o == "--figlist":
	FIGLIST_FILE = a
	CheckFileExists(FIGLIST_FILE)
	if o == "--out":
	OUT_PREFIX = a
	if o == "--nb":
	NB_FILES = a.split(",")
	for item in NB_FILES: CheckFileExists(item)
	if o == "--help" or o == "-h":
	usage()
	sys.exit(0)
	if o == "-v" or "--verbose":
	VERBOSE = True

	########################################
	# functions
	def ParseNB(nbfile):
	"""
	Inputs:
	nbfile (string): path to ipython notebook
	Return:
	FigureToCode (dict:string->[string]): SubfigureName or Table ->code lines
	FigureToLegend (dict:string->string): SubfigureName or Table->legend
	SupportingCode [[string]]: list of code for each cell that is not a figure/table
	"""
	FigureToCode = {}
	FigureToLegend = {}
	SupportingCode = []
	nb = json.load(open(nbfile, "r"))
	cells = nb["worksheets"][0]["cells"]
	for cell in cells:
	if cell["cell_type"] == "code":
	textlines = cell["input"]
	figname = None
	for item in textlines:
	if re.match("#\s?FIGURE: .*", item):
	figname = item.split("FIGURE:")[1].strip()
	elif re.match("#\s?DISPLAY: .*", item):
	figname = item.split("DISPLAY:")[1].strip()
	if figname:
	FigureToCode[figname] = textlines
	else:
	SupportingCode.append(textlines)
	if cell["cell_type"] == "markdown":
	textlines = cell["source"]
	figname = None
	text = ""
	for item in textlines:
	if re.match("### LEGEND: .* ###\n", item):
	figname = item.split("LEGEND:")[1].split("###")[0].strip()
	text = [item for item in textlines if "LEGEND" not in item]
	if figname: FigureToLegend[figname] = "".join(text)
	return FigureToCode, FigureToLegend, SupportingCode

	def GetAllFigureNames(figlist):
	"""
	Input:
	figlist (pandas.DataFrame returned by pandas.read_jason)
	Return:
	[string]: list of all SubFigureNames to process
	"""
	main_text_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.MainText["Figures"]]))
	supp_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.Supplemental["Figures"]]))
	return [item.split(":")[0] for item in main_text_figs + supp_figs]

	def GetAllTableNames(figlist):
	"""
	Input:
	figlist (pandas.DataFrame returned by pandas.read_jason)
	Return:
	[string]: list of all Tables to process
	"""
	main_text_tables = [item["Table"] for item in figlist.MainText["Tables"]]
	supp_tables = [item["Table"] for item in figlist.Supplemental["Tables"]]
	return main_text_tables + supp_tables

	def ScaleToAxis(tick_positions, old_axis, new_axis):
	"""
	Scale ticks to new axis when using imshow to display png
	Input:
	tick_positions (np.array or list) from old axis
	old_axis: (min,max) of old axis
	new_axis: (min,max) of new axis
	Return:
	new_ticks (list): new tick positions scaled to new axis
	"""
	min_old, max_old = old_axis
	width_old = max_old-min_old
	min_new, max_new = new_axis
	width_new = max_new-min_new
	new_ticks = []
	for t in tick_positions:
	perc = (t-min_old)*1.0/width_old
	new = min_new + perc*width_new
	new_ticks.append(new)
	return new_ticks

	def GetFigureSpan(layout, fignum):
	"""
	Input:
	layout ([[int]]) (list of list of ints): layout format array
	fignum (int): number of the figure we're processing
	Return:
	from_row, to_row, from_col, to_col (int,int,int,int)
	"""
	rows = [i for i in range(len(layout)) if fignum in layout[i]]
	from_row = min(rows)
	to_row = max(rows)
	if len(rows) != to_row-from_row + 1:
	LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (row)")
	sys.exit(1)
	cols = [i for i in range(len(layout[rows[0]])) if layout[rows[0]][i]==fignum]
	from_col = min(cols)
	to_col = max(cols)
	if len(cols) != to_col-from_col + 1:
	LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (col)")
	sys.exit(1)
	for row in rows:
	row = layout[row]
	for i in range(len(row)):
	if i >= from_col and i <= to_col:
	if row[i] != fignum:
	LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
	sys.exit(1)
	else:
	if row[i] == fignum:
	LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
	sys.exit(1)
	return from_row, to_row, from_col, to_col

	def MakeFigure(figcode, layout, figpath, size=None, gl={}, pngs=[]):
	"""
	Main function to process figures.
	Make subplots on layout. Save to figpath
	Inputs:
	figcode ([[string]]): list of list of lines of code to execute for each figure
	layout (string): layout format string
	figpath (string): path to save figure
	size (int,int): width/height in iches. If None, save to letter size
	gl: dictionary of global variables (from globals())
	pngs [int]: list of figure numbers to make as pngs (because they're too big otherwise)
	"""
	# parse layout
	layout = [map(int,item.strip(",").split(",")) for item in layout.replace("(","").split(")")[:-1]]
	# check
	numrows = len(layout)
	numcols = [len(item) for item in layout]
	if not numcols.count(numcols[0]) == len(numcols):
	LOG(sname, "ERROR: invalid layout grid")
	sys.exit(1)
	numcols = numcols[0]
	lf = set(itertools.chain.from_iterable(layout))
	for i in range(1, len(figcode)+1):
	if i not in lf:
	LOG(sname, "ERROR: not enough positions specified in layout")
	sys.exit(1)

	# set up figure
	plt.clf()
	fig = plt.figure(1)
	grid_width = 1.0/numcols
	grid_height = grid_width # make them square
	col_scale = 1
	row_scale = 1
	if numrows == 2 or numcols == 2:
	row_scale = 0.8
	col_scale = 0.8
	if numrows == 3: row_scale = 0.7
	if numcols == 3: col_scale = 0.7
	fignum = 1
	for i in range(len(figcode)):
	figletter = NumberToLetter[fignum-1]
	# get span
	from_row, to_row, from_col, to_col = GetFigureSpan(layout, fignum)
	# get letter label
	if len(figcode) > 1 and len(figcode[i]) > 0:
	ax = fig.add_axes([from_colgrid_width, 1-(from_row+1)grid_height, grid_width, grid_height])
	ax.set_axis_off()
	ax.set_ylim(bottom=0, top=1)
	ax.text(0,0.8,figletter, size=20, weight="bold")
	colspan = (to_col-from_col+1)
	rowspan = (to_row-from_row+1)
	w = grid_width(colspan-1)+grid_widthcol_scale
	h = grid_height(rowspan-1)+grid_heightrow_scale
	ax = fig.add_axes([from_colgrid_width+(1-col_scale)0.7grid_width, 1-(to_row+1)grid_height, w, h])
	newcode = ""
	for codeline in figcode[i]:
	if "fig =" not in codeline and "fig=" not in codeline and \
	"ax =" not in codeline and \
	"set_size_inches" not in codeline:
	newcode = newcode + codeline
	if i in pngs:
	fname = "/tmp/%s.png"%(random.randint(0,1000000))
	# Make a new figure, which we'll save to png (only the non-axis part)
	addcodelines = []
	aftercodelines = []
	addcodelines.append("ax_old = ax") # keep track of old axes
	addcodelines.append("fig2 = plt.figure(2)") # new figure
	addcodelines.append("ax = fig2.add_axes([0,0,w,h])") # new axes
	aftercodelines.append("xticklabels = [t.get_text() for t in ax.get_xticklabels()]")
	aftercodelines.append("yticklabels = [t.get_text() for t in ax.get_yticklabels()]")
	aftercodelines.append("if xticklabels[0] == \"\": xticklabels = ax.get_xticks()")
	aftercodelines.append("if yticklabels[0] == \"\": yticklabels = ax.get_yticks()")
	aftercodelines.append("ax.set_axis_off()")
	aftercodelines.append("ax.get_xaxis().set_visible(False)")
	aftercodelines.append("ax.get_yaxis().set_visible(False)")
	aftercodelines.append("plt.savefig(\"%s\", bbox_inches=\"tight\", pad_inches=0, dpi=500)"%fname) # save as png
	aftercodelines.append("plt.close(2)")
	aftercodelines.append("plt.figure(1)") # get back to figure 1
	aftercodelines.append("ax_png = ax")
	aftercodelines.append("ax = ax_old") # get back to the axis we want to plot
	aftercodelines.append("img = mpimg.imread(\"%s\")"%fname)
	aftercodelines.append("ax.imshow(img, extent=[0,1.1,0,1.1], interpolation=\"nearest\", aspect=\"equal\")")
	# set the axis to how it should be
	aftercodelines.append("ax.set_xlabel(ax_png.get_xlabel())")
	aftercodelines.append("ax.set_ylabel(ax_png.get_ylabel())")
	aftercodelines.append("ax.set_xticks(ScaleToAxis(ax_png.get_xticks(), ax_png.get_xlim(), ax.get_xlim()))")
	aftercodelines.append("ax.set_yticks(ScaleToAxis(ax_png.get_yticks(), ax_png.get_ylim(), ax.get_ylim()))")
	aftercodelines.append("ax.set_xticklabels(xticklabels, size=12)");
	aftercodelines.append("ax.set_yticklabels(yticklabels, size=12)");
	newcode = "\n".join(addcodelines) + "\n" + newcode + "\n" + "\n".join(aftercodelines)
	if len(newcode) > 0:
	newcode_comp = compile(newcode, "<string>", "exec")
	exec(newcode_comp, gl, locals())
	fignum = fignum + 1
	else: ax.set_axis_off()
	# set size
	if size is None:
	size = LETTERSIZE
	pad = 0.42
	fig.set_size_inches((size[0]-pad, (size[0]-pad)numcols1.0/numrows))
	dpi = 500
	else:
	xPix = 400
	dpi = xPix/size[0]
	for p in figpath:
	plt.savefig(p, bbox_inches="tight", pad_inches=0, dpi=dpi)
	# if pdf and size is letter, change the paper size
	if ".pdf" in p and size == LETTERSIZE:
	pr = PyPDF2.PdfFileReader(open(p,"rb"))
	page1 = pr.pages[0]
	# extend the paper to letter size
	mbox = page1.mediaBox
	newh = (float(mbox[2])*LETTERSIZE[1]/LETTERSIZE[0])
	deltaH = newh - float(mbox[3])
	page1.mediaBox = PyPDF2.generic.RectangleObject([0,-1*deltaH,mbox[2],mbox[3]])
	# write it
	wr = PyPDF2.PdfFileWriter()
	wr.addPage(page1)
	wr.write(open(p+".tmp","wb"))
	os.system("mv -f %s %s"%(p+".tmp",p))


	def ProcessFigure(figdata, figpath, FigureToCode, FigureToLegend, size=None, gl={}):
	"""
	Process a figure and return the legend
	Input:
	figdata (pandas.DataFrame): item from "Figures" list in figlist
	figpath (string): path to save figure to
	FigureToCode (dict:string->[string]): code for each subfigure
	FigureToLegend (dict:string->string): legend for each subfigure
	size: (int,int): width/height of the figure in inches. If None, use letter size
	gl: dictionary of global variables, from calling globals()
	Return:
	legend [(string, format)] formatted using docx style
	"""
	LOG(sname, " %s"%figdata["FigureTitle"])
	subfigs = figdata["SubFigures"]
	layout = figdata["Layout"]
	legend = (figdata["FigureTitle"] + ". ", [])
	figcode = []
	pngs = []
	for figname in subfigs:
	if ":png" in figname:
	pngs.append(subfigs.index(figname))
	figname = figname.split(":")[0]
	code = FigureToCode.get(figname, "")
	legend[1].append(FigureToLegend.get(figname, "No legend"))
	figcode.append(code)
	legend_text = [(legend[0], 'b')]
	fignum = 0
	for item in legend[1]:
	figletter = NumberToLetter[fignum]
	if len(legend[1]) > 1:
	legend_text.append((figletter+". ",'b'))
	legend_text.append(item+" ")
	fignum = fignum + 1
	MakeFigure(figcode, layout, figpath, size=size, gl=gl, pngs=pngs)
	return legend_text

	def ConvertToString(val):
	"""
	Convert values to strings for table
	Input:
	val (object)
	Return:
	string
	"""
	try:
	x = float(val)
	return "{:.2g}".format(x)
	except: return str(val)

	def MakeTable(tablecode, gl={}):
	"""
	Main function to process tables
	Input:
	tablecode [string]: lines of code to create table, should return a pandas DataFrame
	gl: global variables from calling globals()
	Return:
	[[string]]: list of rows for the table, will be processed by docx to make table
	"""
	comp = compile("".join(tablecode), "<string>", "exec")
	exec(comp, gl, locals())
	df = eval(tablecode[-1].strip(), gl, locals())
	df_list = [list(df.columns)]
	for i in range(df.shape[0]):
	df_list.append(map(ConvertToString,list(df.iloc[i,:])))
	return df_list

	def ProcessTable(tabledata, FigureToCode, FigureToLegend, gl={}):
	"""
	Process a table and return the legend
	Input:
	tabledata (pandas.DataFrame): item from "Tables" list in figlist
	FigureToCode (dict:string->[string]): code for each table
	FigureToLegend (dict:string->string): legend for each table
	gl: dictionary of global variables, from calling globals()
	Return:
	table [[string]]: list of rows for the table, will be processed by docx to make table
	legend [(string, format)] formatted using docx style
	"""
	LOG(sname, " %s"%tabledata["TableTitle"])
	legend = [(tabledata["TableTitle"] + ". ", 'b'), (FigureToLegend.get(tabledata["Table"],""))]
	tablecode = FigureToCode[tabledata["Table"]]
	table = MakeTable(tablecode, gl=gl)
	return table, legend

	########################################

	# Set up MS word stuff
	title = "Figures"
	subject = "Figures"
	creator = 'Melissa Gymrek'
	keywords = []
	coreprops = coreproperties(title=title, subject=subject, creator=creator,
	keywords=keywords)
	appprops = appproperties()
	contenttypes = contenttypes()
	websettings = websettings()

	# Load figlist
	if VERBOSE: LOG(sname, "Parsing figlist")
	figlist = pd.read_json(FIGLIST_FILE)

	# Load code and legend for each figure from Ipython notebooks
	if VERBOSE: LOG(sname, "Parsing ipython notebokos")
	FigureToCode = {}
	FigureToLegend = {}
	SupportingCode = []
	for nbfile in NB_FILES:
	a,b,c = ParseNB(nbfile)
	FigureToCode.update(a)
	FigureToLegend.update(b)
	SupportingCode.extend(c)

	# Check that we have everything we need (code and legends for all figures)
	all_figure_names = GetAllFigureNames(figlist)
	for fig in all_figure_names:
	if fig not in FigureToCode:
	LOG(sname, "WARNING: Figure %s has no code"%(fig))
	if fig not in FigureToLegend:
	LOG(sname, "WARNING: Figure %s has no legend"%(fig))
	all_table_names = GetAllTableNames(figlist)
	for tab in all_table_names:
	if tab not in FigureToCode:
	LOG(sname, "WARNING: Table %s has no code"%(tab))
	if tab not in FigureToLegend:
	LOG(sname, "WARNING: Table %s has no legend"%(tab))

	# Run supporting code
	if VERBOSE: LOG(sname, "Executing supporting code")
	for cell in SupportingCode:
	newcell = []
	for line in cell:
	if line[0] != "%": newcell.append(line)
	code_comp = compile("".join(newcell), "<string>", "exec")
	exec code_comp

	# Process Main figures
	if VERBOSE: LOG(sname, "Process main figures")
	main_figs = figlist.MainText["Figures"]
	main_tables = figlist.MainText["Tables"]
	relationships = relationshiplist()
	document = newdocument()
	body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
	for mf in main_figs:
	legend_text = ProcessFigure(mf, ["%s.%s.pdf"%(OUT_PREFIX, mf["FigureName"])], FigureToCode, FigureToLegend, gl=globals())
	body.append(heading(mf["FigureName"],2))
	body.append(paragraph(legend_text))

	# Process Main Tables
	if VERBOSE: LOG(sname, "Process main tables")
	if len(main_tables) > 0:
	body.append(pagebreak(type="page", orient="portrait"))
	tablenum = 1
	for mt in main_tables:
	tbl, legend = ProcessTable(mt, FigureToCode, FigureToLegend, gl=globals())
	if tbl != []:
	body.append(heading("Table %s"%tablenum, 1))
	body.append(table(tbl))
	body.append(paragraph(legend))
	if mt != main_tables[-1]:
	body.append(pagebreak(type="page", orient="portrait"))
	tablenum = tablenum + 1

	wr = wordrelationships(relationships)
	savedocx(document, coreprops, appprops, contenttypes, websettings,
	wr, "%s.maintext_legends_and_tables.docx"%OUT_PREFIX)

	# Process Supplemental figures
	if VERBOSE: LOG(sname, "Process supplemental figures")
	relationships = relationshiplist()
	document = newdocument()
	body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
	try:
	os.mkdir("%s_supp_pdfs"%OUT_PREFIX)
	except OSError: pass
	supp_figs = figlist.Supplemental["Figures"]
	supp_tables = figlist.Supplemental["Tables"]
	fignum = 1
	for sf in supp_figs:
	figpath_pdf = "%s_supp_pdfs/%s.pdf"%(OUT_PREFIX, sf["FigureName"])
	figpath_png = "%s.png"%(sf["FigureName"])
	legend_text = ProcessFigure(sf, [figpath_png], FigureToCode, FigureToLegend, size=(8,4), gl=globals())
	relationships, picpara = picture(relationships, figpath_png, sf["FigureName"])
	body.append(heading("Supplemental Figure %s"%fignum, 1))
	body.append(picpara)
	body.append(paragraph(legend_text))
	if (sf != supp_figs[-1]) or (sf == supp_figs[-1] and len(supp_tables) > 0):
	body.append(pagebreak(type='page', orient='portrait'))
	fignum = fignum + 1
	cmd = "rm %s"%figpath_png
	os.system(cmd)

	# Process Supplemental tables
	if VERBOSE: LOG(sname, "Process supplemental tables")
	tablenum = 1
	for st in supp_tables:
	tbl, legend = ProcessTable(st, FigureToCode, FigureToLegend, gl=globals())
	body.append(heading("Supplemental Table %s"%tablenum, 1))
	body.append(table(tbl))
	body.append(paragraph(legend))
	if st != supp_tables[-1]:
	body.append(pagebreak(type='page', orient='portrait'))
	tablenum = tablenum + 1

	wr = wordrelationships(relationships)
	savedocx(document, coreprops, appprops, contenttypes, websettings,
	wr, "%s.supplemental_figures_and_tables.docx"%OUT_PREFIX)

	LOG(sname, "Done!")
	{
	"MainText": {
	"Figures": [
	{
	"FigureName": "Figure1",
	"FigureTitle": "TestFigure1",
	"SubFigures": [
	"fig1",
	"fig2",
	"fig1",
	"fig2"
	],
	"Layout": "(1,2),(3,4)"
	}
	],
	"Tables": []
	},
	"Supplemental": {
	"Figures": [
	{
	"FigureName": "SuppFig1",
	"FigureTitle": "SuppFig1Test",
	"SubFigures": [
	"fig2",
	"fig1"
	],
	"Layout": "(1,2)"
	},
	{
	"FigureName": "SuppFig2",
	"FigureTitle": "SuppFig2Test",
	"SubFigures": [
	"fig1"
	],
	"Layout": "(1)"
	}
	],
	"Tables": [
	{
	"TableName": "SuppTable1",
	"TableTitle": "Testing tables",
	"Table": "test-table"
	}
	]
	}
	}