rldotai/latex_table_maker.py

## latex_table_maker.py
#!python3
"""
A script to read a document containing tables that I left as "verbatim" and
format them properly in LaTeX.

We go from this:

\begin{verbatim}
          sparsity   num_active     accuracy  test_accuracy  train_accuracy
count  4000.000000  4000.000000  4000.000000    4000.000000            4000
mean      0.984240   124.657750     0.957216       0.829739               1
std       0.000943     7.460431     0.020602       0.079698               0
min       0.981290    94.000000     0.864583       0.500000               1
25%       0.983565   119.000000     0.947917       0.772727               1
50%       0.984197   125.000000     0.958333       0.833333               1
75%       0.984956   130.000000     0.968750       0.875000               1
max       0.988116   148.000000     1.000000       1.000000               1
\end{verbatim}

to this:

\begin{tabular}{lrrrrr}
\toprule
{} & sparsity & num\_active & accuracy & test\_accuracy & train\_accuracy \\
\midrule
mean &   0.9842 &   124.6577 &   0.9572 &        0.8297 &         1.0000 \\
std  &   0.0009 &     7.4604 &   0.0206 &        0.0797 &         0.0000 \\
min  &   0.9813 &    94.0000 &   0.8646 &        0.5000 &         1.0000 \\
25\%  &   0.9836 &   119.0000 &   0.9479 &        0.7727 &         1.0000 \\
50\%  &   0.9842 &   125.0000 &   0.9583 &        0.8333 &         1.0000 \\
75\%  &   0.9850 &   130.0000 &   0.9688 &        0.8750 &         1.0000 \\
max  &   0.9881 &   148.0000 &   1.0000 &        1.0000 &         1.0000 \\
\bottomrule
\end{tabular}
"""
import pandas as pd
import sys

def read_table(text, separator=None, linebreak='\n', header=True, index=True):
    def parse_row(line):
        return [x.strip() for x in line.split(separator)]
    rows = text.split(linebreak)
    # name the columns according to the first row, if desired
    if header:
        cols = parse_row(rows.pop(0))
    else:
        cols = None

    # parse each row
    data = [parse_row(x) for x in rows]

    # if we want to keep the index separate, we can do so
    if index:
        indices = [x.pop(0) for x in data]
    else:
        indices = None
    # convert to a pandas dataframe
    return pd.DataFrame(data, columns=cols, index=indices)

def my_df_format(df):
    """Quick and dirty formatting of a dataframe. Adapt as necessary."""
    # sparsity num_active accuracy test_accuracy, train_accuracy
    df = df.copy()
    df = df.drop(['count'])
    dtypes = {
        'sparsity': float,
        'num_active': float,
        'accuracy': float,
        'test_accuracy': float,
        'train_accuracy': float,
    }
    for k, v in dtypes.items():
        df[k] = df[k].astype(v)

    formats = {
        'sparsity': '{:.4f}'.format,
        'num_active': '{:.4f}'.format,
        'accuracy': '{:.4f}'.format,
        'test_accuracy': '{:.4f}'.format,
        'train_accuracy': '{:.4f}'.format,
    }
    # print(df.to_latex(formatters=formats))
    return df.to_latex(formatters=formats)

def format_document(filename):
    """Process the document; a bit of a kludge."""
    with open(filename, 'r') as f:
        lines = f.readlines()
    ret = []

    in_table = False
    for line in lines:
        # print(line)
        if r'\begin{verbatim}' in line:
            if in_table:
                raise Exception('Looks like something went wrong') # exceptional exception
            else:
                in_table = True
            tmp = []
        elif in_table:
            if r'\end{verbatim}' in line:
                txt = ''.join(tmp).strip()
                # print(txt)
                df = read_table(txt)
                ret.append(my_df_format(df))
                in_table = False
            else:
                tmp.append(line)
        else:
            ret.append(line)
    return ''.join(ret)


# Get the name of the file to operate on, and optionally an output
if __name__ == "__main__":
    txt = format_document(sys.argv[1])
    if len(sys.argv) > 2:
        open(sys.argv[2], 'w').write(txt)
    else:
        print(txt)
	#!python3
	"""
	A script to read a document containing tables that I left as "verbatim" and
	format them properly in LaTeX.

	We go from this:

	\begin{verbatim}
	sparsity num_active accuracy test_accuracy train_accuracy
	count 4000.000000 4000.000000 4000.000000 4000.000000 4000
	mean 0.984240 124.657750 0.957216 0.829739 1
	std 0.000943 7.460431 0.020602 0.079698 0
	min 0.981290 94.000000 0.864583 0.500000 1
	25% 0.983565 119.000000 0.947917 0.772727 1
	50% 0.984197 125.000000 0.958333 0.833333 1
	75% 0.984956 130.000000 0.968750 0.875000 1
	max 0.988116 148.000000 1.000000 1.000000 1
	\end{verbatim}

	to this:

	\begin{tabular}{lrrrrr}
	\toprule
	{} & sparsity & num\_active & accuracy & test\_accuracy & train\_accuracy \\
	\midrule
	mean & 0.9842 & 124.6577 & 0.9572 & 0.8297 & 1.0000 \\
	std & 0.0009 & 7.4604 & 0.0206 & 0.0797 & 0.0000 \\
	min & 0.9813 & 94.0000 & 0.8646 & 0.5000 & 1.0000 \\
	25\% & 0.9836 & 119.0000 & 0.9479 & 0.7727 & 1.0000 \\
	50\% & 0.9842 & 125.0000 & 0.9583 & 0.8333 & 1.0000 \\
	75\% & 0.9850 & 130.0000 & 0.9688 & 0.8750 & 1.0000 \\
	max & 0.9881 & 148.0000 & 1.0000 & 1.0000 & 1.0000 \\
	\bottomrule
	\end{tabular}
	"""
	import pandas as pd
	import sys

	def read_table(text, separator=None, linebreak='\n', header=True, index=True):
	def parse_row(line):
	return [x.strip() for x in line.split(separator)]
	rows = text.split(linebreak)
	# name the columns according to the first row, if desired
	if header:
	cols = parse_row(rows.pop(0))
	else:
	cols = None

	# parse each row
	data = [parse_row(x) for x in rows]

	# if we want to keep the index separate, we can do so
	if index:
	indices = [x.pop(0) for x in data]
	else:
	indices = None
	# convert to a pandas dataframe
	return pd.DataFrame(data, columns=cols, index=indices)

	def my_df_format(df):
	"""Quick and dirty formatting of a dataframe. Adapt as necessary."""
	# sparsity num_active accuracy test_accuracy, train_accuracy
	df = df.copy()
	df = df.drop(['count'])
	dtypes = {
	'sparsity': float,
	'num_active': float,
	'accuracy': float,
	'test_accuracy': float,
	'train_accuracy': float,
	}
	for k, v in dtypes.items():
	df[k] = df[k].astype(v)

	formats = {
	'sparsity': '{:.4f}'.format,
	'num_active': '{:.4f}'.format,
	'accuracy': '{:.4f}'.format,
	'test_accuracy': '{:.4f}'.format,
	'train_accuracy': '{:.4f}'.format,
	}
	# print(df.to_latex(formatters=formats))
	return df.to_latex(formatters=formats)

	def format_document(filename):
	"""Process the document; a bit of a kludge."""
	with open(filename, 'r') as f:
	lines = f.readlines()
	ret = []

	in_table = False
	for line in lines:
	# print(line)
	if r'\begin{verbatim}' in line:
	if in_table:
	raise Exception('Looks like something went wrong') # exceptional exception
	else:
	in_table = True
	tmp = []
	elif in_table:
	if r'\end{verbatim}' in line:
	txt = ''.join(tmp).strip()
	# print(txt)
	df = read_table(txt)
	ret.append(my_df_format(df))
	in_table = False
	else:
	tmp.append(line)
	else:
	ret.append(line)
	return ''.join(ret)


	# Get the name of the file to operate on, and optionally an output
	if __name__ == "__main__":
	txt = format_document(sys.argv[1])
	if len(sys.argv) > 2:
	open(sys.argv[2], 'w').write(txt)
	else:
	print(txt)