windweller/latex_to_csv.py

## latex_to_csv.py
"""
We write a way to convert LaTex to CSV
"""
import csv
import re

def to_csv(latex_text, file_name):
    """We learn to parse the text.
    We assume the very first line tells the format of the table!

    Arguments:
        latex_text {[type]} -- [description]
    """
    rows = []
    for i, line in enumerate(latex_text.split("\n")):
        if '&' in line:
            # then we process
            row = []
            for cell in line.strip().split('&'):
                cell = cell.strip().replace("\\", "").replace('~~', "  ").replace("\\%", '%')
                cell = cell.replace("\x08egin{tabular}{@{}l@{}}", "")
                cell = cell.replace("end{tabular}", "")
                cell = cell.replace(" ~ ", "")
                row.append(cell.strip())
            rows.append(row)

    with open(file_name, 'w') as f:
        csv_writer = csv.writer(f)
        for row in rows:
            csv_writer.writerow(row)

if __name__ == "__main__":
    latex_text = """{\begin{tabular}{@{}llll@{}}
			\toprule
			& ACSC-related Visits & Non-ACSC-related Visits  \\
			Predictor Variables & (n=823,759) & (n=1,926,289) \\ \colrule % (n=2,711,839)
			Age, mean (SD) & 59.91 (17.99) & 39.66 (23.03)  \\
			Race, num (\%) & & \\
                ~~ White & 615237 (74.69\%) & 1319754 (68.51\%)  \\
                ~~ \begin{tabular}{@{}l@{}}Black or \\ ~ African American \end{tabular} & 61785 (7.5\%) & 110755 (5.75\%) & \\
                ~~ Vietnamese & 2063 (0.25\%) & 2860 (0.15\%) & \\
                ~~ \begin{tabular}{@{}l@{}}American Indian or \\ ~ Alaska Native \end{tabular} & 2054 (0.25\%) & 6148 (0.32\%) & \\
                ~~ Filipino & 1001 (0.12\%) & 1249 (0.06\%) & \\
                ~~ \begin{tabular}{@{}l@{}}Native Hawaiian or \\ ~ Other Pacific Islander \end{tabular} & 949 (0.12\%) & 2804 (0.15\%) & \\
                ~~ Japanese & 678 (0.08\%) & 719 (0.04\%) & \\
                ~~ Other Pacific Islander & 543 (0.07\%) & 1537 (0.08\%) & \\
                ~~ Chinese & 198 (0.02\%) & 780 (0.04\%) & \\
                ~~ Native Hawaiian & 147 (0.02\%) & 592 (0.03\%) & \\
                ~~ Korean & 31 (0.0\%) & 80 (0.0\%) & \\
                ~~ Asian Indian & 24 (0.0\%) & 25 (0.0\%) & \\
                ~~ Other & $<11$ (0.0\%) & $<11$ (0.0\%) & \\
			~~ Missing & 139,045 (16.9\%) & 478,976 (24.9\%) \\
			Ethnicity, num (\%) & & \\
                ~~ Non-Hispanic or Latino & 580099 (70.42\%) & 1218670 (63.27\%) & \\
                ~~ Hispanic or Latino & 63875 (7.75\%) & 195740 (10.16\%) & \\
                ~~ Mexican & 2447 (0.3\%) & 3838 (0.2\%) & \\
                ~~ Central American & 166 (0.02\%) & 242 (0.01\%) & \\
                ~~ Puerto Rican & 71 (0.01\%) & 159 (0.01\%) & \\
                ~~ Cuban & 23 (0.0\%) & 52 (0.0\%) & \\
                ~~ Other & 14 (0.0\%) & 42 (0.0\%) & \\
            ~~ Missing & 177,064 (21.5\%) & 507,546 (26.3\%) \\
            Tobacco, num (\%) & & \\
            ~~ Never Smoked & 311,761 (70.46\%) & 826,680 (76.36\%) & \\
            ~~ Prior History & 78,799 (17.81\%) & 146,505 (13.53\%) & \\
            ~~ Active Smoker & 51,876 (11.73\%) & 109,393 (10.1\%) & \\
            ~~ Missing & 289,938 (35.2\%) & 925,961 (48.1\%) \\
			\botrule
	\end{tabular}}
    """
    to_csv(latex_text, "Table1Demographics.csv")

    table2 = """\begin{table}[h]
	\tbl{Socioeconomics Status (SES) Characteristics of the Patients. We report mean (standard deviation) in the table. SD computed over each individual.}
	{\begin{tabular}{@{}lcc@{}}
			\toprule
			& ACSC-related Visits & Non-ACSC-related Visits \\
			Predictor Variables & (n=648,041) & (n=2,063,798) \\ \colrule
\begin{tabular}{@{}l@{}} The median household income \end{tabular} & 54,548.71 (20175.04) & 56,810.59 (21913.58) \\
\begin{tabular}{@{}l@{}} Frac of high needs population \end{tabular} & 0.38 (0.11) & 0.37 (0.12) \\
\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ renter occupied units \end{tabular} & 0.3 (0.15) & 0.3 (0.16) \\
\begin{tabular}{@{}l@{}} Frac of households with children \\ ~~ and a single parent \end{tabular} & 0.16 (0.08) & 0.16 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of families with incomes $<$ 100\% \\ ~~ of the Federal Poverty Level (FPL) \end{tabular} & 0.15 (0.09) & 0.14 (0.09) \\
\begin{tabular}{@{}l@{}} Frac of African American population \end{tabular} & 0.1 (0.15) & 0.08 (0.14) \\
\begin{tabular}{@{}l@{}} Frac of Hispanic population \end{tabular} & 0.15 (0.21) & 0.16 (0.22) \\
\begin{tabular}{@{}l@{}} Frac of households receiving public assistance \end{tabular} & 0.14 (0.08) & 0.13 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of population with no health \\ ~~ insurance coverage \end{tabular} & 0.14 (0.06) & 0.13 (0.06) \\
\begin{tabular}{@{}l@{}} Frac of people age 25 or older \\ ~~ who have no high school degree \end{tabular} & 0.14 (0.1) & 0.13 (0.1) \\
\begin{tabular}{@{}l@{}} Frac of houses that are vacant \end{tabular} & 0.13 (0.08) & 0.12 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of adults who are unemployed \end{tabular} & 0.08 (0.04) & 0.07 (0.04) \\
\begin{tabular}{@{}l@{}} Frac of population that are foreign born \end{tabular} & 0.08 (0.1) & 0.09 (0.1) \\
\begin{tabular}{@{}l@{}} Frac of household with no car \end{tabular} & 0.06 (0.05) & 0.05 (0.04) \\
\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ crowded housing units \end{tabular} & 0.03 (0.04) & 0.03 (0.05) \\
			\botrule
	\end{tabular}}\label{tab:ses-demo}
\end{table}
    """

    to_csv(table2, "Table2SES.csv")
	"""
	We write a way to convert LaTex to CSV
	"""
	import csv
	import re

	def to_csv(latex_text, file_name):
	"""We learn to parse the text.
	We assume the very first line tells the format of the table!

	Arguments:
	latex_text {[type]} -- [description]
	"""
	rows = []
	for i, line in enumerate(latex_text.split("\n")):
	if '&' in line:
	# then we process
	row = []
	for cell in line.strip().split('&'):
	cell = cell.strip().replace("\\", "").replace('~~', " ").replace("\\%", '%')
	cell = cell.replace("\x08egin{tabular}{@{}l@{}}", "")
	cell = cell.replace("end{tabular}", "")
	cell = cell.replace(" ~ ", "")
	row.append(cell.strip())
	rows.append(row)

	with open(file_name, 'w') as f:
	csv_writer = csv.writer(f)
	for row in rows:
	csv_writer.writerow(row)

	if __name__ == "__main__":
	latex_text = """{\begin{tabular}{@{}llll@{}}
	\toprule
	& ACSC-related Visits & Non-ACSC-related Visits \\
	Predictor Variables & (n=823,759) & (n=1,926,289) \\ \colrule % (n=2,711,839)
	Age, mean (SD) & 59.91 (17.99) & 39.66 (23.03) \\
	Race, num (\%) & & \\
	~~ White & 615237 (74.69\%) & 1319754 (68.51\%) \\
	~~ \begin{tabular}{@{}l@{}}Black or \\ ~ African American \end{tabular} & 61785 (7.5\%) & 110755 (5.75\%) & \\
	~~ Vietnamese & 2063 (0.25\%) & 2860 (0.15\%) & \\
	~~ \begin{tabular}{@{}l@{}}American Indian or \\ ~ Alaska Native \end{tabular} & 2054 (0.25\%) & 6148 (0.32\%) & \\
	~~ Filipino & 1001 (0.12\%) & 1249 (0.06\%) & \\
	~~ \begin{tabular}{@{}l@{}}Native Hawaiian or \\ ~ Other Pacific Islander \end{tabular} & 949 (0.12\%) & 2804 (0.15\%) & \\
	~~ Japanese & 678 (0.08\%) & 719 (0.04\%) & \\
	~~ Other Pacific Islander & 543 (0.07\%) & 1537 (0.08\%) & \\
	~~ Chinese & 198 (0.02\%) & 780 (0.04\%) & \\
	~~ Native Hawaiian & 147 (0.02\%) & 592 (0.03\%) & \\
	~~ Korean & 31 (0.0\%) & 80 (0.0\%) & \\
	~~ Asian Indian & 24 (0.0\%) & 25 (0.0\%) & \\
	~~ Other & $<11$ (0.0\%) & $<11$ (0.0\%) & \\
	~~ Missing & 139,045 (16.9\%) & 478,976 (24.9\%) \\
	Ethnicity, num (\%) & & \\
	~~ Non-Hispanic or Latino & 580099 (70.42\%) & 1218670 (63.27\%) & \\
	~~ Hispanic or Latino & 63875 (7.75\%) & 195740 (10.16\%) & \\
	~~ Mexican & 2447 (0.3\%) & 3838 (0.2\%) & \\
	~~ Central American & 166 (0.02\%) & 242 (0.01\%) & \\
	~~ Puerto Rican & 71 (0.01\%) & 159 (0.01\%) & \\
	~~ Cuban & 23 (0.0\%) & 52 (0.0\%) & \\
	~~ Other & 14 (0.0\%) & 42 (0.0\%) & \\
	~~ Missing & 177,064 (21.5\%) & 507,546 (26.3\%) \\
	Tobacco, num (\%) & & \\
	~~ Never Smoked & 311,761 (70.46\%) & 826,680 (76.36\%) & \\
	~~ Prior History & 78,799 (17.81\%) & 146,505 (13.53\%) & \\
	~~ Active Smoker & 51,876 (11.73\%) & 109,393 (10.1\%) & \\
	~~ Missing & 289,938 (35.2\%) & 925,961 (48.1\%) \\
	\botrule
	\end{tabular}}
	"""
	to_csv(latex_text, "Table1Demographics.csv")

	table2 = """\begin{table}[h]
	\tbl{Socioeconomics Status (SES) Characteristics of the Patients. We report mean (standard deviation) in the table. SD computed over each individual.}
	{\begin{tabular}{@{}lcc@{}}
	\toprule
	& ACSC-related Visits & Non-ACSC-related Visits \\
	Predictor Variables & (n=648,041) & (n=2,063,798) \\ \colrule
	\begin{tabular}{@{}l@{}} The median household income \end{tabular} & 54,548.71 (20175.04) & 56,810.59 (21913.58) \\
	\begin{tabular}{@{}l@{}} Frac of high needs population \end{tabular} & 0.38 (0.11) & 0.37 (0.12) \\
	\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ renter occupied units \end{tabular} & 0.3 (0.15) & 0.3 (0.16) \\
	\begin{tabular}{@{}l@{}} Frac of households with children \\ ~~ and a single parent \end{tabular} & 0.16 (0.08) & 0.16 (0.08) \\
	\begin{tabular}{@{}l@{}} Frac of families with incomes $<$ 100\% \\ ~~ of the Federal Poverty Level (FPL) \end{tabular} & 0.15 (0.09) & 0.14 (0.09) \\
	\begin{tabular}{@{}l@{}} Frac of African American population \end{tabular} & 0.1 (0.15) & 0.08 (0.14) \\
	\begin{tabular}{@{}l@{}} Frac of Hispanic population \end{tabular} & 0.15 (0.21) & 0.16 (0.22) \\
	\begin{tabular}{@{}l@{}} Frac of households receiving public assistance \end{tabular} & 0.14 (0.08) & 0.13 (0.08) \\
	\begin{tabular}{@{}l@{}} Frac of population with no health \\ ~~ insurance coverage \end{tabular} & 0.14 (0.06) & 0.13 (0.06) \\
	\begin{tabular}{@{}l@{}} Frac of people age 25 or older \\ ~~ who have no high school degree \end{tabular} & 0.14 (0.1) & 0.13 (0.1) \\
	\begin{tabular}{@{}l@{}} Frac of houses that are vacant \end{tabular} & 0.13 (0.08) & 0.12 (0.08) \\
	\begin{tabular}{@{}l@{}} Frac of adults who are unemployed \end{tabular} & 0.08 (0.04) & 0.07 (0.04) \\
	\begin{tabular}{@{}l@{}} Frac of population that are foreign born \end{tabular} & 0.08 (0.1) & 0.09 (0.1) \\
	\begin{tabular}{@{}l@{}} Frac of household with no car \end{tabular} & 0.06 (0.05) & 0.05 (0.04) \\
	\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ crowded housing units \end{tabular} & 0.03 (0.04) & 0.03 (0.05) \\
	\botrule
	\end{tabular}}\label{tab:ses-demo}
	\end{table}
	"""

	to_csv(table2, "Table2SES.csv")