Created
May 13, 2016 18:27
-
-
Save pshriwise/c13472bedaf9e2f7a7da366420da1713 to your computer and use it in GitHub Desktop.
A python script for scraping the TOC from a tex file with color values gathered from metadata hidden in latex comments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, subprocess | |
from pyPdf import PdfFileReader | |
import traceback | |
status_colors = {"done":"green", | |
"in progress":"darkorange", | |
"pending" : "red", | |
"no status": "black"} | |
class html_generator: | |
def __init__(self): | |
pass | |
def open_tag(self,tag_type, attribs=None): | |
assert(type(tag_type) is str) | |
attribs = "" if attribs is None else self.attribs_to_str(attribs) | |
return "<"+tag_type+attribs+">" | |
def close_tag(self,tag_type): | |
assert(type(tag_type) is str) | |
val = "</"+tag_type+">" | |
val += "" if tag_type == "span" else "\n" | |
return val | |
def attribs_to_str(self,attribs): | |
assert(type(attribs) is dict) | |
val = "" | |
for key,value in attribs.items(): | |
val+=" "+str(key)+"="+str(value) | |
return val | |
def paragraph(self,contents,attribs=None): | |
return self.open_tag("p",attribs)+contents+self.close_tag("p") | |
def lineitem(self,contents,attribs=None): | |
return self.open_tag("li",attribs)+contents+self.close_tag("li") | |
def span(self,contents,attribs=None): | |
return self.open_tag("span",attribs)+contents+self.close_tag("span") | |
def unorderedlist(self,contents,attribs=None): | |
return self.open_tag("ul",attribs)+"\n"+contents+self.close_tag("ul")+"\n" | |
def color_attrib(self,color): | |
return {'style' : "color:"+color+";"} | |
def generate_html(): | |
try: | |
subprocess.call(["bash","update_prelim.sh"]) | |
except: | |
traceback.print_exc() | |
print "Could not update prelim doc." | |
for file in os.listdir("./"): | |
if ".md" in file: | |
f = open(file.split(".md")[0]+".html", 'wb') | |
subprocess.call(["markdown",file], stdout=f) | |
f.close() | |
prelim_status_report() | |
def status_header(f): | |
h = html_generator() | |
header = h.paragraph("Prelim Table of Contents & Status Report") | |
header += h.paragraph(h.span("Green",attribs=h.color_attrib("green"))+" - complete (for now)") | |
header += h.paragraph(h.span("Orange",attribs=h.color_attrib("darkorange"))+" - in progress") | |
header += h.paragraph(h.span("Red",attribs=h.color_attrib("red"))+" - haven't begun") | |
f.write(header) | |
return | |
def status_footer(f): | |
#special output | |
try: | |
pages = PdfFileReader(open('prelim.pdf','rb')).getNumPages() | |
f.write("<p>Number of Pages: " + str(pages)+"</p>") | |
except: | |
traceback.print_exc() | |
f.write("<p>Number of Pages: Could not read pdf file </p>") | |
def html_from_tex(): | |
f = open("prelim.tex",'rb') | |
lines = f.readlines() | |
f.close() | |
lines.append(None) | |
return gen_html_toc(lines,"section") | |
def parse_line(l): | |
#title should be inside the brackets | |
title = l[l.find("{")+1:l.find("}")] | |
#if there's no status | |
if 'Status' not in l: | |
#indicate no status found | |
status = "no status" | |
#if there is a status | |
else: | |
#format %%Status:<status_val>%% | |
#parse that status value out | |
status = l[l.find("%%")+2:] | |
status = status[:status.find("%%")] | |
status = status.split(":")[-1] | |
#return the title of this item and status | |
return title, status | |
def gen_html_toc(lines,search_key,return_key=None): | |
#list of hierarchical keys in document | |
search_keys = ["chapter","section","subsection","subsubsection"] | |
assert(search_key in search_keys) | |
#set downward recursion key w/ extra characters | |
down_key = None if search_key == search_keys[-1] else "\\"+search_keys[search_keys.index(search_key)+1]+"{" | |
#set search key w/ extra characters | |
search_key = "\\"+search_key+"{" | |
#for tracking first lineitem occurance | |
first = True | |
#create an html generator | |
h = html_generator() | |
html_out = "" | |
#open unorderedlist | |
html_out += "\n"+h.open_tag("ul")+"\n" | |
while True: | |
line = lines.pop(0) | |
# if we find the down key in this line | |
if down_key is not None and line is not None and down_key in line: | |
#put this line back in so we don't skip something | |
lines.insert(0,line) | |
# make recursive call w/ down key and search key as return key | |
out,line = gen_html_toc(lines,down_key[1:-1],return_key=search_key) | |
#add output to current html | |
html_out+=out | |
#if we find our return key or hit the end of the file | |
if line is None or (return_key is not None and return_key in line): | |
# a) close the previous lineitem and current list | |
html_out += h.close_tag("li") | |
html_out += h.close_tag("ul") | |
# b) return the html and current line | |
return html_out, line | |
#if we find the key we're currently looking for | |
elif search_key in line: | |
#if this is the first item found | |
if first: | |
#we're now no longer on our first item | |
first = not first | |
#otherwise close the previous line item at this level | |
else: | |
html_out += h.close_tag("li") | |
#parse title and status from the new item | |
title,status = parse_line(line) | |
#create a new lineitem w/ title and status color | |
html_out += h.open_tag("li")+h.span(title,attribs=h.color_attrib(status_colors[status.lower()])) | |
def prelim_status_report(): | |
print "Opening file for writing..." | |
report_file = open("prelim_status.html",'w') | |
print "Generating html and writing to file..." | |
status_header(report_file) | |
print "Done" | |
html = html_from_tex() | |
report_file.write(html[0]) | |
#write num pages to html | |
status_footer(report_file) | |
report_file.close() | |
if __name__=="__main__": | |
prelim_status_report() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment