Created
October 23, 2018 13:44
-
-
Save korenmiklos/1b55f66d0f7f8100f860da71aa67b9e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import re, string | |
from sys import argv | |
from os import path, system, remove | |
class WikiParser: | |
"""Object that turns Wii LaTeX. | |
All formatting commands can be parsed one line at a time, though | |
some state is carried over between lines. | |
""" | |
def __init__(self, raw): | |
self.raw = raw | |
# this is a dictionary of links. each page can only be called once | |
self.links = {} | |
# this is a list of tuples: (environment, option) | |
self.environment = [] | |
# dictionary of environments | |
self.envdict = {'math': ('\\[','\\]'), | |
'slide': ('\\begin{frame}','\\end{frame}'+'\n'*4), | |
'block': ('\\begin{block}','\\end{block}\n'), | |
'section': ('','\n'*4), | |
'emph': ('\\emph{','}'), | |
'pre': ('',''), | |
'bold': ('\\alert{','}'), | |
'document': ('','\\end{document}'), | |
'itemize': ('\\begin{itemize}','\\end{itemize}\n'), | |
'enumerate': ('\\begin{enumerate}','\\end{enumerate}\n')} | |
# these environments are preformatted. nothing is parsed within them, except the closing statement | |
self.donotparse = ('pre', 'math','comment','tex') | |
def _in_env(self,env): | |
# check if we are in any of [env] | |
s = 0 | |
for envir, option in self.environment: | |
s = s or (envir in env) | |
return s | |
def _last_env(self): | |
return self.environment[-1][0] | |
def _last_option(self): | |
return self.environment[-1][1] | |
def _open_env(self,env,option=0): | |
if env in self.envdict: | |
self.environment.append((env,option)) | |
return self.envdict[env][0] | |
else: | |
return "" | |
def _close_last_env(self): | |
# returns name of environment, does not issue closing statement! | |
if len(self.environment): | |
return self.environment.pop()[0] | |
else: | |
return "" | |
def _close_env(self,which=[]): | |
# close until match at least one of [which] | |
s = '' | |
if (not len(which)==0) and (self._in_env(which)): | |
# close all environments back to and incl the last 'which' | |
env = '' | |
while not (env in which): | |
env = self._close_last_env() | |
s = s + self.envdict[env][1] | |
elif len(which)==0: | |
# just close last | |
env = self._close_last_env() | |
if env != '': s = s + self.envdict[env][1] | |
return s | |
##########################################################3 | |
# hyperlinks | |
def _create_anchor(self,anchor,text): | |
return '\\hypertarget{%s}{%s}' % (anchor,text) | |
def _get_page(self): | |
if self._in_env(['slide']): | |
# determine last open slide | |
env = '' | |
number = 0 | |
while env != 'slide': | |
number += 1 | |
env = self.environment[-number][0] | |
return self.environment[-number][1] | |
else: | |
return '' | |
def _link_to(self,anchor,text): | |
s = '\\hyperlink{%s}{%s}' % (anchor,text) | |
self.links[anchor] = self._get_page() | |
return s | |
def _insert_image(self,anchor,text): | |
# implement more sophisticated features later | |
return '\\includegraphics{%s}' % (anchor) | |
def _what_links_here(self,text): | |
anchor = self._get_page() | |
if anchor in self.links: | |
page = self.links[anchor] | |
return self._link_to(page,text) | |
else: | |
return text | |
def _emph_repl(self, word): | |
if len(word) == 3: | |
if self._last_env()=="bold": | |
return self._close_env() | |
else: | |
return self._open_env('bold') | |
else: | |
if self._last_env()=="emph": | |
return self._close_env() | |
else: | |
return self._open_env('emph') | |
def _url_repl(self, word): | |
return '<a href="%s">%s</a>' % (word, word) | |
def _email_repl(self, word): | |
return '<a href="mailto:%s">%s</a>' % (word, word) | |
def _trim(self,word): | |
import re | |
trim_re = re.compile(r'[\[\]]') | |
if trim_re.match(word): | |
return trim_re.sub('',word.strip()) | |
else: | |
return word.strip() | |
def _link_repl(self, word): | |
import re | |
# determine nice vs ugly link | |
# also have to parse image links here | |
trimmed = self._trim(word) | |
llist = trimmed.split('|',1) | |
if len(llist)>1: | |
trimmed = llist[0] | |
nicetext = llist[1] | |
else: | |
nicetext = trimmed | |
# now determine type of link | |
llist = trimmed.split(':',1) | |
if len(llist)>1: | |
protocol = llist[0] | |
link = llist[1] | |
else: | |
protocol = '' | |
link = trimmed | |
if protocol.lower()=='image': | |
return self._insert_image(link, nicetext) | |
else: | |
return self._link_to(link, nicetext) | |
def _rule_repl(self, word): | |
s = self._undent() | |
s = s + "\n\\bigskip\n" | |
return s | |
def _ent_repl(self, s): | |
return {'&': '&', | |
'<': '<', | |
'>': '>'}[s] | |
def _li_repl(self, match): | |
return '\\item ' | |
def _header_repl(self, word): | |
import re, string | |
s = '' | |
header_re = re.compile(r"^\s*(={1,3})(.*={1,3})\s*$") | |
htype = header_re.match(word).group(1) | |
htext = self._trim(header_re.match(word).group(2)[0:-len(htype)]) | |
# trim whitespace | |
# what type of environment are we talking about? | |
#print 'this has been triggered by', htype, 'with', htext | |
# keep section as an environment | |
envtype = ['','section','slide','block'][len(htype)] | |
#print envtype | |
s = s + self._close_env([envtype]) | |
if not (htext == ""): | |
# do not open slides with empty title - just close previous ones | |
s = s + self._open_env(envtype) | |
if envtype=="section": | |
# this is a section header | |
s = s + '\\section{%s}' % (htext) | |
elif envtype=="slide": | |
# this is a slide header | |
s = s + "\\frametitle{" + self._what_links_here(htext) + "}" | |
elif envtype=="block": | |
# this is a block header | |
s = s + '{%s}' % (htext) | |
# create hypertarget | |
s = s + '\\hypertarget{%s}{}' % htext | |
return s | |
def _pre_repl(self, word): | |
if word == '{{{' and not self._in_env(['pre']): | |
return self._open_env('pre') | |
else: | |
return self._close_env(['pre']) | |
def _math_repl(self, word): | |
if word == '<math>' and not self._in_env(['math']): | |
return self._open_env('math') | |
else: | |
return self._close_env(['math']) | |
def _macro_repl(self, word): | |
macro_name = word[2:-2] | |
# TODO: Somehow get the default value into the search field | |
return apply(globals()['_macro_' + macro_name], ()) | |
def _indent_level(self): | |
if self._in_env(['itemize', 'enumerate']): | |
level = 1 | |
for env, option in self.environment: | |
if (env in ['itemize', 'enumerate']) and (option>level): level=option | |
return level | |
else: | |
return 0 | |
def _indent_to(self, new_level, ordered, initial=1): | |
# Add type of indent to list to handle ordered lists | |
s = '' | |
if self._indent_level() > new_level: | |
while self._indent_level() > new_level: | |
# close all environments, incl the last itemize/enumerate | |
s = s + self._close_env(['enumerate', 'itemize']) | |
else: | |
if self._indent_level() == new_level: | |
if (self._last_env()=='itemize' and ordered): | |
s = s + self._close_env(['itemize']) | |
s = s + self._open_env('enumerate',new_level) | |
elif (self._last_env()=='enumerate' and not ordered): | |
s = s + self._close_env(['enumerate']) | |
s = s + self._open_env('itemize',new_level) | |
while self._indent_level() < new_level: | |
if ordered: | |
# special-case new starting number | |
s = s + self._open_env('enumerate',new_level) | |
s += '\\setcounter{enumi}{%d}' % int(float(initial)-1) | |
else: | |
s = s + self._open_env('itemize',new_level) | |
return s | |
def replace(self, match): | |
for type, hit in match.groupdict().items(): | |
if hit: | |
# check if we're allowed to parse | |
if (not self._in_env(self.donotparse)) or (type in self.donotparse): | |
return apply(getattr(self, '_' + type + '_repl'), (hit,)) | |
else: | |
# return unparsed | |
return hit | |
else: | |
raise "Can't handle match " + match | |
def print_html(self): | |
import re, string | |
stream ='' | |
# begin with opening the document environment | |
stream += '\n' + self._open_env('document') | |
# For each line, we scan through looking for magic | |
# strings, outputting verbatim any intervening text | |
scan_re = re.compile( | |
r"(?:(?P<emph>'{2,3})" | |
+ r"|(?P<url>(http|ftp|nntp|news|mailto)\:[^\s'\"]+\S)" | |
+ r"|(?P<email>[-\w._+]+\@[\w.-]+)" | |
+ r"|(?P<li>^\s*(\*|\d*\.|\-)\s+)" | |
+ r"|(?P<math>(<math>|</math>))" | |
+ r"|(?P<pre>(\{\{\{|\}\}\}))" | |
+ r"|(?P<header>^\s*={1,3}.*={1,3}\s*$)" | |
+ r"|(?P<link>\[\[(.*?)\]\])" | |
+ r"|(?P<macro>\[\[(TitleSearch|FullSearch|WordIndex" | |
+ r"|TitleIndex|RecentChanges|GoTo)\]\])" | |
+ r")") | |
closer_re = re.compile( | |
r"(?:" | |
+ r"(?P<pre>(\}\}\}))" | |
+ r"|(?P<math>(</math>))" | |
+ r")") | |
blank_re = re.compile("^\s*((\.{3})?)\s*$") | |
# this is a blank line, including a pause | |
indent_re = re.compile("^(\s*)\S*") | |
bullet_re = re.compile("^\s*(\*|\-|\+|\d+\.)\s+") | |
eol_re = re.compile(r'\r?\n') | |
raw = string.expandtabs(self.raw) | |
for line in eol_re.split(raw): | |
if not self._in_env(self.donotparse): | |
# begin with checkin paragraph formatting | |
if blank_re.match(line): | |
if len(blank_re.match(line).group(1)): | |
# issue pause | |
stream += '\n' + '\\pause' | |
stream += '\n' + '\n' | |
continue | |
if indent_re.match(line): | |
indent = len(indent_re.match(line).group(1))+1 | |
if indent<self._indent_level(): | |
# unindent things, even if not bullet | |
stream += '\n' + self._indent_to(indent,0,1) | |
elif bullet_re.match(line): | |
bullet = bullet_re.match(line) | |
# what kind of bullet? | |
if bullet.group(1) in ["*", "-", "+"]: | |
# bullet | |
stream += '\n' + self._indent_to(indent,0,1) | |
else: | |
# number | |
stream += '\n' + self._indent_to(indent,1,bullet.group(1)) | |
else: | |
# this is indented text, not bullet | |
pass | |
stream += '\n' + re.sub(scan_re, self.replace, line) | |
else: | |
# look for closing statement | |
# otherwise stream += '\n' + input without checking | |
stream += '\n' + re.sub(closer_re, self.replace, line) | |
# close the document. this closes all open environments | |
stream += '\n' + self._close_env(['document']) | |
return stream | |
inputfilename = argv[1] | |
shortname = path.basename(inputfilename) | |
inputfile = open(inputfilename) | |
if True: | |
firstline = inputfile.readline() | |
path_re = re.compile("^\s*%\s*((\w|/|\.|~|:)+)") | |
#only match forward slashes | |
if path_re.match(firstline): | |
prename = path_re.match(firstline).group(1) | |
else: | |
assert False, "Please give a preamble file either with the -p option or after a % in the first line of the file." | |
inputstring = inputfile.read() | |
# wiki parser instance | |
WP = WikiParser(inputstring) | |
preamble = open(prename).read() | |
outputfile = open(shortname+'.tex','w+t') | |
outputfile.write(preamble) | |
outputfile.write(WP.print_html()) | |
outputfile.close() | |
system('pdflatex '+shortname) | |
JUNK = ('.aux', '.nav', '.snm', '.toc', '.log', '.out') | |
for junk in JUNK: | |
try: | |
remove(shortname+junk) | |
except: | |
pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment