Skip to content

Instantly share code, notes, and snippets.

@korenmiklos
Created October 23, 2018 13:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save korenmiklos/1b55f66d0f7f8100f860da71aa67b9e6 to your computer and use it in GitHub Desktop.
Save korenmiklos/1b55f66d0f7f8100f860da71aa67b9e6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2.7
import re, string
from sys import argv
from os import path, system, remove
class WikiParser:
"""Object that turns Wii LaTeX.
All formatting commands can be parsed one line at a time, though
some state is carried over between lines.
"""
def __init__(self, raw):
self.raw = raw
# this is a dictionary of links. each page can only be called once
self.links = {}
# this is a list of tuples: (environment, option)
self.environment = []
# dictionary of environments
self.envdict = {'math': ('\\[','\\]'),
'slide': ('\\begin{frame}','\\end{frame}'+'\n'*4),
'block': ('\\begin{block}','\\end{block}\n'),
'section': ('','\n'*4),
'emph': ('\\emph{','}'),
'pre': ('',''),
'bold': ('\\alert{','}'),
'document': ('','\\end{document}'),
'itemize': ('\\begin{itemize}','\\end{itemize}\n'),
'enumerate': ('\\begin{enumerate}','\\end{enumerate}\n')}
# these environments are preformatted. nothing is parsed within them, except the closing statement
self.donotparse = ('pre', 'math','comment','tex')
def _in_env(self,env):
# check if we are in any of [env]
s = 0
for envir, option in self.environment:
s = s or (envir in env)
return s
def _last_env(self):
return self.environment[-1][0]
def _last_option(self):
return self.environment[-1][1]
def _open_env(self,env,option=0):
if env in self.envdict:
self.environment.append((env,option))
return self.envdict[env][0]
else:
return ""
def _close_last_env(self):
# returns name of environment, does not issue closing statement!
if len(self.environment):
return self.environment.pop()[0]
else:
return ""
def _close_env(self,which=[]):
# close until match at least one of [which]
s = ''
if (not len(which)==0) and (self._in_env(which)):
# close all environments back to and incl the last 'which'
env = ''
while not (env in which):
env = self._close_last_env()
s = s + self.envdict[env][1]
elif len(which)==0:
# just close last
env = self._close_last_env()
if env != '': s = s + self.envdict[env][1]
return s
##########################################################3
# hyperlinks
def _create_anchor(self,anchor,text):
return '\\hypertarget{%s}{%s}' % (anchor,text)
def _get_page(self):
if self._in_env(['slide']):
# determine last open slide
env = ''
number = 0
while env != 'slide':
number += 1
env = self.environment[-number][0]
return self.environment[-number][1]
else:
return ''
def _link_to(self,anchor,text):
s = '\\hyperlink{%s}{%s}' % (anchor,text)
self.links[anchor] = self._get_page()
return s
def _insert_image(self,anchor,text):
# implement more sophisticated features later
return '\\includegraphics{%s}' % (anchor)
def _what_links_here(self,text):
anchor = self._get_page()
if anchor in self.links:
page = self.links[anchor]
return self._link_to(page,text)
else:
return text
def _emph_repl(self, word):
if len(word) == 3:
if self._last_env()=="bold":
return self._close_env()
else:
return self._open_env('bold')
else:
if self._last_env()=="emph":
return self._close_env()
else:
return self._open_env('emph')
def _url_repl(self, word):
return '<a href="%s">%s</a>' % (word, word)
def _email_repl(self, word):
return '<a href="mailto:%s">%s</a>' % (word, word)
def _trim(self,word):
import re
trim_re = re.compile(r'[\[\]]')
if trim_re.match(word):
return trim_re.sub('',word.strip())
else:
return word.strip()
def _link_repl(self, word):
import re
# determine nice vs ugly link
# also have to parse image links here
trimmed = self._trim(word)
llist = trimmed.split('|',1)
if len(llist)>1:
trimmed = llist[0]
nicetext = llist[1]
else:
nicetext = trimmed
# now determine type of link
llist = trimmed.split(':',1)
if len(llist)>1:
protocol = llist[0]
link = llist[1]
else:
protocol = ''
link = trimmed
if protocol.lower()=='image':
return self._insert_image(link, nicetext)
else:
return self._link_to(link, nicetext)
def _rule_repl(self, word):
s = self._undent()
s = s + "\n\\bigskip\n"
return s
def _ent_repl(self, s):
return {'&': '&amp;',
'<': '&lt;',
'>': '&gt;'}[s]
def _li_repl(self, match):
return '\\item '
def _header_repl(self, word):
import re, string
s = ''
header_re = re.compile(r"^\s*(={1,3})(.*={1,3})\s*$")
htype = header_re.match(word).group(1)
htext = self._trim(header_re.match(word).group(2)[0:-len(htype)])
# trim whitespace
# what type of environment are we talking about?
#print 'this has been triggered by', htype, 'with', htext
# keep section as an environment
envtype = ['','section','slide','block'][len(htype)]
#print envtype
s = s + self._close_env([envtype])
if not (htext == ""):
# do not open slides with empty title - just close previous ones
s = s + self._open_env(envtype)
if envtype=="section":
# this is a section header
s = s + '\\section{%s}' % (htext)
elif envtype=="slide":
# this is a slide header
s = s + "\\frametitle{" + self._what_links_here(htext) + "}"
elif envtype=="block":
# this is a block header
s = s + '{%s}' % (htext)
# create hypertarget
s = s + '\\hypertarget{%s}{}' % htext
return s
def _pre_repl(self, word):
if word == '{{{' and not self._in_env(['pre']):
return self._open_env('pre')
else:
return self._close_env(['pre'])
def _math_repl(self, word):
if word == '<math>' and not self._in_env(['math']):
return self._open_env('math')
else:
return self._close_env(['math'])
def _macro_repl(self, word):
macro_name = word[2:-2]
# TODO: Somehow get the default value into the search field
return apply(globals()['_macro_' + macro_name], ())
def _indent_level(self):
if self._in_env(['itemize', 'enumerate']):
level = 1
for env, option in self.environment:
if (env in ['itemize', 'enumerate']) and (option>level): level=option
return level
else:
return 0
def _indent_to(self, new_level, ordered, initial=1):
# Add type of indent to list to handle ordered lists
s = ''
if self._indent_level() > new_level:
while self._indent_level() > new_level:
# close all environments, incl the last itemize/enumerate
s = s + self._close_env(['enumerate', 'itemize'])
else:
if self._indent_level() == new_level:
if (self._last_env()=='itemize' and ordered):
s = s + self._close_env(['itemize'])
s = s + self._open_env('enumerate',new_level)
elif (self._last_env()=='enumerate' and not ordered):
s = s + self._close_env(['enumerate'])
s = s + self._open_env('itemize',new_level)
while self._indent_level() < new_level:
if ordered:
# special-case new starting number
s = s + self._open_env('enumerate',new_level)
s += '\\setcounter{enumi}{%d}' % int(float(initial)-1)
else:
s = s + self._open_env('itemize',new_level)
return s
def replace(self, match):
for type, hit in match.groupdict().items():
if hit:
# check if we're allowed to parse
if (not self._in_env(self.donotparse)) or (type in self.donotparse):
return apply(getattr(self, '_' + type + '_repl'), (hit,))
else:
# return unparsed
return hit
else:
raise "Can't handle match " + match
def print_html(self):
import re, string
stream =''
# begin with opening the document environment
stream += '\n' + self._open_env('document')
# For each line, we scan through looking for magic
# strings, outputting verbatim any intervening text
scan_re = re.compile(
r"(?:(?P<emph>'{2,3})"
+ r"|(?P<url>(http|ftp|nntp|news|mailto)\:[^\s'\"]+\S)"
+ r"|(?P<email>[-\w._+]+\@[\w.-]+)"
+ r"|(?P<li>^\s*(\*|\d*\.|\-)\s+)"
+ r"|(?P<math>(<math>|</math>))"
+ r"|(?P<pre>(\{\{\{|\}\}\}))"
+ r"|(?P<header>^\s*={1,3}.*={1,3}\s*$)"
+ r"|(?P<link>\[\[(.*?)\]\])"
+ r"|(?P<macro>\[\[(TitleSearch|FullSearch|WordIndex"
+ r"|TitleIndex|RecentChanges|GoTo)\]\])"
+ r")")
closer_re = re.compile(
r"(?:"
+ r"(?P<pre>(\}\}\}))"
+ r"|(?P<math>(</math>))"
+ r")")
blank_re = re.compile("^\s*((\.{3})?)\s*$")
# this is a blank line, including a pause
indent_re = re.compile("^(\s*)\S*")
bullet_re = re.compile("^\s*(\*|\-|\+|\d+\.)\s+")
eol_re = re.compile(r'\r?\n')
raw = string.expandtabs(self.raw)
for line in eol_re.split(raw):
if not self._in_env(self.donotparse):
# begin with checkin paragraph formatting
if blank_re.match(line):
if len(blank_re.match(line).group(1)):
# issue pause
stream += '\n' + '\\pause'
stream += '\n' + '\n'
continue
if indent_re.match(line):
indent = len(indent_re.match(line).group(1))+1
if indent<self._indent_level():
# unindent things, even if not bullet
stream += '\n' + self._indent_to(indent,0,1)
elif bullet_re.match(line):
bullet = bullet_re.match(line)
# what kind of bullet?
if bullet.group(1) in ["*", "-", "+"]:
# bullet
stream += '\n' + self._indent_to(indent,0,1)
else:
# number
stream += '\n' + self._indent_to(indent,1,bullet.group(1))
else:
# this is indented text, not bullet
pass
stream += '\n' + re.sub(scan_re, self.replace, line)
else:
# look for closing statement
# otherwise stream += '\n' + input without checking
stream += '\n' + re.sub(closer_re, self.replace, line)
# close the document. this closes all open environments
stream += '\n' + self._close_env(['document'])
return stream
inputfilename = argv[1]
shortname = path.basename(inputfilename)
inputfile = open(inputfilename)
if True:
firstline = inputfile.readline()
path_re = re.compile("^\s*%\s*((\w|/|\.|~|:)+)")
#only match forward slashes
if path_re.match(firstline):
prename = path_re.match(firstline).group(1)
else:
assert False, "Please give a preamble file either with the -p option or after a % in the first line of the file."
inputstring = inputfile.read()
# wiki parser instance
WP = WikiParser(inputstring)
preamble = open(prename).read()
outputfile = open(shortname+'.tex','w+t')
outputfile.write(preamble)
outputfile.write(WP.print_html())
outputfile.close()
system('pdflatex '+shortname)
JUNK = ('.aux', '.nav', '.snm', '.toc', '.log', '.out')
for junk in JUNK:
try:
remove(shortname+junk)
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment