korenmiklos/wiki2pdf.py

## wiki2pdf.py
#!/usr/bin/env python2.7

import re, string
from sys import argv
from os import path, system, remove

class WikiParser:
    """Object that turns Wii  LaTeX.

    All formatting commands can be parsed one line at a time, though
    some state is carried over between lines.
    """

    def __init__(self, raw):
        self.raw = raw

        # this is a dictionary of links. each page can only be called once
        self.links = {}

        # this is a list of tuples: (environment, option)
        self.environment = []

        # dictionary of environments
        self.envdict = {'math': ('\\[','\\]'),
                        'slide': ('\\begin{frame}','\\end{frame}'+'\n'*4),
                        'block': ('\\begin{block}','\\end{block}\n'),
                        'section': ('','\n'*4),
                        'emph': ('\\emph{','}'),
                        'pre': ('',''),
                        'bold': ('\\alert{','}'),
                        'document': ('','\\end{document}'),
                        'itemize': ('\\begin{itemize}','\\end{itemize}\n'),
                        'enumerate': ('\\begin{enumerate}','\\end{enumerate}\n')}

        # these environments are preformatted. nothing is parsed within them, except the closing statement
        self.donotparse = ('pre', 'math','comment','tex')

    def _in_env(self,env):
        # check if we are in any of [env]
        s = 0
        for envir, option in self.environment:
            s = s or (envir in env)
        return s

    def _last_env(self):
        return self.environment[-1][0]

    def _last_option(self):
        return self.environment[-1][1]

    def _open_env(self,env,option=0):
        if env in self.envdict:
            self.environment.append((env,option))
            return self.envdict[env][0]
        else:
            return ""

    def _close_last_env(self):
        # returns name of environment, does not issue closing statement!
        if len(self.environment):
            return self.environment.pop()[0]
        else:
            return ""

    def _close_env(self,which=[]):
        # close until match at least one of [which]
        s = ''
        if (not len(which)==0) and (self._in_env(which)):
            # close all environments back to and incl the last 'which'
            env = ''
            while not (env in which):
                env = self._close_last_env()
                s = s + self.envdict[env][1]
        elif len(which)==0:
            # just close last
            env = self._close_last_env()
            if env != '': s = s + self.envdict[env][1]
        return s

    ##########################################################3
    # hyperlinks

    def _create_anchor(self,anchor,text):
        return '\\hypertarget{%s}{%s}' % (anchor,text)

    def _get_page(self):
        if self._in_env(['slide']):
            # determine last open slide
            env = ''
            number = 0
            while env != 'slide':
                number += 1
                env = self.environment[-number][0]
            return self.environment[-number][1]
        else:
            return ''


    def _link_to(self,anchor,text):
        s = '\\hyperlink{%s}{%s}' % (anchor,text)
        self.links[anchor] = self._get_page()
        return s

    def _insert_image(self,anchor,text):
        # implement more sophisticated features later
        return '\\includegraphics{%s}' % (anchor)

    def _what_links_here(self,text):
        anchor = self._get_page()
        if anchor in self.links:
            page = self.links[anchor]
            return self._link_to(page,text)
        else:
            return text


    def _emph_repl(self, word):
        if len(word) == 3:
            if self._last_env()=="bold":
                return self._close_env()
            else:
                return self._open_env('bold')
        else:
            if self._last_env()=="emph":
                return self._close_env()
            else:
                return self._open_env('emph')

    def _url_repl(self, word):
        return '<a href="%s">%s</a>' % (word, word)

    def _email_repl(self, word):
        return '<a href="mailto:%s">%s</a>' % (word, word)

    def _trim(self,word):
        import re
        trim_re = re.compile(r'[\[\]]')
        if trim_re.match(word):
            return trim_re.sub('',word.strip())
        else:
            return word.strip()

    def _link_repl(self, word):
        import re
        # determine nice vs ugly link
        # also have to parse image links here
        trimmed = self._trim(word)

        llist = trimmed.split('|',1)

        if len(llist)>1:
            trimmed = llist[0]
            nicetext = llist[1]
        else:
            nicetext = trimmed

        # now determine type of link
        llist = trimmed.split(':',1)
        if len(llist)>1:
            protocol = llist[0]
            link = llist[1]
        else:
            protocol = ''
            link = trimmed

        if protocol.lower()=='image':
            return self._insert_image(link, nicetext)
        else:
            return self._link_to(link, nicetext)

    def _rule_repl(self, word):
        s = self._undent()
        s = s + "\n\\bigskip\n"
        return s

    def _ent_repl(self, s):
        return {'&': '&amp;',
                '<': '&lt;',
                '>': '&gt;'}[s]

    def _li_repl(self, match):
        return '\\item '

    def _header_repl(self, word):
        import re, string
        s = ''
        header_re = re.compile(r"^\s*(={1,3})(.*={1,3})\s*$")
        htype = header_re.match(word).group(1)
        htext = self._trim(header_re.match(word).group(2)[0:-len(htype)])
        # trim whitespace

        # what type of environment are we talking about?
        #print 'this has been triggered by', htype, 'with', htext
        # keep section as an environment
        envtype = ['','section','slide','block'][len(htype)]
        #print envtype
        s = s + self._close_env([envtype])

        if not (htext == ""):
            # do not open slides with empty title - just close previous ones
            s = s + self._open_env(envtype)
            if envtype=="section":
                # this is a section header
                s = s + '\\section{%s}' % (htext)
            elif envtype=="slide":
                # this is a slide header
                s = s + "\\frametitle{" + self._what_links_here(htext) + "}"
            elif envtype=="block":
                # this is a block header
                s = s + '{%s}' % (htext)
            # create hypertarget
            s = s + '\\hypertarget{%s}{}' % htext
        return s

    def _pre_repl(self, word):
        if word == '{{{' and not self._in_env(['pre']):
            return self._open_env('pre')
        else:
            return self._close_env(['pre'])

    def _math_repl(self, word):
        if word == '<math>' and not self._in_env(['math']):
            return self._open_env('math')
        else:
            return self._close_env(['math'])

    def _macro_repl(self, word):
        macro_name = word[2:-2]
        # TODO: Somehow get the default value into the search field
        return apply(globals()['_macro_' + macro_name], ())

    def _indent_level(self):
        if self._in_env(['itemize', 'enumerate']):
            level = 1
            for env, option in self.environment:
                if (env in ['itemize', 'enumerate']) and (option>level): level=option
            return level
        else:
            return 0

    def _indent_to(self, new_level, ordered, initial=1):
        # Add type of indent to list to handle ordered lists
        s = ''
        if self._indent_level() > new_level:
            while self._indent_level() > new_level:
                # close all environments, incl the last itemize/enumerate
                s = s + self._close_env(['enumerate', 'itemize'])
        else:
            if self._indent_level() == new_level:
                if (self._last_env()=='itemize' and ordered):
                    s = s + self._close_env(['itemize'])
                    s = s + self._open_env('enumerate',new_level)
                elif (self._last_env()=='enumerate' and not ordered):
                    s = s + self._close_env(['enumerate'])
                    s = s + self._open_env('itemize',new_level)
            while self._indent_level() < new_level:
                if ordered:
                    # special-case new starting number
                    s = s + self._open_env('enumerate',new_level)
                    s += '\\setcounter{enumi}{%d}' % int(float(initial)-1)
                else:
                    s = s + self._open_env('itemize',new_level)
        return s

    def replace(self, match):
        for type, hit in match.groupdict().items():
            if hit:
                # check if we're allowed to parse
                if (not self._in_env(self.donotparse)) or (type in self.donotparse):
                    return apply(getattr(self, '_' + type + '_repl'), (hit,))
                else:
                    # return unparsed
                    return hit
        else:
            raise "Can't handle match " + match

    def print_html(self):
        import re, string

        stream =''

        # begin with opening the document environment
        stream += '\n' + self._open_env('document')

        # For each line, we scan through looking for magic
        # strings, outputting verbatim any intervening text
        scan_re = re.compile(
            r"(?:(?P<emph>'{2,3})"
            + r"|(?P<url>(http|ftp|nntp|news|mailto)\:[^\s'\"]+\S)"
            + r"|(?P<email>[-\w._+]+\@[\w.-]+)"
            + r"|(?P<li>^\s*(\*|\d*\.|\-)\s+)"
            + r"|(?P<math>(<math>|</math>))"
            + r"|(?P<pre>(\{\{\{|\}\}\}))"
            + r"|(?P<header>^\s*={1,3}.*={1,3}\s*$)"
            + r"|(?P<link>\[\[(.*?)\]\])"
            + r"|(?P<macro>\[\[(TitleSearch|FullSearch|WordIndex"
                            + r"|TitleIndex|RecentChanges|GoTo)\]\])"
            + r")")
        closer_re = re.compile(
            r"(?:"
            + r"(?P<pre>(\}\}\}))"
            + r"|(?P<math>(</math>))"
            + r")")

        blank_re = re.compile("^\s*((\.{3})?)\s*$")
        # this is a blank line, including a pause
        indent_re = re.compile("^(\s*)\S*")
        bullet_re = re.compile("^\s*(\*|\-|\+|\d+\.)\s+")
        eol_re = re.compile(r'\r?\n')
        raw = string.expandtabs(self.raw)
        for line in eol_re.split(raw):
            if not self._in_env(self.donotparse):
                # begin with checkin paragraph formatting
                if blank_re.match(line):
                    if len(blank_re.match(line).group(1)):
                        # issue pause
                        stream += '\n' + '\\pause'
                    stream += '\n' + '\n'
                    continue
                if indent_re.match(line):
                    indent = len(indent_re.match(line).group(1))+1
                    if indent<self._indent_level():
                        # unindent things, even if not bullet
                        stream += '\n' + self._indent_to(indent,0,1)
                    elif bullet_re.match(line):
                        bullet = bullet_re.match(line)
                        # what kind of bullet?
                        if bullet.group(1) in ["*", "-", "+"]:
                            # bullet
                            stream += '\n' + self._indent_to(indent,0,1)
                        else:
                            # number
                            stream += '\n' + self._indent_to(indent,1,bullet.group(1))
                    else:
                        # this is indented text, not bullet
                        pass
                stream += '\n' + re.sub(scan_re, self.replace, line)
            else:
                # look for closing statement
                # otherwise stream += '\n' + input without checking
                stream += '\n' + re.sub(closer_re, self.replace, line)

        # close the document. this closes all open environments
        stream += '\n' + self._close_env(['document'])
        return stream


inputfilename = argv[1]
shortname = path.basename(inputfilename)
inputfile = open(inputfilename)

if True:
        firstline = inputfile.readline()
        path_re = re.compile("^\s*%\s*((\w|/|\.|~|:)+)")
        #only match forward slashes

        if path_re.match(firstline):
                prename = path_re.match(firstline).group(1)
        else:
                assert False, "Please give a preamble file either with the -p option or after a % in the first line of the file."

inputstring = inputfile.read()

# wiki parser instance
WP = WikiParser(inputstring)

preamble = open(prename).read()
outputfile = open(shortname+'.tex','w+t')
outputfile.write(preamble)
outputfile.write(WP.print_html())
outputfile.close()

system('pdflatex '+shortname)
JUNK = ('.aux', '.nav', '.snm', '.toc', '.log', '.out')
for junk in JUNK:
    try:
        remove(shortname+junk)
    except:
        pass
	#!/usr/bin/env python2.7

	import re, string
	from sys import argv
	from os import path, system, remove

	class WikiParser:
	"""Object that turns Wii LaTeX.

	All formatting commands can be parsed one line at a time, though
	some state is carried over between lines.
	"""

	def __init__(self, raw):
	self.raw = raw

	# this is a dictionary of links. each page can only be called once
	self.links = {}

	# this is a list of tuples: (environment, option)
	self.environment = []

	# dictionary of environments
	self.envdict = {'math': ('\\[','\\]'),
	'slide': ('\\begin{frame}','\\end{frame}'+'\n'*4),
	'block': ('\\begin{block}','\\end{block}\n'),
	'section': ('','\n'*4),
	'emph': ('\\emph{','}'),
	'pre': ('',''),
	'bold': ('\\alert{','}'),
	'document': ('','\\end{document}'),
	'itemize': ('\\begin{itemize}','\\end{itemize}\n'),
	'enumerate': ('\\begin{enumerate}','\\end{enumerate}\n')}

	# these environments are preformatted. nothing is parsed within them, except the closing statement
	self.donotparse = ('pre', 'math','comment','tex')

	def _in_env(self,env):
	# check if we are in any of [env]
	s = 0
	for envir, option in self.environment:
	s = s or (envir in env)
	return s

	def _last_env(self):
	return self.environment[-1][0]

	def _last_option(self):
	return self.environment[-1][1]

	def _open_env(self,env,option=0):
	if env in self.envdict:
	self.environment.append((env,option))
	return self.envdict[env][0]
	else:
	return ""

	def _close_last_env(self):
	# returns name of environment, does not issue closing statement!
	if len(self.environment):
	return self.environment.pop()[0]
	else:
	return ""

	def _close_env(self,which=[]):
	# close until match at least one of [which]
	s = ''
	if (not len(which)==0) and (self._in_env(which)):
	# close all environments back to and incl the last 'which'
	env = ''
	while not (env in which):
	env = self._close_last_env()
	s = s + self.envdict[env][1]
	elif len(which)==0:
	# just close last
	env = self._close_last_env()
	if env != '': s = s + self.envdict[env][1]
	return s

	##########################################################3
	# hyperlinks

	def _create_anchor(self,anchor,text):
	return '\\hypertarget{%s}{%s}' % (anchor,text)

	def _get_page(self):
	if self._in_env(['slide']):
	# determine last open slide
	env = ''
	number = 0
	while env != 'slide':
	number += 1
	env = self.environment[-number][0]
	return self.environment[-number][1]
	else:
	return ''


	def _link_to(self,anchor,text):
	s = '\\hyperlink{%s}{%s}' % (anchor,text)
	self.links[anchor] = self._get_page()
	return s

	def _insert_image(self,anchor,text):
	# implement more sophisticated features later
	return '\\includegraphics{%s}' % (anchor)

	def _what_links_here(self,text):
	anchor = self._get_page()
	if anchor in self.links:
	page = self.links[anchor]
	return self._link_to(page,text)
	else:
	return text


	def _emph_repl(self, word):
	if len(word) == 3:
	if self._last_env()=="bold":
	return self._close_env()
	else:
	return self._open_env('bold')
	else:
	if self._last_env()=="emph":
	return self._close_env()
	else:
	return self._open_env('emph')

	def _url_repl(self, word):
	return '<a href="%s">%s</a>' % (word, word)

	def _email_repl(self, word):
	return '<a href="mailto:%s">%s</a>' % (word, word)

	def _trim(self,word):
	import re
	trim_re = re.compile(r'[\[\]]')
	if trim_re.match(word):
	return trim_re.sub('',word.strip())
	else:
	return word.strip()

	def _link_repl(self, word):
	import re
	# determine nice vs ugly link
	# also have to parse image links here
	trimmed = self._trim(word)

	llist = trimmed.split('\|',1)

	if len(llist)>1:
	trimmed = llist[0]
	nicetext = llist[1]
	else:
	nicetext = trimmed

	# now determine type of link
	llist = trimmed.split(':',1)
	if len(llist)>1:
	protocol = llist[0]
	link = llist[1]
	else:
	protocol = ''
	link = trimmed

	if protocol.lower()=='image':
	return self._insert_image(link, nicetext)
	else:
	return self._link_to(link, nicetext)

	def _rule_repl(self, word):
	s = self._undent()
	s = s + "\n\\bigskip\n"
	return s

	def _ent_repl(self, s):
	return {'&': '&',
	'<': '<',
	'>': '>'}[s]

	def _li_repl(self, match):
	return '\\item '

	def _header_repl(self, word):
	import re, string
	s = ''
	header_re = re.compile(r"^\s(={1,3})(.={1,3})\s*$")
	htype = header_re.match(word).group(1)
	htext = self._trim(header_re.match(word).group(2)[0:-len(htype)])
	# trim whitespace

	# what type of environment are we talking about?
	#print 'this has been triggered by', htype, 'with', htext
	# keep section as an environment
	envtype = ['','section','slide','block'][len(htype)]
	#print envtype
	s = s + self._close_env([envtype])

	if not (htext == ""):
	# do not open slides with empty title - just close previous ones
	s = s + self._open_env(envtype)
	if envtype=="section":
	# this is a section header
	s = s + '\\section{%s}' % (htext)
	elif envtype=="slide":
	# this is a slide header
	s = s + "\\frametitle{" + self._what_links_here(htext) + "}"
	elif envtype=="block":
	# this is a block header
	s = s + '{%s}' % (htext)
	# create hypertarget
	s = s + '\\hypertarget{%s}{}' % htext
	return s

	def _pre_repl(self, word):
	if word == '{{{' and not self._in_env(['pre']):
	return self._open_env('pre')
	else:
	return self._close_env(['pre'])

	def _math_repl(self, word):
	if word == '<math>' and not self._in_env(['math']):
	return self._open_env('math')
	else:
	return self._close_env(['math'])

	def _macro_repl(self, word):
	macro_name = word[2:-2]
	# TODO: Somehow get the default value into the search field
	return apply(globals()['_macro_' + macro_name], ())

	def _indent_level(self):
	if self._in_env(['itemize', 'enumerate']):
	level = 1
	for env, option in self.environment:
	if (env in ['itemize', 'enumerate']) and (option>level): level=option
	return level
	else:
	return 0

	def _indent_to(self, new_level, ordered, initial=1):
	# Add type of indent to list to handle ordered lists
	s = ''
	if self._indent_level() > new_level:
	while self._indent_level() > new_level:
	# close all environments, incl the last itemize/enumerate
	s = s + self._close_env(['enumerate', 'itemize'])
	else:
	if self._indent_level() == new_level:
	if (self._last_env()=='itemize' and ordered):
	s = s + self._close_env(['itemize'])
	s = s + self._open_env('enumerate',new_level)
	elif (self._last_env()=='enumerate' and not ordered):
	s = s + self._close_env(['enumerate'])
	s = s + self._open_env('itemize',new_level)
	while self._indent_level() < new_level:
	if ordered:
	# special-case new starting number
	s = s + self._open_env('enumerate',new_level)
	s += '\\setcounter{enumi}{%d}' % int(float(initial)-1)
	else:
	s = s + self._open_env('itemize',new_level)
	return s

	def replace(self, match):
	for type, hit in match.groupdict().items():
	if hit:
	# check if we're allowed to parse
	if (not self._in_env(self.donotparse)) or (type in self.donotparse):
	return apply(getattr(self, '_' + type + '_repl'), (hit,))
	else:
	# return unparsed
	return hit
	else:
	raise "Can't handle match " + match

	def print_html(self):
	import re, string

	stream =''

	# begin with opening the document environment
	stream += '\n' + self._open_env('document')

	# For each line, we scan through looking for magic
	# strings, outputting verbatim any intervening text
	scan_re = re.compile(
	r"(?:(?P<emph>'{2,3})"
	+ r"\|(?P<url>(http\|ftp\|nntp\|news\|mailto)\:[^\s'\"]+\S)"
	+ r"\|(?P<email>[-\w._+]+\@[\w.-]+)"
	+ r"\|(?P<li>^\s(\\|\d*\.\|\-)\s+)"
	+ r"\|(?P<math>(<math>\|</math>))"
	+ r"\|(?P<pre>(\{\{\{\|\}\}\}))"
	+ r"\|(?P<header>^\s={1,3}.={1,3}\s*$)"
	+ r"\|(?P<link>\[\[(.*?)\]\])"
	+ r"\|(?P<macro>\[\[(TitleSearch\|FullSearch\|WordIndex"
	+ r"\|TitleIndex\|RecentChanges\|GoTo)\]\])"
	+ r")")
	closer_re = re.compile(
	r"(?:"
	+ r"(?P<pre>(\}\}\}))"
	+ r"\|(?P<math>(</math>))"
	+ r")")

	blank_re = re.compile("^\s((\.{3})?)\s$")
	# this is a blank line, including a pause
	indent_re = re.compile("^(\s)\S")
	bullet_re = re.compile("^\s(\\|\-\|\+\|\d+\.)\s+")
	eol_re = re.compile(r'\r?\n')
	raw = string.expandtabs(self.raw)
	for line in eol_re.split(raw):
	if not self._in_env(self.donotparse):
	# begin with checkin paragraph formatting
	if blank_re.match(line):
	if len(blank_re.match(line).group(1)):
	# issue pause
	stream += '\n' + '\\pause'
	stream += '\n' + '\n'
	continue
	if indent_re.match(line):
	indent = len(indent_re.match(line).group(1))+1
	if indent<self._indent_level():
	# unindent things, even if not bullet
	stream += '\n' + self._indent_to(indent,0,1)
	elif bullet_re.match(line):
	bullet = bullet_re.match(line)
	# what kind of bullet?
	if bullet.group(1) in ["*", "-", "+"]:
	# bullet
	stream += '\n' + self._indent_to(indent,0,1)
	else:
	# number
	stream += '\n' + self._indent_to(indent,1,bullet.group(1))
	else:
	# this is indented text, not bullet
	pass
	stream += '\n' + re.sub(scan_re, self.replace, line)
	else:
	# look for closing statement
	# otherwise stream += '\n' + input without checking
	stream += '\n' + re.sub(closer_re, self.replace, line)

	# close the document. this closes all open environments
	stream += '\n' + self._close_env(['document'])
	return stream



	inputfilename = argv[1]
	shortname = path.basename(inputfilename)
	inputfile = open(inputfilename)

	if True:
	firstline = inputfile.readline()
	path_re = re.compile("^\s%\s((\w\|/\|\.\|~\|:)+)")
	#only match forward slashes

	if path_re.match(firstline):
	prename = path_re.match(firstline).group(1)
	else:
	assert False, "Please give a preamble file either with the -p option or after a % in the first line of the file."

	inputstring = inputfile.read()

	# wiki parser instance
	WP = WikiParser(inputstring)

	preamble = open(prename).read()
	outputfile = open(shortname+'.tex','w+t')
	outputfile.write(preamble)
	outputfile.write(WP.print_html())
	outputfile.close()

	system('pdflatex '+shortname)
	JUNK = ('.aux', '.nav', '.snm', '.toc', '.log', '.out')
	for junk in JUNK:
	try:
	remove(shortname+junk)
	except:
	pass