ounziw/html2rest.py

## html2rest.py
#-----------------------------------------------------------------------------
# Copyright (c) 2006-2009  Gerard Flanagan
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
#    The above copyright notice and this permission notice shall be included
#    in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#-----------------------------------------------------------------------------


import sys
import os
import re
import codecs
from sgmllib import SGMLParser
from StringIO import StringIO
from mbtextwrap import MBTextWrapper

CODEBLOCK = '.. sourcecode:: php'
BLOCKTAGS = ['div', 'blockquote']
IGNORETAGS = ['title', 'style', 'script']
UNDERLINES = list('#=~`+;')

# Fredrik Lundh, http://effbot.org/zone/re-sub.html
def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3].lower() == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            import htmlentitydefs
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

class LineBuffer(object):

    def __init__(self):
        self._lines = []
        self._wrapper = MBTextWrapper(encoding='utf-8')

    def __len__(self):
        return len(self._lines)

    def __getitem__(self, i):
        return self._lines[i]

    def __setitem__(self, i, value):
        self._lines[i] = value

    def clear(self):
        self._lines[:] = []

    def read(self):
        return '\n'.join(self._lines)

    def write(self, s):
        #normalise whitespace
        s = ' '.join(s.split())
        self._lines.extend(self._wrapper.wrap(s))

    def rawwrite(self, s):
        self._lines.extend(s.splitlines())

    def indent(self, numspaces=4, start=0):
        linebuf = self._lines
        n = len(linebuf)
        if n > start:
            indent = ' ' * numspaces
            for i in range(start, n):
                linebuf[i] = indent + linebuf[i]

    def lstrip(self):
        linebuf = self._lines
        for i in range(len(linebuf)):
            linebuf[i] = linebuf[i].lstrip()

class Parser(SGMLParser):

    def __init__(self, writer=sys.stdout):
        SGMLParser.__init__(self)
        self.writer = writer
        self.stringbuffer = StringIO()
        self.linebuffer = LineBuffer()
        self.verbatim = False
        self.lists = []
        self.ignoredata = False
        self.inblock = 0
        self.nobreak = False
        self.link = None

    def close(self):
        self.writeline()
        SGMLParser.close(self)

    def flush(self):
        if self.linebuffer:
            if self.inblock > 1:
                indent = 4 * (self.inblock - 1)
                self.linebuffer.indent(indent)
            self.writer.write(unescape(self.linebuffer.read()).encode(sys.getfilesystemencoding()))
            self.linebuffer.clear()

    def flush_stringbuffer(self):
        sbuf = self.stringbuffer.getvalue()
        if not sbuf:
            return
        elif self.linebuffer:
            self.linebuffer[-1] += sbuf
        else:
            self.linebuffer.write(sbuf)
        self.clear_stringbuffer()

    def clear_stringbuffer(self):
        #self.stringbuffer.reset()
        self.stringbuffer.seek(0)
        self.stringbuffer.truncate()

    def data(self, text):
        self.stringbuffer.write(text)

    def pending(self):
        return self.stringbuffer.tell() or self.linebuffer

    def write(self, text=''):
        self.flush_stringbuffer()
        self.flush()
        self.writer.write(unescape(text))

    def writeline(self, text=''):
        self.write(text + '\n')

    def writestartblock(self, text=''):
        if self.pending():
            self.writeline()
        self.writeline()
        self.writeline(text)

    def writeendblock(self, text=''):
        self.writeline(text)
        self.writeline()

    def writeblock(self, text=''):
        self.writestartblock(text)
        self.writeline()

    def handle_data(self, data):
        if self.ignoredata:
            return
        elif self.verbatim:
            self.data(data)
        else:
            self.data(' '.join(data.splitlines()))

    def unknown_starttag(self, tag, attrs):
        if tag in IGNORETAGS:
            self.ignoredata = True
        elif len(tag) == 2 and tag[0] == 'h':
            self.writestartblock()
        elif tag == 'br':
            if self.verbatim:
                self.data('\n')
            elif not self.inblock:
                self.writeline()
            else:
                self.data(' ')
        elif not self.verbatim:
            self.data(' ')

    def unknown_endtag(self, tag):
        self.ignoredata = False
        if len(tag) == 2 and tag[0] == 'h':
            self.flush_stringbuffer()
            if self.linebuffer:
                linebuf = self.linebuffer
                linebuf[-1] = linebuf[-1].strip()
                char = UNDERLINES[int(tag[1])-1]
                linebuf.write(char * len(linebuf[-1]))
                self.writeline()
        #elif tag in BLOCKTAGS and self.pending():
        #    if self.lists:
        #        self.end_li()
        #    else:
        #        self.writeline()
        elif not self.verbatim:
            self.data(' ')

    def start_a(self, attrs):
        href = dict(attrs).get('href', None)
        if not href or href.startswith('#'):
            return
        self.data('`')
        self.link = href

    def end_a(self):
        if self.link:
            self.data(' <%s>`' % self.link)
            self.link = None

    def start_code(self, attrs):
        if self.lists:
            self.end_li()
            self.writeline()
        #self.inblock += 1
        self.verbatim = True
        self.writeblock(CODEBLOCK)

    def end_code(self):
        sbuf = self.stringbuffer.getvalue()
        if sbuf:
            self.linebuffer.rawwrite(sbuf)
            self.linebuffer.indent(4)
        self.clear_stringbuffer()
        self.writeendblock()
        #self.inblock -= 1
        self.verbatim = False

    def start_ul(self, attrs):
        if self.lists:
            self.end_li()
            self.writeline()
        else:
            self.writeline()
        self.lists.append('- ')
        self.inblock += 1

    def end_ul(self):
        self.end_li()
        self.lists.pop()
        self.inblock -= 1
        if self.inblock:
            self.writeline()
        else:
            self.writeendblock()

    def start_ol(self, attrs):
        if self.lists:
            self.end_li()
            self.writeline()
        else:
            self.writeline()
        self.lists.append('#. ')
        self.inblock += 1

    def end_ol(self):
        self.end_li()
        self.lists.pop()
        self.inblock -= 1
        if self.inblock:
            self.writeline()
        else:
            self.writeendblock()

    def start_p(self, attrs):
        if self.verbatim:
            self.writeline()
        elif not self.inblock:
            self.writeline()

    def end_p(self):
        if self.inblock:
        #self.flush_stringbuffer()
            if self.verbatim:
                self.writeline()
            else:
                return
        else:
            self.linebuffer.lstrip()
            self.writeline()

    def start_li(self, attrs):
        self.writeline()
        self.data(self.lists[-1])

    def end_li(self):
        self.flush_stringbuffer()
        linebuf = self.linebuffer
        if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['- ', '#.']:
            start=1
        else:
            # the start of the <li> has already been written, perhaps because
            # there was a <pre> block
            start = 0
        self.linebuffer.indent(len(self.lists[-1]), start=start)
        self.write()

    def start_dl(self, attrs):
        self.writeline()
        self.inblock += 1
        self.nobreak = True

    def end_dl(self):
        self.nobreak = False
        self.writeline()
        self.inblock -= 1

    def start_dt(self, attrs):
        self.data(':')

    def end_dt(self):
        self.data(':')

    def start_dd(self, attrs):
        self.data(' ')

    def end_dd(self):
        self.flush_stringbuffer()
        self.linebuffer.indent(2, start=1)
        self.writeline()

    def start_em(self, attrs):
        self.data(' *')

    def end_em(self):
        self.data('*')

    def start_b(self, attrs):
        self.data(' **')

    def end_b(self):
        self.data('**')

    def start_code(self, attrs):
        self.data(' `')

    def end_code(self):
        self.data('`')

    def start_span(self, attrs):
        pass

    def end_span(self):
        pass

    def start_body(self, attrs):
        pass

    def end_body(self):
        self.end_p()
try:
    from BeautifulSoup import BeautifulSoup, NavigableString

    # don't seem to need this anymore - issue fixed in latest BeautifulSoup presumably
    class ShlurpUpYourShloup(BeautifulSoup):
        '''preserve whitespace in <pre>'''
        def endData(self, containerClass=NavigableString):
            if self.currentData:
                currentData = ''.join(self.currentData)
                if not currentData.strip():
                    if '\n' in currentData:
                        currentData = '\n'
                    else:
                        # just changed the following line
                        # original: currentData = ' '
                        currentData = u' ' * len(currentData)
                self.currentData = []
                if self.parseOnlyThese and len(self.tagStack) <= 1 and \
                    (not self.parseOnlyThese.text or \
                        not self.parseOnlyThese.search(currentData)):
                    return
                o = containerClass(currentData)
                o.setup(self.currentTag, self.previous)
                if self.previous:
                    self.previous.next = o
                self.previous = o
                self.currentTag.contents.append(o)

except ImportError:
    def ShlurpUpYourShloup(text, *args, **kw):
        return text

    BeautifulSoup = ShlurpUpYourSoup

def readsoup(fileobj, convert='html', encoding='utf8'):
    if hasattr(fileobj, 'read'):
        text = fileobj.read()
    else:
        text = open(fileobj, 'rb').read()
    #for br in ['<br>', '<br/>', '<br />']:
    #    text = text.replace(br, '\n')
    #    text = text.replace(br.upper(), '\n')
    return str(BeautifulSoup(text, convertEntities=convert,
                                            fromEncoding=encoding))

def html2rest(html, writer=sys.stdout):
    parser = Parser(writer)
    parser.feed(html)
    parser.close()

if __name__ == '__main__':
    # Eg.
    # python html2rest.py http://sphinx.pocoo.org/intro.html > intro.rst
    fileobj = None
    if sys.argv[1:]:
        arg = sys.argv[1]
        if arg.startswith('http://'):
            import urllib
            fileobj = urllib.urlopen(arg)
        else:
            fileobj = codecs.open(arg, 'rb', 'utf8')
    else:
        fileobj = sys.stdin
    if fileobj is not None:
        try:
            html2rest(fileobj.read())#readsoup(fileobj))
        finally:
            fileobj.close()
	#-----------------------------------------------------------------------------
	# Copyright (c) 2006-2009 Gerard Flanagan
	#
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included
	# in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.
	#-----------------------------------------------------------------------------


	import sys
	import os
	import re
	import codecs
	from sgmllib import SGMLParser
	from StringIO import StringIO
	from mbtextwrap import MBTextWrapper

	CODEBLOCK = '.. sourcecode:: php'
	BLOCKTAGS = ['div', 'blockquote']
	IGNORETAGS = ['title', 'style', 'script']
	UNDERLINES = list('#=~`+;')

	# Fredrik Lundh, http://effbot.org/zone/re-sub.html
	def unescape(text):
	def fixup(m):
	text = m.group(0)
	if text[:2] == "&#":
	# character reference
	try:
	if text[:3].lower() == "&#x":
	return unichr(int(text[3:-1], 16))
	else:
	return unichr(int(text[2:-1]))
	except ValueError:
	pass
	else:
	# named entity
	import htmlentitydefs
	try:
	text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
	except KeyError:
	pass
	return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

	class LineBuffer(object):

	def __init__(self):
	self._lines = []
	self._wrapper = MBTextWrapper(encoding='utf-8')

	def __len__(self):
	return len(self._lines)

	def __getitem__(self, i):
	return self._lines[i]

	def __setitem__(self, i, value):
	self._lines[i] = value

	def clear(self):
	self._lines[:] = []

	def read(self):
	return '\n'.join(self._lines)

	def write(self, s):
	#normalise whitespace
	s = ' '.join(s.split())
	self._lines.extend(self._wrapper.wrap(s))

	def rawwrite(self, s):
	self._lines.extend(s.splitlines())

	def indent(self, numspaces=4, start=0):
	linebuf = self._lines
	n = len(linebuf)
	if n > start:
	indent = ' ' * numspaces
	for i in range(start, n):
	linebuf[i] = indent + linebuf[i]

	def lstrip(self):
	linebuf = self._lines
	for i in range(len(linebuf)):
	linebuf[i] = linebuf[i].lstrip()

	class Parser(SGMLParser):

	def __init__(self, writer=sys.stdout):
	SGMLParser.__init__(self)
	self.writer = writer
	self.stringbuffer = StringIO()
	self.linebuffer = LineBuffer()
	self.verbatim = False
	self.lists = []
	self.ignoredata = False
	self.inblock = 0
	self.nobreak = False
	self.link = None

	def close(self):
	self.writeline()
	SGMLParser.close(self)

	def flush(self):
	if self.linebuffer:
	if self.inblock > 1:
	indent = 4 * (self.inblock - 1)
	self.linebuffer.indent(indent)
	self.writer.write(unescape(self.linebuffer.read()).encode(sys.getfilesystemencoding()))
	self.linebuffer.clear()

	def flush_stringbuffer(self):
	sbuf = self.stringbuffer.getvalue()
	if not sbuf:
	return
	elif self.linebuffer:
	self.linebuffer[-1] += sbuf
	else:
	self.linebuffer.write(sbuf)
	self.clear_stringbuffer()

	def clear_stringbuffer(self):
	#self.stringbuffer.reset()
	self.stringbuffer.seek(0)
	self.stringbuffer.truncate()

	def data(self, text):
	self.stringbuffer.write(text)

	def pending(self):
	return self.stringbuffer.tell() or self.linebuffer

	def write(self, text=''):
	self.flush_stringbuffer()
	self.flush()
	self.writer.write(unescape(text))

	def writeline(self, text=''):
	self.write(text + '\n')

	def writestartblock(self, text=''):
	if self.pending():
	self.writeline()
	self.writeline()
	self.writeline(text)

	def writeendblock(self, text=''):
	self.writeline(text)
	self.writeline()

	def writeblock(self, text=''):
	self.writestartblock(text)
	self.writeline()

	def handle_data(self, data):
	if self.ignoredata:
	return
	elif self.verbatim:
	self.data(data)
	else:
	self.data(' '.join(data.splitlines()))

	def unknown_starttag(self, tag, attrs):
	if tag in IGNORETAGS:
	self.ignoredata = True
	elif len(tag) == 2 and tag[0] == 'h':
	self.writestartblock()
	elif tag == 'br':
	if self.verbatim:
	self.data('\n')
	elif not self.inblock:
	self.writeline()
	else:
	self.data(' ')
	elif not self.verbatim:
	self.data(' ')

	def unknown_endtag(self, tag):
	self.ignoredata = False
	if len(tag) == 2 and tag[0] == 'h':
	self.flush_stringbuffer()
	if self.linebuffer:
	linebuf = self.linebuffer
	linebuf[-1] = linebuf[-1].strip()
	char = UNDERLINES[int(tag[1])-1]
	linebuf.write(char * len(linebuf[-1]))
	self.writeline()
	#elif tag in BLOCKTAGS and self.pending():
	# if self.lists:
	# self.end_li()
	# else:
	# self.writeline()
	elif not self.verbatim:
	self.data(' ')

	def start_a(self, attrs):
	href = dict(attrs).get('href', None)
	if not href or href.startswith('#'):
	return
	self.data('`')
	self.link = href

	def end_a(self):
	if self.link:
	self.data(' <%s>`' % self.link)
	self.link = None

	def start_code(self, attrs):
	if self.lists:
	self.end_li()
	self.writeline()
	#self.inblock += 1
	self.verbatim = True
	self.writeblock(CODEBLOCK)

	def end_code(self):
	sbuf = self.stringbuffer.getvalue()
	if sbuf:
	self.linebuffer.rawwrite(sbuf)
	self.linebuffer.indent(4)
	self.clear_stringbuffer()
	self.writeendblock()
	#self.inblock -= 1
	self.verbatim = False

	def start_ul(self, attrs):
	if self.lists:
	self.end_li()
	self.writeline()
	else:
	self.writeline()
	self.lists.append('- ')
	self.inblock += 1

	def end_ul(self):
	self.end_li()
	self.lists.pop()
	self.inblock -= 1
	if self.inblock:
	self.writeline()
	else:
	self.writeendblock()

	def start_ol(self, attrs):
	if self.lists:
	self.end_li()
	self.writeline()
	else:
	self.writeline()
	self.lists.append('#. ')
	self.inblock += 1

	def end_ol(self):
	self.end_li()
	self.lists.pop()
	self.inblock -= 1
	if self.inblock:
	self.writeline()
	else:
	self.writeendblock()

	def start_p(self, attrs):
	if self.verbatim:
	self.writeline()
	elif not self.inblock:
	self.writeline()

	def end_p(self):
	if self.inblock:
	#self.flush_stringbuffer()
	if self.verbatim:
	self.writeline()
	else:
	return
	else:
	self.linebuffer.lstrip()
	self.writeline()

	def start_li(self, attrs):
	self.writeline()
	self.data(self.lists[-1])

	def end_li(self):
	self.flush_stringbuffer()
	linebuf = self.linebuffer
	if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['- ', '#.']:
	start=1
	else:
	# the start of the <li> has already been written, perhaps because
	# there was a <pre> block
	start = 0
	self.linebuffer.indent(len(self.lists[-1]), start=start)
	self.write()

	def start_dl(self, attrs):
	self.writeline()
	self.inblock += 1
	self.nobreak = True

	def end_dl(self):
	self.nobreak = False
	self.writeline()
	self.inblock -= 1

	def start_dt(self, attrs):
	self.data(':')

	def end_dt(self):
	self.data(':')

	def start_dd(self, attrs):
	self.data(' ')

	def end_dd(self):
	self.flush_stringbuffer()
	self.linebuffer.indent(2, start=1)
	self.writeline()

	def start_em(self, attrs):
	self.data(' *')

	def end_em(self):
	self.data('*')

	def start_b(self, attrs):
	self.data(' **')

	def end_b(self):
	self.data('**')

	def start_code(self, attrs):
	self.data(' `')

	def end_code(self):
	self.data('`')

	def start_span(self, attrs):
	pass

	def end_span(self):
	pass

	def start_body(self, attrs):
	pass

	def end_body(self):
	self.end_p()
	try:
	from BeautifulSoup import BeautifulSoup, NavigableString

	# don't seem to need this anymore - issue fixed in latest BeautifulSoup presumably
	class ShlurpUpYourShloup(BeautifulSoup):
	'''preserve whitespace in <pre>'''
	def endData(self, containerClass=NavigableString):
	if self.currentData:
	currentData = ''.join(self.currentData)
	if not currentData.strip():
	if '\n' in currentData:
	currentData = '\n'
	else:
	# just changed the following line
	# original: currentData = ' '
	currentData = u' ' * len(currentData)
	self.currentData = []
	if self.parseOnlyThese and len(self.tagStack) <= 1 and \
	(not self.parseOnlyThese.text or \
	not self.parseOnlyThese.search(currentData)):
	return
	o = containerClass(currentData)
	o.setup(self.currentTag, self.previous)
	if self.previous:
	self.previous.next = o
	self.previous = o
	self.currentTag.contents.append(o)

	except ImportError:
	def ShlurpUpYourShloup(text, args, *kw):
	return text

	BeautifulSoup = ShlurpUpYourSoup

	def readsoup(fileobj, convert='html', encoding='utf8'):
	if hasattr(fileobj, 'read'):
	text = fileobj.read()
	else:
	text = open(fileobj, 'rb').read()
	#for br in ['<br>', '<br/>', '<br />']:
	# text = text.replace(br, '\n')
	# text = text.replace(br.upper(), '\n')
	return str(BeautifulSoup(text, convertEntities=convert,
	fromEncoding=encoding))

	def html2rest(html, writer=sys.stdout):
	parser = Parser(writer)
	parser.feed(html)
	parser.close()

	if __name__ == '__main__':
	# Eg.
	# python html2rest.py http://sphinx.pocoo.org/intro.html > intro.rst
	fileobj = None
	if sys.argv[1:]:
	arg = sys.argv[1]
	if arg.startswith('http://'):
	import urllib
	fileobj = urllib.urlopen(arg)
	else:
	fileobj = codecs.open(arg, 'rb', 'utf8')
	else:
	fileobj = sys.stdin
	if fileobj is not None:
	try:
	html2rest(fileobj.read())#readsoup(fileobj))
	finally:
	fileobj.close()