Skip to content

Instantly share code, notes, and snippets.

@ounziw
Forked from kenjis/html2rest.py
Created October 6, 2011 23:42
Show Gist options
  • Save ounziw/1269027 to your computer and use it in GitHub Desktop.
Save ounziw/1269027 to your computer and use it in GitHub Desktop.
html2rest for CodeIgniter User Guide
#-----------------------------------------------------------------------------
# Copyright (c) 2006-2009 Gerard Flanagan
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#-----------------------------------------------------------------------------
import sys
import os
import re
import codecs
from sgmllib import SGMLParser
from StringIO import StringIO
from mbtextwrap import MBTextWrapper
CODEBLOCK = '.. sourcecode:: php'
BLOCKTAGS = ['div', 'blockquote']
IGNORETAGS = ['title', 'style', 'script']
UNDERLINES = list('#=~`+;')
# Fredrik Lundh, http://effbot.org/zone/re-sub.html
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3].lower() == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
import htmlentitydefs
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
class LineBuffer(object):
def __init__(self):
self._lines = []
self._wrapper = MBTextWrapper(encoding='utf-8')
def __len__(self):
return len(self._lines)
def __getitem__(self, i):
return self._lines[i]
def __setitem__(self, i, value):
self._lines[i] = value
def clear(self):
self._lines[:] = []
def read(self):
return '\n'.join(self._lines)
def write(self, s):
#normalise whitespace
s = ' '.join(s.split())
self._lines.extend(self._wrapper.wrap(s))
def rawwrite(self, s):
self._lines.extend(s.splitlines())
def indent(self, numspaces=4, start=0):
linebuf = self._lines
n = len(linebuf)
if n > start:
indent = ' ' * numspaces
for i in range(start, n):
linebuf[i] = indent + linebuf[i]
def lstrip(self):
linebuf = self._lines
for i in range(len(linebuf)):
linebuf[i] = linebuf[i].lstrip()
class Parser(SGMLParser):
def __init__(self, writer=sys.stdout):
SGMLParser.__init__(self)
self.writer = writer
self.stringbuffer = StringIO()
self.linebuffer = LineBuffer()
self.verbatim = False
self.lists = []
self.ignoredata = False
self.inblock = 0
self.nobreak = False
self.link = None
def close(self):
self.writeline()
SGMLParser.close(self)
def flush(self):
if self.linebuffer:
if self.inblock > 1:
indent = 4 * (self.inblock - 1)
self.linebuffer.indent(indent)
self.writer.write(unescape(self.linebuffer.read()).encode(sys.getfilesystemencoding()))
self.linebuffer.clear()
def flush_stringbuffer(self):
sbuf = self.stringbuffer.getvalue()
if not sbuf:
return
elif self.linebuffer:
self.linebuffer[-1] += sbuf
else:
self.linebuffer.write(sbuf)
self.clear_stringbuffer()
def clear_stringbuffer(self):
#self.stringbuffer.reset()
self.stringbuffer.seek(0)
self.stringbuffer.truncate()
def data(self, text):
self.stringbuffer.write(text)
def pending(self):
return self.stringbuffer.tell() or self.linebuffer
def write(self, text=''):
self.flush_stringbuffer()
self.flush()
self.writer.write(unescape(text))
def writeline(self, text=''):
self.write(text + '\n')
def writestartblock(self, text=''):
if self.pending():
self.writeline()
self.writeline()
self.writeline(text)
def writeendblock(self, text=''):
self.writeline(text)
self.writeline()
def writeblock(self, text=''):
self.writestartblock(text)
self.writeline()
def handle_data(self, data):
if self.ignoredata:
return
elif self.verbatim:
self.data(data)
else:
self.data(' '.join(data.splitlines()))
def unknown_starttag(self, tag, attrs):
if tag in IGNORETAGS:
self.ignoredata = True
elif len(tag) == 2 and tag[0] == 'h':
self.writestartblock()
elif tag == 'br':
if self.verbatim:
self.data('\n')
elif not self.inblock:
self.writeline()
else:
self.data(' ')
elif not self.verbatim:
self.data(' ')
def unknown_endtag(self, tag):
self.ignoredata = False
if len(tag) == 2 and tag[0] == 'h':
self.flush_stringbuffer()
if self.linebuffer:
linebuf = self.linebuffer
linebuf[-1] = linebuf[-1].strip()
char = UNDERLINES[int(tag[1])-1]
linebuf.write(char * len(linebuf[-1]))
self.writeline()
#elif tag in BLOCKTAGS and self.pending():
# if self.lists:
# self.end_li()
# else:
# self.writeline()
elif not self.verbatim:
self.data(' ')
def start_a(self, attrs):
href = dict(attrs).get('href', None)
if not href or href.startswith('#'):
return
self.data('`')
self.link = href
def end_a(self):
if self.link:
self.data(' <%s>`' % self.link)
self.link = None
def start_code(self, attrs):
if self.lists:
self.end_li()
self.writeline()
#self.inblock += 1
self.verbatim = True
self.writeblock(CODEBLOCK)
def end_code(self):
sbuf = self.stringbuffer.getvalue()
if sbuf:
self.linebuffer.rawwrite(sbuf)
self.linebuffer.indent(4)
self.clear_stringbuffer()
self.writeendblock()
#self.inblock -= 1
self.verbatim = False
def start_ul(self, attrs):
if self.lists:
self.end_li()
self.writeline()
else:
self.writeline()
self.lists.append('- ')
self.inblock += 1
def end_ul(self):
self.end_li()
self.lists.pop()
self.inblock -= 1
if self.inblock:
self.writeline()
else:
self.writeendblock()
def start_ol(self, attrs):
if self.lists:
self.end_li()
self.writeline()
else:
self.writeline()
self.lists.append('#. ')
self.inblock += 1
def end_ol(self):
self.end_li()
self.lists.pop()
self.inblock -= 1
if self.inblock:
self.writeline()
else:
self.writeendblock()
def start_p(self, attrs):
if self.verbatim:
self.writeline()
elif not self.inblock:
self.writeline()
def end_p(self):
if self.inblock:
#self.flush_stringbuffer()
if self.verbatim:
self.writeline()
else:
return
else:
self.linebuffer.lstrip()
self.writeline()
def start_li(self, attrs):
self.writeline()
self.data(self.lists[-1])
def end_li(self):
self.flush_stringbuffer()
linebuf = self.linebuffer
if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['- ', '#.']:
start=1
else:
# the start of the <li> has already been written, perhaps because
# there was a <pre> block
start = 0
self.linebuffer.indent(len(self.lists[-1]), start=start)
self.write()
def start_dl(self, attrs):
self.writeline()
self.inblock += 1
self.nobreak = True
def end_dl(self):
self.nobreak = False
self.writeline()
self.inblock -= 1
def start_dt(self, attrs):
self.data(':')
def end_dt(self):
self.data(':')
def start_dd(self, attrs):
self.data(' ')
def end_dd(self):
self.flush_stringbuffer()
self.linebuffer.indent(2, start=1)
self.writeline()
def start_em(self, attrs):
self.data(' *')
def end_em(self):
self.data('*')
def start_b(self, attrs):
self.data(' **')
def end_b(self):
self.data('**')
def start_code(self, attrs):
self.data(' `')
def end_code(self):
self.data('`')
def start_span(self, attrs):
pass
def end_span(self):
pass
def start_body(self, attrs):
pass
def end_body(self):
self.end_p()
try:
from BeautifulSoup import BeautifulSoup, NavigableString
# don't seem to need this anymore - issue fixed in latest BeautifulSoup presumably
class ShlurpUpYourShloup(BeautifulSoup):
'''preserve whitespace in <pre>'''
def endData(self, containerClass=NavigableString):
if self.currentData:
currentData = ''.join(self.currentData)
if not currentData.strip():
if '\n' in currentData:
currentData = '\n'
else:
# just changed the following line
# original: currentData = ' '
currentData = u' ' * len(currentData)
self.currentData = []
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
(not self.parseOnlyThese.text or \
not self.parseOnlyThese.search(currentData)):
return
o = containerClass(currentData)
o.setup(self.currentTag, self.previous)
if self.previous:
self.previous.next = o
self.previous = o
self.currentTag.contents.append(o)
except ImportError:
def ShlurpUpYourShloup(text, *args, **kw):
return text
BeautifulSoup = ShlurpUpYourSoup
def readsoup(fileobj, convert='html', encoding='utf8'):
if hasattr(fileobj, 'read'):
text = fileobj.read()
else:
text = open(fileobj, 'rb').read()
#for br in ['<br>', '<br/>', '<br />']:
# text = text.replace(br, '\n')
# text = text.replace(br.upper(), '\n')
return str(BeautifulSoup(text, convertEntities=convert,
fromEncoding=encoding))
def html2rest(html, writer=sys.stdout):
parser = Parser(writer)
parser.feed(html)
parser.close()
if __name__ == '__main__':
# Eg.
# python html2rest.py http://sphinx.pocoo.org/intro.html > intro.rst
fileobj = None
if sys.argv[1:]:
arg = sys.argv[1]
if arg.startswith('http://'):
import urllib
fileobj = urllib.urlopen(arg)
else:
fileobj = codecs.open(arg, 'rb', 'utf8')
else:
fileobj = sys.stdin
if fileobj is not None:
try:
html2rest(fileobj.read())#readsoup(fileobj))
finally:
fileobj.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment