Skip to content

Instantly share code, notes, and snippets.

@btoews
Created September 27, 2011 16:42
Show Gist options
  • Save btoews/1245568 to your computer and use it in GitHub Desktop.
Save btoews/1245568 to your computer and use it in GitHub Desktop.
prettyHTMLizer
from HTMLParser import HTMLParser
PRETTY_LINE_WIDTH = 120
LINEBREAK_PREFERENCE = "unix" #unix(\n) or win(\r\n)
LINEBREAK = ["\r\n","\n"][LINEBREAK_PREFERENCE == "unix"] #set LINEBREAK based on LINEBREAK_PREFERENCE
#DECORATORS
def format(fn):
def replace(*args,**kwargs):
pass_args = []
pass_kwargs = {}
for arg in args:
pass_args.append(line_width_recurse(line_breaks_recurse(arg)))
for k,v in kwargs.items():
pass_kwargs[k] = line_width_recurse(line_breaks_recurse(v))
return fn(*pass_args,**pass_kwargs)
return replace
def line_breaks_recurse(obj):
"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
if type(obj) == str and LINEBREAK_PREFERENCE == "win":
temp_obj = ""
for ch,i in enumerate(obj):
if ch == "\n" and (i < 1 or obj[i-1] != "\r"):
temp_obj += "\r\n"
else:
temp_obj += "ch"
elif type(obj) == str and LINEBREAK_PREFERENCE == "unix":
temp_obj = obj.replace("\r\n","\n")
elif type(obj) == list:
for o in obj:
temp_obj = []
temp_obj.append(line_breaks_recurse(o))
elif type(obj) == dict:
temp_obj = {}
for k,v in obj.items():
temp_obj[k] = line_breaks_recurse(v)
else:
temp_obj = obj
return temp_obj
def line_width_recurse(obj):
"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
if type(obj) == str:
#we check string length. if its too long we break it up
#first though we break based on existing line breaks
if obj.count(LINEBREAK):
temp_obj = LINEBREAK.join(line_width_recurse(obj.split(LINEBREAK)))
else:
if len(obj) > PRETTY_LINE_WIDTH:
white_space = ""
i = 0
#here we want to make sure that a line that gets broken keeps with the indentation
while i+1 < len(obj) and (obj[i] == "\t" or obj[i] == " "):
white_space += obj[i]
i += 1
part_one = obj[:PRETTY_LINE_WIDTH]
part_two = line_width_recurse(white_space + obj[PRETTY_LINE_WIDTH:])
temp_obj = part_one + LINEBREAK + part_two
else:
temp_obj = obj
#if its an iterable object we recurse
#TODO maybe we can check if the object has an __iter__ method to deal with objects other than dicts and lists
elif type(obj) == list:
temp_obj = []
for o in obj:
temp_obj.append(line_width_recurse(o))
elif type(obj) == dict:
temp_obj = {}
for k,v in obj.items():
temp_obj[k] = line_width_recurse(v)
#if its not iterable and not a string we pass it back unmodified
else:
temp_obj = obj
return temp_obj
class prettyHTMLizer(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.pretty_output = ""
self.tab_count = 0
def handle_starttag(self,tag,attrs):
attrs = dict(attrs)
self.pretty_output += self.__html_start_tag(tag,attrs)
self.tab_count += 1
def handle_endtag(self,tag):
#here we decrement tab_count only so far as 0... neat trick, huh?
self.tab_count -= self.tab_count > 0
self.pretty_output += self.__html_end_tag(tag)
def handle_startendtag(self, tag, attrs):
self.pretty_output += self.__html_startend_tag(tag,attrs)
def handle_data(self,data):
self.pretty_output += self.__html_data(data)
def output(self):
return self.__formatting(self.pretty_output)
@format
def __formatting(self,string):
return string
def __html_start_tag(self, tag, attrs):
return "\t"*self.tab_count + '<%s%s>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK
def __html_startend_tag(self, tag, attrs):
return "\t"*self.tab_count + '<%s%s/>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK
def __html_end_tag(self, tag):
return "\t"*self.tab_count + '</%s>' % (tag.lower()) + LINEBREAK
def __html_data(self,data):
return "\t"*self.tab_count + data + LINEBREAK
def __html_attrs(self, attrs):
_attrs = ''
if attrs:
_attrs = ' %s' % (' '.join([('%s="%s"' % (k.lower(),v.lower())) for k,v in attrs.iteritems()]))
return _attrs
if __name__ == "__main__":
ugly = "<htML><b>hello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello world</b></html><foobar/>"
parser = prettyHTMLizer()
parser.feed(ugly)
pretty = parser.output()
print pretty
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment