Created
September 27, 2011 16:42
-
-
Save btoews/1245568 to your computer and use it in GitHub Desktop.
prettyHTMLizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser | |
PRETTY_LINE_WIDTH = 120 | |
LINEBREAK_PREFERENCE = "unix" #unix(\n) or win(\r\n) | |
LINEBREAK = ["\r\n","\n"][LINEBREAK_PREFERENCE == "unix"] #set LINEBREAK based on LINEBREAK_PREFERENCE | |
#DECORATORS | |
def format(fn): | |
def replace(*args,**kwargs): | |
pass_args = [] | |
pass_kwargs = {} | |
for arg in args: | |
pass_args.append(line_width_recurse(line_breaks_recurse(arg))) | |
for k,v in kwargs.items(): | |
pass_kwargs[k] = line_width_recurse(line_breaks_recurse(v)) | |
return fn(*pass_args,**pass_kwargs) | |
return replace | |
def line_breaks_recurse(obj): | |
"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks""" | |
if type(obj) == str and LINEBREAK_PREFERENCE == "win": | |
temp_obj = "" | |
for ch,i in enumerate(obj): | |
if ch == "\n" and (i < 1 or obj[i-1] != "\r"): | |
temp_obj += "\r\n" | |
else: | |
temp_obj += "ch" | |
elif type(obj) == str and LINEBREAK_PREFERENCE == "unix": | |
temp_obj = obj.replace("\r\n","\n") | |
elif type(obj) == list: | |
for o in obj: | |
temp_obj = [] | |
temp_obj.append(line_breaks_recurse(o)) | |
elif type(obj) == dict: | |
temp_obj = {} | |
for k,v in obj.items(): | |
temp_obj[k] = line_breaks_recurse(v) | |
else: | |
temp_obj = obj | |
return temp_obj | |
def line_width_recurse(obj): | |
"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks""" | |
if type(obj) == str: | |
#we check string length. if its too long we break it up | |
#first though we break based on existing line breaks | |
if obj.count(LINEBREAK): | |
temp_obj = LINEBREAK.join(line_width_recurse(obj.split(LINEBREAK))) | |
else: | |
if len(obj) > PRETTY_LINE_WIDTH: | |
white_space = "" | |
i = 0 | |
#here we want to make sure that a line that gets broken keeps with the indentation | |
while i+1 < len(obj) and (obj[i] == "\t" or obj[i] == " "): | |
white_space += obj[i] | |
i += 1 | |
part_one = obj[:PRETTY_LINE_WIDTH] | |
part_two = line_width_recurse(white_space + obj[PRETTY_LINE_WIDTH:]) | |
temp_obj = part_one + LINEBREAK + part_two | |
else: | |
temp_obj = obj | |
#if its an iterable object we recurse | |
#TODO maybe we can check if the object has an __iter__ method to deal with objects other than dicts and lists | |
elif type(obj) == list: | |
temp_obj = [] | |
for o in obj: | |
temp_obj.append(line_width_recurse(o)) | |
elif type(obj) == dict: | |
temp_obj = {} | |
for k,v in obj.items(): | |
temp_obj[k] = line_width_recurse(v) | |
#if its not iterable and not a string we pass it back unmodified | |
else: | |
temp_obj = obj | |
return temp_obj | |
class prettyHTMLizer(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.pretty_output = "" | |
self.tab_count = 0 | |
def handle_starttag(self,tag,attrs): | |
attrs = dict(attrs) | |
self.pretty_output += self.__html_start_tag(tag,attrs) | |
self.tab_count += 1 | |
def handle_endtag(self,tag): | |
#here we decrement tab_count only so far as 0... neat trick, huh? | |
self.tab_count -= self.tab_count > 0 | |
self.pretty_output += self.__html_end_tag(tag) | |
def handle_startendtag(self, tag, attrs): | |
self.pretty_output += self.__html_startend_tag(tag,attrs) | |
def handle_data(self,data): | |
self.pretty_output += self.__html_data(data) | |
def output(self): | |
return self.__formatting(self.pretty_output) | |
@format | |
def __formatting(self,string): | |
return string | |
def __html_start_tag(self, tag, attrs): | |
return "\t"*self.tab_count + '<%s%s>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK | |
def __html_startend_tag(self, tag, attrs): | |
return "\t"*self.tab_count + '<%s%s/>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK | |
def __html_end_tag(self, tag): | |
return "\t"*self.tab_count + '</%s>' % (tag.lower()) + LINEBREAK | |
def __html_data(self,data): | |
return "\t"*self.tab_count + data + LINEBREAK | |
def __html_attrs(self, attrs): | |
_attrs = '' | |
if attrs: | |
_attrs = ' %s' % (' '.join([('%s="%s"' % (k.lower(),v.lower())) for k,v in attrs.iteritems()])) | |
return _attrs | |
if __name__ == "__main__": | |
ugly = "<htML><b>hello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello world</b></html><foobar/>" | |
parser = prettyHTMLizer() | |
parser.feed(ugly) | |
pretty = parser.output() | |
print pretty |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment