btoews/pretty_htmlizer.py

## pretty_htmlizer.py
from HTMLParser import HTMLParser

PRETTY_LINE_WIDTH     = 120
LINEBREAK_PREFERENCE  = "unix" #unix(\n) or win(\r\n)
LINEBREAK = ["\r\n","\n"][LINEBREAK_PREFERENCE == "unix"] #set LINEBREAK based on LINEBREAK_PREFERENCE

#DECORATORS
def format(fn):
  def replace(*args,**kwargs):
    pass_args = []
    pass_kwargs = {}
    for arg in args:
      pass_args.append(line_width_recurse(line_breaks_recurse(arg)))
    for k,v in kwargs.items():
      pass_kwargs[k] = line_width_recurse(line_breaks_recurse(v))
    return fn(*pass_args,**pass_kwargs)
  return replace

def line_breaks_recurse(obj):
  """Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
  if type(obj) == str and LINEBREAK_PREFERENCE == "win":
    temp_obj = ""
    for ch,i in enumerate(obj):
      if ch == "\n" and (i < 1 or obj[i-1] != "\r"):
        temp_obj += "\r\n"
      else:
        temp_obj += "ch"
  elif type(obj) == str and LINEBREAK_PREFERENCE == "unix":
    temp_obj = obj.replace("\r\n","\n")
  elif type(obj) == list:
    for o in obj:
      temp_obj = []
      temp_obj.append(line_breaks_recurse(o))
  elif type(obj) == dict:
    temp_obj = {}
    for k,v in obj.items():
      temp_obj[k] = line_breaks_recurse(v)
  else:
    temp_obj = obj
  return temp_obj

def line_width_recurse(obj):
  """Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
  if type(obj) == str:
      #we check string length. if its too long we break it up
      #first though we break based on existing line breaks
      if obj.count(LINEBREAK):
        temp_obj = LINEBREAK.join(line_width_recurse(obj.split(LINEBREAK)))
      else:
        if len(obj) > PRETTY_LINE_WIDTH:
          white_space = ""
          i = 0
          #here we want to make sure that a line that gets broken keeps with the indentation
          while i+1 < len(obj) and (obj[i] == "\t" or obj[i] == " "):
            white_space += obj[i]
            i += 1
          part_one = obj[:PRETTY_LINE_WIDTH]
          part_two = line_width_recurse(white_space + obj[PRETTY_LINE_WIDTH:])
          temp_obj = part_one + LINEBREAK + part_two
        else:
          temp_obj = obj
  #if its an iterable object we recurse
  #TODO maybe we can check if the object has an __iter__ method to deal with objects other than dicts and lists
  elif type(obj) == list:
    temp_obj = []
    for o in obj:
      temp_obj.append(line_width_recurse(o))
  elif type(obj) == dict:
    temp_obj = {}
    for k,v in obj.items():
      temp_obj[k] = line_width_recurse(v)
  #if its not iterable and not a string we pass it back unmodified
  else:
    temp_obj = obj
  return temp_obj

class prettyHTMLizer(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.pretty_output = ""
    self.tab_count = 0

  def handle_starttag(self,tag,attrs):
    attrs = dict(attrs)
    self.pretty_output += self.__html_start_tag(tag,attrs)
    self.tab_count += 1

  def handle_endtag(self,tag):
    #here we decrement tab_count only so far as 0... neat trick, huh?
    self.tab_count -= self.tab_count > 0
    self.pretty_output += self.__html_end_tag(tag)

  def handle_startendtag(self, tag, attrs):
    self.pretty_output += self.__html_startend_tag(tag,attrs)

  def handle_data(self,data):
    self.pretty_output += self.__html_data(data)

  def output(self):
    return self.__formatting(self.pretty_output)

  @format
  def __formatting(self,string):
    return string

  def __html_start_tag(self, tag, attrs):
    return "\t"*self.tab_count + '<%s%s>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK

  def __html_startend_tag(self, tag, attrs):
    return "\t"*self.tab_count + '<%s%s/>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK

  def __html_end_tag(self, tag):
    return "\t"*self.tab_count + '</%s>' % (tag.lower()) + LINEBREAK

  def __html_data(self,data):
    return "\t"*self.tab_count + data + LINEBREAK

  def __html_attrs(self, attrs):
    _attrs = ''
    if attrs:
      _attrs = ' %s' % (' '.join([('%s="%s"' % (k.lower(),v.lower())) for k,v in attrs.iteritems()]))
    return _attrs

if __name__ == "__main__":
  ugly = "<htML><b>hello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello world</b></html><foobar/>"
  parser = prettyHTMLizer()
  parser.feed(ugly)
  pretty = parser.output()
  print pretty
	from HTMLParser import HTMLParser

	PRETTY_LINE_WIDTH = 120
	LINEBREAK_PREFERENCE = "unix" #unix(\n) or win(\r\n)
	LINEBREAK = ["\r\n","\n"][LINEBREAK_PREFERENCE == "unix"] #set LINEBREAK based on LINEBREAK_PREFERENCE

	#DECORATORS
	def format(fn):
	def replace(args,*kwargs):
	pass_args = []
	pass_kwargs = {}
	for arg in args:
	pass_args.append(line_width_recurse(line_breaks_recurse(arg)))
	for k,v in kwargs.items():
	pass_kwargs[k] = line_width_recurse(line_breaks_recurse(v))
	return fn(pass_args,*pass_kwargs)
	return replace

	def line_breaks_recurse(obj):
	"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
	if type(obj) == str and LINEBREAK_PREFERENCE == "win":
	temp_obj = ""
	for ch,i in enumerate(obj):
	if ch == "\n" and (i < 1 or obj[i-1] != "\r"):
	temp_obj += "\r\n"
	else:
	temp_obj += "ch"
	elif type(obj) == str and LINEBREAK_PREFERENCE == "unix":
	temp_obj = obj.replace("\r\n","\n")
	elif type(obj) == list:
	for o in obj:
	temp_obj = []
	temp_obj.append(line_breaks_recurse(o))
	elif type(obj) == dict:
	temp_obj = {}
	for k,v in obj.items():
	temp_obj[k] = line_breaks_recurse(v)
	else:
	temp_obj = obj
	return temp_obj

	def line_width_recurse(obj):
	"""Recurse over the object until we hit string. make replacements for fix_args_linebreaks"""
	if type(obj) == str:
	#we check string length. if its too long we break it up
	#first though we break based on existing line breaks
	if obj.count(LINEBREAK):
	temp_obj = LINEBREAK.join(line_width_recurse(obj.split(LINEBREAK)))
	else:
	if len(obj) > PRETTY_LINE_WIDTH:
	white_space = ""
	i = 0
	#here we want to make sure that a line that gets broken keeps with the indentation
	while i+1 < len(obj) and (obj[i] == "\t" or obj[i] == " "):
	white_space += obj[i]
	i += 1
	part_one = obj[:PRETTY_LINE_WIDTH]
	part_two = line_width_recurse(white_space + obj[PRETTY_LINE_WIDTH:])
	temp_obj = part_one + LINEBREAK + part_two
	else:
	temp_obj = obj
	#if its an iterable object we recurse
	#TODO maybe we can check if the object has an __iter__ method to deal with objects other than dicts and lists
	elif type(obj) == list:
	temp_obj = []
	for o in obj:
	temp_obj.append(line_width_recurse(o))
	elif type(obj) == dict:
	temp_obj = {}
	for k,v in obj.items():
	temp_obj[k] = line_width_recurse(v)
	#if its not iterable and not a string we pass it back unmodified
	else:
	temp_obj = obj
	return temp_obj

	class prettyHTMLizer(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.pretty_output = ""
	self.tab_count = 0

	def handle_starttag(self,tag,attrs):
	attrs = dict(attrs)
	self.pretty_output += self.__html_start_tag(tag,attrs)
	self.tab_count += 1

	def handle_endtag(self,tag):
	#here we decrement tab_count only so far as 0... neat trick, huh?
	self.tab_count -= self.tab_count > 0
	self.pretty_output += self.__html_end_tag(tag)

	def handle_startendtag(self, tag, attrs):
	self.pretty_output += self.__html_startend_tag(tag,attrs)

	def handle_data(self,data):
	self.pretty_output += self.__html_data(data)

	def output(self):
	return self.__formatting(self.pretty_output)

	@format
	def __formatting(self,string):
	return string

	def __html_start_tag(self, tag, attrs):
	return "\t"*self.tab_count + '<%s%s>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK

	def __html_startend_tag(self, tag, attrs):
	return "\t"*self.tab_count + '<%s%s/>' % (tag.lower(), self.__html_attrs(attrs)) + LINEBREAK

	def __html_end_tag(self, tag):
	return "\t"*self.tab_count + '</%s>' % (tag.lower()) + LINEBREAK

	def __html_data(self,data):
	return "\t"*self.tab_count + data + LINEBREAK

	def __html_attrs(self, attrs):
	_attrs = ''
	if attrs:
	_attrs = ' %s' % (' '.join([('%s="%s"' % (k.lower(),v.lower())) for k,v in attrs.iteritems()]))
	return _attrs

	if __name__ == "__main__":
	ugly = "<htML><b>hello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello world</b></html><foobar/>"
	parser = prettyHTMLizer()
	parser.feed(ugly)
	pretty = parser.output()
	print pretty