obeattie/fragmentsoup.py

## fragmentsoup.py
from bs4 import BeautifulSoup
from bs4.builder._lxml import LXML as BS_LXML_FEATURE

class FragmentSoup(BeautifulSoup):
    """A custom BeautifulSoup implementation that properly represents fragments when using lxml, without wrapping them
       in a document on output (internally, they are still wrapped)."""

    def _feed(self, *args, **kwargs):
        if not hasattr(self, '__markup_enclosure'):
            self.__markup_enclosure = None

            if BS_LXML_FEATURE in self.builder.features and (not self.builder.is_xml) and not (
               self.markup.startswith('<html') or self.markup.startswith('<!doctype')):
                # Wrap in an enclosure -- if this is a <body> then don't include another
                if self.markup.startswith('<body'):
                    self.markup = '<html>%s</html>' % self.markup
                    self.__markup_enclosure = 'html'
                else:
                    self.markup = '<html><body>%s</body></html>' % self.markup
                    self.__markup_enclosure = 'body'

        return super(FragmentSoup, self)._feed(*args, **kwargs)

    def decode(self, *args, **kwargs):
        if self.__markup_enclosure is not None:
            targets = self.find(self.__markup_enclosure).children
            return ''.join(t.decode(*args, **kwargs) for t in targets)
        else:
            return super(FragmentSoup, self).decode(*args, **kwargs)
	from bs4 import BeautifulSoup
	from bs4.builder._lxml import LXML as BS_LXML_FEATURE

	class FragmentSoup(BeautifulSoup):
	"""A custom BeautifulSoup implementation that properly represents fragments when using lxml, without wrapping them
	in a document on output (internally, they are still wrapped)."""

	def _feed(self, args, *kwargs):
	if not hasattr(self, '__markup_enclosure'):
	self.__markup_enclosure = None

	if BS_LXML_FEATURE in self.builder.features and (not self.builder.is_xml) and not (
	self.markup.startswith('<html') or self.markup.startswith('<!doctype')):
	# Wrap in an enclosure -- if this is a <body> then don't include another
	if self.markup.startswith('<body'):
	self.markup = '<html>%s</html>' % self.markup
	self.__markup_enclosure = 'html'
	else:
	self.markup = '<html><body>%s</body></html>' % self.markup
	self.__markup_enclosure = 'body'

	return super(FragmentSoup, self)._feed(args, *kwargs)

	def decode(self, args, *kwargs):
	if self.__markup_enclosure is not None:
	targets = self.find(self.__markup_enclosure).children
	return ''.join(t.decode(args, *kwargs) for t in targets)
	else:
	return super(FragmentSoup, self).decode(args, *kwargs)