Skip to content

Instantly share code, notes, and snippets.

@obeattie
Created May 22, 2012 17:41
Show Gist options
  • Save obeattie/2770517 to your computer and use it in GitHub Desktop.
Save obeattie/2770517 to your computer and use it in GitHub Desktop.
A BeautifulSoup that can handle HTML fragments with lxml
from bs4 import BeautifulSoup
from bs4.builder._lxml import LXML as BS_LXML_FEATURE
class FragmentSoup(BeautifulSoup):
"""A custom BeautifulSoup implementation that properly represents fragments when using lxml, without wrapping them
in a document on output (internally, they are still wrapped)."""
def _feed(self, *args, **kwargs):
if not hasattr(self, '__markup_enclosure'):
self.__markup_enclosure = None
if BS_LXML_FEATURE in self.builder.features and (not self.builder.is_xml) and not (
self.markup.startswith('<html') or self.markup.startswith('<!doctype')):
# Wrap in an enclosure -- if this is a <body> then don't include another
if self.markup.startswith('<body'):
self.markup = '<html>%s</html>' % self.markup
self.__markup_enclosure = 'html'
else:
self.markup = '<html><body>%s</body></html>' % self.markup
self.__markup_enclosure = 'body'
return super(FragmentSoup, self)._feed(*args, **kwargs)
def decode(self, *args, **kwargs):
if self.__markup_enclosure is not None:
targets = self.find(self.__markup_enclosure).children
return ''.join(t.decode(*args, **kwargs) for t in targets)
else:
return super(FragmentSoup, self).decode(*args, **kwargs)
@ivan-kleshnin
Copy link

You can use undocumented (in v.4) soup.body.decode_contents()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment