Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kevinhendricks/2025df8d7161168d7ecc to your computer and use it in GitHub Desktop.
Save kevinhendricks/2025df8d7161168d7ecc to your computer and use it in GitHub Desktop.
google/gumbo-parser patch to allow gumboc.py to work with both python 2 and 3 and add BeautifulSoup4 support for both
diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py
new file mode 100644
index 0000000..5a8d273
--- /dev/null
+++ b/python/gumbo/bs4_adapter.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, print_function
+
+# Copyright 2012 Google Inc. All Rights Reserved.
+# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Should this be reworked to be a bs4 treebuilder?
+
+"""
+ Adapter between Gumbo and BeautifulSoup4.
+ This parses an HTML document and gives back a BeautifulSoup4 object, which you
+ can then manipulate like a normal BeautifulSoup4 parse tree.
+
+ Groks namespaces on elements and attributes
+"""
+
+__author__ = 'jdtang@google.com (Jonathan Tang)'
+
+import sys
+import gumboc
+
+import bs4
+# uses bs4.element classes:
+# Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re
+
+# These should be indexed by the enum
+# values of gumboc.Namespace
+
+_NAMESPACES = [
+ 'http://www.w3.org/1999/xhtml',
+ 'http://www.w3.org/2000/svg',
+ 'http://www.w3.org/1998/Math/MathML',
+ ]
+
+
+def _fromutf8(text):
+ return text.decode('utf-8', 'replace')
+
+
+def _add_source_info(obj, original_text, start_pos, end_pos):
+ obj.original = _fromutf8(bytes(original_text))
+ obj.line = start_pos.line
+ obj.col = start_pos.column
+ obj.offset = start_pos.offset
+ if end_pos:
+ obj.end_line = end_pos.line
+ obj.end_col = end_pos.column
+ obj.end_offset = end_pos.offset
+
+
+def _convert_attrs(element_attrs):
+ def maybe_namespace(attr):
+ if attr.namespace != gumboc.AttributeNamespace.NONE:
+ name = _fromutf8(attr.name)
+ prefix = repr(attr.namespace).lower() if name != 'xmlns' else None
+ nsurl = atr.namespace.to_url()
+ return bs4.element.NamespacedAttributes(prefix, name, nsurl)
+ else:
+ return _fromutf8(attr.name)
+ def maybe_value_list(attr):
+ value = _fromutf8(attr.value)
+ if " " in value:
+ value = bs4.element.whitespace_re.split(value)
+ return value
+ return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs)
+
+
+def _add_document(soup, element):
+ if not element.has_doctype:
+ # Mimic html5lib behavior: if no doctype token, no doctype node.
+ return
+ doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name),
+ _fromutf8(element.public_identifier),
+ _fromutf8(element.system_identifier))
+ soup.object_was_parsed(doctype)
+
+
+def _add_element(soup, element):
+ tag = bs4.element.Tag(parser=soup,
+ name=_fromutf8(element.tag_name),
+ namespace=_NAMESPACES[element.tag_namespace.value],
+ attrs=_convert_attrs(element.attributes))
+ for child in element.children:
+ tag.append(_add_node(soup, child))
+ _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos)
+ tag.original_end_tag = _fromutf8(bytes(element.original_end_tag))
+ return tag
+
+
+def _add_text(cls):
+ def add_text_internal(soup, element):
+ text = cls(_fromutf8(element.text))
+ _add_source_info(text, element.original_text, element.start_pos, None)
+ return text
+ return add_text_internal
+
+
+_HANDLERS = [
+ _add_document, # DOCUMENT
+ _add_element, # ELEMENT
+ _add_text(bs4.element.NavigableString), # TEXT
+ _add_text(bs4.element.CData), # CDATA
+ _add_text(bs4.element.Comment), # COMMENT
+ _add_text(bs4.element.NavigableString), # WHITESPACE
+ _add_element, # TEMPLATE
+ ]
+
+
+def _add_node(soup, node):
+ return _HANDLERS[node.type.value](soup, node.contents)
+
+
+def _add_next_prev_pointers(soup):
+ def _traverse(node):
+ # .findAll requires the .next pointer, which is what we're trying to add
+ # when we call this, and so we manually supply a generator to yield the
+ # nodes in DOM order.
+ yield node
+ try:
+ for child in node.contents:
+ for descendant in _traverse(child):
+ yield descendant
+ except AttributeError:
+ # Not an element.
+ return
+ nodes = sorted(_traverse(soup), key=lambda node: node.offset)
+ if nodes:
+ nodes[0].previous_element = None
+ nodes[-1].next_element = None
+ for i, node in enumerate(nodes[1:-1], 1):
+ nodes[i-1].next_element = node
+ node.previous_element = nodes[i-1]
+
+
+def parse(text, **kwargs):
+ with gumboc.parse(text, **kwargs) as output:
+ soup = bs4.BeautifulSoup('', "html.parser")
+ _add_document(soup, output.contents.document.contents)
+ for node in output.contents.document.contents.children:
+ soup.append(_add_node(soup, node))
+ _add_next_prev_pointers(soup.html)
+ return soup
+
+
+def main():
+ samp = """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US">
+<head><title>testing & entities</title></head>
+<body>
+ <p class="first second">this&nbsp;is&#160;the&#xa0;<i><b>copyright</i></b> symbol "&copy;"</p>
+ <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com">
+ this used to test atribute namespaces
+ </p>
+</body>
+</html>
+"""
+ soup = parse(samp)
+ print(soup.decode())
+ for node in soup.findAll("head"):
+ print(node)
+ for node in soup.find_all(attrs={'class':'second'}):
+ print(node)
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py
new file mode 100644
index 0000000..aa25d4b
--- /dev/null
+++ b/python/gumbo/bs4_adapter_test.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals, print_function
+
+# Copyright 2012 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Tests for the Gumbo's BeautifulSoup Python adapter."""
+
+__author__ = 'jdtang@google.com (Jonathan Tang)'
+
+import unittest
+
+import bs4_adapter
+
+
+class SoupAdapterTest(unittest.TestCase):
+
+ def testSimpleParse(self):
+ soup = bs4_adapter.parse(
+ """
+ <ul>
+ <li class=odd><a href="one.html">One</a>
+ <li class="even"><a href="two.html">Two</a>
+ <li class='odd'><a href="three.html">Three</a>
+ <li class="even"><a href="four.html">Four</a>
+ </ul>
+ """)
+
+ head = soup.head
+ self.assertEqual(soup, head.parent.parent)
+ self.assertEqual(u'head', head.name)
+ self.assertEqual(0, len(head))
+
+ body = soup.body
+ self.assertEqual(head, body.previousSibling)
+ self.assertEqual(2, len(body)) # <ul> + trailing whitespace
+ self.assertEqual(u'ul', body.contents[0].name)
+
+ list_items = body.findAll('li')
+ self.assertEqual(4, len(list_items))
+
+ evens = body('li', 'even')
+ self.assertEqual(2, len(evens))
+
+ a2 = body.find('a', href='two.html')
+ self.assertEqual(u'a', a2.name)
+ self.assertEqual(u'Two', a2.contents[0])
+
+ li2 = a2.parent
+ self.assertEqual(u'li', li2.name)
+ self.assertEqual(u'even', li2['class'])
+ self.assertEqual(list_items[1], li2)
+ self.assertEqual(evens[0], li2)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
index 04da319..887f8e2 100644
--- a/python/gumbo/gumboc.py
+++ b/python/gumbo/gumboc.py
@@ -13,6 +13,8 @@
# limitations under the License.
#
+from __future__ import unicode_literals, print_function
+
"""CTypes bindings for the Gumbo HTML5 parser.
This exports the raw interface of the library as a set of very thin ctypes
@@ -23,6 +25,19 @@ Pythonic API.
__author__ = 'jdtang@google.com (Jonathan Tang)'
import sys
+
+PY3 = sys.version_info[0] == 3
+if PY3:
+ text_type = str
+else:
+ text_type = unicode
+
+# When supporting both python 2 and 3 using one code base, using str(obj) is confusing
+# at best since its return type is python version specific
+# Notes:
+# - The unicode(obj) operator does not exist in PY3
+# - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X)
+
import contextlib
import ctypes
import os.path
@@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure):
return self.length
def __str__(self):
+ # Warning: in Python 3 the str() operator method may **never** return bytes
+ # to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead
+ if PY3:
+ return ctypes.string_at(self.data, self.length).decode('utf-8')
+ return ctypes.string_at(self.data, self.length)
+
+ def __bytes__(self):
return ctypes.string_at(self.data, self.length)
@@ -273,11 +295,11 @@ class Element(ctypes.Structure):
if self.tag_namespace == Namespace.SVG:
svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
if svg_tagname is not None:
- return str(svg_tagname)
+ return bytes(svg_tagname)
if self.tag == Tag.UNKNOWN:
if original_tag.data is None:
return ''
- return str(original_tag).lower()
+ return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
return _tagname(self.tag)
def __repr__(self):
@@ -384,7 +406,9 @@ def parse(text, **kwargs):
# outlives the parse output. If we let ctypes do it automatically on function
# call, it creates a temporary buffer which is destroyed when the call
# completes, and then the original_text pointers point into invalid memory.
- text_ptr = ctypes.c_char_p(text.encode('utf-8'))
+ if isinstance(text, text_type):
+ text = text.encode('utf-8')
+ text_ptr = ctypes.c_char_p(text)
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
try:
yield output
diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py
index 3e9c41f..ed170a1 100644
--- a/python/gumbo/gumboc_tags.py
+++ b/python/gumbo/gumboc_tags.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
# Generated via `gentags.py src/tag.in`.
# Do not edit; edit src/tag.in instead.
# clang-format off
diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py
index 1f30d38..b510ca8 100644
--- a/python/gumbo/gumboc_test.py
+++ b/python/gumbo/gumboc_test.py
@@ -13,11 +13,16 @@
# limitations under the License.
#
+from __future__ import unicode_literals, print_function
+
"""Tests for Gumbo CTypes bindings."""
__author__ = 'jdtang@google.com (Jonathan Tang)'
-import StringIO
+try:
+ import StringIO as io
+except ImportError:
+ import io
import unittest
@@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase):
def testWordParse(self):
with gumboc.parse('Test') as output:
doctype_node = output.contents.document.contents
- self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
+ self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type)
document = doctype_node.v.document
- self.assertEquals('', document.name)
- self.assertEquals('', document.public_identifier)
- self.assertEquals('', document.system_identifier)
+ self.assertEqual(b'', document.name)
+ self.assertEqual(b'', document.public_identifier)
+ self.assertEqual(b'', document.system_identifier)
root = output.contents.root.contents
- self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
- self.assertEquals(gumboc.Tag.HTML, root.tag)
- self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
- self.assertEquals(2, len(root.children))
+ self.assertEqual(gumboc.NodeType.ELEMENT, root.type)
+ self.assertEqual(gumboc.Tag.HTML, root.tag)
+ self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace)
+ self.assertEqual(2, len(root.children))
head = root.children[0]
- self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
- self.assertEquals(gumboc.Tag.HEAD, head.tag)
- self.assertEquals('head', head.tag_name)
- self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
- self.assertEquals(0, len(head.original_tag))
- self.assertEquals('', str(head.original_end_tag))
- self.assertEquals(0, head.children.length)
+ self.assertEqual(gumboc.NodeType.ELEMENT, head.type)
+ self.assertEqual(gumboc.Tag.HEAD, head.tag)
+ self.assertEqual(b'head', head.tag_name)
+ self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace)
+ self.assertEqual(0, len(head.original_tag))
+ self.assertEqual(b'', bytes(head.original_end_tag))
+ self.assertEqual(0, head.children.length)
body = root.children[1]
- self.assertNotEquals(body, doctype_node)
- self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
- self.assertEquals(gumboc.Tag.BODY, body.tag)
- self.assertEquals('body', body.tag_name)
- self.assertEquals(1, len(body.children))
+ self.assertNotEqual(body, doctype_node)
+ self.assertEqual(gumboc.NodeType.ELEMENT, body.type)
+ self.assertEqual(gumboc.Tag.BODY, body.tag)
+ self.assertEqual(b'body', body.tag_name)
+ self.assertEqual(1, len(body.children))
text_node = body.children[0]
- self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
- self.assertEquals('Test', text_node.text)
+ self.assertEqual(gumboc.NodeType.TEXT, text_node.type)
+ self.assertEqual(b'Test', text_node.text)
def testBufferThatGoesAway(self):
for i in range(10):
- source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
+ source = io.StringIO('<foo bar=quux>1<p>2</foo>')
parse_tree = gumboc.parse(source.read())
source.close()
with parse_tree as output:
root = output.contents.root.contents
body = root.children[1]
foo = body.children[0]
- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
- self.assertEquals('', str(foo.original_end_tag))
- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
- self.assertEquals('bar', foo.attributes[0].name)
- self.assertEquals('quux', foo.attributes[0].value)
+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
+ self.assertEqual(b'', bytes(foo.original_end_tag))
+ self.assertEqual(b'foo', foo.tag_name)
+ self.assertEqual(b'bar', foo.attributes[0].name)
+ self.assertEqual(b'quux', foo.attributes[0].value)
def testUnknownTag(self):
with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
root = output.contents.root.contents
body = root.children[1]
foo = body.children[0]
- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
- self.assertEquals('', str(foo.original_end_tag))
- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
- self.assertEquals('bar', foo.attributes[0].name)
- self.assertEquals('quux', foo.attributes[0].value)
+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
+ self.assertEqual(b'', bytes(foo.original_end_tag))
+ self.assertEqual(b'foo', foo.tag_name)
+ self.assertEqual(b'bar', foo.attributes[0].name)
+ self.assertEqual(b'quux', foo.attributes[0].value)
def testSarcasm(self):
with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
@@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase):
body = root.children[1]
div = body.children[0]
sarcasm = div.children[0]
- self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
- self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
- self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
- self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
- self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
+ self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type)
+ self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag)
+ self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag))
+ self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag))
+ self.assertEqual(b'sarcasm', sarcasm.tag_name)
def testEnums(self):
- self.assertEquals(gumboc.Tag.A, gumboc.Tag.A)
- self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
+ self.assertEqual(gumboc.Tag.A, gumboc.Tag.A)
+ self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
def testFragment(self):
with gumboc.parse(
@@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase):
fragment_context=gumboc.Tag.TITLE,
fragment_namespace=gumboc.Namespace.SVG) as output:
root = output.contents.root.contents
- self.assertEquals(1, len(root.children))
+ self.assertEqual(1, len(root.children))
div = root.children[0]
- self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
- self.assertEquals(gumboc.Tag.DIV, div.tag)
- self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
+ self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
+ self.assertEqual(gumboc.Tag.DIV, div.tag)
+ self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment