kevinhendricks/python3_fixes_with_bs4_support.patch

## python3_fixes_with_bs4_support.patch
diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py
new file mode 100644
index 0000000..5a8d273
--- /dev/null
+++ b/python/gumbo/bs4_adapter.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, print_function
+
+# Copyright 2012 Google Inc. All Rights Reserved.
+# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Should this be reworked to be a bs4 treebuilder?
+
+"""
+  Adapter between Gumbo and BeautifulSoup4.
+  This parses an HTML document and gives back a BeautifulSoup4 object, which you
+  can then manipulate like a normal BeautifulSoup4 parse tree.
+
+  Groks namespaces on elements and attributes
+"""
+
+__author__ = 'jdtang@google.com (Jonathan Tang)'
+
+import sys
+import gumboc
+
+import bs4
+# uses bs4.element classes:
+#      Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re
+
+# These should be indexed by the enum
+# values of gumboc.Namespace
+
+_NAMESPACES = [
+    'http://www.w3.org/1999/xhtml',
+    'http://www.w3.org/2000/svg',
+    'http://www.w3.org/1998/Math/MathML',
+    ]
+
+
+def _fromutf8(text):
+    return text.decode('utf-8', 'replace')
+
+
+def _add_source_info(obj, original_text, start_pos, end_pos):
+    obj.original = _fromutf8(bytes(original_text))
+    obj.line = start_pos.line
+    obj.col = start_pos.column
+    obj.offset = start_pos.offset
+    if end_pos:
+        obj.end_line = end_pos.line
+        obj.end_col = end_pos.column
+        obj.end_offset = end_pos.offset
+
+
+def _convert_attrs(element_attrs):
+    def maybe_namespace(attr):
+        if attr.namespace != gumboc.AttributeNamespace.NONE:
+            name = _fromutf8(attr.name)
+            prefix = repr(attr.namespace).lower() if name != 'xmlns' else None
+            nsurl = atr.namespace.to_url()
+            return bs4.element.NamespacedAttributes(prefix, name, nsurl)
+        else:
+            return _fromutf8(attr.name)
+    def maybe_value_list(attr):
+        value = _fromutf8(attr.value)
+        if " " in value:
+            value = bs4.element.whitespace_re.split(value)
+        return value
+    return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs)
+
+
+def _add_document(soup, element):
+    if not element.has_doctype:
+        # Mimic html5lib behavior: if no doctype token, no doctype node.
+        return
+    doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name),
+                                                   _fromutf8(element.public_identifier),
+                                                   _fromutf8(element.system_identifier))
+    soup.object_was_parsed(doctype)
+
+
+def _add_element(soup, element):
+    tag = bs4.element.Tag(parser=soup,
+                  name=_fromutf8(element.tag_name),
+                  namespace=_NAMESPACES[element.tag_namespace.value],
+                  attrs=_convert_attrs(element.attributes))
+    for child in element.children:
+        tag.append(_add_node(soup, child))
+    _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos)
+    tag.original_end_tag = _fromutf8(bytes(element.original_end_tag))
+    return tag
+
+
+def _add_text(cls):
+    def add_text_internal(soup, element):
+        text = cls(_fromutf8(element.text))
+        _add_source_info(text, element.original_text, element.start_pos, None)
+        return text
+    return add_text_internal
+
+
+_HANDLERS = [
+    _add_document,                              # DOCUMENT
+    _add_element,                               # ELEMENT
+    _add_text(bs4.element.NavigableString),     # TEXT
+    _add_text(bs4.element.CData),               # CDATA
+    _add_text(bs4.element.Comment),             # COMMENT
+    _add_text(bs4.element.NavigableString),     # WHITESPACE
+    _add_element,                               # TEMPLATE
+    ]
+
+
+def _add_node(soup, node):
+  return _HANDLERS[node.type.value](soup, node.contents)
+
+
+def _add_next_prev_pointers(soup):
+    def _traverse(node):
+        # .findAll requires the .next pointer, which is what we're trying to add
+        # when we call this, and so we manually supply a generator to yield the
+        # nodes in DOM order.
+        yield node
+        try:
+            for child in node.contents:
+                for descendant in _traverse(child):
+                    yield descendant
+        except AttributeError:
+            # Not an element.
+            return
+    nodes = sorted(_traverse(soup), key=lambda node: node.offset)
+    if nodes:
+        nodes[0].previous_element = None
+        nodes[-1].next_element = None
+    for i, node in enumerate(nodes[1:-1], 1):
+        nodes[i-1].next_element = node
+        node.previous_element = nodes[i-1]
+
+
+def parse(text, **kwargs):
+    with gumboc.parse(text, **kwargs) as output:
+        soup = bs4.BeautifulSoup('', "html.parser")
+        _add_document(soup, output.contents.document.contents)
+        for node in output.contents.document.contents.children:
+            soup.append(_add_node(soup, node))
+        _add_next_prev_pointers(soup.html)
+        return soup
+
+
+def main():
+    samp = """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US">
+<head><title>testing & entities</title></head>
+<body>
+  <p class="first second">this&nbsp;is&#160;the&#xa0;<i><b>copyright</i></b> symbol "&copy;"</p>
+  <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com">
+     this used to test atribute namespaces
+ </p>
+</body>
+</html>
+"""
+    soup = parse(samp)
+    print(soup.decode())
+    for node in soup.findAll("head"):
+        print(node)
+    for node in soup.find_all(attrs={'class':'second'}):
+        print(node)
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py
new file mode 100644
index 0000000..aa25d4b
--- /dev/null
+++ b/python/gumbo/bs4_adapter_test.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals, print_function
+
+# Copyright 2012 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Tests for the Gumbo's BeautifulSoup Python adapter."""
+
+__author__ = 'jdtang@google.com (Jonathan Tang)'
+
+import unittest
+
+import bs4_adapter
+
+
+class SoupAdapterTest(unittest.TestCase):
+
+  def testSimpleParse(self):
+    soup = bs4_adapter.parse(
+        """
+        <ul>
+          <li class=odd><a href="one.html">One</a>
+          <li class="even"><a href="two.html">Two</a>
+          <li class='odd'><a href="three.html">Three</a>
+          <li class="even"><a href="four.html">Four</a>
+        </ul>
+        """)
+
+    head = soup.head
+    self.assertEqual(soup, head.parent.parent)
+    self.assertEqual(u'head', head.name)
+    self.assertEqual(0, len(head))
+
+    body = soup.body
+    self.assertEqual(head, body.previousSibling)
+    self.assertEqual(2, len(body))  # <ul> + trailing whitespace
+    self.assertEqual(u'ul', body.contents[0].name)
+
+    list_items = body.findAll('li')
+    self.assertEqual(4, len(list_items))
+
+    evens = body('li', 'even')
+    self.assertEqual(2, len(evens))
+
+    a2 = body.find('a', href='two.html')
+    self.assertEqual(u'a', a2.name)
+    self.assertEqual(u'Two', a2.contents[0])
+
+    li2 = a2.parent
+    self.assertEqual(u'li', li2.name)
+    self.assertEqual(u'even', li2['class'])
+    self.assertEqual(list_items[1], li2)
+    self.assertEqual(evens[0], li2)
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
index 04da319..887f8e2 100644
--- a/python/gumbo/gumboc.py
+++ b/python/gumbo/gumboc.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 #

+from __future__ import unicode_literals, print_function
+
 """CTypes bindings for the Gumbo HTML5 parser.

 This exports the raw interface of the library as a set of very thin ctypes
@@ -23,6 +25,19 @@ Pythonic API.
 __author__ = 'jdtang@google.com (Jonathan Tang)'

 import sys
+
+PY3 = sys.version_info[0] == 3
+if PY3:
+  text_type = str
+else:
+  text_type = unicode
+
+# When supporting both python 2 and 3 using one code base, using str(obj) is confusing
+# at best since its return type is python version specific
+# Notes:
+#   - The unicode(obj) operator does not exist in PY3
+#   - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X)
+
 import contextlib
 import ctypes
 import os.path
@@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure):
     return self.length

   def __str__(self):
+    # Warning: in Python 3 the str() operator method may **never** return bytes
+    #  to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead
+    if PY3:
+      return ctypes.string_at(self.data, self.length).decode('utf-8')
+    return ctypes.string_at(self.data, self.length)
+
+  def __bytes__(self):
     return ctypes.string_at(self.data, self.length)


@@ -273,11 +295,11 @@ class Element(ctypes.Structure):
     if self.tag_namespace == Namespace.SVG:
       svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
       if svg_tagname is not None:
-        return str(svg_tagname)
+        return bytes(svg_tagname)
     if self.tag == Tag.UNKNOWN:
       if original_tag.data is None:
         return ''
-      return str(original_tag).lower()
+      return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
     return _tagname(self.tag)

   def __repr__(self):
@@ -384,7 +406,9 @@ def parse(text, **kwargs):
   # outlives the parse output.  If we let ctypes do it automatically on function
   # call, it creates a temporary buffer which is destroyed when the call
   # completes, and then the original_text pointers point into invalid memory.
-  text_ptr = ctypes.c_char_p(text.encode('utf-8'))
+  if isinstance(text, text_type):
+    text = text.encode('utf-8')
+  text_ptr = ctypes.c_char_p(text)
   output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
   try:
     yield output
diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py
index 3e9c41f..ed170a1 100644
--- a/python/gumbo/gumboc_tags.py
+++ b/python/gumbo/gumboc_tags.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 # Generated via `gentags.py src/tag.in`.
 # Do not edit; edit src/tag.in instead.
 # clang-format off
diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py
index 1f30d38..b510ca8 100644
--- a/python/gumbo/gumboc_test.py
+++ b/python/gumbo/gumboc_test.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 #

+from __future__ import unicode_literals, print_function
+
 """Tests for Gumbo CTypes bindings."""

 __author__ = 'jdtang@google.com (Jonathan Tang)'

-import StringIO
+try:
+  import StringIO as io
+except ImportError:
+  import io

 import unittest

@@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase):
   def testWordParse(self):
     with gumboc.parse('Test') as output:
       doctype_node = output.contents.document.contents
-      self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
+      self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type)
       document = doctype_node.v.document
-      self.assertEquals('', document.name)
-      self.assertEquals('', document.public_identifier)
-      self.assertEquals('', document.system_identifier)
+      self.assertEqual(b'', document.name)
+      self.assertEqual(b'', document.public_identifier)
+      self.assertEqual(b'', document.system_identifier)

       root = output.contents.root.contents
-      self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
-      self.assertEquals(gumboc.Tag.HTML, root.tag)
-      self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
-      self.assertEquals(2, len(root.children))
+      self.assertEqual(gumboc.NodeType.ELEMENT, root.type)
+      self.assertEqual(gumboc.Tag.HTML, root.tag)
+      self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace)
+      self.assertEqual(2, len(root.children))

       head = root.children[0]
-      self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
-      self.assertEquals(gumboc.Tag.HEAD, head.tag)
-      self.assertEquals('head', head.tag_name)
-      self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
-      self.assertEquals(0, len(head.original_tag))
-      self.assertEquals('', str(head.original_end_tag))
-      self.assertEquals(0, head.children.length)
+      self.assertEqual(gumboc.NodeType.ELEMENT, head.type)
+      self.assertEqual(gumboc.Tag.HEAD, head.tag)
+      self.assertEqual(b'head', head.tag_name)
+      self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace)
+      self.assertEqual(0, len(head.original_tag))
+      self.assertEqual(b'', bytes(head.original_end_tag))
+      self.assertEqual(0, head.children.length)

       body = root.children[1]
-      self.assertNotEquals(body, doctype_node)
-      self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
-      self.assertEquals(gumboc.Tag.BODY, body.tag)
-      self.assertEquals('body', body.tag_name)
-      self.assertEquals(1, len(body.children))
+      self.assertNotEqual(body, doctype_node)
+      self.assertEqual(gumboc.NodeType.ELEMENT, body.type)
+      self.assertEqual(gumboc.Tag.BODY, body.tag)
+      self.assertEqual(b'body', body.tag_name)
+      self.assertEqual(1, len(body.children))

       text_node = body.children[0]
-      self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
-      self.assertEquals('Test', text_node.text)
+      self.assertEqual(gumboc.NodeType.TEXT, text_node.type)
+      self.assertEqual(b'Test', text_node.text)

   def testBufferThatGoesAway(self):
     for i in range(10):
-      source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
+      source = io.StringIO('<foo bar=quux>1<p>2</foo>')
       parse_tree = gumboc.parse(source.read())
       source.close()
     with parse_tree as output:
       root = output.contents.root.contents
       body = root.children[1]
       foo = body.children[0]
-      self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
-      self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
-      self.assertEquals('<foo bar=quux>', str(foo.original_tag))
-      self.assertEquals('', str(foo.original_end_tag))
-      self.assertEquals('foo', foo.tag_name.decode('utf-8'))
-      self.assertEquals('bar', foo.attributes[0].name)
-      self.assertEquals('quux', foo.attributes[0].value)
+      self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
+      self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
+      self.assertEqual('<foo bar=quux>', str(foo.original_tag))
+      self.assertEqual(b'', bytes(foo.original_end_tag))
+      self.assertEqual(b'foo', foo.tag_name)
+      self.assertEqual(b'bar', foo.attributes[0].name)
+      self.assertEqual(b'quux', foo.attributes[0].value)

   def testUnknownTag(self):
     with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
       root = output.contents.root.contents
       body = root.children[1]
       foo = body.children[0]
-      self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
-      self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
-      self.assertEquals('<foo bar=quux>', str(foo.original_tag))
-      self.assertEquals('', str(foo.original_end_tag))
-      self.assertEquals('foo', foo.tag_name.decode('utf-8'))
-      self.assertEquals('bar', foo.attributes[0].name)
-      self.assertEquals('quux', foo.attributes[0].value)
+      self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
+      self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
+      self.assertEqual('<foo bar=quux>', str(foo.original_tag))
+      self.assertEqual(b'', bytes(foo.original_end_tag))
+      self.assertEqual(b'foo', foo.tag_name)
+      self.assertEqual(b'bar', foo.attributes[0].name)
+      self.assertEqual(b'quux', foo.attributes[0].value)

   def testSarcasm(self):
     with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
@@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase):
       body = root.children[1]
       div = body.children[0]
       sarcasm = div.children[0]
-      self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
-      self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
-      self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
-      self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
-      self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
+      self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type)
+      self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag)
+      self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag))
+      self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag))
+      self.assertEqual(b'sarcasm', sarcasm.tag_name)

   def testEnums(self):
-    self.assertEquals(gumboc.Tag.A, gumboc.Tag.A)
-    self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
+    self.assertEqual(gumboc.Tag.A, gumboc.Tag.A)
+    self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))

   def testFragment(self):
     with gumboc.parse(
@@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase):
         fragment_context=gumboc.Tag.TITLE,
         fragment_namespace=gumboc.Namespace.SVG) as output:
       root = output.contents.root.contents
-      self.assertEquals(1, len(root.children))
+      self.assertEqual(1, len(root.children))
       div = root.children[0]
-      self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
-      self.assertEquals(gumboc.Tag.DIV, div.tag)
-      self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
+      self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
+      self.assertEqual(gumboc.Tag.DIV, div.tag)
+      self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)
	diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py
	new file mode 100644
	index 0000000..5a8d273
	--- /dev/null
	+++ b/python/gumbo/bs4_adapter.py
	@@ -0,0 +1,183 @@
	+# -- coding: utf-8 --
	+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
	+
	+from __future__ import unicode_literals, print_function
	+
	+# Copyright 2012 Google Inc. All Rights Reserved.
	+# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada
	+#
	+# Licensed under the Apache License, Version 2.0 (the "License");
	+# you may not use this file except in compliance with the License.
	+# You may obtain a copy of the License at
	+#
	+# http://www.apache.org/licenses/LICENSE-2.0
	+#
	+# Unless required by applicable law or agreed to in writing, software
	+# distributed under the License is distributed on an "AS IS" BASIS,
	+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# See the License for the specific language governing permissions and
	+# limitations under the License.
	+#
	+
	+# Should this be reworked to be a bs4 treebuilder?
	+
	+"""
	+ Adapter between Gumbo and BeautifulSoup4.
	+ This parses an HTML document and gives back a BeautifulSoup4 object, which you
	+ can then manipulate like a normal BeautifulSoup4 parse tree.
	+
	+ Groks namespaces on elements and attributes
	+"""
	+
	+__author__ = 'jdtang@google.com (Jonathan Tang)'
	+
	+import sys
	+import gumboc
	+
	+import bs4
	+# uses bs4.element classes:
	+# Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re
	+
	+# These should be indexed by the enum
	+# values of gumboc.Namespace
	+
	+_NAMESPACES = [
	+ 'http://www.w3.org/1999/xhtml',
	+ 'http://www.w3.org/2000/svg',
	+ 'http://www.w3.org/1998/Math/MathML',
	+ ]
	+
	+
	+def _fromutf8(text):
	+ return text.decode('utf-8', 'replace')
	+
	+
	+def _add_source_info(obj, original_text, start_pos, end_pos):
	+ obj.original = _fromutf8(bytes(original_text))
	+ obj.line = start_pos.line
	+ obj.col = start_pos.column
	+ obj.offset = start_pos.offset
	+ if end_pos:
	+ obj.end_line = end_pos.line
	+ obj.end_col = end_pos.column
	+ obj.end_offset = end_pos.offset
	+
	+
	+def _convert_attrs(element_attrs):
	+ def maybe_namespace(attr):
	+ if attr.namespace != gumboc.AttributeNamespace.NONE:
	+ name = _fromutf8(attr.name)
	+ prefix = repr(attr.namespace).lower() if name != 'xmlns' else None
	+ nsurl = atr.namespace.to_url()
	+ return bs4.element.NamespacedAttributes(prefix, name, nsurl)
	+ else:
	+ return _fromutf8(attr.name)
	+ def maybe_value_list(attr):
	+ value = _fromutf8(attr.value)
	+ if " " in value:
	+ value = bs4.element.whitespace_re.split(value)
	+ return value
	+ return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs)
	+
	+
	+def _add_document(soup, element):
	+ if not element.has_doctype:
	+ # Mimic html5lib behavior: if no doctype token, no doctype node.
	+ return
	+ doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name),
	+ _fromutf8(element.public_identifier),
	+ _fromutf8(element.system_identifier))
	+ soup.object_was_parsed(doctype)
	+
	+
	+def _add_element(soup, element):
	+ tag = bs4.element.Tag(parser=soup,
	+ name=_fromutf8(element.tag_name),
	+ namespace=_NAMESPACES[element.tag_namespace.value],
	+ attrs=_convert_attrs(element.attributes))
	+ for child in element.children:
	+ tag.append(_add_node(soup, child))
	+ _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos)
	+ tag.original_end_tag = _fromutf8(bytes(element.original_end_tag))
	+ return tag
	+
	+
	+def _add_text(cls):
	+ def add_text_internal(soup, element):
	+ text = cls(_fromutf8(element.text))
	+ _add_source_info(text, element.original_text, element.start_pos, None)
	+ return text
	+ return add_text_internal
	+
	+
	+_HANDLERS = [
	+ _add_document, # DOCUMENT
	+ _add_element, # ELEMENT
	+ _add_text(bs4.element.NavigableString), # TEXT
	+ _add_text(bs4.element.CData), # CDATA
	+ _add_text(bs4.element.Comment), # COMMENT
	+ _add_text(bs4.element.NavigableString), # WHITESPACE
	+ _add_element, # TEMPLATE
	+ ]
	+
	+
	+def _add_node(soup, node):
	+ return _HANDLERS[node.type.value](soup, node.contents)
	+
	+
	+def _add_next_prev_pointers(soup):
	+ def _traverse(node):
	+ # .findAll requires the .next pointer, which is what we're trying to add
	+ # when we call this, and so we manually supply a generator to yield the
	+ # nodes in DOM order.
	+ yield node
	+ try:
	+ for child in node.contents:
	+ for descendant in _traverse(child):
	+ yield descendant
	+ except AttributeError:
	+ # Not an element.
	+ return
	+ nodes = sorted(_traverse(soup), key=lambda node: node.offset)
	+ if nodes:
	+ nodes[0].previous_element = None
	+ nodes[-1].next_element = None
	+ for i, node in enumerate(nodes[1:-1], 1):
	+ nodes[i-1].next_element = node
	+ node.previous_element = nodes[i-1]
	+
	+
	+def parse(text, **kwargs):
	+ with gumboc.parse(text, **kwargs) as output:
	+ soup = bs4.BeautifulSoup('', "html.parser")
	+ _add_document(soup, output.contents.document.contents)
	+ for node in output.contents.document.contents.children:
	+ soup.append(_add_node(soup, node))
	+ _add_next_prev_pointers(soup.html)
	+ return soup
	+
	+
	+def main():
	+ samp = """
	+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
	+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
	+<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US">
	+<head><title>testing & entities</title></head>
	+<body>
	+ <p class="first second">this is the <i><b>copyright</i></b> symbol "©"</p>
	+ <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com">
	+ this used to test atribute namespaces
	+ </p>
	+</body>
	+</html>
	+"""
	+ soup = parse(samp)
	+ print(soup.decode())
	+ for node in soup.findAll("head"):
	+ print(node)
	+ for node in soup.find_all(attrs={'class':'second'}):
	+ print(node)
	+ return 0
	+
	+if __name__ == '__main__':
	+ sys.exit(main())
	diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py
	new file mode 100644
	index 0000000..aa25d4b
	--- /dev/null
	+++ b/python/gumbo/bs4_adapter_test.py
	@@ -0,0 +1,66 @@
	+from __future__ import unicode_literals, print_function
	+
	+# Copyright 2012 Google Inc. All Rights Reserved.
	+#
	+# Licensed under the Apache License, Version 2.0 (the "License");
	+# you may not use this file except in compliance with the License.
	+# You may obtain a copy of the License at
	+#
	+# http://www.apache.org/licenses/LICENSE-2.0
	+#
	+# Unless required by applicable law or agreed to in writing, software
	+# distributed under the License is distributed on an "AS IS" BASIS,
	+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# See the License for the specific language governing permissions and
	+# limitations under the License.
	+#
	+"""Tests for the Gumbo's BeautifulSoup Python adapter."""
	+
	+__author__ = 'jdtang@google.com (Jonathan Tang)'
	+
	+import unittest
	+
	+import bs4_adapter
	+
	+
	+class SoupAdapterTest(unittest.TestCase):
	+
	+ def testSimpleParse(self):
	+ soup = bs4_adapter.parse(
	+ """
	+ <ul>
	+ <li class=odd><a href="one.html">One</a>
	+ <li class="even"><a href="two.html">Two</a>
	+ <li class='odd'><a href="three.html">Three</a>
	+ <li class="even"><a href="four.html">Four</a>
	+ </ul>
	+ """)
	+
	+ head = soup.head
	+ self.assertEqual(soup, head.parent.parent)
	+ self.assertEqual(u'head', head.name)
	+ self.assertEqual(0, len(head))
	+
	+ body = soup.body
	+ self.assertEqual(head, body.previousSibling)
	+ self.assertEqual(2, len(body)) # <ul> + trailing whitespace
	+ self.assertEqual(u'ul', body.contents[0].name)
	+
	+ list_items = body.findAll('li')
	+ self.assertEqual(4, len(list_items))
	+
	+ evens = body('li', 'even')
	+ self.assertEqual(2, len(evens))
	+
	+ a2 = body.find('a', href='two.html')
	+ self.assertEqual(u'a', a2.name)
	+ self.assertEqual(u'Two', a2.contents[0])
	+
	+ li2 = a2.parent
	+ self.assertEqual(u'li', li2.name)
	+ self.assertEqual(u'even', li2['class'])
	+ self.assertEqual(list_items[1], li2)
	+ self.assertEqual(evens[0], li2)
	+
	+if __name__ == '__main__':
	+ unittest.main()
	diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
	index 04da319..887f8e2 100644
	--- a/python/gumbo/gumboc.py
	+++ b/python/gumbo/gumboc.py
	@@ -13,6 +13,8 @@
	# limitations under the License.
	#

	+from __future__ import unicode_literals, print_function
	+
	"""CTypes bindings for the Gumbo HTML5 parser.

	This exports the raw interface of the library as a set of very thin ctypes
	@@ -23,6 +25,19 @@ Pythonic API.
	__author__ = 'jdtang@google.com (Jonathan Tang)'

	import sys
	+
	+PY3 = sys.version_info[0] == 3
	+if PY3:
	+ text_type = str
	+else:
	+ text_type = unicode
	+
	+# When supporting both python 2 and 3 using one code base, using str(obj) is confusing
	+# at best since its return type is python version specific
	+# Notes:
	+# - The unicode(obj) operator does not exist in PY3
	+# - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X)
	+
	import contextlib
	import ctypes
	import os.path
	@@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure):
	return self.length

	def __str__(self):
	+ # Warning: in Python 3 the str() operator method may never return bytes
	+ # to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead
	+ if PY3:
	+ return ctypes.string_at(self.data, self.length).decode('utf-8')
	+ return ctypes.string_at(self.data, self.length)
	+
	+ def __bytes__(self):
	return ctypes.string_at(self.data, self.length)


	@@ -273,11 +295,11 @@ class Element(ctypes.Structure):
	if self.tag_namespace == Namespace.SVG:
	svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
	if svg_tagname is not None:
	- return str(svg_tagname)
	+ return bytes(svg_tagname)
	if self.tag == Tag.UNKNOWN:
	if original_tag.data is None:
	return ''
	- return str(original_tag).lower()
	+ return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
	return _tagname(self.tag)

	def __repr__(self):
	@@ -384,7 +406,9 @@ def parse(text, **kwargs):
	# outlives the parse output. If we let ctypes do it automatically on function
	# call, it creates a temporary buffer which is destroyed when the call
	# completes, and then the original_text pointers point into invalid memory.
	- text_ptr = ctypes.c_char_p(text.encode('utf-8'))
	+ if isinstance(text, text_type):
	+ text = text.encode('utf-8')
	+ text_ptr = ctypes.c_char_p(text)
	output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
	try:
	yield output
	diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py
	index 3e9c41f..ed170a1 100644
	--- a/python/gumbo/gumboc_tags.py
	+++ b/python/gumbo/gumboc_tags.py
	@@ -1,3 +1,5 @@
	+from __future__ import unicode_literals
	+
	# Generated via `gentags.py src/tag.in`.
	# Do not edit; edit src/tag.in instead.
	# clang-format off
	diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py
	index 1f30d38..b510ca8 100644
	--- a/python/gumbo/gumboc_test.py
	+++ b/python/gumbo/gumboc_test.py
	@@ -13,11 +13,16 @@
	# limitations under the License.
	#

	+from __future__ import unicode_literals, print_function
	+
	"""Tests for Gumbo CTypes bindings."""

	__author__ = 'jdtang@google.com (Jonathan Tang)'

	-import StringIO
	+try:
	+ import StringIO as io
	+except ImportError:
	+ import io

	import unittest

	@@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase):
	def testWordParse(self):
	with gumboc.parse('Test') as output:
	doctype_node = output.contents.document.contents
	- self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
	+ self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type)
	document = doctype_node.v.document
	- self.assertEquals('', document.name)
	- self.assertEquals('', document.public_identifier)
	- self.assertEquals('', document.system_identifier)
	+ self.assertEqual(b'', document.name)
	+ self.assertEqual(b'', document.public_identifier)
	+ self.assertEqual(b'', document.system_identifier)

	root = output.contents.root.contents
	- self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
	- self.assertEquals(gumboc.Tag.HTML, root.tag)
	- self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
	- self.assertEquals(2, len(root.children))
	+ self.assertEqual(gumboc.NodeType.ELEMENT, root.type)
	+ self.assertEqual(gumboc.Tag.HTML, root.tag)
	+ self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace)
	+ self.assertEqual(2, len(root.children))

	head = root.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
	- self.assertEquals(gumboc.Tag.HEAD, head.tag)
	- self.assertEquals('head', head.tag_name)
	- self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
	- self.assertEquals(0, len(head.original_tag))
	- self.assertEquals('', str(head.original_end_tag))
	- self.assertEquals(0, head.children.length)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, head.type)
	+ self.assertEqual(gumboc.Tag.HEAD, head.tag)
	+ self.assertEqual(b'head', head.tag_name)
	+ self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace)
	+ self.assertEqual(0, len(head.original_tag))
	+ self.assertEqual(b'', bytes(head.original_end_tag))
	+ self.assertEqual(0, head.children.length)

	body = root.children[1]
	- self.assertNotEquals(body, doctype_node)
	- self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
	- self.assertEquals(gumboc.Tag.BODY, body.tag)
	- self.assertEquals('body', body.tag_name)
	- self.assertEquals(1, len(body.children))
	+ self.assertNotEqual(body, doctype_node)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, body.type)
	+ self.assertEqual(gumboc.Tag.BODY, body.tag)
	+ self.assertEqual(b'body', body.tag_name)
	+ self.assertEqual(1, len(body.children))

	text_node = body.children[0]
	- self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
	- self.assertEquals('Test', text_node.text)
	+ self.assertEqual(gumboc.NodeType.TEXT, text_node.type)
	+ self.assertEqual(b'Test', text_node.text)

	def testBufferThatGoesAway(self):
	for i in range(10):
	- source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
	+ source = io.StringIO('<foo bar=quux>1<p>2</foo>')
	parse_tree = gumboc.parse(source.read())
	source.close()
	with parse_tree as output:
	root = output.contents.root.contents
	body = root.children[1]
	foo = body.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
	- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
	- self.assertEquals('', str(foo.original_end_tag))
	- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
	- self.assertEquals('bar', foo.attributes[0].name)
	- self.assertEquals('quux', foo.attributes[0].value)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
	+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
	+ self.assertEqual(b'', bytes(foo.original_end_tag))
	+ self.assertEqual(b'foo', foo.tag_name)
	+ self.assertEqual(b'bar', foo.attributes[0].name)
	+ self.assertEqual(b'quux', foo.attributes[0].value)

	def testUnknownTag(self):
	with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
	root = output.contents.root.contents
	body = root.children[1]
	foo = body.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
	- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
	- self.assertEquals('', str(foo.original_end_tag))
	- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
	- self.assertEquals('bar', foo.attributes[0].name)
	- self.assertEquals('quux', foo.attributes[0].value)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
	+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
	+ self.assertEqual(b'', bytes(foo.original_end_tag))
	+ self.assertEqual(b'foo', foo.tag_name)
	+ self.assertEqual(b'bar', foo.attributes[0].name)
	+ self.assertEqual(b'quux', foo.attributes[0].value)

	def testSarcasm(self):
	with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
	@@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase):
	body = root.children[1]
	div = body.children[0]
	sarcasm = div.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
	- self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
	- self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
	- self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
	+ self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag)
	+ self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag))
	+ self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag))
	+ self.assertEqual(b'sarcasm', sarcasm.tag_name)

	def testEnums(self):
	- self.assertEquals(gumboc.Tag.A, gumboc.Tag.A)
	- self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
	+ self.assertEqual(gumboc.Tag.A, gumboc.Tag.A)
	+ self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))

	def testFragment(self):
	with gumboc.parse(
	@@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase):
	fragment_context=gumboc.Tag.TITLE,
	fragment_namespace=gumboc.Namespace.SVG) as output:
	root = output.contents.root.contents
	- self.assertEquals(1, len(root.children))
	+ self.assertEqual(1, len(root.children))
	div = root.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
	- self.assertEquals(gumboc.Tag.DIV, div.tag)
	- self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
	+ self.assertEqual(gumboc.Tag.DIV, div.tag)
	+ self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)