Skip to content

Instantly share code, notes, and snippets.

@aucampia
Created July 12, 2023 22:22
Show Gist options
  • Save aucampia/103bfb1336d6552da3d35063639df5a0 to your computer and use it in GitHub Desktop.
Save aucampia/103bfb1336d6552da3d35063639df5a0 to your computer and use it in GitHub Desktop.
$ deno test html5_test.ts
running 4 tests from ./html5_test.ts
parse <table></table> ...
------- output -------
documentFragment = <ref *1> {
nodeName: "#document-fragment",
childNodes: [
{
nodeName: "table",
tagName: "table",
attrs: [],
namespaceURI: "http://www.w3.org/1999/xhtml",
childNodes: [],
parentNode: [Circular *1],
sourceCodeLocation: {
startLine: 1,
startCol: 1,
startOffset: 0,
endLine: 1,
endCol: 16,
endOffset: 15,
startTag: {
startLine: 1,
startCol: 1,
startOffset: 0,
endLine: 1,
endCol: 8,
endOffset: 7
},
endTag: {
startLine: 1,
startCol: 8,
startOffset: 7,
endLine: 1,
endCol: 16,
endOffset: 15
}
}
}
]
}
re serialize = <table></table>
re serializeOuter =
----- output end -----
parse <table></table> ... ok (10ms)
parse <html><body></body></html> ...
------- output -------
documentFragment = { nodeName: "#document-fragment", childNodes: [] }
re serialize =
re serializeOuter =
----- output end -----
parse <html><body></body></html> ... ok (5ms)
parse <tr><td>THE TEXT IS IN HERE</td></tr> ...
------- output -------
documentFragment = <ref *2> {
nodeName: "#document-fragment",
childNodes: [
<ref *1> {
nodeName: "tr",
tagName: "tr",
attrs: [],
namespaceURI: "http://www.w3.org/1999/xhtml",
childNodes: [
{
nodeName: "td",
tagName: "td",
attrs: [],
namespaceURI: "http://www.w3.org/1999/xhtml",
childNodes: [Array],
parentNode: [Circular *1],
sourceCodeLocation: [Object]
}
],
parentNode: [Circular *2],
sourceCodeLocation: {
startLine: 1,
startCol: 1,
startOffset: 0,
endLine: 1,
endCol: 38,
endOffset: 37,
startTag: {
startLine: 1,
startCol: 1,
startOffset: 0,
endLine: 1,
endCol: 5,
endOffset: 4
},
endTag: {
startLine: 1,
startCol: 33,
startOffset: 32,
endLine: 1,
endCol: 38,
endOffset: 37
}
}
}
]
}
re serialize = <tr><td>THE TEXT IS IN HERE</td></tr>
re serializeOuter =
----- output end -----
parse <tr><td>THE TEXT IS IN HERE</td></tr> ... ok (7ms)
parse <body></body> ...
------- output -------
documentFragment = { nodeName: "#document-fragment", childNodes: [] }
re serialize =
re serializeOuter =
----- output end -----
parse <body></body> ... ok (5ms)
ok | 4 passed | 0 failed (115ms)
import { assertEquals } from "https://deno.land/std@0.193.0/testing/asserts.ts";
import { parseFragment, serialize, serializeOuter } from "npm:parse5";
[
"<table></table>",
"<html><body></body></html>",
"<tr><td>THE TEXT IS IN HERE</td></tr>",
"<body></body>",
].forEach((html) => {
Deno.test(`parse ${html}`, () => {
const documentFragment = parseFragment(null, html, {
onParseError: (err) => {
console.log("err = ", err);
throw err;
},
});
console.log("documentFragment = ", documentFragment);
console.log("re serialize = ", serialize(documentFragment));
console.log("re serializeOuter = ", serializeOuter(documentFragment));
});
});
$ task test -- 'tests/test_html5lib.py::test_parse'
task: [test] poetry run python -m pytest tests/test_html5lib.py::test_parse
============================================================================ test session starts ============================================================================
platform linux -- Python 3.11.4, pytest-7.4.0, pluggy-1.2.0
rootdir: /home/iwana/sw/d/gitlab.com/aucampia/pvt/scratchpad/tech/py3
configfile: pyproject.toml
plugins: mock-3.11.1, cov-4.1.0
collected 4 items
tests/test_html5lib.py::test_parse[<body></body>]
------------------------------------------------------------------------------- live log call -------------------------------------------------------------------------------
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d3ebd50>
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [],
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d410170>}
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 6), 'unexpected-start-tag', {'name': 'body'}),
((1, 13), 'XXX-undefined-error', {})]
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>'
PASSED [ 25%]
tests/test_html5lib.py::test_parse[<html><body></body></html>]
------------------------------------------------------------------------------- live log call -------------------------------------------------------------------------------
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d415a10>
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [],
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d740ad0>}
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 6), 'non-html-root', {}),
((1, 12), 'unexpected-start-tag', {'name': 'body'}),
((1, 19), 'XXX-undefined-error', {})]
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>'
PASSED [ 50%]
tests/test_html5lib.py::test_parse[<tr><td>THE TEXT IS IN HERE</td></tr>]
------------------------------------------------------------------------------- live log call -------------------------------------------------------------------------------
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d417690>
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [<DOM Text node "'THE TEXT I'...">],
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d410230>}
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 4), 'unexpected-start-tag-ignored', {'name': 'tr'}),
((1, 8), 'unexpected-start-tag-ignored', {'name': 'td'}),
((1, 32), 'unexpected-end-tag', {'name': 'td'}),
((1, 37), 'unexpected-end-tag', {'name': 'tr'})]
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>THE TEXT IS IN HERE'
PASSED [ 75%]
tests/test_html5lib.py::test_parse[<table></table>]
------------------------------------------------------------------------------- live log call -------------------------------------------------------------------------------
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d41d710>
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [<DOM Element: table at 0x7f8b5d407750>],
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d4102f0>}
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = []
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?><table/>'
PASSED [100%]
============================================================================= 4 passed in 0.03s =============================================================================
import logging
from pprint import pformat
import html5lib
import pytest
import xml.dom.minidom
@pytest.mark.parametrize(
("fragment_text",),
[
("<body></body>",),
("<html><body></body></html>",),
("<tr><td>THE TEXT IS IN HERE</td></tr>",),
("<table></table>",),
],
)
def test_parse(fragment_text: str) -> None:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
fragment = parser.parseFragment(fragment_text)
fragment.normalize()
logging.debug("fragment = %s", pformat(fragment))
logging.debug("fragment = %s", pformat(vars(fragment)))
logging.debug("fragment = %s", pformat(parser.errors))
doc = xml.dom.minidom.Document()
doc.childNodes += fragment.childNodes
string = doc.toxml("utf-8")
logging.debug("string = %s", string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment