-
-
Save aucampia/103bfb1336d6552da3d35063639df5a0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ deno test html5_test.ts | |
running 4 tests from ./html5_test.ts | |
parse <table></table> ... | |
------- output ------- | |
documentFragment = <ref *1> { | |
nodeName: "#document-fragment", | |
childNodes: [ | |
{ | |
nodeName: "table", | |
tagName: "table", | |
attrs: [], | |
namespaceURI: "http://www.w3.org/1999/xhtml", | |
childNodes: [], | |
parentNode: [Circular *1], | |
sourceCodeLocation: { | |
startLine: 1, | |
startCol: 1, | |
startOffset: 0, | |
endLine: 1, | |
endCol: 16, | |
endOffset: 15, | |
startTag: { | |
startLine: 1, | |
startCol: 1, | |
startOffset: 0, | |
endLine: 1, | |
endCol: 8, | |
endOffset: 7 | |
}, | |
endTag: { | |
startLine: 1, | |
startCol: 8, | |
startOffset: 7, | |
endLine: 1, | |
endCol: 16, | |
endOffset: 15 | |
} | |
} | |
} | |
] | |
} | |
re serialize = <table></table> | |
re serializeOuter = | |
----- output end ----- | |
parse <table></table> ... ok (10ms) | |
parse <html><body></body></html> ... | |
------- output ------- | |
documentFragment = { nodeName: "#document-fragment", childNodes: [] } | |
re serialize = | |
re serializeOuter = | |
----- output end ----- | |
parse <html><body></body></html> ... ok (5ms) | |
parse <tr><td>THE TEXT IS IN HERE</td></tr> ... | |
------- output ------- | |
documentFragment = <ref *2> { | |
nodeName: "#document-fragment", | |
childNodes: [ | |
<ref *1> { | |
nodeName: "tr", | |
tagName: "tr", | |
attrs: [], | |
namespaceURI: "http://www.w3.org/1999/xhtml", | |
childNodes: [ | |
{ | |
nodeName: "td", | |
tagName: "td", | |
attrs: [], | |
namespaceURI: "http://www.w3.org/1999/xhtml", | |
childNodes: [Array], | |
parentNode: [Circular *1], | |
sourceCodeLocation: [Object] | |
} | |
], | |
parentNode: [Circular *2], | |
sourceCodeLocation: { | |
startLine: 1, | |
startCol: 1, | |
startOffset: 0, | |
endLine: 1, | |
endCol: 38, | |
endOffset: 37, | |
startTag: { | |
startLine: 1, | |
startCol: 1, | |
startOffset: 0, | |
endLine: 1, | |
endCol: 5, | |
endOffset: 4 | |
}, | |
endTag: { | |
startLine: 1, | |
startCol: 33, | |
startOffset: 32, | |
endLine: 1, | |
endCol: 38, | |
endOffset: 37 | |
} | |
} | |
} | |
] | |
} | |
re serialize = <tr><td>THE TEXT IS IN HERE</td></tr> | |
re serializeOuter = | |
----- output end ----- | |
parse <tr><td>THE TEXT IS IN HERE</td></tr> ... ok (7ms) | |
parse <body></body> ... | |
------- output ------- | |
documentFragment = { nodeName: "#document-fragment", childNodes: [] } | |
re serialize = | |
re serializeOuter = | |
----- output end ----- | |
parse <body></body> ... ok (5ms) | |
ok | 4 passed | 0 failed (115ms) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { assertEquals } from "https://deno.land/std@0.193.0/testing/asserts.ts"; | |
import { parseFragment, serialize, serializeOuter } from "npm:parse5"; | |
[ | |
"<table></table>", | |
"<html><body></body></html>", | |
"<tr><td>THE TEXT IS IN HERE</td></tr>", | |
"<body></body>", | |
].forEach((html) => { | |
Deno.test(`parse ${html}`, () => { | |
const documentFragment = parseFragment(null, html, { | |
onParseError: (err) => { | |
console.log("err = ", err); | |
throw err; | |
}, | |
}); | |
console.log("documentFragment = ", documentFragment); | |
console.log("re serialize = ", serialize(documentFragment)); | |
console.log("re serializeOuter = ", serializeOuter(documentFragment)); | |
}); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ task test -- 'tests/test_html5lib.py::test_parse' | |
task: [test] poetry run python -m pytest tests/test_html5lib.py::test_parse | |
============================================================================ test session starts ============================================================================ | |
platform linux -- Python 3.11.4, pytest-7.4.0, pluggy-1.2.0 | |
rootdir: /home/iwana/sw/d/gitlab.com/aucampia/pvt/scratchpad/tech/py3 | |
configfile: pyproject.toml | |
plugins: mock-3.11.1, cov-4.1.0 | |
collected 4 items | |
tests/test_html5lib.py::test_parse[<body></body>] | |
------------------------------------------------------------------------------- live log call ------------------------------------------------------------------------------- | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d3ebd50> | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [], | |
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d410170>} | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 6), 'unexpected-start-tag', {'name': 'body'}), | |
((1, 13), 'XXX-undefined-error', {})] | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>' | |
PASSED [ 25%] | |
tests/test_html5lib.py::test_parse[<html><body></body></html>] | |
------------------------------------------------------------------------------- live log call ------------------------------------------------------------------------------- | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d415a10> | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [], | |
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d740ad0>} | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 6), 'non-html-root', {}), | |
((1, 12), 'unexpected-start-tag', {'name': 'body'}), | |
((1, 19), 'XXX-undefined-error', {})] | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>' | |
PASSED [ 50%] | |
tests/test_html5lib.py::test_parse[<tr><td>THE TEXT IS IN HERE</td></tr>] | |
------------------------------------------------------------------------------- live log call ------------------------------------------------------------------------------- | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d417690> | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [<DOM Text node "'THE TEXT I'...">], | |
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d410230>} | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [((1, 4), 'unexpected-start-tag-ignored', {'name': 'tr'}), | |
((1, 8), 'unexpected-start-tag-ignored', {'name': 'td'}), | |
((1, 32), 'unexpected-end-tag', {'name': 'td'}), | |
((1, 37), 'unexpected-end-tag', {'name': 'tr'})] | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?>THE TEXT IS IN HERE' | |
PASSED [ 75%] | |
tests/test_html5lib.py::test_parse[<table></table>] | |
------------------------------------------------------------------------------- live log call ------------------------------------------------------------------------------- | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:22:test_parse fragment = <xml.dom.minidom.DocumentFragment object at 0x7f8b5d41d710> | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:23:test_parse fragment = {'childNodes': [<DOM Element: table at 0x7f8b5d407750>], | |
'ownerDocument': <xml.dom.minidom.Document object at 0x7f8b5d4102f0>} | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:24:test_parse fragment = [] | |
2023-07-13T00:21:10 2606687 140236803430208 010:DEBUG root test_html5lib:30:test_parse string = b'<?xml version="1.0" encoding="utf-8"?><table/>' | |
PASSED [100%] | |
============================================================================= 4 passed in 0.03s ============================================================================= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from pprint import pformat | |
import html5lib | |
import pytest | |
import xml.dom.minidom | |
@pytest.mark.parametrize( | |
("fragment_text",), | |
[ | |
("<body></body>",), | |
("<html><body></body></html>",), | |
("<tr><td>THE TEXT IS IN HERE</td></tr>",), | |
("<table></table>",), | |
], | |
) | |
def test_parse(fragment_text: str) -> None: | |
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) | |
fragment = parser.parseFragment(fragment_text) | |
fragment.normalize() | |
logging.debug("fragment = %s", pformat(fragment)) | |
logging.debug("fragment = %s", pformat(vars(fragment))) | |
logging.debug("fragment = %s", pformat(parser.errors)) | |
doc = xml.dom.minidom.Document() | |
doc.childNodes += fragment.childNodes | |
string = doc.toxml("utf-8") | |
logging.debug("string = %s", string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment