Created
September 17, 2016 18:52
-
-
Save Mortal/8a7f0348685fd68ab63576dfc49ab9df to your computer and use it in GitHub Desktop.
HTML compressor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import sys | |
import html5lib | |
from xml.etree.ElementTree import ElementTree | |
def element_to_html(element): | |
with io.BytesIO() as buf: | |
# We cannot use default_namespace, | |
# since it incorrectly errors on unnamespaced attributes | |
# See: https://bugs.python.org/issue17088 | |
ElementTree(element).write( | |
buf, encoding='utf8', xml_declaration=False, | |
method='xml') | |
body = buf.getvalue().decode('utf8') | |
# Workaround to make it prettier | |
body = body.replace( | |
' xmlns:html="http://www.w3.org/1999/xhtml"', '') | |
body = body.replace('<html:', '<') | |
body = body.replace('</html:', '</') | |
return body | |
def tree_equal(t1, t2): | |
return (t1.tag == t2.tag and | |
t1.text == t2.text and | |
len(t1) == len(t2) and | |
all(c1.tail == c2.tail for c1, c2 in zip(t1, t2)) and | |
all(tree_equal(c1, c2) for c1, c2 in zip(t1, t2))) | |
def main(): | |
input = sys.stdin.read() | |
document = html5lib.parse(input) | |
input2 = element_to_html(document) | |
print("Input size: %s bytes" % len(input)) | |
print("Input size 2: %s bytes" % len(input2)) | |
input3 = input2.replace('<html>', '') | |
document3 = html5lib.parse(input3) | |
print("Input 3: %s bytes" % len(input3)) | |
print(tree_equal(document, document3)) | |
input4 = input3.replace('<head>', '') | |
document4 = html5lib.parse(input4) | |
print("Input 4: %s bytes" % len(input4)) | |
print(tree_equal(document3, document4)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment