Created
April 17, 2022 09:41
-
-
Save dalf/95d8487c5632bf09da91d055870162be to your computer and use it in GitHub Desktop.
lxml version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
from lxml import html | |
import re | |
import lxml.etree | |
import lxml.html | |
import lxml.html.clean | |
from searx.utils import html_to_text, get_string_replaces_function | |
from searx import logger | |
html_string = '<!-- test --><style>.span { color: red; }<i>test</i></style><span>#Exam>ple <i>Italic</i>Tail text</span><br />Addtional text tail' | |
replace_below_32_by_space = get_string_replaces_function({ | |
chr(i): ' ' | |
for i in range(0, 32) | |
}) | |
class MyParserTarget: | |
BLOCKED_TAG = ('script', 'style', 'textarea', 'form', 'iframe', 'object') | |
def __init__(self): | |
self._texts = [] | |
self.blocked_tag_count = 0 | |
def start(self, tag, attrib, nsmap) -> None: | |
if tag in MyParserTarget.BLOCKED_TAG: | |
self.blocked_tag_count += 1 | |
if tag == 'br': | |
self._texts.append(' ') | |
def end(self, tag) -> None: | |
if tag in MyParserTarget.BLOCKED_TAG: | |
self.blocked_tag_count -= 1 | |
def data(self, data) -> None: | |
if self.blocked_tag_count == 0: | |
self._texts.append(data) | |
def close(self): | |
pass | |
@property | |
def texts(self): | |
return ''.join(self._texts) | |
def html_to_text_2(html_str): | |
mytarget = MyParserTarget() | |
parser = lxml.etree.HTMLPullParser(target=mytarget, events=('start', 'end'), no_network=True, collect_ids=False, remove_comments=False, remove_pis=False, strip_cdata=False, compact=False, huge_tree=False) | |
try: | |
parser.feed(html_str) | |
parser.close() | |
except lxml.etree.ParseError: | |
logger.debug("html_to_text: invalid HTML\n%s", html_str) | |
result = mytarget.texts | |
del parser | |
del mytarget | |
return ' '.join(replace_below_32_by_space(result).split()) | |
def get_memory_rss(): | |
return psutil.Process(os.getpid()).memory_info().rss | |
print(html_to_text_2(html_string)) | |
print(html_to_text(html_string)) | |
import timeit | |
import psutil | |
import os | |
import gc | |
m_before = get_memory_rss() | |
iter_count = 1_000 | |
for c in range(0, iter_count): | |
html_to_text_2(html_string) | |
gc.collect() | |
m_diff = get_memory_rss() - m_before | |
print(m_diff, m_diff / iter_count) | |
# 1_000 iterations --> 253952 bytes | |
# 5_000_000 iterations --> 647_168 bytes | |
# if it is a leak, it doesn't grow linearly | |
print(timeit.timeit('html_to_text_2(html_string)', globals=globals(), number=100000)) | |
print(timeit.timeit('html_to_text(html_string)', globals=globals(), number=100000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment