Skip to content

Instantly share code, notes, and snippets.

@dalf
Created April 17, 2022 09:41
Show Gist options
  • Save dalf/95d8487c5632bf09da91d055870162be to your computer and use it in GitHub Desktop.
Save dalf/95d8487c5632bf09da91d055870162be to your computer and use it in GitHub Desktop.
lxml version
from typing import List
from lxml import html
import re
import lxml.etree
import lxml.html
import lxml.html.clean
from searx.utils import html_to_text, get_string_replaces_function
from searx import logger
html_string = '<!-- test --><style>.span { color: red; }<i>test</i></style><span>&#35;Exam&gt;ple <i>Italic</i>Tail text</span><br />Addtional text tail'
replace_below_32_by_space = get_string_replaces_function({
chr(i): ' '
for i in range(0, 32)
})
class MyParserTarget:
BLOCKED_TAG = ('script', 'style', 'textarea', 'form', 'iframe', 'object')
def __init__(self):
self._texts = []
self.blocked_tag_count = 0
def start(self, tag, attrib, nsmap) -> None:
if tag in MyParserTarget.BLOCKED_TAG:
self.blocked_tag_count += 1
if tag == 'br':
self._texts.append(' ')
def end(self, tag) -> None:
if tag in MyParserTarget.BLOCKED_TAG:
self.blocked_tag_count -= 1
def data(self, data) -> None:
if self.blocked_tag_count == 0:
self._texts.append(data)
def close(self):
pass
@property
def texts(self):
return ''.join(self._texts)
def html_to_text_2(html_str):
mytarget = MyParserTarget()
parser = lxml.etree.HTMLPullParser(target=mytarget, events=('start', 'end'), no_network=True, collect_ids=False, remove_comments=False, remove_pis=False, strip_cdata=False, compact=False, huge_tree=False)
try:
parser.feed(html_str)
parser.close()
except lxml.etree.ParseError:
logger.debug("html_to_text: invalid HTML\n%s", html_str)
result = mytarget.texts
del parser
del mytarget
return ' '.join(replace_below_32_by_space(result).split())
def get_memory_rss():
return psutil.Process(os.getpid()).memory_info().rss
print(html_to_text_2(html_string))
print(html_to_text(html_string))
import timeit
import psutil
import os
import gc
m_before = get_memory_rss()
iter_count = 1_000
for c in range(0, iter_count):
html_to_text_2(html_string)
gc.collect()
m_diff = get_memory_rss() - m_before
print(m_diff, m_diff / iter_count)
# 1_000 iterations --> 253952 bytes
# 5_000_000 iterations --> 647_168 bytes
# if it is a leak, it doesn't grow linearly
print(timeit.timeit('html_to_text_2(html_string)', globals=globals(), number=100000))
print(timeit.timeit('html_to_text(html_string)', globals=globals(), number=100000))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment