Skip to content

Instantly share code, notes, and snippets.

@leemurus
Last active January 10, 2023 00:10
Show Gist options
  • Save leemurus/4ae757c615057836c68f132dc73977e1 to your computer and use it in GitHub Desktop.
Save leemurus/4ae757c615057836c68f132dc73977e1 to your computer and use it in GitHub Desktop.
import re
import requests
import time
from bs4 import BeautifulSoup
from selectolax.parser import HTMLParser
def print_performance(func):
def wrapper(*args, **kwargs):
begin = time.monotonic()
result = func(*args, **kwargs)
duration = time.monotonic() - begin
print(f'Time execution of {func.__name__}: {duration * 1000:.2f} ms')
return result
return wrapper
@print_performance
def test_bs4(html_text):
soup = BeautifulSoup(html_text, 'html.parser')
a_tags = soup.findAll('a', href=True)
return set(a_tag['href'] for a_tag in a_tags)
@print_performance
def test_selectolax(html_text):
links = []
dom = HTMLParser(html_text)
for tag in dom.tags('a'):
attrs = tag.attributes
if 'href' in attrs:
links.append(attrs['href'])
return set(links)
@print_performance
def test_regex(html_text):
# [^<>]+ 1 or more symbols besides "<" and ">"
# ' or "
# few any chars
# ' or " like in second group
HTML_TAG_REGEX = re.compile(r'<a[^<>]+href=([\'\"])(.*?)\1')
reg_urls = set(match[1] for match in HTML_TAG_REGEX.findall(html_text))
return reg_urls
def main():
url = 'https://en.wikipedia.org/wiki/HTML'
html_text = requests.get(url).text.replace('&amp;', '&')
soup_urls = test_bs4(html_text)
selectolax_urls = test_selectolax(html_text)
regex_urls = test_regex(html_text)
print(soup_urls == selectolax_urls == regex_urls)
if __name__ == '__main__':
main()
@leemurus
Copy link
Author

leemurus commented Jan 9, 2023

Time execution of test_bs4: 418.98 ms
Time execution of test_selectolax: 13.22 ms
Time execution of test_regex: 3.83 ms
True

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment