leemurus/scrapper.py

## scrapper.py
import re
import requests
import time

from bs4 import BeautifulSoup
from selectolax.parser import HTMLParser


def print_performance(func):
    def wrapper(*args, **kwargs):
        begin = time.monotonic()
        result = func(*args, **kwargs)
        duration = time.monotonic() - begin
        print(f'Time execution of {func.__name__}: {duration * 1000:.2f} ms')
        return result

    return wrapper


@print_performance
def test_bs4(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    a_tags = soup.findAll('a', href=True)
    return set(a_tag['href'] for a_tag in a_tags)


@print_performance
def test_selectolax(html_text):
    links = []
    dom = HTMLParser(html_text)
    for tag in dom.tags('a'):
        attrs = tag.attributes
        if 'href' in attrs:
            links.append(attrs['href'])
    return set(links)


@print_performance
def test_regex(html_text):
    # [^<>]+ 1 or more symbols besides "<" and ">"
    # ' or "
    # few any chars
    # ' or " like in second group
    HTML_TAG_REGEX = re.compile(r'<a[^<>]+href=([\'\"])(.*?)\1')
    reg_urls = set(match[1] for match in HTML_TAG_REGEX.findall(html_text))

    return reg_urls


def main():
    url = 'https://en.wikipedia.org/wiki/HTML'
    html_text = requests.get(url).text.replace('&amp;', '&')

    soup_urls = test_bs4(html_text)
    selectolax_urls = test_selectolax(html_text)
    regex_urls = test_regex(html_text)
    print(soup_urls == selectolax_urls == regex_urls)


if __name__ == '__main__':
    main()
	import re
	import requests
	import time

	from bs4 import BeautifulSoup
	from selectolax.parser import HTMLParser


	def print_performance(func):
	def wrapper(args, *kwargs):
	begin = time.monotonic()
	result = func(args, *kwargs)
	duration = time.monotonic() - begin
	print(f'Time execution of {func.__name__}: {duration * 1000:.2f} ms')
	return result

	return wrapper


	@print_performance
	def test_bs4(html_text):
	soup = BeautifulSoup(html_text, 'html.parser')
	a_tags = soup.findAll('a', href=True)
	return set(a_tag['href'] for a_tag in a_tags)


	@print_performance
	def test_selectolax(html_text):
	links = []
	dom = HTMLParser(html_text)
	for tag in dom.tags('a'):
	attrs = tag.attributes
	if 'href' in attrs:
	links.append(attrs['href'])
	return set(links)


	@print_performance
	def test_regex(html_text):
	# [^<>]+ 1 or more symbols besides "<" and ">"
	# ' or "
	# few any chars
	# ' or " like in second group
	HTML_TAG_REGEX = re.compile(r'<a[^<>]+href=([\'\"])(.*?)\1')
	reg_urls = set(match[1] for match in HTML_TAG_REGEX.findall(html_text))

	return reg_urls


	def main():
	url = 'https://en.wikipedia.org/wiki/HTML'
	html_text = requests.get(url).text.replace('&', '&')

	soup_urls = test_bs4(html_text)
	selectolax_urls = test_selectolax(html_text)
	regex_urls = test_regex(html_text)
	print(soup_urls == selectolax_urls == regex_urls)


	if __name__ == '__main__':
	main()