Last active
March 13, 2024 02:22
-
-
Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.
BeautifulSoup vs lxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from datetime import datetime | |
import requests | |
from bs4 import BeautifulSoup as BSoup | |
from lxml import html | |
def get_html(): | |
url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States' | |
r = requests.get(url) | |
return r.text | |
def bs_scraping(page_source, parser): | |
bs_obj = BSoup(page_source, parser) | |
rows = bs_obj.find_all('table')[0].find_all('tr') | |
data = [] | |
for row in rows[2:]: | |
cells = row.find_all('td') | |
name = row.find('th').get_text() | |
abbr = cells[0].get_text() | |
reps = cells[-1].get_text() | |
water_km = cells[-2].get_text() | |
land_km = cells[-4].get_text() | |
total_km = cells[-6].get_text() | |
population = cells[-8].get_text() | |
data.append([name, abbr, reps, water_km, land_km, total_km, population]) | |
return data | |
def lxml_scraping(page_source): | |
tree = html.fromstring(page_source) | |
table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0] | |
rows = table.findall('tr') | |
data = [] | |
for row in rows[2:]: | |
name = row.xpath('./th')[0].text_content() | |
cells = row.xpath('./td') | |
abbr = cells[0].text_content() | |
reps = cells[-1].text_content() | |
water_km = cells[-2].text_content() | |
land_km = cells[-4].text_content() | |
total_km = cells[-6].text_content() | |
population = cells[-8].text_content() | |
data.append([name, abbr, reps, water_km, land_km, total_km, population]) | |
return data | |
if __name__ == '__main__': | |
repeats = 100 | |
page_source = get_html() | |
bs_parsers = ['lxml', 'html.parser', 'html5lib'] | |
for parser in bs_parsers: | |
bs_start = datetime.now() | |
for _ in range(repeats): | |
bs_result = bs_scraping(page_source, parser) | |
bs_finish = datetime.now() - bs_start | |
print('BeautifulSoup {} time: {}'.format(parser, bs_finish)) | |
lxml_start = datetime.now() | |
for _ in range(repeats): | |
lxml_result = lxml_scraping(page_source) | |
lxml_finish = datetime.now() - lxml_start | |
print('lxml time:', lxml_finish) | |
# BeautifulSoup lxml time: 0:00:12.774159 | |
# BeautifulSoup html.parser time: 0:00:20.097766 | |
# BeautifulSoup html5lib time: 0:00:50.156767 | |
# lxml time: 0:00:02.027748 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment