Skip to content

Instantly share code, notes, and snippets.

@dmitriiweb
Last active March 13, 2024 02:22
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.
Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.
BeautifulSoup vs lxml
from datetime import datetime
from datetime import datetime
import requests
from bs4 import BeautifulSoup as BSoup
from lxml import html
def get_html():
url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
r = requests.get(url)
return r.text
def bs_scraping(page_source, parser):
bs_obj = BSoup(page_source, parser)
rows = bs_obj.find_all('table')[0].find_all('tr')
data = []
for row in rows[2:]:
cells = row.find_all('td')
name = row.find('th').get_text()
abbr = cells[0].get_text()
reps = cells[-1].get_text()
water_km = cells[-2].get_text()
land_km = cells[-4].get_text()
total_km = cells[-6].get_text()
population = cells[-8].get_text()
data.append([name, abbr, reps, water_km, land_km, total_km, population])
return data
def lxml_scraping(page_source):
tree = html.fromstring(page_source)
table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0]
rows = table.findall('tr')
data = []
for row in rows[2:]:
name = row.xpath('./th')[0].text_content()
cells = row.xpath('./td')
abbr = cells[0].text_content()
reps = cells[-1].text_content()
water_km = cells[-2].text_content()
land_km = cells[-4].text_content()
total_km = cells[-6].text_content()
population = cells[-8].text_content()
data.append([name, abbr, reps, water_km, land_km, total_km, population])
return data
if __name__ == '__main__':
repeats = 100
page_source = get_html()
bs_parsers = ['lxml', 'html.parser', 'html5lib']
for parser in bs_parsers:
bs_start = datetime.now()
for _ in range(repeats):
bs_result = bs_scraping(page_source, parser)
bs_finish = datetime.now() - bs_start
print('BeautifulSoup {} time: {}'.format(parser, bs_finish))
lxml_start = datetime.now()
for _ in range(repeats):
lxml_result = lxml_scraping(page_source)
lxml_finish = datetime.now() - lxml_start
print('lxml time:', lxml_finish)
# BeautifulSoup lxml time: 0:00:12.774159
# BeautifulSoup html.parser time: 0:00:20.097766
# BeautifulSoup html5lib time: 0:00:50.156767
# lxml time: 0:00:02.027748
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment