Skip to content

Instantly share code, notes, and snippets.

@MercuryRising
Created November 12, 2012 19:29
Show Gist options
  • Save MercuryRising/4061368 to your computer and use it in GitHub Desktop.
Save MercuryRising/4061368 to your computer and use it in GitHub Desktop.
Pyquery, lxml, BeautifulSoup comparison
from bs4 import BeautifulSoup as bs
from pyquery import PyQuery as pq
from lxml.html import fromstring
import re
import requests
import time
def Timer():
a = time.time()
while True:
c = time.time()
yield time.time()-a
a = c
timer = Timer()
url = "http://www.python.org/"
html = requests.get(url).text
num = 100000
print '\n==== Total trials: %s =====' %num
next(timer)
soup = bs(html, 'lxml')
for x in range(num):
paragraphs = soup.findAll('p')
t = next(timer)
print 'bs4 total time: %.1f' %t
d = pq(html)
for x in range(num):
paragraphs = d('p')
t = next(timer)
print 'pq total time: %.1f' %t
tree = fromstring(html)
for x in range(num):
paragraphs = tree.cssselect('p')
t = next(timer)
print 'lxml (cssselect) total time: %.1f' %t
tree = fromstring(html)
for x in range(num):
paragraphs = tree.xpath('.//p')
t = next(timer)
print 'lxml (xpath) total time: %.1f' %t
for x in range(num):
paragraphs = re.findall('<[p ]>.*?</p>', html)
t = next(timer)
print 'regex total time: %.1f (doesn\'t find all p)\n' %t
@p3nj
Copy link

p3nj commented Apr 30, 2023

Python 3.11.2

==== Total trials: 100000 =====
bs4 total time: 18.1
pq total time: 2.2
lxml (cssselect) total time: 2.2
lxml (xpath) total time: 1.7
regex total time: 5.2 (doesn't find all p)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment