Skip to content

Instantly share code, notes, and snippets.

@ifduyue
Created January 11, 2012 15:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ifduyue/1595083 to your computer and use it in GitHub Desktop.
Save ifduyue/1595083 to your computer and use it in GitHub Desktop.
content between <p> and </p>
def txt_wrap_by(begin, end, html):
if not html:
return ''
start = html.find(begin)
if start >= 0:
start += len(begin)
end = html.find(end, start)
if end >= 0:
return html[start:end].strip()
def txt_wrap_by_all(begin, end, html):
if not html:
return ''
result = []
from_pos = 0
while True:
start = html.find(begin, from_pos)
if start >= 0:
start += len(begin)
endpos = html.find(end, start)
if endpos >= 0:
result.append(html[start:endpos].strip())
from_pos = endpos+len(end)
continue
break
return result
for p in txt_wrap_by_all('<p>', '</p>', html):
print p
$ python test.py
1.34135603905 txt_wrap_by_all
2.01093912125 re_findall
82 82
import re
import timeit
def txt_wrap_by_all(begin, end, html):
if not html:
return ''
result = []
from_pos = 0
while True:
start = html.find(begin, from_pos)
if start >= 0:
start += len(begin)
endpos = html.find(end, start)
if endpos >= 0:
result.append(html[start:endpos])
from_pos = endpos+len(end)
continue
break
return result
def re_findall(html):
return re.findall(r'''<p>(.*?)</p>''', html, re.S)
if __name__ == '__main__':
html = open('index.html').read()
t1 = timeit.Timer("txt_wrap_by_all('<p>', '</p>', html)", "from __main__ import txt_wrap_by_all,html")
t2 = timeit.Timer("re_findall(html)", "from __main__ import re_findall,html")
print t1.timeit(1000), 'txt_wrap_by_all'
print t2.timeit(1000), 're_findall'
print len(txt_wrap_by_all('<p>', '</p>', html)), len(re_findall(html))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment