Created
January 11, 2012 15:04
-
-
Save ifduyue/1595083 to your computer and use it in GitHub Desktop.
content between <p> and </p>
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def txt_wrap_by(begin, end, html): | |
if not html: | |
return '' | |
start = html.find(begin) | |
if start >= 0: | |
start += len(begin) | |
end = html.find(end, start) | |
if end >= 0: | |
return html[start:end].strip() | |
def txt_wrap_by_all(begin, end, html): | |
if not html: | |
return '' | |
result = [] | |
from_pos = 0 | |
while True: | |
start = html.find(begin, from_pos) | |
if start >= 0: | |
start += len(begin) | |
endpos = html.find(end, start) | |
if endpos >= 0: | |
result.append(html[start:endpos].strip()) | |
from_pos = endpos+len(end) | |
continue | |
break | |
return result | |
for p in txt_wrap_by_all('<p>', '</p>', html): | |
print p |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python test.py | |
1.34135603905 txt_wrap_by_all | |
2.01093912125 re_findall | |
82 82 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import timeit | |
def txt_wrap_by_all(begin, end, html): | |
if not html: | |
return '' | |
result = [] | |
from_pos = 0 | |
while True: | |
start = html.find(begin, from_pos) | |
if start >= 0: | |
start += len(begin) | |
endpos = html.find(end, start) | |
if endpos >= 0: | |
result.append(html[start:endpos]) | |
from_pos = endpos+len(end) | |
continue | |
break | |
return result | |
def re_findall(html): | |
return re.findall(r'''<p>(.*?)</p>''', html, re.S) | |
if __name__ == '__main__': | |
html = open('index.html').read() | |
t1 = timeit.Timer("txt_wrap_by_all('<p>', '</p>', html)", "from __main__ import txt_wrap_by_all,html") | |
t2 = timeit.Timer("re_findall(html)", "from __main__ import re_findall,html") | |
print t1.timeit(1000), 'txt_wrap_by_all' | |
print t2.timeit(1000), 're_findall' | |
print len(txt_wrap_by_all('<p>', '</p>', html)), len(re_findall(html)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment