ifduyue/p_in_html.py

## p_in_html.py
def txt_wrap_by(begin, end, html):
    if not html:
        return ''
    start = html.find(begin)
    if start >= 0:
        start += len(begin)
        end = html.find(end, start)
        if end >= 0:
            return html[start:end].strip()

def txt_wrap_by_all(begin, end, html):
    if not html:
        return ''
    result = []
    from_pos = 0
    while True:
        start = html.find(begin, from_pos)
        if start >= 0:
            start += len(begin)
            endpos = html.find(end, start)
            if endpos >= 0:
                result.append(html[start:endpos].strip())
                from_pos = endpos+len(end)
                continue
        break
    return result

for p in txt_wrap_by_all('<p>', '</p>', html):
    print p

## result
$ python test.py
1.34135603905 txt_wrap_by_all
2.01093912125 re_findall
82 82

## test.py
import re
import timeit

def txt_wrap_by_all(begin, end, html):
    if not html:
        return ''
    result = []
    from_pos = 0
    while True:
        start = html.find(begin, from_pos)
        if start >= 0:
            start += len(begin)
            endpos = html.find(end, start)
            if endpos >= 0:
                result.append(html[start:endpos])
                from_pos = endpos+len(end)
                continue
        break
    return result

def re_findall(html):
    return re.findall(r'''<p>(.*?)</p>''', html, re.S)


if __name__ == '__main__':
    html = open('index.html').read()
    t1 = timeit.Timer("txt_wrap_by_all('<p>', '</p>', html)", "from __main__ import txt_wrap_by_all,html")
    t2 = timeit.Timer("re_findall(html)", "from __main__ import re_findall,html")

    print t1.timeit(1000), 'txt_wrap_by_all'
    print t2.timeit(1000), 're_findall'

    print len(txt_wrap_by_all('<p>', '</p>', html)), len(re_findall(html))
	def txt_wrap_by(begin, end, html):
	if not html:
	return ''
	start = html.find(begin)
	if start >= 0:
	start += len(begin)
	end = html.find(end, start)
	if end >= 0:
	return html[start:end].strip()

	def txt_wrap_by_all(begin, end, html):
	if not html:
	return ''
	result = []
	from_pos = 0
	while True:
	start = html.find(begin, from_pos)
	if start >= 0:
	start += len(begin)
	endpos = html.find(end, start)
	if endpos >= 0:
	result.append(html[start:endpos].strip())
	from_pos = endpos+len(end)
	continue
	break
	return result

	for p in txt_wrap_by_all('<p>', '</p>', html):
	print p
	$ python test.py
	1.34135603905 txt_wrap_by_all
	2.01093912125 re_findall
	82 82
	import re
	import timeit

	def txt_wrap_by_all(begin, end, html):
	if not html:
	return ''
	result = []
	from_pos = 0
	while True:
	start = html.find(begin, from_pos)
	if start >= 0:
	start += len(begin)
	endpos = html.find(end, start)
	if endpos >= 0:
	result.append(html[start:endpos])
	from_pos = endpos+len(end)
	continue
	break
	return result

	def re_findall(html):
	return re.findall(r'''<p>(.*?)</p>''', html, re.S)


	if __name__ == '__main__':
	html = open('index.html').read()
	t1 = timeit.Timer("txt_wrap_by_all('<p>', '</p>', html)", "from __main__ import txt_wrap_by_all,html")
	t2 = timeit.Timer("re_findall(html)", "from __main__ import re_findall,html")

	print t1.timeit(1000), 'txt_wrap_by_all'
	print t2.timeit(1000), 're_findall'

	print len(txt_wrap_by_all('<p>', '</p>', html)), len(re_findall(html))