public
Created

substring html code without html breaking

  • Download Gist
gistfile1.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13
adylab:script adyliu$ python3 htmlsubstring.py
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span></div>
 
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div>
 
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div>
 
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div>
htmlsubstring.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
#substring html code without html breaking
#author: Ady Liu(http://github.com/adyliu)
#date: 2012-12-28
 
 
 
notags=('img','br','hr')
 
class Tag():
def __init__(self,label,pre=None):
self.pre=pre
self.label=label
self.end=False
def __str__(self):
return self.label+':'+str(self.end)
def __repr__(self):
return self.__str__()
 
def substring2(html,size):
if len(html) <= size:
return html
result,tag,count='','',0,0
tags=[]
for c in html:
result += c
if c == '<':
intag=True
elif c=='>':
intag=False
tag=tag.split()[0]
if tag[0] == '/':
tag = tag.replace('/','')
if tag not in notags:
tags.pop()
else:
if tag[-1] != '/' and tag not in notags:
tags.append(tag)
tag=''
else:
if intag:
tag += c
else:
count+=1
if count>=size: break
while len(tags)>0:
result += '</{0}>'.format(tags.pop())
return result
 
 
def substring(html,size):
if len(html) <= size:
return html
 
result=''
offset,count=0,0
tags=[]
tag=''
cur=None
for c in html:
result += c
offset += 1
if c=='<':
intag=True
elif c=='>':
intag=False
tag=tag.split()[0]
if tag[-1] == '/': tag=tag[0:-1]
if tag[0] == '/':
tag = tag[1:]
if tag not in notags:
while cur:
if cur.label == tag and not cur.end:
cur.end=True
break
cur=cur.pre
else:
if tag not in notags:
t = Tag(tag,cur)
tags.append(t)
cur = t
tag=''
else:
if intag:
tag+=c
else:
count+=1
if count>=size:break
#print('tags',tags)
for t in reversed(tags):
if not t.end:
result += '</{0}>'.format(t.label)
 
return result
 
if __name__ == '__main__':
s='<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div>'
for i in range(18,30):
print(s)
print(substring(s,i))
print(substring2(s,i))
print()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.