Skip to content

Instantly share code, notes, and snippets.

@josephmosby
Created October 1, 2012 03:30
Show Gist options
  • Save josephmosby/3809331 to your computer and use it in GitHub Desktop.
Save josephmosby/3809331 to your computer and use it in GitHub Desktop.
Strip out all tags from HTML and return content
def strip_out_tags(html):
out_of_tags = []
inside_tags = []
words = []
beginning_pos = 0
end_pos = 0
if html[beginning_pos] == '<':
in_tags = True
else:
in_tags = False
while end_pos < len(html):
if in_tags == False:
if html[end_pos] != '<':
end_pos += 1
else:
if html[beginning_pos:end_pos].rstrip() != '':
out_of_tags.append(html[beginning_pos:end_pos].rstrip())
print beginning_pos, end_pos, in_tags
in_tags = True
beginning_pos = end_pos
else:
if html[end_pos] != '>':
end_pos += 1
else:
inside_tags.append(html[beginning_pos:end_pos])
print beginning_pos, end_pos, in_tags
in_tags = False
beginning_pos = end_pos+1
if beginning_pos != end_pos:
out_of_tags.append(html[beginning_pos:end_pos].rstrip())
for item in out_of_tags:
if ' ' in item:
for word in item.split(' '):
if word != '':
words.append(word)
else:
words.append(item)
return words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment