Skip to content

Instantly share code, notes, and snippets.

@Codercise
Created May 29, 2012 11:26
Show Gist options
  • Save Codercise/2827919 to your computer and use it in GitHub Desktop.
Save Codercise/2827919 to your computer and use it in GitHub Desktop.
#################################
FIRST VARIATION (only removes the first tag)
#################################
def remove_tags(html):
start_tag = '>'
end_tag = '</'
find_stag = html.find(start_tag)
find_etag = html.find(end_tag)
if start_tag == -1:
return None
find_stag = html.find(start_tag)
find_etag = html.find(end_tag)
content = html[find_stag + 1:find_etag]
derp = []
splitted=(content.split())
return splitted
#################################
SECOND VARIATION (still need to separate the capitalised words)
#################################
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def remove_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data().split()
##### TEST CASES
print remove_tags('''<h1>Title</h1><p>This is a
<a href="http://www.udacity.com">link</a>.<p>''')
#>>> ['Title','This','is','a','link','.']
print remove_tags('''<table cellpadding='3'>
<tr><td>Hello</td><td>World!</td></tr>
</table>''')
#>>> ['Hello','World!']
print remove_tags("<hello><goodbye>")
#>>> []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment