Created
May 29, 2012 11:26
-
-
Save Codercise/2827919 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################# | |
FIRST VARIATION (only removes the first tag) | |
################################# | |
def remove_tags(html): | |
start_tag = '>' | |
end_tag = '</' | |
find_stag = html.find(start_tag) | |
find_etag = html.find(end_tag) | |
if start_tag == -1: | |
return None | |
find_stag = html.find(start_tag) | |
find_etag = html.find(end_tag) | |
content = html[find_stag + 1:find_etag] | |
derp = [] | |
splitted=(content.split()) | |
return splitted | |
################################# | |
SECOND VARIATION (still need to separate the capitalised words) | |
################################# | |
from HTMLParser import HTMLParser | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def remove_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data().split() | |
##### TEST CASES | |
print remove_tags('''<h1>Title</h1><p>This is a | |
<a href="http://www.udacity.com">link</a>.<p>''') | |
#>>> ['Title','This','is','a','link','.'] | |
print remove_tags('''<table cellpadding='3'> | |
<tr><td>Hello</td><td>World!</td></tr> | |
</table>''') | |
#>>> ['Hello','World!'] | |
print remove_tags("<hello><goodbye>") | |
#>>> [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment