Skip to content

Instantly share code, notes, and snippets.

@RobinDavid
Created February 24, 2014 20:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RobinDavid/9196709 to your computer and use it in GitHub Desktop.
Save RobinDavid/9196709 to your computer and use it in GitHub Desktop.
html parser in python to extract h1 text
from html.parser import HTMLParser #Import the parser
class HeadingParser(HTMLParser): #create a subclass of HTMLParser which will overload handle..
inHeading = False
def handle_starttag(self, tag, attrs): #Triggered when an opening tag is encountered
if tag == "h1": #if the tag is <h1>
self.inHeading = True #Change a variable which says we are in an header
print("Found a Heading 1")
def handle_data(self, data): #Triggered when data found (the content of the tag)
if self.inHeading: #Useless, used just to filter content of h1's
print(data)
def handle_endtag(self, tag): #Handle end of a tag
if tag =="h1": #Here if it is h1 put back "inHeading" to False
self.inHeading = False
hParser = HeadingParser() #Create our object
file = open("file.html", "r") #Open the file
html = file.read() #Read entirely the file
file.close() #Close the file
hParser.feed(html) #Parse the file contained in the var "html"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment