Skip to content

Instantly share code, notes, and snippets.

@kakwa
Created August 12, 2015 19:48
Show Gist options
  • Save kakwa/e0c5a462ca9d321041c3 to your computer and use it in GitHub Desktop.
Save kakwa/e0c5a462ca9d321041c3 to your computer and use it in GitHub Desktop.
from HTMLParser import HTMLParser
import urllib2
import re
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def __init__(self):
"""Initialize and reset this instance."""
self.flag = False
self.reset()
def handle_starttag(self, tag, attrs):
if tag == 'pre':
for a in attrs:
if a[0] == 'class' and re.match("source-code", a[1]):
self.flag = True
#print "Encountered a start tag:", tag
def handle_endtag(self, tag):
if tag == 'pre':
self.flag = False
#print "Encountered an end tag :", tag
def handle_data(self, data):
if self.flag:
print self.unescape(data)
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
counter = 1
while True:
print str(counter)
try:
response = urllib2.urlopen('http://codecrap.com/content/' + str(counter) + '/')
except urllib2.HTTPError:
break
page = response.read().decode('utf-8')
parser.feed(page)
counter = counter + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment