kakwa/get_codecrap.py

## get_codecrap.py
from HTMLParser import HTMLParser
import urllib2
import re

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    def __init__(self):
        """Initialize and reset this instance."""
        self.flag = False
        self.reset()

    def handle_starttag(self, tag, attrs):
        if tag == 'pre':
            for a in attrs:
                if a[0] == 'class' and re.match("source-code", a[1]):
                    self.flag = True
        #print "Encountered a start tag:", tag
    def handle_endtag(self, tag):
        if tag == 'pre':
            self.flag = False
        #print "Encountered an end tag :", tag
    def handle_data(self, data):
        if self.flag:
            print self.unescape(data)

# instantiate the parser and fed it some HTML
parser = MyHTMLParser()

counter = 1
while True:
    print str(counter)
    try:
        response = urllib2.urlopen('http://codecrap.com/content/' + str(counter) +  '/')
    except urllib2.HTTPError:
        break
    page = response.read().decode('utf-8')
    parser.feed(page)
    counter = counter + 1
	from HTMLParser import HTMLParser
	import urllib2
	import re

	# create a subclass and override the handler methods
	class MyHTMLParser(HTMLParser):
	def __init__(self):
	"""Initialize and reset this instance."""
	self.flag = False
	self.reset()

	def handle_starttag(self, tag, attrs):
	if tag == 'pre':
	for a in attrs:
	if a[0] == 'class' and re.match("source-code", a[1]):
	self.flag = True
	#print "Encountered a start tag:", tag
	def handle_endtag(self, tag):
	if tag == 'pre':
	self.flag = False
	#print "Encountered an end tag :", tag
	def handle_data(self, data):
	if self.flag:
	print self.unescape(data)

	# instantiate the parser and fed it some HTML
	parser = MyHTMLParser()

	counter = 1
	while True:
	print str(counter)
	try:
	response = urllib2.urlopen('http://codecrap.com/content/' + str(counter) + '/')
	except urllib2.HTTPError:
	break
	page = response.read().decode('utf-8')
	parser.feed(page)
	counter = counter + 1