Skip to content

Instantly share code, notes, and snippets.

@sturgle
Last active August 29, 2015 13:58
Show Gist options
  • Save sturgle/9954216 to your computer and use it in GitHub Desktop.
Save sturgle/9954216 to your computer and use it in GitHub Desktop.
A crawler to get books on douban, but it seems not working as douban will block it after a while.
#!/usr/bin/env python
# coding=utf-8
from sys import argv
from string import replace,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve, urlopen
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
import re
class Retriever(object):
def __init__(self,url):
self.url = url
self.htmlContent = ''
def filename(self,url,deffile ="index.htm"):
parsedurl = urlparse(url,"http:",0)
path = parsedurl[1] + parsedurl[2]
return path
def retrieve(self):
try:
sock = urlopen(self.url)
self.htmlContent = sock.read()
sock.close
retval = self.htmlContent
except IOError:
retval = ('***Error: invalid URL: "%s"' % self.url,)
return retval
def parse(self):
self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
self.parser.feed(self.htmlContent)
self.parser.close()
return
def getLinks(self):
return self.parser.anchorlist
class Crawler(object):
pattern = 'http://book.douban.com/subject/\d+/$'
def __init__ (self,url):
self.q = [url]
self.seen = []
self.dom = urlparse(url)[1]
def getPage(self,url):
r = Retriever(url)
retval = r.retrieve()
if retval[0] == "*": # error
print retval,"--- skipping parse"
return
print "URL:",url
self.seen.append(url)
r.parse()
m = re.match(Crawler.pattern, url)
if (m != None):
print r.parser.title
links = r.getLinks()
for eachLink in links:
if eachLink[:4] != "http" and find(eachLink,"://") == -1:
eachLink = urljoin(url,eachLink)
if eachLink not in self.seen:
if find(eachLink,self.dom) != -1:
if eachLink not in self.q:
m = re.match(Crawler.pattern, eachLink)
if (m != None):
self.q.append(eachLink)
def go(self):
while self.q:
url=self.q.pop()
self.getPage(url)
def main():
'''
if len(argv) > 1:
url = argv[1]
else:
try:
url = raw_input("Enter starting URL:")
except (KeyboardInterrupt,EOFError):
url = ""
'''
url = 'http://book.douban.com/'
if not url: return
robot = Crawler(url)
robot.go()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment