Last active
August 29, 2015 13:58
-
-
Save sturgle/9954216 to your computer and use it in GitHub Desktop.
A crawler to get books on douban, but it seems not working as douban will block it after a while.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
from sys import argv | |
from string import replace,find,lower | |
from htmllib import HTMLParser | |
from urllib import urlretrieve, urlopen | |
from urlparse import urlparse,urljoin | |
from formatter import DumbWriter,AbstractFormatter | |
from cStringIO import StringIO | |
import re | |
class Retriever(object): | |
def __init__(self,url): | |
self.url = url | |
self.htmlContent = '' | |
def filename(self,url,deffile ="index.htm"): | |
parsedurl = urlparse(url,"http:",0) | |
path = parsedurl[1] + parsedurl[2] | |
return path | |
def retrieve(self): | |
try: | |
sock = urlopen(self.url) | |
self.htmlContent = sock.read() | |
sock.close | |
retval = self.htmlContent | |
except IOError: | |
retval = ('***Error: invalid URL: "%s"' % self.url,) | |
return retval | |
def parse(self): | |
self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) | |
self.parser.feed(self.htmlContent) | |
self.parser.close() | |
return | |
def getLinks(self): | |
return self.parser.anchorlist | |
class Crawler(object): | |
pattern = 'http://book.douban.com/subject/\d+/$' | |
def __init__ (self,url): | |
self.q = [url] | |
self.seen = [] | |
self.dom = urlparse(url)[1] | |
def getPage(self,url): | |
r = Retriever(url) | |
retval = r.retrieve() | |
if retval[0] == "*": # error | |
print retval,"--- skipping parse" | |
return | |
print "URL:",url | |
self.seen.append(url) | |
r.parse() | |
m = re.match(Crawler.pattern, url) | |
if (m != None): | |
print r.parser.title | |
links = r.getLinks() | |
for eachLink in links: | |
if eachLink[:4] != "http" and find(eachLink,"://") == -1: | |
eachLink = urljoin(url,eachLink) | |
if eachLink not in self.seen: | |
if find(eachLink,self.dom) != -1: | |
if eachLink not in self.q: | |
m = re.match(Crawler.pattern, eachLink) | |
if (m != None): | |
self.q.append(eachLink) | |
def go(self): | |
while self.q: | |
url=self.q.pop() | |
self.getPage(url) | |
def main(): | |
''' | |
if len(argv) > 1: | |
url = argv[1] | |
else: | |
try: | |
url = raw_input("Enter starting URL:") | |
except (KeyboardInterrupt,EOFError): | |
url = "" | |
''' | |
url = 'http://book.douban.com/' | |
if not url: return | |
robot = Crawler(url) | |
robot.go() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment