Created
October 20, 2015 08:09
-
-
Save swshan/87dbbb05dade36f0a9c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import sys | |
import re | |
import requests | |
import gevent | |
from bs4 import BeautifulSoup | |
import urlparse | |
import time | |
global header_info | |
header_info = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36' | |
} | |
root_url = 'http://wap.douban.com' | |
def crawler(): | |
''' ''' | |
r = requests.get(root_url, timeout = 4, headers=header_info) | |
if (r.status_code != 200): | |
return [] | |
print r.status_code | |
''' | |
soup = BeautifulSoup(r.text, 'html.parser') | |
selects = [a.attrs.get('href') for a in soup.select('a')] | |
''' | |
try: | |
with open("text.html", "wb") as f: | |
f.write(r.content) | |
except IOError: | |
print("Oops, file error...") | |
gevent.joinall([ | |
gevent.spawn(crawler) | |
]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment