Skip to content

Instantly share code, notes, and snippets.

@swshan
Created October 20, 2015 08:09
Show Gist options
  • Save swshan/87dbbb05dade36f0a9c9 to your computer and use it in GitHub Desktop.
Save swshan/87dbbb05dade36f0a9c9 to your computer and use it in GitHub Desktop.
#-*- coding:utf-8 -*-
import sys
import re
import requests
import gevent
from bs4 import BeautifulSoup
import urlparse
import time
global header_info
header_info = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'
}
root_url = 'http://wap.douban.com'
def crawler():
''' '''
r = requests.get(root_url, timeout = 4, headers=header_info)
if (r.status_code != 200):
return []
print r.status_code
'''
soup = BeautifulSoup(r.text, 'html.parser')
selects = [a.attrs.get('href') for a in soup.select('a')]
'''
try:
with open("text.html", "wb") as f:
f.write(r.content)
except IOError:
print("Oops, file error...")
gevent.joinall([
gevent.spawn(crawler)
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment