Created
October 4, 2012 19:33
-
-
Save young001/3835876 to your computer and use it in GitHub Desktop.
tianya3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
from BeautifulSoup import BeautifulSoup,SoupStrainer | |
import urllib | |
import urllib2 | |
import re | |
import time | |
import os | |
import random | |
import cookielib | |
import json | |
import redis | |
from log_code import logger | |
r = redis.Redis() | |
cookie = cookielib.CookieJar() | |
cookie_support= urllib2.HTTPCookieProcessor(cookie) | |
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) | |
urllib2.install_opener(opener) | |
user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0' | |
main_url='http://m.tianya.cn/bbs/' | |
vu = '48886414191' #在天涯的手机版本会有一个vu,每次访问不同 | |
#links_of_leiid = SoupStrainer('a', href=re.compile('leiid')) | |
#soup = BeautifulSoup(response,parseOnlyThese=links_of_leiid) | |
def response_soup_content1(url,referer): | |
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent, | |
'Referer':referer, | |
'Host':'m.tianya.cn'}) | |
while True: | |
try: | |
response = urllib2.urlopen(req,timeout=10).read() | |
soup = BeautifulSoup(response) | |
return soup,response | |
except Exception, e: | |
print e | |
time.sleep(random.random()) | |
#response = urllib2.urlopen(req,timeout=10).read() | |
#soup = BeautifulSoup(response) | |
#return soup,response | |
def response_soup_content(url): | |
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent, | |
'Host':'m.tianya.cn'}) | |
while True: | |
try: | |
response = urllib2.urlopen(req,timeout=10).read() | |
soup = BeautifulSoup(response) | |
return soup,response | |
except Exception, e: | |
print e | |
time.sleep(random.random()) | |
def get_post_ids(url_content): | |
'获取一个页面上所有帖子id列表' | |
ids = re.findall(';id=(\d+)',url_content) | |
return ids | |
def get_next_page_url(url_content): | |
'从当前url中获取下一页的url' | |
next_temp_url = re.search('href="([^<]*?)">下一页',url_content).group(1) | |
if next_temp_url is not None: | |
next_url = main_url+next_temp_url | |
return next_url | |
else: | |
return None | |
def get_post_url(id): | |
'一个id对应这个帖子的url' | |
return 'http://m.tianya.cn/bbs/art.jsp?item=free&id=%s' % id | |
def get_total_pages_number(post_id): | |
'获取一个帖子中页数的总和' | |
post_url = get_post_url(post_id) | |
#print catalog_soup | |
post_content = response_soup_content(post_url)[1] | |
total_pages__number = re.search('\d\/(\d+)',post_content).group(0) | |
return total_pages_number | |
def to_json(data): | |
'将数据转换成json格式,存入redis中' | |
json_data = json.dumps(data) | |
return json_data | |
def sadd_ids_to_redis(redis_db,key,ids): | |
for i in ids: | |
redis_db.sadd(key,i) | |
def get_all_post_ids(redis_db,vu): | |
'获取天涯杂谈这个板块所有帖子的id' | |
start_url = 'http://m.tianya.cn/bbs/list.jsp?item=free&idwriter=0&key=0&chk=&vu=%s' % vu | |
print 'start_url is', start_url | |
start_url_content = response_soup_content(start_url)[1] | |
open('start.html','w').write(start_url_content) | |
start_ids = get_post_ids(start_url_content) | |
logger.debug(start_ids) | |
sadd_ids_to_redis(redis_db,'post:ids',start_ids) | |
redis_db.rpush('visited_urls',start_url) | |
next_url = get_next_page_url(start_url_content).replace('amp;','') | |
logger.debug('the next url is '+next_url) | |
redis_db.set('counter',1) | |
count = 1 | |
while next_url is not None: | |
time.sleep(random.random()) | |
referer = r.lrange('visited_urls',-1,-1)[0] | |
next_url_content = response_soup_content1(next_url,referer)[1] | |
open('url_content_%s' %count,'w').write(next_url_content) | |
ids = get_post_ids(next_url_content) | |
logger.debug(ids) | |
#print ids | |
sadd_ids_to_redis(redis_db,'post:ids',ids) | |
redis_db.rpush('visited_urls',next_url) | |
next_url = get_next_page_url(next_url_content).replace('amp;','') | |
logger.debug('next_url is'+next_url) | |
redis_db.incr('counter') | |
count = count+1 | |
if __name__ == '__main__': | |
get_all_post_ids(r,vu) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment