Skip to content

Instantly share code, notes, and snippets.

@young001
Created October 4, 2012 16:45
Show Gist options
  • Save young001/3834860 to your computer and use it in GitHub Desktop.
Save young001/3834860 to your computer and use it in GitHub Desktop.
tianya crawler
# -*- coding:utf-8 -*-
from BeautifulSoup import BeautifulSoup,SoupStrainer
import urllib
import urllib2
import re
import time
import os
import json
import redis
from log_code import logger
r = redis.Redis()
user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0'
main_url='http://m.tianya.cn/bbs/'
vu = '48886414191' #在天涯的手机版本会有一个vu,每次访问不同
#links_of_leiid = SoupStrainer('a', href=re.compile('leiid'))
#soup = BeautifulSoup(response,parseOnlyThese=links_of_leiid)
def response_soup_content(url):
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent })
while True:
#try:
#response = urllib2.urlopen(req).read()
#soup = BeautifulSoup(response)
#return soup,response
#except Exception, e:
#print e
#time.sleep(5)
response = urllib2.urlopen(req).read()
soup = BeautifulSoup(response)
return soup,response
def get_post_ids(url):
'获取一个页面上所有帖子id列表'
post_lists_content = response_soup_content(url)[1]
ids = re.findall(';id=(\d+)',post_lists_content)
return ids
def get_next_page_url(url):
'从当前url中获取下一页的url'
url_content = response_soup_content(url)[1]
next_temp_url = re.search('href="([^<]*?)">下一页',url_content).group(1)
if next_temp_url is not None:
next_url = main_url+next_temp_url
return next_url
else:
return None
def get_post_url(id):
'一个id对应这个帖子的url'
return 'http://m.tianya.cn/bbs/art.jsp?item=free&id=%s' % id
def get_total_pages_number(post_id):
'获取一个帖子中页数的总和'
post_url = get_post_url(post_id)
#print catalog_soup
post_content = response_soup_content(post_url)[1]
total_pages__number = re.search('\d\/(\d+)',post_content).group(0)
return total_pages_number
def to_json(data):
'将数据转换成json格式,存入redis中'
json_data = json.dumps(data)
return json_data
def sadd_ids_to_redis(redis_db,key,ids):
for i in ids:
redis_db.sadd(key,i)
def get_all_post_ids(redis_db,vu):
'获取天涯杂谈这个板块所有帖子的id'
start_url = 'http://m.tianya.cn/bbs/list.jsp?item=free&idwriter=0&key=0&chk=&vu=%s' % vu
print 'start_url is', start_url
start_ids = get_post_ids(start_url)
logger.debug(start_ids)
sadd_ids_to_redis(redis_db,'post:ids',start_ids)
redis_db.rpush('visited_urls',start_url)
next_url = get_next_page_url(start_url).replace('amp;','')
logger.debug('the next url is '+next_url)
redis_db.set('counter',1)
while next_url is not None:
ids = get_post_ids(next_url)
logger.debug(ids)
#print ids
sadd_ids_to_redis(redis_db,'post:ids',ids)
redis_db.rpush('visited_urls',next_url)
next_url = get_next_page_url(next_url).replace('amp;','')
logger.debug('next_url is'+next_url)
redis_db.incr('counter')
if __name__ == '__main__':
get_all_post_ids(r,vu)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment