Created
October 26, 2012 03:40
-
-
Save anonymous/3956734 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
from BeautifulSoup import BeautifulSoup,SoupStrainer | |
import time | |
import urllib | |
import urllib2 | |
import re | |
import time as ti | |
import os | |
import sys | |
import json | |
import redis | |
import chardet | |
import MySQLdb | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
r = redis.Redis() | |
user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0' | |
main_url='http://m.tianya.cn/bbs/' | |
vu = '48886414191' #在天涯的手机版本会有一个vu,每次访问不同 | |
#links_of_leiid = SoupStrainer('a', href=re.compile('leiid')) | |
#soup = BeautifulSoup(response,parseOnlyThese=links_of_leiid) | |
def get_encoding(string): | |
code_detected = chardet.detect(string)['encoding'] | |
return code_detected | |
def response_soup_content(url): | |
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent }) | |
while True: | |
try: | |
response = urllib2.urlopen(req,timeout=10).read() | |
soup = BeautifulSoup(response) | |
return soup,response | |
except: | |
time.sleep(5) | |
def get_post_url(id): | |
'一个id对应这个帖子的url' | |
return 'http://m.tianya.cn/bbs/art.jsp?item=free&id=%s' % id | |
def get_total_pages_number(post_id): | |
'获取一个帖子中页数的总和' | |
post_url = get_post_url(post_id) | |
print 'post_url is', post_url | |
#print catalog_soup | |
post_content = response_soup_content(post_url)[1] | |
total_pages_search = re.search('\d\/(\d+)页',post_content) | |
if total_pages_search is not None: | |
total_pages_number = total_pages_search.group(1) | |
return total_pages_number | |
else: | |
return 1 | |
def strtosecs(str_time): | |
return int(time.mktime(time.strptime(str_time,'%Y-%m-%d %H:%M'))) | |
def insert_into_mysql(conn,cur,pid,user,time,content): | |
'''将数据存入到mysql中 | |
@param cursor,pid,user,time,content:cursor是mysql的cursor,pid是帖子的id,user是用户名,time是帖子发布的时间,content是内容 | |
@return:无返回值''' | |
try: | |
value=[pid,user,time,content] | |
cur.execute('insert into post(id,user,time,content) values(%s,%s,%s,%s)',value) | |
conn.commit() | |
except MySQLdb.Error,e: | |
print "Mysql Error %d: %s" % (e.args[0], e.args[1]) | |
def download_content_test(id): | |
total_pages_number = get_total_pages_number(id) | |
print 'total_pages_number is', total_pages_number | |
print 'I am sleeping' | |
time.sleep(10) | |
if __name__ == '__main__': | |
pool = Pool(20) | |
ids_set = r.smembers('post:timeline_test:ids') | |
#ids_set = {'945453','1909279'} | |
#conn=MySQLdb.connect(host='localhost',user='root',passwd='young001',db='tianya',charset="utf8") | |
#cur=conn.cursor() | |
while ids_set is not None: | |
id = ids_set.pop() | |
print 'now id is', id | |
pool.spawn(download_content_test,int(id)) | |
pool.join() | |
#r.sadd('visited_ids',id) | |
conn.close() | |
cur.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment