public
anonymous / gist:3956734
Created

  • Download Gist
gistfile1.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
# -*- coding:utf-8 -*-
from BeautifulSoup import BeautifulSoup,SoupStrainer
import time
import urllib
import urllib2
import re
import time as ti
import os
import sys
import json
import redis
import chardet
import MySQLdb
 
reload(sys)
sys.setdefaultencoding('utf-8')
 
r = redis.Redis()
 
user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0'
main_url='http://m.tianya.cn/bbs/'
vu = '48886414191' #在天涯的手机版本会有一个vu,每次访问不同
 
#links_of_leiid = SoupStrainer('a', href=re.compile('leiid'))
#soup = BeautifulSoup(response,parseOnlyThese=links_of_leiid)
 
def get_encoding(string):
code_detected = chardet.detect(string)['encoding']
return code_detected
 
def response_soup_content(url):
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent })
while True:
try:
response = urllib2.urlopen(req,timeout=10).read()
soup = BeautifulSoup(response)
return soup,response
except:
time.sleep(5)
def get_post_url(id):
'一个id对应这个帖子的url'
return 'http://m.tianya.cn/bbs/art.jsp?item=free&id=%s' % id
 
def get_total_pages_number(post_id):
'获取一个帖子中页数的总和'
post_url = get_post_url(post_id)
print 'post_url is', post_url
#print catalog_soup
post_content = response_soup_content(post_url)[1]
total_pages_search = re.search('\d\/(\d+)页',post_content)
if total_pages_search is not None:
total_pages_number = total_pages_search.group(1)
return total_pages_number
else:
return 1
 
def strtosecs(str_time):
return int(time.mktime(time.strptime(str_time,'%Y-%m-%d %H:%M')))
 
def insert_into_mysql(conn,cur,pid,user,time,content):
'''将数据存入到mysql中
@param cursor,pid,user,time,content:cursor是mysql的cursor,pid是帖子的id,user是用户名,time是帖子发布的时间,content是内容
@return:无返回值'''
try:
value=[pid,user,time,content]
cur.execute('insert into post(id,user,time,content) values(%s,%s,%s,%s)',value)
conn.commit()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
 
def download_content_test(id):
total_pages_number = get_total_pages_number(id)
print 'total_pages_number is', total_pages_number
print 'I am sleeping'
time.sleep(10)
 
if __name__ == '__main__':
pool = Pool(20)
ids_set = r.smembers('post:timeline_test:ids')
#ids_set = {'945453','1909279'}
#conn=MySQLdb.connect(host='localhost',user='root',passwd='young001',db='tianya',charset="utf8")
#cur=conn.cursor()
while ids_set is not None:
id = ids_set.pop()
print 'now id is', id
pool.spawn(download_content_test,int(id))
pool.join()
#r.sadd('visited_ids',id)
conn.close()
cur.close()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.