Skip to content

Instantly share code, notes, and snippets.

Created October 26, 2012 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/3956734 to your computer and use it in GitHub Desktop.
Save anonymous/3956734 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
from BeautifulSoup import BeautifulSoup,SoupStrainer
import time
import urllib
import urllib2
import re
import time as ti
import os
import sys
import json
import redis
import chardet
import MySQLdb
reload(sys)
sys.setdefaultencoding('utf-8')
r = redis.Redis()
user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0'
main_url='http://m.tianya.cn/bbs/'
vu = '48886414191' #在天涯的手机版本会有一个vu,每次访问不同
#links_of_leiid = SoupStrainer('a', href=re.compile('leiid'))
#soup = BeautifulSoup(response,parseOnlyThese=links_of_leiid)
def get_encoding(string):
code_detected = chardet.detect(string)['encoding']
return code_detected
def response_soup_content(url):
req = urllib2.Request(url, headers = { 'User-Agent' : user_agent })
while True:
try:
response = urllib2.urlopen(req,timeout=10).read()
soup = BeautifulSoup(response)
return soup,response
except:
time.sleep(5)
def get_post_url(id):
'一个id对应这个帖子的url'
return 'http://m.tianya.cn/bbs/art.jsp?item=free&id=%s' % id
def get_total_pages_number(post_id):
'获取一个帖子中页数的总和'
post_url = get_post_url(post_id)
print 'post_url is', post_url
#print catalog_soup
post_content = response_soup_content(post_url)[1]
total_pages_search = re.search('\d\/(\d+)页',post_content)
if total_pages_search is not None:
total_pages_number = total_pages_search.group(1)
return total_pages_number
else:
return 1
def strtosecs(str_time):
return int(time.mktime(time.strptime(str_time,'%Y-%m-%d %H:%M')))
def insert_into_mysql(conn,cur,pid,user,time,content):
'''将数据存入到mysql中
@param cursor,pid,user,time,content:cursor是mysql的cursor,pid是帖子的id,user是用户名,time是帖子发布的时间,content是内容
@return:无返回值'''
try:
value=[pid,user,time,content]
cur.execute('insert into post(id,user,time,content) values(%s,%s,%s,%s)',value)
conn.commit()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
def download_content_test(id):
total_pages_number = get_total_pages_number(id)
print 'total_pages_number is', total_pages_number
print 'I am sleeping'
time.sleep(10)
if __name__ == '__main__':
pool = Pool(20)
ids_set = r.smembers('post:timeline_test:ids')
#ids_set = {'945453','1909279'}
#conn=MySQLdb.connect(host='localhost',user='root',passwd='young001',db='tianya',charset="utf8")
#cur=conn.cursor()
while ids_set is not None:
id = ids_set.pop()
print 'now id is', id
pool.spawn(download_content_test,int(id))
pool.join()
#r.sadd('visited_ids',id)
conn.close()
cur.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment