Created
February 11, 2009 09:14
-
-
Save kindy61/61926 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
版权:www.lalfa.com ealpha@gmail.com | |
转载请不要删除 | |
todo :设置一个字段表示,是否成功更新,一旦成功则记录ID,系统运行结束,进行update。否则发送短信。保证只发送一次 | |
""" | |
import os,urllib2,re,MySQLdb,datetime,time,smtplib | |
from BeautifulSoup import BeautifulSoup | |
from StringIO import StringIO | |
from email.mime.text import MIMEText | |
USER_AGENT = ' Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1' | |
BASE_URL_BEGIN= 'http://www.weather.com.cn/html/weather/' | |
BASE_URL_END = '.shtml' | |
conn = MySQLdb.connect(host="localhost", user="fun", passwd="fun", db="fun",use_unicode=1, charset='utf8') | |
#Mail 接收方邮件 | |
mailto_list=["XXXXX@XXXXX.com"] | |
#SMTP 服务器,用户名、口令以及邮箱的后缀 | |
mail_host="XXXX.com" | |
mail_user="XXXX" | |
mail_pass="XXXX" | |
mail_postfix=" XXXX.com" | |
#失败的WID,记录用来判断,是否发送邮件 | |
faultwid = [] | |
#失败重试次数 | |
dotime = 0 | |
def send_mail(to_list,sub,content): | |
''' | |
to_list:发给谁 | |
sub:主题 | |
content:内容 | |
send_mail("aaa@126.com","sub","content") | |
''' | |
me=mail_user+"<"+mail_user+"@"+mail_postfix+">" | |
msg = MIMEText(content) | |
msg['Subject'] = sub | |
msg['From'] = me | |
msg['To'] = ";".join(to_list) | |
try: | |
s = smtplib.SMTP() | |
s.connect(mail_host) | |
s.login(mail_user,mail_pass) | |
s.sendmail(me, to_list, msg.as_string()) | |
s.close() | |
return True | |
except Exception, e: | |
print str(e) | |
return False | |
def getFiveDayWeather(wid,pageid,agent=USER_AGENT): | |
""" | |
将需要的数据表格从整个网页取出来 | |
""" | |
url = BASE_URL_BEGIN + pageid + BASE_URL_END | |
#print '*************************' | |
#print url | |
request = urllib2.Request(url) | |
request.add_header('User-Agent', agent) | |
opener = urllib2.build_opener() | |
allhtml = StringIO(str((opener.open(request)).read())) | |
soup = BeautifulSoup(allhtml, fromEncoding="utf-8") | |
html = soup.find('div', id='dd_0').parent.contents | |
getWeatherList(wid,html) | |
return html | |
def getWeatherList(wid,html): | |
""" | |
取得最后发布时间,已经5天数据 | |
""" | |
soup1 = BeautifulSoup(str(html)) | |
time = soup1.find('h2') | |
update_time = '' | |
for t in time: | |
update_time = t | |
#print '\r' | |
#print update_time | |
#print '\r' | |
html2 = soup1.findAll('div', { "class" : "fut_weatherbox" }) | |
dayid = 0 | |
for dayweather in html2: | |
dayid += 1 | |
getOneDayWeather(wid,dayid,update_time,dayweather) | |
pass | |
def getOneDayWeather(wid,dayid,update_time,html): | |
""" | |
分析一天的天气预报信息 | |
""" | |
soup = BeautifulSoup(StringIO(str(html)), fromEncoding="UTF-8") | |
day = soup.findAll('h3') | |
imgs = soup.findAll('img') | |
t00 = soup.findAll('h4', { "class" : "temp00_dn" }) | |
t01 = soup.findAll('h4', { "class" : "temp01_dn" }) | |
t02 = soup.findAll('h4', { "class" : "temp02_dn" }) | |
t03 = soup.findAll('h4', { "class" : "temp03_dn" }) | |
#print '----------------------' | |
soup_h3 = BeautifulSoup(StringIO(str(day)), fromEncoding="UTF-8") | |
day_value = soup_h3.h3.renderContents() | |
#for img in imgs: 为了确定值,不使用循环 | |
soup_img = BeautifulSoup(StringIO(str(imgs[0])), fromEncoding="UTF-8") | |
imgsrc = soup_img.first('img')['src'] | |
d_pic_value = imgsrc.split('/')[-1].split('.')[-2] | |
soup_img = BeautifulSoup(StringIO(str(imgs[1])), fromEncoding="UTF-8") | |
imgsrc = soup_img.first('img')['src'] | |
n_pic_value = imgsrc.split('/')[-1].split('.')[-2] | |
soup_t00 = BeautifulSoup(StringIO(str(t00)), fromEncoding="UTF-8") | |
weather_value = soup_t00.h4.renderContents() | |
soup_t01 = BeautifulSoup(StringIO(str(t01)), fromEncoding="UTF-8") | |
max_temp = soup_t01.h4.renderContents() | |
soup_t02 = BeautifulSoup(StringIO(str(t02)), fromEncoding="UTF-8") | |
min_temp = soup_t02.h4.renderContents() | |
soup_t03 = BeautifulSoup(StringIO(str(t03)), fromEncoding="UTF-8") | |
wind = soup_t03.h4.renderContents() | |
insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind ) | |
def insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind ): | |
""" | |
插入数据库,此处要修改,5天数据一次commit(),异常rollback() | |
""" | |
cursor_uodate=conn.cursor() | |
sql="INSERT INTO weatherdetail( wid, dayid, lastupdate, currdate, dpic, npic,weather, maxtemp, mintemp, wind) VALUES( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" | |
param = (wid,dayid,update_time ,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind) | |
#print param | |
n=cursor_uodate.execute(sql,param) | |
conn.commit() | |
#print n | |
def sendMonitor(): | |
""" | |
处理失败,发送报警邮件或短信 | |
""" | |
if len(faultwid) <> 0 : | |
if send_mail(mailto_list,"Error: Get Weather Error "+str(datetime.datetime.now()),str(faultwid)): | |
print "监控邮件发送成功." | |
else: | |
print "监控邮件发送失败." | |
pass | |
def doworking(dotime,wid,pageid): | |
""" | |
业务处理入口 | |
""" | |
try: | |
getFiveDayWeather(wid,pageid) | |
except (NameError,Exception),e: | |
print "has one error on %s %s , then do it again , waiting five secs." % (wid,pageid) | |
time.sleep(5) | |
if dotime < 3 : | |
doworking(dotime + 1,wid,pageid) | |
else: | |
faultwid.append(wid) | |
pass | |
if __name__ == "__main__": | |
""" | |
入口函数 | |
""" | |
starttime = datetime.datetime.now() | |
print "Start."+str(starttime) | |
cursor = conn.cursor() | |
cursor.execute("SELECT id,weather_com_cn_pageid FROM weather") | |
result = cursor.fetchall() | |
for record in result: | |
# 将 dotime 恢复到 0 ,代表本次请求首次执行 | |
doworking(0,str(record[0]),record[1]) | |
#time.sleep(2) | |
print '\r' | |
endtime = datetime.datetime.now() | |
print "End."+str(endtime) | |
print "-------------------------------------------------" | |
sendMonitor() | |
print (endtime - starttime).seconds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment