Skip to content

Instantly share code, notes, and snippets.

@Wizmann
Created February 25, 2013 14:05
Show Gist options
  • Save Wizmann/5029979 to your computer and use it in GitHub Desktop.
Save Wizmann/5029979 to your computer and use it in GitHub Desktop.
一个很挫很挫的抓取BeijingAir的程序
#coding=utf-8
import urllib
import urllib2
import cookielib
import httplib
import json
import re
import time
import datetime
import os
import sys
import socket
import sqlite3
import logging
import renren
import HTMLParser
import MySQLdb
reload(sys)
sys.setdefaultencoding('utf-8')
class AirParser(HTMLParser.HTMLParser):
def __init__(self,callback):
HTMLParser.HTMLParser.__init__(self)
self.ready=False
self.callback=callback
def handle_starttag(self,tag,attrs):
if(tag=='p' and attrs==[('class','js-tweet-text')]):
self.ready=True
try:
if(tag=='span' and 'data-time' in zip(*attrs)[0]):
for key,value in attrs:
if key=='data-time':
self.callback(value,'time')
except:
pass
def handle_data(self,data):
if(self.ready and data.strip()!=''):
#print '>>'
#print data.strip()
self.callback(data.strip(),'context')
self.ready=False
def get_log():
log = logging.getLogger('weibo')
handler = logging.FileHandler('beijing_air.log','a')
fmt = logging.Formatter("%(levelname)-8s %(asctime)-15s [%(filename)s,%(lineno)d] %(message)s")
handler.setFormatter(fmt)
log.addHandler(handler)
log.setLevel(logging.DEBUG)
return log
def check_updated(air):
sql_query='SELECT `id` FROM `air` WHERE `addtime`=%d;'
sql_insert='INSERT INTO `air` (`addtime`,`context`) VALUES (%d,\'%s\');'
conn=sqlite3.connect('beijing_air.sqlite')
cursor=conn.cursor()
show=[]
for item in air:
cursor.execute(sql_query % item[1])
if(len(cursor.fetchall())==0):
show.append(item[0])
cursor.execute(sql_insert % (item[1],MySQLdb.escape_string(item[0])))
cursor.close()
conn.commit()
conn.close()
return show[::-1][:3][::-1]
def show(addr):
logger=get_log()
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11'
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
urllib2.install_opener(opener)
req = urllib2.Request(addr)
req.add_header('User-Agent',user_agent)
logger.info("try to fetch page...")
content = urllib2.urlopen(req,timeout=60).read()
with open(os.path.join('/tmp',str(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M"))+'.txt'),'w') as pagefile:
pagefile.write(content)
air=[]
airtime=[]
def air_callback(item,ttype):
if(ttype=='context'):
air.append(item)
else:
airtime.append(int(item))
air_parser=AirParser(air_callback)
air_parser.feed(content)
air=zip(air,airtime)
for item in check_updated(air[::-1]):
print item
logger.info(item)
renren.show(item+'(@BeijingAir)')
time.sleep(30)
if(__name__=='__main__'):
for i in xrange(30):
print 'attempted %d...' % (i+1)
try:
show('https://twitter.com/beijingair')
time.sleep(30)
show('http://twitter.com/beijingair')
break
except Exception,e:
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment