Skip to content

Instantly share code, notes, and snippets.

@yszou
Created December 9, 2013 11:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yszou/7870934 to your computer and use it in GitHub Desktop.
Save yszou/7870934 to your computer and use it in GitHub Desktop.
获取内容的文件
# -*- coding: utf-8 -*-
'获取邮件'
import pickle
from time import time
from uuid import uuid4
import os
import sys
import re
import datetime
from os.path import join
from lxml import etree
from functools import partial
import logging
from email.mime.text import MIMEText
from email.header import make_header
import email.utils
import tornado.ioloop
import tornado.gen
import tornado.httpclient
import tornado.escape
from service.pop_client import POP
from lib.eml_parse import parse
from timer.log import LocalLog
from config import ACCOUNT, RECORD, MAILBOX, LOCAL_LOG_DIR, DEBUG, MAX_ONCE
IL = tornado.ioloop.IOLoop.instance()
logger = logging.getLogger('AceFetch')
logger.propagate = False
logger.setLevel(logging.INFO)
fmt = logging.Formatter('--%(name)s|%(asctime)s|%(filename)s|%(lineno)s|%(levelname)s|%(message)s', '%m-%d %H:%M:%S')
stream_hd = logging.StreamHandler()
stream_hd.setFormatter(fmt)
#if sys.platform.startswith('linux'):
# syslog_hd = SysLogHandler(address='/dev/log', facility=SysLogHandler.LOG_LOCAL6)
# syslog_hd.setFormatter(fmt)
# syslog_hd.setLevel(logging.INFO)
# logger.addHandler(syslog_hd)
if sys.version.startswith('2.7'):
local_hd = logging.StreamHandler(stream=LocalLog)
else:
local_hd = logging.StreamHandler(strm=LocalLog)
local_hd.setFormatter(fmt)
local_hd.setLevel(logging.INFO)
logger.addHandler(local_hd)
if DEBUG:
stream_hd.setLevel(logging.INFO)
logger.addHandler(stream_hd)
def notify(count):
cmd = 'notify-send -i /usr/share/libindicator/icons/hicolor/32x32/categories/applications-email-panel.png -t 1000 "有 %s 封新邮件"' % count
os.system(cmd)
def get_box(msg, self):
list_map = {
'python-cn.googlegroups.com': '3-CPyUG',
'python-tornado.googlegroups.com': '4-Tornado',
'angular.googlegroups.com': '5-AngularJS',
}
to_map = {
'python-cn@googlegroups.com': '3-CPyUG',
'python-tornado@googlegroups.com': '4-Tornado',
'angular@googlegroups.com': '5-AngularJS',
}
from_map = {
'jira@vip.sohu.com': '6-JIRA',
}
host_user_map = {
('mail.xxx.com', 'xxx'): '2-SOHU-INC',
}
box = None
if box is None and msg['to']:
box = to_map.get(msg['to'][0][1], None)
if box is None and msg['list']:
box = list_map.get(msg['list'][0][1], None)
if box is None and msg['from']:
box = from_map.get(msg['from'][0][1], None)
if box is None:
box = host_user_map.get((self.host, self.user), None)
return box or '0-inbox'
class DiscuzFetch(object):
LAST_FILE = ''
LAST = (0, 0, 0)
HOST = 'http://example.com/forum.php'
FOLDER = '/home/zys/Mail/0-Example/'
ID = 'Example'
FROM = 'no-reply@cloudbbs.org'
def __init__(self):
self.last = self.__class__.LAST
self.last_file = self.__class__.LAST_FILE
self.host = self.__class__.HOST
self.folder = self.__class__.FOLDER
self.id = self.__class__.ID
self.from_mbox = self.__class__.FROM
self.max = self.last[:]
self.client = tornado.httpclient.AsyncHTTPClient()
def to_send(self, to_do_obj):
for obj in to_do_obj:
obj['title'] = ('[%s] ' % self.id) + obj['title']
obj['title'] = obj['title'].encode('utf8')
obj['user'] = obj['user'].encode('utf8')
msg = MIMEText(obj['html'], _subtype='html', _charset='utf-8')
msg['Subject'] = make_header([(obj['title'], 'utf-8')])
msg['From'] = make_header([(obj['user'], 'utf-8'), ('<%s>' % self.from_mbox, None)])
msg['To'] = make_header([('邹业盛', 'utf-8'), ('<xxx@gmail.com>', None)])
if not obj['is_p']:
msg['Message-ID'] = '<Discuz-%s-%s-%s-%s-%s@BJ5544>' % (self.last[0], obj['thread'], obj['id'], self.id, self.host.encode('hex'))
msg['In-Reply-To'] = '<Discuz-%s-%s-0-%s-%s@BJ5544>' % (self.last[0], obj['thread'], self.id, self.host.encode('hex'))
msg['References'] = msg['In-Reply-To']
else:
msg['Message-ID'] = '<Discuz-%s-%s-0-%s-%s@BJ5544>' % (self.last[0], obj['thread'], self.id, self.host.encode('hex'))
msg['X-URL'] = self.host + '?' + ('mod=viewthread&tid=%s&page=%s#pid%s' % (obj['thread'], obj['page'], obj['id']))
now = datetime.datetime.strptime(obj['create'], '%Y-%m-%d %H:%M:%S')
msg['Date'] = email.utils.formatdate(int(now.strftime('%s')), localtime=True)
filename = join(self.folder, 'new', msg['Message-ID'].split('@', 1)[0])
with open(filename, 'w') as f:
f.write(msg.as_string())
logger.info('[Discuz]%s|%s|%s' % (obj['title'], obj['user'], obj['id']))
@tornado.gen.engine
def check_thread(self, thread, callback):
to_do_html = []
url = self.host + '?' + ('mod=redirect&tid=%s&goto=lastpost' % thread)
res = yield tornado.gen.Task(self.client.fetch, url)
if res.error:
IL.add_timeout(int(time()) + 5, partial(self.check_thread, thread, callback))
return
root = etree.HTML(res.body)
title = root.xpath('.//div[@id="pt"]/div/a')[-1].text
sum_page = re.findall('page=(\d+)', res.effective_url)
sum_page = int(sum_page[0]) if sum_page else 1
page = sum_page
is_continue = True
while 1:
if page == 0:
break
url = self.host + '?' + ('mod=viewthread&tid=%s&page=%s' % (thread, page))
while 1:
res = yield tornado.gen.Task(self.client.fetch, url)
if res.error:
continue
root = etree.HTML(res.body)
break
post_list = root.xpath('.//div[@id="postlist"]/div')
post_list.reverse()
for i, p in enumerate(post_list):
id = p.get('id', '')
if not id.startswith('post_'):
continue
id = int(id.split('_', 1)[1])
if id <= self.last[2]:
#最后一页, 最后一个(post_list中有一个不是帖子)
if page == sum_page and i == 1:
is_continue = False
page = 1
break
user = p.xpath('.//a[@class="xw1"]')[0].text
create = p.xpath('.//em[@id="authorposton%s"]/span' % id)
if create:
create = create[0].get('title')
else:
create = p.xpath('.//em[@id="authorposton%s"]' % id)[0].text.split(' ', 1)[1]
html = p.xpath('.//td[@id="postmessage_%s"]' % id)[0]
html.tag = 'div'; del html.attrib['id']; del html.attrib['class']
html = etree.tostring(html, method='html', encoding='utf8')
is_p = True if p.xpath('.//div[@id="p_btn"]') else False
to_do_html.append({
'id': id,
'thread': thread,
'user': user,
'create': create,
'html': html,
'title': title if is_p else title,
'page': page,
'is_p': is_p,
})
if id > self.max[-1]:
self.max = (self.last[0], thread, id)
page -= 1
to_do_html.reverse()
self.to_send(to_do_html)
callback(is_continue)
@tornado.gen.engine
def run(self):
page = 0
while 1:
page += 1
url = self.host + '?' + ('mod=forumdisplay&filter=lastpost&orderby=lastpost&fid=%s&page=%s' % (self.last[0], page))
res = yield tornado.gen.Task(self.client.fetch, url)
if res.error:
IL.add_timeout(int(time()) + 30, self.run)
return
root = etree.HTML(res.body)
tb_list = root.xpath('.//form[@id="moderate"]/table/tbody')
if not tb_list:
IL.add_timeout(int(time()) + 30, self.run)
return
for tb in tb_list:
id = tb.get('id', '')
if id.startswith('normalthread'):
thread = int(id.split('_', 1)[1])
is_continue = yield tornado.gen.Task(self.check_thread, thread)
if not is_continue:
self.last = self.max[:]
with open(self.last_file, 'w') as f:
f.write(tornado.escape.json_encode(self.last).encode('utf8'))
IL.add_timeout(int(time()) + 30, self.run)
return
class CloudBBS(DiscuzFetch):
LAST_FILE = '/home/zys/Dropbox/last/cloudbbs.org'
with open(LAST_FILE, 'r') as f:
data = f.read().strip()
data = tornado.escape.json_decode(data)
LAST = tuple(data)
HOST = 'http://cloudbbs.org/forum.php'
FOLDER = '/home/zys/Mail/8-Discuz/'
ID = 'CloudBBS'
FROM = 'no-reply@cloudbbs.org'
class CloudBBS_41(DiscuzFetch):
LAST_FILE = '/home/zys/Dropbox/last/cloudbbs-41.org'
with open(LAST_FILE, 'r') as f:
data = f.read().strip()
data = tornado.escape.json_decode(data)
LAST = tuple(data)
HOST = 'http://cloudbbs.org/forum.php'
FOLDER = '/home/zys/Mail/8-Discuz/'
ID = 'CloudBBS'
FROM = 'no-reply@cloudbbs.org'
class Anshida(DiscuzFetch):
LAST = (74, 495699, 1714985-1)
HOST = 'http://www.anshida.net/BBS/forum.php'
FOLDER = '/home/zys/Mail/8-Discuz/'
ID = 'Anshida'
FROM = 'no-reply@anshida.net'
class Fetch(object):
def __init__(self, host, port, user, password, ssl=False, name=None, *args, **kargs):
self.host = host
self.port = port
self.user = user
self.password = password
self.ssl = ssl
self.name = name
self.interval = kargs['all']['interval']
self.timer = None
def run(self):
logger.info('%s|CHECK' % self.name)
self.pop = POP(self.host, self.port, self.user, self.password, self.fetch, self.ssl)
def parse_body(self, body):
body = body.split('\n', 1)[1].lstrip()
msg = parse(body)
box = get_box(msg, self)
id = '%s.%s' % (int(time()), uuid4().hex)
file = join(MAILBOX, box, 'new', id)
if 'corp_bizweb' not in msg['subject']:
with open(file, 'wb') as f:
f.write(msg['source'].read())
return msg, box
@tornado.gen.engine
def fetch(self, pop):
data = yield tornado.gen.Task(pop.uidl, '')
mail_list = [x.split(' ', 1) for x in data.split('\r\n')][1:-1]
mail_list.reverse()
if os.access(RECORD, os.F_OK):
with open(RECORD, 'rb') as f:
in_db = pickle.load(f).get(self.name, set([]))
else:
in_db = set([])
count = 0
new = 0
for id, uidl in mail_list:
count += 1
if uidl in in_db:
continue
if MAX_ONCE > 0 and count > MAX_ONCE:
break
body = yield tornado.gen.Task(pop.retr, id)
msg, box = self.parse_body(body)
new += 1
in_db.add(uidl)
logger.info('%s|%s -> %s, %s/%s' % (self.name, msg['subject'].encode('utf8'), box, count, len(mail_list)))
if new:
notify(new)
if os.access(RECORD, os.F_OK):
with open(RECORD, 'rb') as f:
all = pickle.load(f)
all[self.name] = in_db
else:
all = {self.name: in_db}
with open(RECORD, 'wb') as f:
pickle.dump(all, f)
data = yield tornado.gen.Task(pop.quit)
logger.info('%s|%s|COMPLETE' % (self.name, data))
self.timer = IL.add_timeout(int(time()) + self.interval, self.run)
if __name__ == '__main__':
for k, v in ACCOUNT.items():
Fetch(v['host'], v['port'], v['user'], v['pass'], v['ssl'], k, all=v).run()
CloudBBS().run()
CloudBBS_41().run()
#Anshida().run()
LocalLog(1000 * 10, LOCAL_LOG_DIR).start()
IL.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment