Skip to content

Instantly share code, notes, and snippets.

@SmartHypercube
Last active July 7, 2018 07:44
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SmartHypercube/842adf2197a4c8d511da620a1daf4d4d to your computer and use it in GitHub Desktop.
Save SmartHypercube/842adf2197a4c8d511da620a1daf4d4d to your computer and use it in GitHub Desktop.
从电子邮件源码中提取文本,依赖于beautifulsoup
#!/usr/bin/python3
from codecs import getdecoder
from email import message_from_bytes
from bs4 import BeautifulSoup
ATTACHMENT = '<attachment with type %s>'
GARBAGE = '<cannot decode>'
def safe(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
return GARBAGE
return wrapper
def payload(m):
encoding = m.get_content_charset() or 'utf8'
try:
getdecoder(encoding)
except LookupError:
encoding = 'utf8'
try:
s = m.get_payload(decode=True)
except:
return GARBAGE
return s.decode(encoding, errors='ignore')
@safe
def text_plain(m):
return payload(m)
@safe
def text_html(m):
soup = BeautifulSoup(payload(m), 'lxml')
return soup.text
@safe
def text_enriched(m):
soup = BeautifulSoup(payload(m), 'lxml')
return soup.text
@safe
def multipart_alternative(m):
return msg2text(m.get_payload()[-1])
@safe
def multipart_mixed(m):
return '\n'.join(map(msg2text, m.get_payload()))
@safe
def multipart_related(m):
return '\n'.join(map(msg2text, m.get_payload()))
@safe
def multipart_signed(m):
return msg2text(m.get_payload()[0])
@safe
def multipart_report(m):
return msg2text(m.get_payload()[0])
def msg2text(m):
t = m.get_content_type()
if t == 'text/plain':
return text_plain(m)
elif t == 'text/html':
return text_html(m)
elif t == 'text/enriched':
return text_enriched(m)
elif t == 'multipart/alternative':
return multipart_alternative(m)
elif t == 'multipart/mixed':
return multipart_mixed(m)
elif t == 'multipart/related':
return multipart_related(m)
elif t == 'multipart/signed':
return multipart_signed(m)
elif t == 'multipart/report':
return multipart_report(m)
elif t.startswith('text/'):
raise ValueError('Unknown MIME type: %s' % t)
elif t.startswith('multipart/'):
raise ValueError('Unknown MIME type: %s' % t)
else:
return ATTACHMENT % t
def email2text(f):
"""f should be opened with 'rb' mode!"""
return msg2text(message_from_bytes(f.read()))
if __name__ == '__main__':
from sys import argv, stderr
if len(argv) == 1:
print('Usage: %s <file>' % argv[0], file=stderr)
print('You can also import this and use `email2text` function.',
file=stderr)
exit(1)
for path in argv[1:]:
if len(argv) > 2:
print(path)
with open(path, 'rb') as f:
print(email2text(f))
if len(argv) > 2:
print()
@kingzevin
Copy link

么么馒头

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment