xtao/decode_email.py

## decode_email.py
import imaplib, email, email.parser, email.policy
import html2text
import pprint, sys

#
# Utilities for Debug
#
pp = pprint.PrettyPrinter(indent=4, width=80)

def myprint(str):
    print(str.encode('cp932','replace').decode('cp932'))


#
# search mails in IMAP
#
def search_imap():
    #mail = imaplib.IMAP4_SSL("imap.mail.com", 993)
    mail = imaplib.IMAP4_SSL("imap.mail.yahoo.co.jp")

    mail.login("sdkn104@yahoo.co.jp","xxxxxx")
    mail.list()
    mail.select('Inbox') # specify inbox
    #mail.select('register') # specify label

    #typ, [data] = mail.search(None, "UNSEEN")
    typ, [data] = mail.search(None, "(ALL)")

    print("searched")
    print(typ)
    pp.pprint(data.split())

    #check
    if typ == "OK":
        if data != '':
            print("New Mail")
        else:
            print("Non")

    # for each mail searched
    for num in data.split():
        print("START " + str(num) + "--------------------------------------------------")

        # fetch whole message as RFC822 format
        result, d = mail.fetch(num, "(RFC822)")


        # save to file
        f = open("mail_" + str(num) + ".txt","bw")
        f.write(d[0][1])
        f.close()

        print("####################################")

        msg = email2Text(d[0][1])

        print("------------------------------------")
        myprint(msg["subject"])
        myprint(msg["date"])
        myprint(msg["from"])
        myprint(msg["body"])

    # closing
    mail.close()
    mail.logout()

#
# Get subject, date, from and body as text from email RFC822 style string
#
def email2Text(rfc822mail):
        # parse the message
        msg_data = email.message_from_bytes(rfc822mail, policy=email.policy.default)

        mail_value = {}

        # Get From, Date, Subject
        mail_value["from"] = header_decode(msg_data.get('From'))
        mail_value["date"] = header_decode(msg_data.get('Date'))
        mail_value["subject"] = header_decode(msg_data.get('Subject'))

        #print( mail_value["date"] )
        #print( mail_value["from"] )
        #print( mail_value["subject"] )

        # Get Body
        #print("--- body ---")
        mail_value["body"] = ""
        if msg_data.is_multipart():
            for part in msg_data.walk():
                #print("--- part ---")
                ddd = msg2bodyText(part)
                if ddd is not None:
                    mail_value["body"] = mail_value["body"] + ddd
        else:
            #print("--- single ---")
            ddd = msg2bodyText(msg_data)
            mail_value["body"] = ddd

        return mail_value

#
# get body text from a message (EmailMessage instance)
#
def msg2bodyText(msg):
    ct = msg.get_content_type()
    cc = msg.get_content_charset() # charset in Content-Type header
    cte = msg.get("Content-Transfer-Encoding")
    print("part: " + str(ct) + " " + str(cc) + " : " + str(cte))

    # skip non-text part/msg
    if msg.get_content_maintype() != "text":
        return None

    # get text
    ddd = msg.get_content()

    # html to text
    if msg.get_content_subtype() == "html":
        try:
            ddd = html2text.html2text(ddd)
        except:
            print("error in html2text")

    return ddd


def header_decode(header):
    hdr = ""
    for text, encoding in email.header.decode_header(header):
        if isinstance(text, bytes):
            text = text.decode(encoding or "us-ascii")
        hdr += text
    return hdr


if __name__ == "__main__":
    search_imap()
	import imaplib, email, email.parser, email.policy
	import html2text
	import pprint, sys

	#
	# Utilities for Debug
	#
	pp = pprint.PrettyPrinter(indent=4, width=80)

	def myprint(str):
	print(str.encode('cp932','replace').decode('cp932'))


	#
	# search mails in IMAP
	#
	def search_imap():
	#mail = imaplib.IMAP4_SSL("imap.mail.com", 993)
	mail = imaplib.IMAP4_SSL("imap.mail.yahoo.co.jp")

	mail.login("sdkn104@yahoo.co.jp","xxxxxx")
	mail.list()
	mail.select('Inbox') # specify inbox
	#mail.select('register') # specify label

	#typ, [data] = mail.search(None, "UNSEEN")
	typ, [data] = mail.search(None, "(ALL)")

	print("searched")
	print(typ)
	pp.pprint(data.split())

	#check
	if typ == "OK":
	if data != '':
	print("New Mail")
	else:
	print("Non")

	# for each mail searched
	for num in data.split():
	print("START " + str(num) + "--------------------------------------------------")

	# fetch whole message as RFC822 format
	result, d = mail.fetch(num, "(RFC822)")


	# save to file
	f = open("mail_" + str(num) + ".txt","bw")
	f.write(d[0][1])
	f.close()

	print("####################################")

	msg = email2Text(d[0][1])

	print("------------------------------------")
	myprint(msg["subject"])
	myprint(msg["date"])
	myprint(msg["from"])
	myprint(msg["body"])

	# closing
	mail.close()
	mail.logout()

	#
	# Get subject, date, from and body as text from email RFC822 style string
	#
	def email2Text(rfc822mail):
	# parse the message
	msg_data = email.message_from_bytes(rfc822mail, policy=email.policy.default)

	mail_value = {}

	# Get From, Date, Subject
	mail_value["from"] = header_decode(msg_data.get('From'))
	mail_value["date"] = header_decode(msg_data.get('Date'))
	mail_value["subject"] = header_decode(msg_data.get('Subject'))

	#print( mail_value["date"] )
	#print( mail_value["from"] )
	#print( mail_value["subject"] )

	# Get Body
	#print("--- body ---")
	mail_value["body"] = ""
	if msg_data.is_multipart():
	for part in msg_data.walk():
	#print("--- part ---")
	ddd = msg2bodyText(part)
	if ddd is not None:
	mail_value["body"] = mail_value["body"] + ddd
	else:
	#print("--- single ---")
	ddd = msg2bodyText(msg_data)
	mail_value["body"] = ddd

	return mail_value

	#
	# get body text from a message (EmailMessage instance)
	#
	def msg2bodyText(msg):
	ct = msg.get_content_type()
	cc = msg.get_content_charset() # charset in Content-Type header
	cte = msg.get("Content-Transfer-Encoding")
	print("part: " + str(ct) + " " + str(cc) + " : " + str(cte))

	# skip non-text part/msg
	if msg.get_content_maintype() != "text":
	return None

	# get text
	ddd = msg.get_content()

	# html to text
	if msg.get_content_subtype() == "html":
	try:
	ddd = html2text.html2text(ddd)
	except:
	print("error in html2text")

	return ddd


	def header_decode(header):
	hdr = ""
	for text, encoding in email.header.decode_header(header):
	if isinstance(text, bytes):
	text = text.decode(encoding or "us-ascii")
	hdr += text
	return hdr


	if __name__ == "__main__":
	search_imap()