Skip to content

Instantly share code, notes, and snippets.

@afm-sayem
Created September 9, 2018 11:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afm-sayem/b3904def820a09edb255e2efcb1c2a5a to your computer and use it in GitHub Desktop.
Save afm-sayem/b3904def820a09edb255e2efcb1c2a5a to your computer and use it in GitHub Desktop.
parse emails from files and separate the key components
import os, re, email, sqlite3
conn = sqlite3.connect('mails.db')
c = conn.cursor()
c.execute('''CREATE TABLE emails
(date text, frm text, recipient text, subject text, body text)''')
def cleanup(message):
clean_html = (re.sub('<[^<]+?>', '', data)).replace('&nbsp;', ' ');
return clean_html
def writedata(mail, body, table):
date = mail.get('Date')
frm = mail.get('From')
to = mail.get('In-Reply-To')
subject = mail.get('Subject')
table.execute('INSERT INTO emails VALUES (?,?,?,?,?)', (date, frm, to, subject, body))
conn.commit()
with open('out.csv', 'w') as csvfile:
for filename in os.listdir():
if filename.endswith("txt"):
with open(filename, 'r', encoding="ISO-8859-1") as f:
content = f.read()
msgs = content.split('=========================================================================')
for m in msgs:
mail = email.message_from_string(m.strip())
if mail.is_multipart():
for part in mail.walk():
body = part.get_payload(decode=True)
if body is not None:
data = body.decode("ISO-8859-1")
data = cleanup(data)
writedata(mail, data, c)
else:
body = mail.get_payload(decode=True)
data = body.decode("ISO-8859-1")
data = cleanup(data)
writedata(mail, data, c)
else:
continue
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment