Skip to content

Instantly share code, notes, and snippets.

@Dvboi
Created July 1, 2022 05:20
Show Gist options
  • Save Dvboi/2ce7d02d9ee8440ef78831c48aa7beb8 to your computer and use it in GitHub Desktop.
Save Dvboi/2ce7d02d9ee8440ef78831c48aa7beb8 to your computer and use it in GitHub Desktop.
def transform_body(text):
'''
In this function we have tried our best to clean the body as much as possible
Comments tell what the code does.
Overview :-
* since we have email threads, we will extract each email one by one based on delimiter/boundaries
* we'll keep appending these emails in an array and return that
* If there's just one mail, we have an array with just one element
'''
# remove metadata like forwarded by, original message text, etc.
remove_forwarded = r'-{3,}.*Forwarded by.*'
remove_orginal = r'-{3,}.*Original Message.*'
remove_from = re.compile('From:.*',re.IGNORECASE)
remove_sent= re.compile('Sent:.*',re.IGNORECASE)
remove_to = re.compile('To:.*',re.IGNORECASE)
remove_cc = re.compile('Cc:.*',re.IGNORECASE)
remove_bcc = re.compile('Bcc:.*',re.IGNORECASE)
remove_subject = re.compile('Subject:.*',re.IGNORECASE)
# remove URL's
remove_url = r'(www|http)\S+' # https://stackoverflow.com/a/40823105
remove_phone = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}' # ONLY US numbers for now --> https://stackoverflow.com/a/16699507
#remove ANY emails
remove_email = r'\S+@\S+' # https://stackoverflow.com/a/64036475
# Now before we do any preprocessing, we need to extraxt emails as we have 'email threads' not individual emails
pattern = '(%s|%s)' % (remove_orginal,remove_forwarded) # since FWD by & Original message headers are borders from one email to other.
border_indices = [m.span() for m in re.finditer(pattern,text)][::-1] # reverse it for chronological order i.e. if u want something like subject + prev-email to predict curr-prefix, that can also be done now
temp = []
if len(border_indices)>0:
for idx,(start,end) in enumerate(border_indices):
if idx==0:
temp.append(text[end:])
else:
temp.append(text[end:prev_end])
prev_end = end
temp.append(text[:prev_end]) # sometimes there is some text even above the first header so storing that too
def helper(text):
pattern_list_1 = [remove_forwarded,remove_orginal,remove_from,remove_sent,remove_to,remove_cc,remove_bcc,remove_subject,remove_url,remove_phone,remove_email]
for pattern in pattern_list_1:
text = re.sub(pattern,'',text)
# remove attachment_names
text = remove_extensions(text)
# remove any word with digit
text = re.sub(r'\w*\d\w*', '', text)
# remove any digit
text = re.sub('\d','',text)
# remove text between <>,()
remove_tags = r'<.*>'
remove_brackets = r'\(.*\)'
remove_special_1 = r'\\|-' # remove raw backslash or '-'
remove_colon = r'\b[\w]+:' # removes 'something:'
pattern_list_2 = [remove_tags,remove_brackets,remove_special_1,remove_colon]
for pattern in pattern_list_2:
text = re.sub(pattern,'',text)
# remove anything which is not a character,apostrophy ; remember to give a space on replacing with this
remove_nonchars = r'[^A-Za-z\']'
text = re.sub(remove_nonchars,' ',text)
# remove AM/PM as we have a lot of timestamps in emails
text = remove_timestamps(text)
# remove personal names using named entity recognition
text = remove_personal_name(text)
# takes care of \t & \n ; remember to give a space on replacing with this
remove_space = r'\s+'
text = re.sub(remove_space,' ',text)
# take care of apostrophies
text = decontracted(text)
# remove other junk
text = text.replace("IMAGE",'')
text = re.sub(r"\bth\b",'',text)
return text.strip()
if len(temp) > 0:
# extract several mails from a thread
for i in range(len(temp)):
temp[i] = helper(temp[i])
else:
# just store that single mail in array
temp.append(helper(text))
return temp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment