Created
July 1, 2022 05:20
-
-
Save Dvboi/2ce7d02d9ee8440ef78831c48aa7beb8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def transform_body(text): | |
''' | |
In this function we have tried our best to clean the body as much as possible | |
Comments tell what the code does. | |
Overview :- | |
* since we have email threads, we will extract each email one by one based on delimiter/boundaries | |
* we'll keep appending these emails in an array and return that | |
* If there's just one mail, we have an array with just one element | |
''' | |
# remove metadata like forwarded by, original message text, etc. | |
remove_forwarded = r'-{3,}.*Forwarded by.*' | |
remove_orginal = r'-{3,}.*Original Message.*' | |
remove_from = re.compile('From:.*',re.IGNORECASE) | |
remove_sent= re.compile('Sent:.*',re.IGNORECASE) | |
remove_to = re.compile('To:.*',re.IGNORECASE) | |
remove_cc = re.compile('Cc:.*',re.IGNORECASE) | |
remove_bcc = re.compile('Bcc:.*',re.IGNORECASE) | |
remove_subject = re.compile('Subject:.*',re.IGNORECASE) | |
# remove URL's | |
remove_url = r'(www|http)\S+' # https://stackoverflow.com/a/40823105 | |
remove_phone = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}' # ONLY US numbers for now --> https://stackoverflow.com/a/16699507 | |
#remove ANY emails | |
remove_email = r'\S+@\S+' # https://stackoverflow.com/a/64036475 | |
# Now before we do any preprocessing, we need to extraxt emails as we have 'email threads' not individual emails | |
pattern = '(%s|%s)' % (remove_orginal,remove_forwarded) # since FWD by & Original message headers are borders from one email to other. | |
border_indices = [m.span() for m in re.finditer(pattern,text)][::-1] # reverse it for chronological order i.e. if u want something like subject + prev-email to predict curr-prefix, that can also be done now | |
temp = [] | |
if len(border_indices)>0: | |
for idx,(start,end) in enumerate(border_indices): | |
if idx==0: | |
temp.append(text[end:]) | |
else: | |
temp.append(text[end:prev_end]) | |
prev_end = end | |
temp.append(text[:prev_end]) # sometimes there is some text even above the first header so storing that too | |
def helper(text): | |
pattern_list_1 = [remove_forwarded,remove_orginal,remove_from,remove_sent,remove_to,remove_cc,remove_bcc,remove_subject,remove_url,remove_phone,remove_email] | |
for pattern in pattern_list_1: | |
text = re.sub(pattern,'',text) | |
# remove attachment_names | |
text = remove_extensions(text) | |
# remove any word with digit | |
text = re.sub(r'\w*\d\w*', '', text) | |
# remove any digit | |
text = re.sub('\d','',text) | |
# remove text between <>,() | |
remove_tags = r'<.*>' | |
remove_brackets = r'\(.*\)' | |
remove_special_1 = r'\\|-' # remove raw backslash or '-' | |
remove_colon = r'\b[\w]+:' # removes 'something:' | |
pattern_list_2 = [remove_tags,remove_brackets,remove_special_1,remove_colon] | |
for pattern in pattern_list_2: | |
text = re.sub(pattern,'',text) | |
# remove anything which is not a character,apostrophy ; remember to give a space on replacing with this | |
remove_nonchars = r'[^A-Za-z\']' | |
text = re.sub(remove_nonchars,' ',text) | |
# remove AM/PM as we have a lot of timestamps in emails | |
text = remove_timestamps(text) | |
# remove personal names using named entity recognition | |
text = remove_personal_name(text) | |
# takes care of \t & \n ; remember to give a space on replacing with this | |
remove_space = r'\s+' | |
text = re.sub(remove_space,' ',text) | |
# take care of apostrophies | |
text = decontracted(text) | |
# remove other junk | |
text = text.replace("IMAGE",'') | |
text = re.sub(r"\bth\b",'',text) | |
return text.strip() | |
if len(temp) > 0: | |
# extract several mails from a thread | |
for i in range(len(temp)): | |
temp[i] = helper(temp[i]) | |
else: | |
# just store that single mail in array | |
temp.append(helper(text)) | |
return temp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment