Dvboi/cleaning_body.py

## cleaning_body.py
def transform_body(text):
    '''
    In this function we have tried our best to clean the body as much as possible
    Comments tell what the code does.

    Overview :-
    * since we have email threads, we will extract each email one by one based on delimiter/boundaries
    * we'll keep appending these emails in an array and return that
    * If there's just one mail, we have an array with just one element
    '''
    # remove metadata like forwarded by, original message text, etc.
    remove_forwarded = r'-{3,}.*Forwarded by.*'
    remove_orginal = r'-{3,}.*Original Message.*'
    remove_from = re.compile('From:.*',re.IGNORECASE)
    remove_sent= re.compile('Sent:.*',re.IGNORECASE)
    remove_to = re.compile('To:.*',re.IGNORECASE)
    remove_cc = re.compile('Cc:.*',re.IGNORECASE)
    remove_bcc = re.compile('Bcc:.*',re.IGNORECASE)
    remove_subject = re.compile('Subject:.*',re.IGNORECASE)
    # remove URL's
    remove_url = r'(www|http)\S+'     # https://stackoverflow.com/a/40823105
    remove_phone = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}'   # ONLY US numbers for now --> https://stackoverflow.com/a/16699507

    #remove ANY emails
    remove_email = r'\S+@\S+'  # https://stackoverflow.com/a/64036475


    # Now before we do any preprocessing, we need to extraxt emails as we have 'email threads' not individual emails

    pattern = '(%s|%s)' % (remove_orginal,remove_forwarded)  # since FWD by & Original message headers are borders from one email to other.
    border_indices = [m.span() for m in re.finditer(pattern,text)][::-1]   # reverse it for chronological order i.e. if u want something like subject + prev-email to predict curr-prefix, that can also be done now
    temp = []
    if len(border_indices)>0:
        for idx,(start,end) in enumerate(border_indices):
            if idx==0:
                temp.append(text[end:])
            else:
                temp.append(text[end:prev_end])
            prev_end = end
        temp.append(text[:prev_end])  # sometimes there is some text even above the first header so storing that too

    def helper(text):
        pattern_list_1 = [remove_forwarded,remove_orginal,remove_from,remove_sent,remove_to,remove_cc,remove_bcc,remove_subject,remove_url,remove_phone,remove_email]

        for pattern in pattern_list_1:
            text = re.sub(pattern,'',text)

        # remove attachment_names
        text = remove_extensions(text)

        # remove any word with digit
        text = re.sub(r'\w*\d\w*', '', text)

        # remove any digit
        text = re.sub('\d','',text)

        # remove text between <>,()
        remove_tags = r'<.*>'
        remove_brackets = r'\(.*\)'
        remove_special_1 = r'\\|-'  # remove raw backslash or '-'
        remove_colon = r'\b[\w]+:' # removes 'something:'

        pattern_list_2 = [remove_tags,remove_brackets,remove_special_1,remove_colon]
        for pattern in pattern_list_2:
            text = re.sub(pattern,'',text)

        # remove anything which is not a character,apostrophy ; remember to give a space on replacing with this
        remove_nonchars = r'[^A-Za-z\']'
        text = re.sub(remove_nonchars,' ',text)

        # remove AM/PM as we have a lot of timestamps in emails
        text = remove_timestamps(text)

        # remove personal names using named entity recognition
        text = remove_personal_name(text)

        # takes care of \t & \n ; remember to give a space on replacing with this
        remove_space = r'\s+'
        text = re.sub(remove_space,' ',text)

        # take care of apostrophies
        text = decontracted(text)

        # remove other junk
        text = text.replace("IMAGE",'')
        text = re.sub(r"\bth\b",'',text)

        return text.strip()

    if len(temp) > 0:
        # extract several mails from a thread
        for i in range(len(temp)):
            temp[i] = helper(temp[i])
    else:
        # just store that single mail in array
        temp.append(helper(text))
    return temp
	def transform_body(text):
	'''
	In this function we have tried our best to clean the body as much as possible
	Comments tell what the code does.

	Overview :-
	* since we have email threads, we will extract each email one by one based on delimiter/boundaries
	* we'll keep appending these emails in an array and return that
	* If there's just one mail, we have an array with just one element
	'''
	# remove metadata like forwarded by, original message text, etc.
	remove_forwarded = r'-{3,}.Forwarded by.'
	remove_orginal = r'-{3,}.Original Message.'
	remove_from = re.compile('From:.*',re.IGNORECASE)
	remove_sent= re.compile('Sent:.*',re.IGNORECASE)
	remove_to = re.compile('To:.*',re.IGNORECASE)
	remove_cc = re.compile('Cc:.*',re.IGNORECASE)
	remove_bcc = re.compile('Bcc:.*',re.IGNORECASE)
	remove_subject = re.compile('Subject:.*',re.IGNORECASE)
	# remove URL's
	remove_url = r'(www\|http)\S+' # https://stackoverflow.com/a/40823105
	remove_phone = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}' # ONLY US numbers for now --> https://stackoverflow.com/a/16699507

	#remove ANY emails
	remove_email = r'\S+@\S+' # https://stackoverflow.com/a/64036475


	# Now before we do any preprocessing, we need to extraxt emails as we have 'email threads' not individual emails

	pattern = '(%s\|%s)' % (remove_orginal,remove_forwarded) # since FWD by & Original message headers are borders from one email to other.
	border_indices = [m.span() for m in re.finditer(pattern,text)][::-1] # reverse it for chronological order i.e. if u want something like subject + prev-email to predict curr-prefix, that can also be done now
	temp = []
	if len(border_indices)>0:
	for idx,(start,end) in enumerate(border_indices):
	if idx==0:
	temp.append(text[end:])
	else:
	temp.append(text[end:prev_end])
	prev_end = end
	temp.append(text[:prev_end]) # sometimes there is some text even above the first header so storing that too

	def helper(text):
	pattern_list_1 = [remove_forwarded,remove_orginal,remove_from,remove_sent,remove_to,remove_cc,remove_bcc,remove_subject,remove_url,remove_phone,remove_email]

	for pattern in pattern_list_1:
	text = re.sub(pattern,'',text)

	# remove attachment_names
	text = remove_extensions(text)

	# remove any word with digit
	text = re.sub(r'\w\d\w', '', text)

	# remove any digit
	text = re.sub('\d','',text)

	# remove text between <>,()
	remove_tags = r'<.*>'
	remove_brackets = r'\(.*\)'
	remove_special_1 = r'\\\|-' # remove raw backslash or '-'
	remove_colon = r'\b[\w]+:' # removes 'something:'

	pattern_list_2 = [remove_tags,remove_brackets,remove_special_1,remove_colon]
	for pattern in pattern_list_2:
	text = re.sub(pattern,'',text)

	# remove anything which is not a character,apostrophy ; remember to give a space on replacing with this
	remove_nonchars = r'[^A-Za-z\']'
	text = re.sub(remove_nonchars,' ',text)

	# remove AM/PM as we have a lot of timestamps in emails
	text = remove_timestamps(text)

	# remove personal names using named entity recognition
	text = remove_personal_name(text)

	# takes care of \t & \n ; remember to give a space on replacing with this
	remove_space = r'\s+'
	text = re.sub(remove_space,' ',text)

	# take care of apostrophies
	text = decontracted(text)

	# remove other junk
	text = text.replace("IMAGE",'')
	text = re.sub(r"\bth\b",'',text)

	return text.strip()

	if len(temp) > 0:
	# extract several mails from a thread
	for i in range(len(temp)):
	temp[i] = helper(temp[i])
	else:
	# just store that single mail in array
	temp.append(helper(text))
	return temp