gauravjuvekar/transform_remove_safelinks.py

## transform_remove_safelinks.py
#!/bin/env python
import email
import email.message
import email.parser
import email.policy
import html
import re
import sys
import urllib
import urllib.parse


def normalize_url(match):
    text = match.group(0)
    text = html.unescape(text)
    url = urllib.parse.urlparse(text)
    params = urllib.parse.parse_qs(url.query)
    normalized = params['url'][0]
    return urllib.parse.quote(normalized)


def replace_urls(text, htmlescape=False):
    def htmlescape_wrap(match):
        return html.escape(normalize_url(match))

    text = re.sub(
        r'(?<!href=["\'])(?<!href=)https?:\/\/.*?\.safelinks\.protection\.outlook\.com\/.*?reserved=\d+',
        htmlescape_wrap if htmlescape else normalize_url,
        text)
    return text


def transform_html(data):
    text = data.decode('ascii')
    # TODO: There's probably a better way of applying this to only text
    # contents (not in a <a href=) than just negative-lookbehinds in the regex
    # using BeautifulSoup, but this works well enough for now.
    text = replace_urls(text, htmlescape=True)
    return text.encode()


def transfrom_text(data):
    text = data.decode('ascii')
    text = replace_urls(text)
    return text.encode()


def main(data):
    parser = email.parser.BytesParser(policy=email.policy.SMTP)
    em = parser.parsebytes(data)
    for part in em.walk():
        original_type = part.get_content_type()
        if part.get_content_type() in ('text/plain', 'text'):
            part.set_content(transfrom_text(part.get_payload(decode=True)),
                             maintype='text', subtype='plain')
        elif part.get_content_type() in ('text/html',):
            part.set_content(transform_html(part.get_payload(decode=True)),
                             maintype='text', subtype='html')
    return em.as_bytes()


if __name__ == '__main__':
    sys.stdout.buffer.write(main(sys.stdin.buffer.read()))
	#!/bin/env python
	import email
	import email.message
	import email.parser
	import email.policy
	import html
	import re
	import sys
	import urllib
	import urllib.parse


	def normalize_url(match):
	text = match.group(0)
	text = html.unescape(text)
	url = urllib.parse.urlparse(text)
	params = urllib.parse.parse_qs(url.query)
	normalized = params['url'][0]
	return urllib.parse.quote(normalized)


	def replace_urls(text, htmlescape=False):
	def htmlescape_wrap(match):
	return html.escape(normalize_url(match))

	text = re.sub(
	r'(?<!href=["\'])(?<!href=)https?:\/\/.?\.safelinks\.protection\.outlook\.com\/.?reserved=\d+',
	htmlescape_wrap if htmlescape else normalize_url,
	text)
	return text


	def transform_html(data):
	text = data.decode('ascii')
	# TODO: There's probably a better way of applying this to only text
	# contents (not in a <a href=) than just negative-lookbehinds in the regex
	# using BeautifulSoup, but this works well enough for now.
	text = replace_urls(text, htmlescape=True)
	return text.encode()


	def transfrom_text(data):
	text = data.decode('ascii')
	text = replace_urls(text)
	return text.encode()


	def main(data):
	parser = email.parser.BytesParser(policy=email.policy.SMTP)
	em = parser.parsebytes(data)
	for part in em.walk():
	original_type = part.get_content_type()
	if part.get_content_type() in ('text/plain', 'text'):
	part.set_content(transfrom_text(part.get_payload(decode=True)),
	maintype='text', subtype='plain')
	elif part.get_content_type() in ('text/html',):
	part.set_content(transform_html(part.get_payload(decode=True)),
	maintype='text', subtype='html')
	return em.as_bytes()


	if __name__ == '__main__':
	sys.stdout.buffer.write(main(sys.stdin.buffer.read()))