stringertheory/lenient_url_scrub.py

## lenient_url_scrub.py
import re
import scrubadub


class UrlFilth(scrubadub.filth.url.UrlFilth):

    regex = re.compile(r'''
        (?P<protocol>
            (https?:\/\/(www\.)?|www\.)?         # protocol http://, etc
        )(?P<domain>
            [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
            /?                                   # can have a trailing slash
        )(?P<path>
            [\-\w@:%\+\.~\#?&/=]*                # rest of path, query, & hash
        )
    ''', re.VERBOSE)


class UrlDetector(scrubadub.detectors.base.RegexDetector):
    filth_cls = UrlFilth


SCRUBBER = scrubadub.Scrubber()
SCRUBBER.remove_detector('name')
SCRUBBER.remove_detector('url')
SCRUBBER.add_detector(UrlDetector)

SCRUBBER.clean(u'''
Link 1: https://example.com
Link 2: example.com
Email: alice@example.com
''')
# Gives:
# Link 1: {{URL}}
# Link 2: {{URL}}
# Email: {{URL+EMAIL}}
	import re
	import scrubadub


	class UrlFilth(scrubadub.filth.url.UrlFilth):

	regex = re.compile(r'''
	(?P<protocol>
	(https?:\/\/(www\.)?\|www\.)? # protocol http://, etc
	)(?P<domain>
	[\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
	/? # can have a trailing slash
	)(?P<path>
	[\-\w@:%\+\.~\#?&/=]* # rest of path, query, & hash
	)
	''', re.VERBOSE)


	class UrlDetector(scrubadub.detectors.base.RegexDetector):
	filth_cls = UrlFilth


	SCRUBBER = scrubadub.Scrubber()
	SCRUBBER.remove_detector('name')
	SCRUBBER.remove_detector('url')
	SCRUBBER.add_detector(UrlDetector)

	SCRUBBER.clean(u'''
	Link 1: https://example.com
	Link 2: example.com
	Email: alice@example.com
	''')
	# Gives:
	# Link 1: {{URL}}
	# Link 2: {{URL}}
	# Email: {{URL+EMAIL}}