Skip to content

Instantly share code, notes, and snippets.

@stringertheory
Created September 29, 2018 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stringertheory/e8867f7620d8b606c1c675b60c0b173f to your computer and use it in GitHub Desktop.
Save stringertheory/e8867f7620d8b606c1c675b60c0b173f to your computer and use it in GitHub Desktop.
replacing a python scrubadub regex filter for URL matching that doesn't require protocol
import re
import scrubadub
class UrlFilth(scrubadub.filth.url.UrlFilth):
regex = re.compile(r'''
(?P<protocol>
(https?:\/\/(www\.)?|www\.)? # protocol http://, etc
)(?P<domain>
[\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
/? # can have a trailing slash
)(?P<path>
[\-\w@:%\+\.~\#?&/=]* # rest of path, query, & hash
)
''', re.VERBOSE)
class UrlDetector(scrubadub.detectors.base.RegexDetector):
filth_cls = UrlFilth
SCRUBBER = scrubadub.Scrubber()
SCRUBBER.remove_detector('name')
SCRUBBER.remove_detector('url')
SCRUBBER.add_detector(UrlDetector)
SCRUBBER.clean(u'''
Link 1: https://example.com
Link 2: example.com
Email: alice@example.com
''')
# Gives:
# Link 1: {{URL}}
# Link 2: {{URL}}
# Email: {{URL+EMAIL}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment