Skip to content

Instantly share code, notes, and snippets.

@josephlewis42
Created September 12, 2015 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josephlewis42/14b882e4d094c0cead54 to your computer and use it in GitHub Desktop.
Save josephlewis42/14b882e4d094c0cead54 to your computer and use it in GitHub Desktop.
A first pass redaction script for basic information
#!/usr/bin/env python3
'''redact.py
A very dumb redaction script that will remove basic versions of the following:
- Email addresses
- Links
- US city/state/zip combinations
- US phone numbers
- pure www.blah.com version sites
- US currency
- dates and times
- IPV4 addresses
- Times
- Names
- Numbers
- Various email headers and quoted text
This should only be used as a first pass over data you want to redact.
In this case it was for troves of email for a spam/ham filter.
Usage:
redact.py inputfile // will print the output to stdout
'''
from unidecode import unidecode
import re
import sys
import codecs
replacement_regex = [
(re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), "EMAIL_ADDRESS"),
(re.compile(r"(<)?http(s)?://[^\s]+"),"HTTP_URI"),
(re.compile(r"\d\d\d-\d\d-\d\d\d\d"),"SSN"),
(re.compile(r"[^\s]+,\s+[A-Z]{2}\s+[\d]{5}(-[\d]+)?"), "CITY_STATE_ZIP"),
(re.compile(r"(\()?\d\d\d(\))?[\.\s-][\d]+[\.\s-][\d]+"),"US_PHONE_NUMBER"),
(re.compile(r"www.[^\s]+"),"HTTP_URI"),
(re.compile(r"\$[\d,\.]+"),"US_CURRENCY"),
(re.compile(r"(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)"),"DATETIME"),
(re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), "IPV4_ADDRESS"),
(re.compile(r"[\d]+:\d\d(:\d\d(\.\d+)?)?(\s*[aApP]\.?([mM]\.?)?)?(\s*[+-]\d{4})?"), "TIME"),
(re.compile(r"\b([A-Z]{1}[a-z]{1,30}[- ]{0,1}|[A-Z]{1}[- \']{1}[A-Z]{0,1} [a-z]{1,30}[- ]{0,1}|[a-z]{1,2}[ -\']{1}[A-Z]{1}[a-z]{1,30}){2,5}"), "NAME"),
(re.compile(r"\d[\d]+"), "NUMBER")
]
replacement_email = [
(re.compile(r"^To:.+$"), "EMAIL_TO"),
(re.compile(r"^From:.+$"), "EMAIL_TO"),
(re.compile(r"^Date:.+$"), "EMAIL_DATE"),
(re.compile(r"^Url:.+$"), "EMAIL_MESSAGE_URL"),
(re.compile(r"^>.*$"), " "), # quotes
]
with codecs.open(sys.argv[1]) as fd:
text = fd.read()
for k, v in replacement_email:
text = k.sub(v, text)
for k, v in replacement_regex:
text = k.sub(v, text)
print text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment