Created
September 12, 2015 18:08
-
-
Save josephlewis42/14b882e4d094c0cead54 to your computer and use it in GitHub Desktop.
A first pass redaction script for basic information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
'''redact.py | |
A very dumb redaction script that will remove basic versions of the following: | |
- Email addresses | |
- Links | |
- US city/state/zip combinations | |
- US phone numbers | |
- pure www.blah.com version sites | |
- US currency | |
- dates and times | |
- IPV4 addresses | |
- Times | |
- Names | |
- Numbers | |
- Various email headers and quoted text | |
This should only be used as a first pass over data you want to redact. | |
In this case it was for troves of email for a spam/ham filter. | |
Usage: | |
redact.py inputfile // will print the output to stdout | |
''' | |
from unidecode import unidecode | |
import re | |
import sys | |
import codecs | |
replacement_regex = [ | |
(re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), "EMAIL_ADDRESS"), | |
(re.compile(r"(<)?http(s)?://[^\s]+"),"HTTP_URI"), | |
(re.compile(r"\d\d\d-\d\d-\d\d\d\d"),"SSN"), | |
(re.compile(r"[^\s]+,\s+[A-Z]{2}\s+[\d]{5}(-[\d]+)?"), "CITY_STATE_ZIP"), | |
(re.compile(r"(\()?\d\d\d(\))?[\.\s-][\d]+[\.\s-][\d]+"),"US_PHONE_NUMBER"), | |
(re.compile(r"www.[^\s]+"),"HTTP_URI"), | |
(re.compile(r"\$[\d,\.]+"),"US_CURRENCY"), | |
(re.compile(r"(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)"),"DATETIME"), | |
(re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), "IPV4_ADDRESS"), | |
(re.compile(r"[\d]+:\d\d(:\d\d(\.\d+)?)?(\s*[aApP]\.?([mM]\.?)?)?(\s*[+-]\d{4})?"), "TIME"), | |
(re.compile(r"\b([A-Z]{1}[a-z]{1,30}[- ]{0,1}|[A-Z]{1}[- \']{1}[A-Z]{0,1} [a-z]{1,30}[- ]{0,1}|[a-z]{1,2}[ -\']{1}[A-Z]{1}[a-z]{1,30}){2,5}"), "NAME"), | |
(re.compile(r"\d[\d]+"), "NUMBER") | |
] | |
replacement_email = [ | |
(re.compile(r"^To:.+$"), "EMAIL_TO"), | |
(re.compile(r"^From:.+$"), "EMAIL_TO"), | |
(re.compile(r"^Date:.+$"), "EMAIL_DATE"), | |
(re.compile(r"^Url:.+$"), "EMAIL_MESSAGE_URL"), | |
(re.compile(r"^>.*$"), " "), # quotes | |
] | |
with codecs.open(sys.argv[1]) as fd: | |
text = fd.read() | |
for k, v in replacement_email: | |
text = k.sub(v, text) | |
for k, v in replacement_regex: | |
text = k.sub(v, text) | |
print text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment