Skip to content

Instantly share code, notes, and snippets.

@shantoroy
Created March 2, 2023 19:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shantoroy/d99825be17e667fafd542dee10999725 to your computer and use it in GitHub Desktop.
Save shantoroy/d99825be17e667fafd542dee10999725 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File : redaction.py
@Time : 2023/02/09 20:57:22
@Author : Shanto Roy
@Version : 1.0
@Contact : sroy10@uh.edu
@License : (C)Copyright 2020-2021, Shanto Roy
@Desc : Class that replace real information with fake believable ones.
'''
from faker import Faker
import re
from dateutil.parser import parse
from anonymization import Anonymization, AnonymizerChain
from anonymization import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer
# from datefinder import find_dates
import datetime
import spacy
import usaddress
from pyap import parse
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import random
fake = Faker()
# this class moderate our contents
class DataRedaction:
def __init__(self, text):
self.text = text
# Load the small English model
self.nlp = spacy.load("en_core_web_sm")
@staticmethod
def find_dates(text):
date_formats = [
r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", # dd/mm/yyyy or dd-mm-yyyy
r"\b\d{1,2} \b\w{3} \d{2,4}\b", # dd MMM yyyy
r"\b\d{1,2} \b\w{3} \d{4}\b", # dd MMM yyyyy
r"\b\w{3} \d{1,2}, \d{4}\b", # MMM dd, yyyy
]
dates = []
for format in date_formats:
dates.extend(re.findall(format, text, re.IGNORECASE))
return dates
# change date within text
def changeDate(self, given_start_date=datetime.datetime(1980, 1, 1), given_end_date=datetime.datetime(2023, 12, 31)):
# Find all dates in the text using datefinder
# dates = list(find_dates(self.text))
# # Replace each date with a new date generated by Faker
# for date_obj in dates:
# old_date = date_obj.strftime('%Y-%m-%d')
# new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d')
# self.text = self.text.replace(old_date, new_date)
# Find all dates in the text using regular expressions
# dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\b|\b\d{4}-\d{2}-\d{2}\b', self.text)
dates = self.find_dates(self.text)
# Replace each date with a new date generated by Faker
for date_string in dates:
old_date = date_string
new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d')
if 'T' in old_date:
new_date = new_date + 'T12:00:00'
self.text = self.text.replace(old_date, new_date)
return self.text
# change name entities -> person name, company name
def changeName(self):
# to not mistake important terms as person or company names
excludewordlist = ["SSN"]
# Process the text with spacy
doc = self.nlp(self.text)
# Anonymize people and company names
for ent in doc.ents:
if ent.label_ in ["PERSON", "ORG", "GPE", "FAC"]:
if ent.label_ == "PERSON":
anonymized_name = fake.name()
elif ent.label_ in ["ORG", "FAC"]:
anonymized_name = fake.company() + " Inc."
if ent.text.upper() not in excludewordlist:
self.text = self.text.replace(ent.text, anonymized_name)
return self.text
# change address
def changeAddress(self):
# # Process the text with spacy
# doc = self.nlp(self.text)
# # Iterate over the entities in the document
# for ent in doc.ents:
# # Check if the entity is a location/address
# if ent.label_ == "GPE" or ent.label_ == "LOC" or ent.label_ == "FAC" or ent.label_ == "ORG":
# anonymized_name = fake.company()
# self.text = self.text.replace(ent.text, anonymized_name)
addresses = parse(self.text, country='US')
# print(addresses)
for address in addresses:
anonymized_address = fake.address()
self.text = self.text.replace(address, anonymized_address)
return self.text
"""
# change email
def changeEmail(self):
# # Use a regex pattern to find email addresses in the text
# email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
# emails = email_regex.findall(self.text)
# # Anonymize each email
# for email in emails:
# anonymized_email = fake.email()
# self.text = self.text.replace(email, anonymized_email)
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
name_pattern = re.compile(r"(?i)(?!my|his|her)\b[A-Z][a-z]+\b")
# # Process the text with spacy
# doc = self.nlp(self.text)
# # Anonymize people and company names
# for ent in doc.ents:
# if ent.label_ == "PERSON":
# Split the text into sentences
sentences = sent_tokenize(self.text)
names = []
new_sentence_list = []
for sentence in sentences:
sentence_names = re.findall(name_pattern, sentence)
if sentence_names:
names.extend(sentence_names)
sentence_emails = re.findall(email_pattern, sentence)
print(names,sentence_emails)
if sentence_emails:
for email in sentence_emails:
if names:
print(names)
if not "USA" in names[-1]:
if " " in names[-1]:
print(names[-1])
firstname = names[-1].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
else:
if " " in names[-2]:
print(names[-2])
firstname = names[-2].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
names.pop()
else:
anonymized_email = fake.email()
sentence = sentence.replace(email, anonymized_email)
new_sentence_list.append(sentence)
self.text = " ".join(new_sentence_list)
return self.text
"""
# revised version of change emails
def changeEmail2(self):
# Split the text into sentences
sentences = sent_tokenize(self.text)
names = []
new_sentence_list = []
# get the regex of email
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
for sentence in sentences:
sentence_names = []
# Tokenize the text into words
tokens = nltk.word_tokenize(sentence)
# Tag the tokens with their part-of-speech
tagged = nltk.pos_tag(tokens)
# Use the named entity recognizer to extract entities from the tagged tokens
entities = ne_chunk(tagged)
# Iterate through the entities and extract the person names
for entity in entities:
if hasattr(entity, 'label') and entity.label() == 'PERSON':
name = ' '.join(c[0] for c in entity.leaves())
sentence_names.append(name)
if sentence_names:
names.extend(sentence_names)
sentence_emails = re.findall(email_pattern, sentence)
# print(names,sentence_emails)
if sentence_emails:
for email in sentence_emails:
if names:
print(names)
# fake_email = f"{names[-1].lower()}.{fake.free_email_domain()}"
if not "USA" in names[-1]:
if " " in names[-1]:
print(names[-1])
firstname = names[-1].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
names.pop()
else:
if " " in names[-2]:
print(names[-2])
firstname = names[-2].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
names.pop()
names.pop()
else:
anonymized_email = fake.email()
sentence = sentence.replace(email, anonymized_email)
new_sentence_list.append(sentence)
self.text = " ".join(new_sentence_list)
return self.text
# change mobile numbers
def changeMobileNumber(self):
# Search for phone number patterns in the text
phone_number_pattern = re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}')
phone_numbers = re.findall(phone_number_pattern, self.text)
# Replace the phone numbers with fake ones generated by Faker
for phone_number in phone_numbers:
fake_phone_number = fake.phone_number()
self.text = self.text.replace(phone_number, fake_phone_number)
return self.text
# change SSN
def changeSSN(self):
ssn_regex = re.compile(r"\d{3}-\d{2}-\d{4}")
self.text = ssn_regex.sub(lambda x: fake.ssn(), self.text)
return self.text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment