Created March 2, 2023 19:42
#!/usr/bin/env python
# -*-coding:utf-8 -*-
@File :
@Time : 2023/02/09 20:57:22
@Author : Shanto Roy
@Version : 1.0
@Contact :
@License : (C)Copyright 2020-2021, Shanto Roy
@Desc : Class that replace real information with fake believable ones.
from faker import Faker
import re
from dateutil.parser import parse
from anonymization import Anonymization, AnonymizerChain
from anonymization import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer
# from datefinder import find_dates
import datetime
import spacy
import usaddress
from pyap import parse
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import random
fake = Faker()
# this class moderate our contents
class DataRedaction:
def __init__(self, text):
self.text = text
# Load the small English model
self.nlp = spacy.load("en_core_web_sm")
def find_dates(text):
date_formats = [
r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", # dd/mm/yyyy or dd-mm-yyyy
r"\b\d{1,2} \b\w{3} \d{2,4}\b", # dd MMM yyyy
r"\b\d{1,2} \b\w{3} \d{4}\b", # dd MMM yyyyy
r"\b\w{3} \d{1,2}, \d{4}\b", # MMM dd, yyyy
dates = []
for format in date_formats:
dates.extend(re.findall(format, text, re.IGNORECASE))
return dates
# change date within text
def changeDate(self, given_start_date=datetime.datetime(1980, 1, 1), given_end_date=datetime.datetime(2023, 12, 31)):
# Find all dates in the text using datefinder
# dates = list(find_dates(self.text))
# # Replace each date with a new date generated by Faker
# for date_obj in dates:
# old_date = date_obj.strftime('%Y-%m-%d')
# new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d')
# self.text = self.text.replace(old_date, new_date)
# Find all dates in the text using regular expressions
# dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\b|\b\d{4}-\d{2}-\d{2}\b', self.text)
dates = self.find_dates(self.text)
# Replace each date with a new date generated by Faker
for date_string in dates:
old_date = date_string
new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d')
if 'T' in old_date:
new_date = new_date + 'T12:00:00'
self.text = self.text.replace(old_date, new_date)
return self.text
# change name entities -> person name, company name
def changeName(self):
# to not mistake important terms as person or company names
excludewordlist = ["SSN"]
# Process the text with spacy
doc = self.nlp(self.text)
# Anonymize people and company names
for ent in doc.ents:
if ent.label_ in ["PERSON", "ORG", "GPE", "FAC"]:
if ent.label_ == "PERSON":
anonymized_name =
elif ent.label_ in ["ORG", "FAC"]:
anonymized_name = + " Inc."
if ent.text.upper() not in excludewordlist:
self.text = self.text.replace(ent.text, anonymized_name)
return self.text
# change address
def changeAddress(self):
# # Process the text with spacy
# doc = self.nlp(self.text)
# # Iterate over the entities in the document
# for ent in doc.ents:
# # Check if the entity is a location/address
# if ent.label_ == "GPE" or ent.label_ == "LOC" or ent.label_ == "FAC" or ent.label_ == "ORG":
# anonymized_name =
# self.text = self.text.replace(ent.text, anonymized_name)
addresses = parse(self.text, country='US')
# print(addresses)
for address in addresses:
anonymized_address = fake.address()
self.text = self.text.replace(address, anonymized_address)
return self.text
# change email
def changeEmail(self):
# # Use a regex pattern to find email addresses in the text
# email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
# emails = email_regex.findall(self.text)
# # Anonymize each email
# for email in emails:
# anonymized_email =
# self.text = self.text.replace(email, anonymized_email)
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
name_pattern = re.compile(r"(?i)(?!my|his|her)\b[A-Z][a-z]+\b")
# # Process the text with spacy
# doc = self.nlp(self.text)
# # Anonymize people and company names
# for ent in doc.ents:
# if ent.label_ == "PERSON":
# Split the text into sentences
sentences = sent_tokenize(self.text)
names = []
new_sentence_list = []
for sentence in sentences:
sentence_names = re.findall(name_pattern, sentence)
if sentence_names:
sentence_emails = re.findall(email_pattern, sentence)
if sentence_emails:
for email in sentence_emails:
if names:
if not "USA" in names[-1]:
if " " in names[-1]:
firstname = names[-1].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
if " " in names[-2]:
firstname = names[-2].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
anonymized_email =
sentence = sentence.replace(email, anonymized_email)
self.text = " ".join(new_sentence_list)
return self.text
# revised version of change emails
def changeEmail2(self):
# Split the text into sentences
sentences = sent_tokenize(self.text)
names = []
new_sentence_list = []
# get the regex of email
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
for sentence in sentences:
sentence_names = []
# Tokenize the text into words
tokens = nltk.word_tokenize(sentence)
# Tag the tokens with their part-of-speech
tagged = nltk.pos_tag(tokens)
# Use the named entity recognizer to extract entities from the tagged tokens
entities = ne_chunk(tagged)
# Iterate through the entities and extract the person names
for entity in entities:
if hasattr(entity, 'label') and entity.label() == 'PERSON':
name = ' '.join(c[0] for c in entity.leaves())
if sentence_names:
sentence_emails = re.findall(email_pattern, sentence)
# print(names,sentence_emails)
if sentence_emails:
for email in sentence_emails:
if names:
# fake_email = f"{names[-1].lower()}.{fake.free_email_domain()}"
if not "USA" in names[-1]:
if " " in names[-1]:
firstname = names[-1].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
if " " in names[-2]:
firstname = names[-2].split()[0]
any_num = random.randint(0, 1000)
fullPrefix = firstname+str(any_num)
fake_email = f"{fullPrefix}.{fake.free_email_domain()}"
sentence = sentence.replace(email, fake_email)
anonymized_email =
sentence = sentence.replace(email, anonymized_email)
self.text = " ".join(new_sentence_list)
return self.text
# change mobile numbers
def changeMobileNumber(self):
# Search for phone number patterns in the text
phone_number_pattern = re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}')
phone_numbers = re.findall(phone_number_pattern, self.text)
# Replace the phone numbers with fake ones generated by Faker
for phone_number in phone_numbers:
fake_phone_number = fake.phone_number()
self.text = self.text.replace(phone_number, fake_phone_number)
return self.text
# change SSN
def changeSSN(self):
ssn_regex = re.compile(r"\d{3}-\d{2}-\d{4}")
self.text = ssn_regex.sub(lambda x: fake.ssn(), self.text)
return self.text
