Created
March 2, 2023 19:42
-
-
Save shantoroy/d99825be17e667fafd542dee10999725 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*-coding:utf-8 -*- | |
''' | |
@File : redaction.py | |
@Time : 2023/02/09 20:57:22 | |
@Author : Shanto Roy | |
@Version : 1.0 | |
@Contact : sroy10@uh.edu | |
@License : (C)Copyright 2020-2021, Shanto Roy | |
@Desc : Class that replace real information with fake believable ones. | |
''' | |
from faker import Faker | |
import re | |
from dateutil.parser import parse | |
from anonymization import Anonymization, AnonymizerChain | |
from anonymization import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer | |
# from datefinder import find_dates | |
import datetime | |
import spacy | |
import usaddress | |
from pyap import parse | |
import nltk | |
from nltk import sent_tokenize | |
from nltk import word_tokenize, pos_tag, ne_chunk | |
import random | |
fake = Faker() | |
# this class moderate our contents | |
class DataRedaction: | |
def __init__(self, text): | |
self.text = text | |
# Load the small English model | |
self.nlp = spacy.load("en_core_web_sm") | |
@staticmethod | |
def find_dates(text): | |
date_formats = [ | |
r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", # dd/mm/yyyy or dd-mm-yyyy | |
r"\b\d{1,2} \b\w{3} \d{2,4}\b", # dd MMM yyyy | |
r"\b\d{1,2} \b\w{3} \d{4}\b", # dd MMM yyyyy | |
r"\b\w{3} \d{1,2}, \d{4}\b", # MMM dd, yyyy | |
] | |
dates = [] | |
for format in date_formats: | |
dates.extend(re.findall(format, text, re.IGNORECASE)) | |
return dates | |
# change date within text | |
def changeDate(self, given_start_date=datetime.datetime(1980, 1, 1), given_end_date=datetime.datetime(2023, 12, 31)): | |
# Find all dates in the text using datefinder | |
# dates = list(find_dates(self.text)) | |
# # Replace each date with a new date generated by Faker | |
# for date_obj in dates: | |
# old_date = date_obj.strftime('%Y-%m-%d') | |
# new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d') | |
# self.text = self.text.replace(old_date, new_date) | |
# Find all dates in the text using regular expressions | |
# dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\b|\b\d{4}-\d{2}-\d{2}\b', self.text) | |
dates = self.find_dates(self.text) | |
# Replace each date with a new date generated by Faker | |
for date_string in dates: | |
old_date = date_string | |
new_date = fake.date_between(start_date=given_start_date, end_date=given_end_date).strftime('%Y-%m-%d') | |
if 'T' in old_date: | |
new_date = new_date + 'T12:00:00' | |
self.text = self.text.replace(old_date, new_date) | |
return self.text | |
# change name entities -> person name, company name | |
def changeName(self): | |
# to not mistake important terms as person or company names | |
excludewordlist = ["SSN"] | |
# Process the text with spacy | |
doc = self.nlp(self.text) | |
# Anonymize people and company names | |
for ent in doc.ents: | |
if ent.label_ in ["PERSON", "ORG", "GPE", "FAC"]: | |
if ent.label_ == "PERSON": | |
anonymized_name = fake.name() | |
elif ent.label_ in ["ORG", "FAC"]: | |
anonymized_name = fake.company() + " Inc." | |
if ent.text.upper() not in excludewordlist: | |
self.text = self.text.replace(ent.text, anonymized_name) | |
return self.text | |
# change address | |
def changeAddress(self): | |
# # Process the text with spacy | |
# doc = self.nlp(self.text) | |
# # Iterate over the entities in the document | |
# for ent in doc.ents: | |
# # Check if the entity is a location/address | |
# if ent.label_ == "GPE" or ent.label_ == "LOC" or ent.label_ == "FAC" or ent.label_ == "ORG": | |
# anonymized_name = fake.company() | |
# self.text = self.text.replace(ent.text, anonymized_name) | |
addresses = parse(self.text, country='US') | |
# print(addresses) | |
for address in addresses: | |
anonymized_address = fake.address() | |
self.text = self.text.replace(address, anonymized_address) | |
return self.text | |
""" | |
# change email | |
def changeEmail(self): | |
# # Use a regex pattern to find email addresses in the text | |
# email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
# emails = email_regex.findall(self.text) | |
# # Anonymize each email | |
# for email in emails: | |
# anonymized_email = fake.email() | |
# self.text = self.text.replace(email, anonymized_email) | |
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
name_pattern = re.compile(r"(?i)(?!my|his|her)\b[A-Z][a-z]+\b") | |
# # Process the text with spacy | |
# doc = self.nlp(self.text) | |
# # Anonymize people and company names | |
# for ent in doc.ents: | |
# if ent.label_ == "PERSON": | |
# Split the text into sentences | |
sentences = sent_tokenize(self.text) | |
names = [] | |
new_sentence_list = [] | |
for sentence in sentences: | |
sentence_names = re.findall(name_pattern, sentence) | |
if sentence_names: | |
names.extend(sentence_names) | |
sentence_emails = re.findall(email_pattern, sentence) | |
print(names,sentence_emails) | |
if sentence_emails: | |
for email in sentence_emails: | |
if names: | |
print(names) | |
if not "USA" in names[-1]: | |
if " " in names[-1]: | |
print(names[-1]) | |
firstname = names[-1].split()[0] | |
any_num = random.randint(0, 1000) | |
fullPrefix = firstname+str(any_num) | |
fake_email = f"{fullPrefix}.{fake.free_email_domain()}" | |
else: | |
if " " in names[-2]: | |
print(names[-2]) | |
firstname = names[-2].split()[0] | |
any_num = random.randint(0, 1000) | |
fullPrefix = firstname+str(any_num) | |
fake_email = f"{fullPrefix}.{fake.free_email_domain()}" | |
sentence = sentence.replace(email, fake_email) | |
names.pop() | |
else: | |
anonymized_email = fake.email() | |
sentence = sentence.replace(email, anonymized_email) | |
new_sentence_list.append(sentence) | |
self.text = " ".join(new_sentence_list) | |
return self.text | |
""" | |
# revised version of change emails | |
def changeEmail2(self): | |
# Split the text into sentences | |
sentences = sent_tokenize(self.text) | |
names = [] | |
new_sentence_list = [] | |
# get the regex of email | |
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
for sentence in sentences: | |
sentence_names = [] | |
# Tokenize the text into words | |
tokens = nltk.word_tokenize(sentence) | |
# Tag the tokens with their part-of-speech | |
tagged = nltk.pos_tag(tokens) | |
# Use the named entity recognizer to extract entities from the tagged tokens | |
entities = ne_chunk(tagged) | |
# Iterate through the entities and extract the person names | |
for entity in entities: | |
if hasattr(entity, 'label') and entity.label() == 'PERSON': | |
name = ' '.join(c[0] for c in entity.leaves()) | |
sentence_names.append(name) | |
if sentence_names: | |
names.extend(sentence_names) | |
sentence_emails = re.findall(email_pattern, sentence) | |
# print(names,sentence_emails) | |
if sentence_emails: | |
for email in sentence_emails: | |
if names: | |
print(names) | |
# fake_email = f"{names[-1].lower()}.{fake.free_email_domain()}" | |
if not "USA" in names[-1]: | |
if " " in names[-1]: | |
print(names[-1]) | |
firstname = names[-1].split()[0] | |
any_num = random.randint(0, 1000) | |
fullPrefix = firstname+str(any_num) | |
fake_email = f"{fullPrefix}.{fake.free_email_domain()}" | |
sentence = sentence.replace(email, fake_email) | |
names.pop() | |
else: | |
if " " in names[-2]: | |
print(names[-2]) | |
firstname = names[-2].split()[0] | |
any_num = random.randint(0, 1000) | |
fullPrefix = firstname+str(any_num) | |
fake_email = f"{fullPrefix}.{fake.free_email_domain()}" | |
sentence = sentence.replace(email, fake_email) | |
names.pop() | |
names.pop() | |
else: | |
anonymized_email = fake.email() | |
sentence = sentence.replace(email, anonymized_email) | |
new_sentence_list.append(sentence) | |
self.text = " ".join(new_sentence_list) | |
return self.text | |
# change mobile numbers | |
def changeMobileNumber(self): | |
# Search for phone number patterns in the text | |
phone_number_pattern = re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}') | |
phone_numbers = re.findall(phone_number_pattern, self.text) | |
# Replace the phone numbers with fake ones generated by Faker | |
for phone_number in phone_numbers: | |
fake_phone_number = fake.phone_number() | |
self.text = self.text.replace(phone_number, fake_phone_number) | |
return self.text | |
# change SSN | |
def changeSSN(self): | |
ssn_regex = re.compile(r"\d{3}-\d{2}-\d{4}") | |
self.text = ssn_regex.sub(lambda x: fake.ssn(), self.text) | |
return self.text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment