Last active
December 22, 2022 13:43
-
-
Save iKunalChhabra/fccbac749aa33904d7df5adb4a51c737 to your computer and use it in GitHub Desktop.
simple pii identifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import country_list # pip install country_list | |
import phonenumbers # pip install phonenumbers | |
import pycountry # pip install pycountry | |
import langdetect # pip install langdetect | |
import whois # pip install python-whois | |
import re | |
import datetime | |
langdetect.DetectorFactory.seed = 0 | |
class PhoneNumber: | |
def __init__(self, phone_number): | |
self.phone_number = phone_number.strip() | |
self.info = {'phone_number': self.phone_number} | |
self.__parse() | |
def __parse_phone_number(self): | |
try: | |
self.__parsed_phone_number = phonenumbers.parse(self.phone_number) | |
self.info['is_valid'] = phonenumbers.is_valid_number(self.__parsed_phone_number) | |
except Exception as e: | |
self.info['error'] = str(e).strip() | |
def __split_number(self): | |
if self.info.get('is_valid'): | |
self.info['country_number'] = self.__parsed_phone_number.country_code | |
self.info['national_number'] = self.__parsed_phone_number.national_number | |
def __get_region_code(self): | |
if self.info.get('is_valid'): | |
self.info['country_code'] = phonenumbers.region_code_for_number(self.__parsed_phone_number) | |
def __get_country_name(self): | |
if self.info.get('country_code'): | |
self.info['country_name'] = pycountry.countries.get(alpha_2=self.info['country_code']).name | |
def __str__(self): | |
return str(self.info) | |
def __repr__(self): | |
return self.__str__() | |
def __parse(self): | |
start_time = datetime.datetime.now() | |
self.__parse_phone_number() | |
self.__split_number() | |
self.__get_region_code() | |
self.__get_country_name() | |
end_time = datetime.datetime.now() | |
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds() | |
class Language: | |
def __init__(self, text): | |
self.text = text.strip() | |
self.info = {'text': self.text} | |
self.__parse() | |
def __parse_language(self): | |
try: | |
self.info['language_code'] = langdetect.detect(self.text) | |
self.info['is_valid'] = True | |
except Exception as e: | |
self.info['error'] = str(e).strip() | |
self.info['is_valid'] = False | |
def __get_language_name(self): | |
if self.info.get('language_code'): | |
try: | |
self.info['language_name'] = pycountry.languages.get(alpha_2=self.info['language_code'][:2]).name | |
except AttributeError: | |
self.info['language_name'] = 'Unknown' | |
def __str__(self): | |
return str(self.info) | |
def __repr__(self): | |
return self.__str__() | |
def __parse(self): | |
start_time = datetime.datetime.now() | |
self.__parse_language() | |
self.__get_language_name() | |
end_time = datetime.datetime.now() | |
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds() | |
class Email: | |
def __init__(self, email): | |
self.email = email.strip() | |
self.info = {'email': self.email} | |
self.__parse() | |
def __parse_email(self): | |
if re.match(r"[^@]+@[^@]+\.[^@]+", self.email): | |
self.info['parsed_domain'] = self.email.split('@')[1] | |
self.info['is_valid'] = True | |
else: | |
self.info['is_valid'] = False | |
def __get_domain_info(self): | |
if self.info.get('parsed_domain'): | |
try: | |
self.info.update(whois.whois(self.info['parsed_domain'])) | |
try: | |
self.info['country_name'] = pycountry.countries.get(alpha_2=self.info['country']).name | |
except AttributeError: | |
self.info['country_name'] = 'Unknown' | |
except KeyError: | |
self.info['country_name'] = 'Unknown' | |
except LookupError: | |
self.info['country_name'] = 'Unknown' | |
except whois.parser.PywhoisError as e: | |
self.info['error'] = str(e).strip() | |
def __str__(self): | |
return str(self.info) | |
def __repr__(self): | |
return self.__str__() | |
def __parse(self): | |
start_time = datetime.datetime.now() | |
self.__parse_email() | |
self.__get_domain_info() | |
end_time = datetime.datetime.now() | |
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds() | |
class Address: | |
def __init__(self, address): | |
self.address = address.strip() | |
self.info = {'address': self.address} | |
self.__parse() | |
def __parse_address(self): | |
address = self.address.lower() | |
address_word_list = address.split() | |
for language in country_list.available_languages(): | |
for country_code, country_name in country_list.countries_for_language(language): | |
for word in address_word_list: | |
if (word == country_name.lower()) or (word == country_code.lower()): | |
self.info['country_code'] = country_code | |
self.info['country_name'] = pycountry.countries.get(alpha_2=country_code).name | |
self.info['country_detected'] = country_name | |
self.info['is_valid'] = True | |
return | |
else: | |
self.info['is_valid'] = False | |
def __str__(self): | |
return str(self.info) | |
def __repr__(self): | |
return self.__str__() | |
def __parse(self): | |
start_time = datetime.datetime.now() | |
self.__parse_address() | |
end_time = datetime.datetime.now() | |
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds() | |
class PII: | |
def __init__(self, text): | |
self.text = text | |
self.info = {} | |
self.__parse() | |
self.info['input'] = self.text | |
def __is_email(self): | |
e = Email(self.text) | |
if e.info.get('is_valid'): | |
self.info = e.info | |
self.info['type'] = 'email' | |
self.info['is_pii'] = True | |
return True | |
else: | |
return False | |
def __is_phone_number(self): | |
p = PhoneNumber(self.text) | |
if p.info.get('is_valid'): | |
self.info = p.info | |
self.info['type'] = 'phone_number' | |
self.info['is_pii'] = True | |
return True | |
else: | |
return False | |
def __is_address(self): | |
a = Address(self.text) | |
if a.info.get('is_valid'): | |
self.info = a.info | |
self.info['type'] = 'address' | |
self.info['is_pii'] = True | |
return True | |
else: | |
return False | |
def __detect_language(self): | |
l = Language(self.text) | |
if l.info.get('is_valid'): | |
self.info = l.info | |
self.info['type'] = 'language' | |
self.info['is_pii'] = True | |
return True | |
else: | |
return False | |
def __parse_pii(self): | |
if self.__is_email(): | |
return | |
elif self.__is_phone_number(): | |
return | |
elif self.__is_address(): | |
return | |
elif self.__detect_language(): | |
return | |
else: | |
self.info['type'] = 'unknown' | |
self.info['is_pii'] = False | |
return | |
def __str__(self): | |
return str(self.info) | |
def __repr__(self): | |
return self.__str__() | |
def __parse(self): | |
start_time = datetime.datetime.now() | |
self.__parse_pii() | |
end_time = datetime.datetime.now() | |
self.info['total_inference_time_in_seconds'] = (end_time - start_time).total_seconds() | |
if __name__ == '__main__': | |
p = PII('中国') | |
print(p.info) | |
p = PII('test@gmail.com') | |
print(p.info) | |
p = PII('+91 9999999999') | |
print(p.info) | |
p = PII('Hello, How are you?') | |
print(p.info) | |
p = PII('33') | |
print(p.info) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment