Skip to content

Instantly share code, notes, and snippets.

@iKunalChhabra
Last active December 22, 2022 13:43
Show Gist options
  • Save iKunalChhabra/fccbac749aa33904d7df5adb4a51c737 to your computer and use it in GitHub Desktop.
Save iKunalChhabra/fccbac749aa33904d7df5adb4a51c737 to your computer and use it in GitHub Desktop.
simple pii identifier
import country_list # pip install country_list
import phonenumbers # pip install phonenumbers
import pycountry # pip install pycountry
import langdetect # pip install langdetect
import whois # pip install python-whois
import re
import datetime
langdetect.DetectorFactory.seed = 0
class PhoneNumber:
def __init__(self, phone_number):
self.phone_number = phone_number.strip()
self.info = {'phone_number': self.phone_number}
self.__parse()
def __parse_phone_number(self):
try:
self.__parsed_phone_number = phonenumbers.parse(self.phone_number)
self.info['is_valid'] = phonenumbers.is_valid_number(self.__parsed_phone_number)
except Exception as e:
self.info['error'] = str(e).strip()
def __split_number(self):
if self.info.get('is_valid'):
self.info['country_number'] = self.__parsed_phone_number.country_code
self.info['national_number'] = self.__parsed_phone_number.national_number
def __get_region_code(self):
if self.info.get('is_valid'):
self.info['country_code'] = phonenumbers.region_code_for_number(self.__parsed_phone_number)
def __get_country_name(self):
if self.info.get('country_code'):
self.info['country_name'] = pycountry.countries.get(alpha_2=self.info['country_code']).name
def __str__(self):
return str(self.info)
def __repr__(self):
return self.__str__()
def __parse(self):
start_time = datetime.datetime.now()
self.__parse_phone_number()
self.__split_number()
self.__get_region_code()
self.__get_country_name()
end_time = datetime.datetime.now()
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds()
class Language:
def __init__(self, text):
self.text = text.strip()
self.info = {'text': self.text}
self.__parse()
def __parse_language(self):
try:
self.info['language_code'] = langdetect.detect(self.text)
self.info['is_valid'] = True
except Exception as e:
self.info['error'] = str(e).strip()
self.info['is_valid'] = False
def __get_language_name(self):
if self.info.get('language_code'):
try:
self.info['language_name'] = pycountry.languages.get(alpha_2=self.info['language_code'][:2]).name
except AttributeError:
self.info['language_name'] = 'Unknown'
def __str__(self):
return str(self.info)
def __repr__(self):
return self.__str__()
def __parse(self):
start_time = datetime.datetime.now()
self.__parse_language()
self.__get_language_name()
end_time = datetime.datetime.now()
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds()
class Email:
def __init__(self, email):
self.email = email.strip()
self.info = {'email': self.email}
self.__parse()
def __parse_email(self):
if re.match(r"[^@]+@[^@]+\.[^@]+", self.email):
self.info['parsed_domain'] = self.email.split('@')[1]
self.info['is_valid'] = True
else:
self.info['is_valid'] = False
def __get_domain_info(self):
if self.info.get('parsed_domain'):
try:
self.info.update(whois.whois(self.info['parsed_domain']))
try:
self.info['country_name'] = pycountry.countries.get(alpha_2=self.info['country']).name
except AttributeError:
self.info['country_name'] = 'Unknown'
except KeyError:
self.info['country_name'] = 'Unknown'
except LookupError:
self.info['country_name'] = 'Unknown'
except whois.parser.PywhoisError as e:
self.info['error'] = str(e).strip()
def __str__(self):
return str(self.info)
def __repr__(self):
return self.__str__()
def __parse(self):
start_time = datetime.datetime.now()
self.__parse_email()
self.__get_domain_info()
end_time = datetime.datetime.now()
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds()
class Address:
def __init__(self, address):
self.address = address.strip()
self.info = {'address': self.address}
self.__parse()
def __parse_address(self):
address = self.address.lower()
address_word_list = address.split()
for language in country_list.available_languages():
for country_code, country_name in country_list.countries_for_language(language):
for word in address_word_list:
if (word == country_name.lower()) or (word == country_code.lower()):
self.info['country_code'] = country_code
self.info['country_name'] = pycountry.countries.get(alpha_2=country_code).name
self.info['country_detected'] = country_name
self.info['is_valid'] = True
return
else:
self.info['is_valid'] = False
def __str__(self):
return str(self.info)
def __repr__(self):
return self.__str__()
def __parse(self):
start_time = datetime.datetime.now()
self.__parse_address()
end_time = datetime.datetime.now()
self.info['inference_time_in_seconds'] = (end_time - start_time).total_seconds()
class PII:
def __init__(self, text):
self.text = text
self.info = {}
self.__parse()
self.info['input'] = self.text
def __is_email(self):
e = Email(self.text)
if e.info.get('is_valid'):
self.info = e.info
self.info['type'] = 'email'
self.info['is_pii'] = True
return True
else:
return False
def __is_phone_number(self):
p = PhoneNumber(self.text)
if p.info.get('is_valid'):
self.info = p.info
self.info['type'] = 'phone_number'
self.info['is_pii'] = True
return True
else:
return False
def __is_address(self):
a = Address(self.text)
if a.info.get('is_valid'):
self.info = a.info
self.info['type'] = 'address'
self.info['is_pii'] = True
return True
else:
return False
def __detect_language(self):
l = Language(self.text)
if l.info.get('is_valid'):
self.info = l.info
self.info['type'] = 'language'
self.info['is_pii'] = True
return True
else:
return False
def __parse_pii(self):
if self.__is_email():
return
elif self.__is_phone_number():
return
elif self.__is_address():
return
elif self.__detect_language():
return
else:
self.info['type'] = 'unknown'
self.info['is_pii'] = False
return
def __str__(self):
return str(self.info)
def __repr__(self):
return self.__str__()
def __parse(self):
start_time = datetime.datetime.now()
self.__parse_pii()
end_time = datetime.datetime.now()
self.info['total_inference_time_in_seconds'] = (end_time - start_time).total_seconds()
if __name__ == '__main__':
p = PII('中国')
print(p.info)
p = PII('test@gmail.com')
print(p.info)
p = PII('+91 9999999999')
print(p.info)
p = PII('Hello, How are you?')
print(p.info)
p = PII('33')
print(p.info)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment