Skip to content

Instantly share code, notes, and snippets.

@sebastianknopf
Created May 11, 2022 21:03
Show Gist options
  • Save sebastianknopf/e9e0381efab5fc872d726c28eca1b19f to your computer and use it in GitHub Desktop.
Save sebastianknopf/e9e0381efab5fc872d726c28eca1b19f to your computer and use it in GitHub Desktop.
unifies objects to a comparable (hash) string regardless of their spelling - use case: checking persons and addresses against a blacklist without storing personal data permanently in database
import hashlib
import re
import unittest
from abc import abstractmethod
class ObjectUnifier:
_token_list = []
_token_delimiter = "[\\s+\\.]+"
def _add_token(self, token):
token = token.lower()
token = re.sub("[^a-z0-9-]+", "_", token)
self._token_list.append(str(token))
@abstractmethod
def _generate_token_list(self):
pass
def get_unified_object_string(self):
return "#".join(self._token_list)
def get_unified_object_hash(self):
return hashlib.sha256(self.get_unified_object_string().encode("utf-8")).hexdigest()
def __str__(self):
return self.get_unified_object_string()
class PersonUnifier(ObjectUnifier):
_gender = None
_last_name = None
_first_name = None
_birthday = None
def __init__(self, gender, last_name, first_name, birthday):
self._token_list = []
self._gender = gender
self._last_name = last_name
self._first_name = first_name
self._birthday = birthday
self._generate_token_list()
def _generate_token_list(self):
self._add_token(self._gender)
self._add_token(self._last_name)
first_name_value = re.split(self._token_delimiter, self._first_name)[0]
self._add_token(first_name_value)
self._add_token(self._birthday)
class AddressUnifier(ObjectUnifier):
_street = None
_house_number = None
_postal_code = None
_city = None
def __init__(self, street, house_number, postal_code, city):
self._token_list = []
self._street = street
self._house_number = house_number
self._postal_code = postal_code
self._city = city
self._generate_token_list()
def _generate_token_list(self):
street = re.split("[\\s.-]+", self._street)
street = list(filter(lambda x: x != "", street))
street_value = []
for i in range(0, len(street)):
sv = street[i]
sv = sv.ljust(3, "_")
if i < len(street) - 1 and len(sv) > 3:
sv = sv[:3]
if i == len(street) - 1:
sv = sv.replace("aße", "")
sv = sv.replace("asse", "")
street_value.append(sv)
self._add_token("-".join(street_value))
self._add_token(self._house_number)
self._add_token(self._postal_code[:2])
city = re.sub("(\\s?\\([^()]*\\))+", "", self._city)
city = re.split("[\\s.]+", city)
city = list(filter(lambda x: x != "", city))
city_value = []
for i in range(0, len(city)):
cv = city[i]
cv = cv.ljust(3, "_")
city_value.append(cv)
self._add_token("-".join(city_value))
class PersonUnifierTest(unittest.TestCase):
def test_person_unifier(self):
self.assertEqual("male#mustermann#max#01_01_1990", str(PersonUnifier("male", "Mustermann", "Max", "01.01.1990")))
self.assertEqual("male#mustermann-tester#max#01_01_1990", str(PersonUnifier("male", "Mustermann-Tester", "Max", "01.01.1990")))
self.assertEqual("male#mustermann#m_x#01_01_1990", str(PersonUnifier("male", "mustermann", "mäx", "01.01.1990")))
class AddressUnifierTest(unittest.TestCase):
def test_address_unifier(self):
self.assertEqual("heimweg#5#77#musterstadt", str(AddressUnifier("Heimweg", "5", "77777", "Musterstadt")))
self.assertEqual("teststr#5#77#test", str(AddressUnifier("Teststraße", "5", "77777", "Test")))
self.assertEqual("teststr#5#77#test", str(AddressUnifier("Teststr.", "5", "77777", "Test")))
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test Straße", "5", "77777", "Test")))
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test Str.", "5", "77777", "Test")))
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Straße", "5", "77777", "Test")))
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Str", "5", "77777", "Test")))
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Str.", "5", "77777", "Test")))
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St.Tester Weg", "5C", "68390", "Musterstadt")))
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St. Tester Weg", "5C", "68390", "Musterstadt")))
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St. Tester-Weg", "5C", "68390", "Musterstadt")))
self.assertEqual("am_-testweg#5c#68#musterstadt", str(AddressUnifier("Am Testweg", "5C", "68390", "Musterstadt")))
self.assertEqual("am_-testweg#5c#68#musterstadt", str(AddressUnifier("Am Testweg", "5C", "68390", "Musterstadt")))
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unterer Testweg", "222", "75175", "Daheim")))
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unt. Testweg", "222", "75175", "Daheim")))
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unt. Testweg", "222", "75175", "Daheim")))
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("Unterer Test-Weg", "222", "75175", "Daheim")))
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("Unt. Test-Weg", "222", "75175", "Daheim")))
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("unt test weg", "222", "75175", "Daheim")))
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim (Test)")))
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim ( Test)")))
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim (test.test)")))
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment