Skip to content

Instantly share code, notes, and snippets.

View talfco's full-sized avatar

Felix Kuestahler talfco

View GitHub Profile
def __calculate_name_matching(self, row):
name = row['lastName']+' '+row['firstName']+' '+row['middleName']
norm_name = sort_words(normalize_unicode_to_ascii(name))
tp = double_metaphone(norm_name)
row['col_match1'] = norm_name
row['col_match2'] = tp[0]
row['col_match3'] = tp[1]
return row
def create_politican_from_govapi_table(self):
self.load_government_members()
df = DataFrame.from_records(self._members)
df = df.apply(self.__calculate_name_matching, axis=1)
return df
# -*- coding: utf-8 -*-
import re
import unicodedata
def normalize_unicode_to_ascii(data):
normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
val = normal.decode("utf-8")
val = val.lower()
# remove special characters
val = re.sub('[^A-Za-z0-9 ]+', ' ', val)
# Name Tupel 1 Tupel 2 DP Match Remark
A iliane Maury Pasquier LLNMRPSK LLNMRPSKR
A1 LilianeMauryPasquier LLNMRPSK LLNMRPSKR Strong Capital/Small + Missing Spaces
A2 L. Maury Pasquier LLMRPSK LLMRPSKR Failed Abbreviation
B Marta Flückiger-Bäni MRTFLKJRRPN MRTFLKKRRPN
B1 Marta Fluckiger-Bani MRTFLKJRRPN MRTFLKKRRPN Strong ü/u
B2 Marta Flükger-Bäni MRTFLKKRRPN MRTFLKJRRPN Normal spelling mistakes
B3 Marta Flückiger Bäni MRTFLKJRPN MRTFLKKRPN Failed missing -
# -*- coding: utf-8 -*-
import pytest
from name_matching import double_metaphone,double_metaphone_compare, Threshold
class TestClass(object):
def test_double_metaphone(self):
tp1 = double_metaphone(u'LilianeMauryPasquier')
tp2 = double_metaphone(u'liliane Maury Pasquier')
assert True == double_metaphone_compare(tp1,tp2,Threshold.STRONG)
# -*- coding: utf-8 -*-
from metaphone import doublemetaphone
from enum import Enum
class Threshold(Enum):
WEAK = 0
NORMAL = 1
STRONG = 2
def double_metaphone(value):
@talfco
talfco / name-challenge.csv
Last active June 21, 2020 15:38
tut01-lesson5-name-challenge
Challenge Examples Strategy
Phonetic Similarity Klaus <-> Claus double_metaphone
Missing Spaces&Hyphens PeterLow, Anne Meier-King normalize_unicode_to_ascii
Missing Components Peter James Low <-> Peter Low <no solution - manual check>
Titles & Honorifics Dr. Peter Low, Ms. Anne Meier __calculate_name_matching
Out of Order Components Peter James Low <-> Low Peter James sort_words
Nicknames Wiiliam <-> Will <-> Bill <no solution - manual check>
Multiple Languages José Müller <-> Jose Müller normalize_unicode_to_ascii
Initials James Earl Smith <-> J.E. Smith __calculate_name_matching
Typos Jamse Earl Smith double_metaphone
def __calculate_name_matching(self, row):
name = row['Name']
for clean in self.__cfg['twitterNameCleaner']:
name = name.replace(clean ,'')
for expand in self.__cfg['twitterNamesExpander']:
name = name.replace(expand.get('abbreviation'), expand.get('name'))
twitterNameCleaner: ["Dr.","Zürich"]
twitterNamesExpander:
- abbreviation: "Chr."
name: "Christophe"
Twitter Account Name Remark Resolution
Dr. Bastien Girod Title prefix Remove Title
Jean Chr. Schwaab Middle Name Abbreviation Expand Middle Name
Kathy Riklin Zürich City postfix Remove City