Created
July 22, 2019 20:12
-
-
Save risicle/91fa7de74466058b15a59f69ad145296 to your computer and use it in GitHub Desktop.
Nasty all-in-one bundling of Metaphone 0.6 module, suitable for dumping into QGIS in a hurry without battling with imports
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
import unicodedata | |
class Word(object): | |
""" | |
""" | |
def __init__(self, input): | |
self.original = input | |
if isinstance(input, bytes): | |
self.decoded = input.decode('utf-8', 'ignore') | |
else: | |
self.decoded = input | |
self.decoded = self.decoded.replace('\xc7', "s") | |
self.decoded = self.decoded.replace('\xe7', "s") | |
self.normalized = ''.join( | |
(c for c in unicodedata.normalize('NFD', self.decoded) | |
if unicodedata.category(c) != 'Mn')) | |
self.upper = self.normalized.upper() | |
self.length = len(self.upper) | |
self.prepad = "--" | |
self.start_index = len(self.prepad) | |
self.end_index = self.start_index + self.length - 1 | |
self.postpad = "------" | |
# so we can index beyond the begining and end of the input string | |
self.buffer = self.prepad + self.upper + self.postpad | |
@property | |
def is_slavo_germanic(self): | |
return ( | |
self.upper.find('W') > -1 | |
or self.upper.find('K') > -1 | |
or self.upper.find('CZ') > -1 | |
or self.upper.find('WITZ') > -1) | |
def get_letters(self, start=0, end=None): | |
if not end: | |
end = start + 1 | |
start = self.start_index + start | |
end = self.start_index + end | |
return self.buffer[start:end] | |
VOWELS = ['A', 'E', 'I', 'O', 'U', 'Y'] | |
SILENT_STARTERS = ["GN", "KN", "PN", "WR", "PS"] | |
class DoubleMetaphone(object): | |
""" | |
""" | |
def __init__(self): | |
self.position = 0 | |
self.primary_phone = "" | |
self.secondary_phone = "" | |
# next is used set to a tuple of the next characters in the primary and | |
# secondary codes and to indicate how many characters to move forward | |
# in the string. The secondary code letter is given only when it is | |
# different than the primary. This is just a trick to make the code | |
# easier to write and read. The default action is to add nothing and | |
# move to next char. | |
self.next = (None, 1) | |
def check_word_start(self): | |
# skip these silent letters when at start of word | |
if self.word.get_letters(0, 2) in SILENT_STARTERS: | |
self.position += 1 | |
# Initial 'X' is pronounced 'Z' e.g. 'Xavier' | |
if self.word.get_letters(0) == 'X': | |
# 'Z' maps to 'S' | |
self.primary_phone = self.secondary_phone = 'S' | |
self.position += 1 | |
def process_initial_vowels(self): | |
# XXX do we need this next set? it should already be done... | |
self.next = (None, 1) | |
# all init vowels now map to 'A' | |
if self.position == self.word.start_index: | |
self.next = ('A', 1) | |
def process_b(self): | |
# "-mb", e.g., "dumb", already skipped over... see 'M' below | |
if self.word.buffer[self.position + 1] == 'B': | |
self.next = ('P', 2) | |
else: | |
self.next = ('P', 1) | |
def process_c(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
# various germanic | |
if (position > start_index + 1 | |
and buffer[position - 2] not in VOWELS | |
and buffer[position - 1:self.position + 2] == 'ACH' | |
and buffer[position + 2] not in ['I'] | |
and (buffer[position + 2] not in ['E'] | |
or buffer[position - 2:position + 4] in [ | |
'BACHER', 'MACHER'])): | |
self.next = ('K', 2) | |
# special case 'CAESAR' | |
elif (position == start_index | |
and buffer[start_index:start_index + 6] == 'CAESAR'): | |
self.next = ('S', 2) | |
# italian 'chianti' | |
elif buffer[position:position + 4] == 'CHIA': | |
self.next = ('K', 2) | |
elif buffer[position:position + 2] == 'CH': | |
# find 'michael' | |
if (position > start_index | |
and buffer[position:position + 4] == 'CHAE'): | |
self.next = ('K', 'X', 2) | |
elif (position == start_index | |
and (buffer[position + 1:position + 6] in ['HARAC', 'HARIS'] | |
or buffer[position + 1:position + 4] in ["HOR", "HYM", "HIA", | |
"HEM"]) | |
and buffer[start_index:start_index + 5] != 'CHORE'): | |
self.next = ('K', 2) | |
# germanic, greek, or otherwise 'ch' for 'kh' sound | |
elif ( | |
buffer[start_index:start_index + 4] in ['VAN ', 'VON '] | |
or buffer[start_index:start_index + 3] == 'SCH' | |
or buffer[position - 2:position + 4] in ["ORCHES", "ARCHIT", | |
"ORCHID"] | |
or buffer[position + 2] in ['T', 'S'] | |
or ( | |
(buffer[position - 1] in ["A", "O", "U", "E"] | |
or position == start_index) | |
and (buffer[position + 2] in [ | |
"L", "R", "N", "M", "B", "H", "F", "V", "W"]))): | |
self.next = ('K', 2) | |
else: | |
if position > start_index: | |
if buffer[start_index:start_index + 2] == 'MC': | |
self.next = ('K', 2) | |
else: | |
self.next = ('X', 'K', 2) | |
else: | |
self.next = ('X', 2) | |
# e.g, 'czerny' | |
elif (buffer[position:position + 2] == 'CZ' | |
and buffer[position - 2:position + 2] != 'WICZ'): | |
self.next = ('S', 'X', 2) | |
# e.g., 'focaccia' | |
elif buffer[position + 1:position + 4] == 'CIA': | |
self.next = ('X', 3) | |
# double 'C', but not if e.g. 'McClellan' | |
elif ( | |
buffer[position:position + 2] == 'CC' | |
and not (position == (start_index + 1) | |
and buffer[start_index] == 'M')): | |
#'bellocchio' but not 'bacchus' | |
if (buffer[position + 2] in ["I", "E", "H"] | |
and buffer[position + 2:position + 4] != 'HU'): | |
# 'accident', 'accede' 'succeed' | |
if ( | |
(position == (start_index + 1) | |
and buffer[start_index] == 'A') | |
or buffer[position - 1:position + 4] in [ | |
'UCCEE', 'UCCES']): | |
self.next = ('KS', 3) | |
# 'bacci', 'bertucci', other italian | |
else: | |
self.next = ('X', 3) | |
else: | |
self.next = ('K', 2) | |
elif buffer[position:position + 2] in ["CK", "CG", "CQ"]: | |
self.next = ('K', 2) | |
elif buffer[position:position + 2] in ["CI", "CE", "CY"]: | |
# italian vs. english | |
if buffer[position:position + 3] in ["CIO", "CIE", "CIA"]: | |
self.next = ('S', 'X', 2) | |
else: | |
self.next = ('S', 2) | |
else: | |
# name sent in 'mac caffrey', 'mac gregor' | |
if buffer[position + 1:position + 3] in [" C", " Q", " G"]: | |
self.next = ('K', 3) | |
else: | |
if (buffer[position + 1] in ["C", "K", "Q"] | |
and buffer[position + 1:position + 3] not in ["CE", "CI"]): | |
self.next = ('K', 2) | |
# default for 'C' | |
else: | |
self.next = ('K', 1) | |
def process_d(self): | |
if self.word.buffer[self.position:self.position + 2] == 'DG': | |
# e.g. 'edge' | |
if self.word.buffer[self.position + 2] in ['I', 'E', 'Y']: | |
self.next = ('J', 3) | |
else: | |
self.next = ('TK', 2) | |
elif self.word.buffer[self.position:self.position + 2] in ['DT', 'DD']: | |
self.next = ('T', 2) | |
else: | |
self.next = ('T', 1) | |
def process_f(self): | |
if self.word.buffer[self.position + 1] == 'F': | |
self.next = ('F', 2) | |
else: | |
self.next = ('F', 1) | |
def process_g(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
if buffer[position + 1] == 'H': | |
if (position > start_index | |
and buffer[position - 1] not in VOWELS): | |
self.next = ('K', 2) | |
elif position < (start_index + 3): | |
# 'ghislane', ghiradelli | |
if position == start_index: | |
if buffer[position + 2] == 'I': | |
self.next = ('J', 2) | |
else: | |
self.next = ('K', 2) | |
# Parker's rule (with some further refinements) - e.g., 'hugh' | |
elif ( | |
(position > (start_index + 1) | |
and buffer[position - 2] in ['B', 'H', 'D']) | |
or (position > (start_index + 2) | |
and buffer[position - 3] in ['B', 'H', 'D']) | |
or (position > (start_index + 3) | |
and buffer[position - 4] in ['B', 'H'])): | |
self.next = (None, 2) | |
else: | |
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', | |
# 'tough' | |
if (position > (start_index + 2) | |
and buffer[position - 1] == 'U' | |
and buffer[position - 3] in [ | |
"C", "G", "L", "R", "T"]): | |
self.next = ('F', 2) | |
else: | |
if (position > start_index | |
and buffer[position - 1] != 'I'): | |
self.next = ('K', 2) | |
elif buffer[position + 1] == 'N': | |
if (position == (start_index + 1) | |
and buffer[start_index] in VOWELS | |
and not self.word.is_slavo_germanic): | |
self.next = ('KN', 'N', 2) | |
else: | |
# not e.g. 'cagney' | |
if (buffer[position + 2:position + 4] != 'EY' | |
and buffer[position + 1] != 'Y' | |
and not self.word.is_slavo_germanic): | |
self.next = ('N', 'KN', 2) | |
else: | |
self.next = ('KN', 2) | |
# 'tagliaro' | |
elif (buffer[position + 1:position + 3] == 'LI' | |
and not self.word.is_slavo_germanic): | |
self.next = ('KL', 'L', 2) | |
# -ges-,-gep-,-gel-, -gie- at beginning | |
elif (position == start_index | |
and (buffer[position + 1] == 'Y' | |
or buffer[position + 1:position + 3] in [ | |
"ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", | |
"ER"])): | |
self.next = ('K', 'J', 2) | |
# -ger-, -gy- | |
elif ( | |
(buffer[position + 1:position + 3] == 'ER' | |
or buffer[position + 1] == 'Y') | |
and buffer[start_index:start_index + 6] not in [ | |
"DANGER", "RANGER", "MANGER"] | |
and buffer[position - 1] not in ['E', 'I'] | |
and buffer[position - 1:position + 2] not in ['RGY', 'OGY']): | |
self.next = ('K', 'J', 2) | |
# italian e.g, 'biaggi' | |
elif ( | |
buffer[position + 1] in ['E', 'I', 'Y'] | |
or buffer[position - 1:position + 3] in [ | |
"AGGI", "OGGI"]): | |
# obvious germanic | |
if (buffer[start_index:start_index + 4] in ['VON ', 'VAN '] | |
or buffer[start_index:start_index + 3] == 'SCH' | |
or buffer[position + 1:position + 3] == 'ET'): | |
self.next = ('K', 2) | |
else: | |
# always soft if french ending | |
if buffer[position + 1:position + 5] == 'IER ': | |
self.next = ('J', 2) | |
else: | |
self.next = ('J', 'K', 2) | |
elif buffer[position + 1] == 'G': | |
self.next = ('K', 2) | |
else: | |
self.next = ('K', 1) | |
def process_h(self): | |
# only keep if self.word.start_index & before vowel or btw. 2 vowels | |
if ((self.position == self.word.start_index | |
or self.word.buffer[self.position - 1] in VOWELS) | |
and self.word.buffer[self.position + 1] in VOWELS): | |
self.next = ('H', 2) | |
# (also takes care of 'HH') | |
else: | |
self.next = (None, 1) | |
def process_j(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
# obvious spanish, 'jose', 'san jacinto' | |
if (buffer[self.position:self.position + 4] == 'JOSE' | |
or buffer[start_index:start_index + 4] == 'SAN '): | |
if ( | |
(position == start_index and buffer[position + 4] == ' ') | |
or buffer[start_index:start_index + 4] == 'SAN '): | |
self.next = ('H', ) | |
else: | |
self.next = ('J', 'H') | |
# Yankelovich/Jankelowicz | |
elif (position == start_index | |
and buffer[self.position:self.position + 4] != 'JOSE'): | |
self.next = ('J', 'A') | |
else: | |
# spanish pron. of e.g. 'bajador' | |
if (buffer[position - 1] in VOWELS | |
and not self.word.is_slavo_germanic | |
and buffer[position + 1] in ['A', 'O']): | |
self.next = ('J', 'H') | |
else: | |
if position == self.word.end_index: | |
self.next = ('J', ' ') | |
else: | |
if (buffer[position + 1] not in ["L", "T", "K", "S", "N", | |
"M", "B", "Z"] | |
and buffer[position - 1] not in ["S", "K", "L"]): | |
self.next = ('J',) | |
else: | |
self.next = (None, ) | |
if buffer[position + 1] == 'J': | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
def process_k(self): | |
if self.word.buffer[self.position + 1] == 'K': | |
self.next = ('K', 2) | |
else: | |
self.next = ('K', 1) | |
def process_l(self): | |
buffer = self.word.buffer | |
position = self.position | |
end_index = self.word.end_index | |
if buffer[position + 1] == 'L': | |
# spanish e.g. 'cabrillo', 'gallegos' | |
if ((position == (end_index - 2) | |
and buffer[position - 1:position + 3] in [ | |
"ILLO", "ILLA", "ALLE"]) | |
or ((buffer[end_index - 1:end_index + 1] in ["AS", "OS"] | |
or buffer[end_index] in ["A", "O"]) | |
and buffer[position - 1:position + 3] == 'ALLE')): | |
self.next = ('L', '', 2) | |
else: | |
self.next = ('L', 2) | |
else: | |
self.next = ('L', 1) | |
def process_m(self): | |
buffer = self.word.buffer | |
position = self.position | |
if ((buffer[position + 1:position + 4] == 'UMB' | |
and (position + 1 == self.word.end_index | |
or buffer[position + 2:position + 4] == 'ER')) | |
or buffer[position + 1] == 'M'): | |
self.next = ('M', 2) | |
else: | |
self.next = ('M', 1) | |
def process_n(self): | |
if self.word.buffer[self.position + 1] == 'N': | |
self.next = ('N', 2) | |
else: | |
self.next = ('N', 1) | |
def process_p(self): | |
if self.word.buffer[self.position + 1] == 'H': | |
self.next = ('F', 2) | |
# also account for "campbell", "raspberry" | |
elif self.word.buffer[self.position + 1] in ['P', 'B']: | |
self.next = ('P', 2) | |
else: | |
self.next = ('P', 1) | |
def process_q(self): | |
if self.word.buffer[self.position + 1] == 'Q': | |
self.next = ('K', 2) | |
else: | |
self.next = ('K', 1) | |
def process_r(self): | |
buffer = self.word.buffer | |
position = self.position | |
end_index = self.word.end_index | |
# french e.g. 'rogier', but exclude 'hochmeier' | |
if (position == end_index | |
and not self.word.is_slavo_germanic | |
and buffer[position - 2:position] == 'IE' | |
and buffer[position - 4:position - 2] not in ['ME', 'MA']): | |
self.next = ('', 'R') | |
else: | |
self.next = ('R',) | |
if buffer[position + 1] == 'R': | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
def process_s(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
end_index = self.word.end_index | |
# special cases 'island', 'isle', 'carlisle', 'carlysle' | |
if buffer[position - 1:position + 2] in ['ISL', 'YSL']: | |
self.next = (None, 1) | |
# special case 'sugar-' | |
elif (position == start_index | |
and buffer[start_index:start_index + 5] == 'SUGAR'): | |
self.next = ('X', 'S', 1) | |
elif buffer[position:position + 2] == 'SH': | |
# germanic | |
if buffer[position + 1:position + 5] in [ | |
"HEIM", "HOEK", "HOLM", "HOLZ"]: | |
self.next = ('S', 2) | |
else: | |
self.next = ('X', 2) | |
# italian & armenian | |
elif (buffer[position:position + 3] in ["SIO", "SIA"] | |
or buffer[position:position + 4] == 'SIAN'): | |
if not self.word.is_slavo_germanic: | |
self.next = ('S', 'X', 3) | |
else: | |
self.next = ('S', 3) | |
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' | |
# match 'schneider' also, -sz- in slavic language altho in | |
# hungarian it is pronounced 's' | |
elif ((position == start_index | |
and buffer[position + 1] in ["M", "N", "L", "W"]) | |
or buffer[position + 1] == 'Z'): | |
self.next = ('S', 'X') | |
if buffer[position + 1] == 'Z': | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
elif buffer[position:position + 2] == 'SC': | |
# Schlesinger's rule | |
if buffer[position + 2] == 'H': | |
# dutch origin, e.g. 'school', 'schooner' | |
if buffer[position + 3:position + 5] in [ | |
"OO", "ER", "EN", "UY", "ED", "EM"]: | |
# 'schermerhorn', 'schenker' | |
if buffer[position + 3:position + 5] in ['ER', 'EN']: | |
self.next = ('X', 'SK', 3) | |
else: | |
self.next = ('SK', 3) | |
else: | |
if (position == start_index | |
and buffer[start_index + 3] not in VOWELS | |
and buffer[start_index + 3] != 'W'): | |
self.next = ('X', 'S', 3) | |
else: | |
self.next = ('X', 3) | |
elif buffer[position + 2] in ['I', 'E', 'Y']: | |
self.next = ('S', 3) | |
else: | |
self.next = ('SK', 3) | |
# french e.g. 'resnais', 'artois' | |
elif (position == end_index | |
and buffer[position - 2:position] in ['AI', 'OI']): | |
self.next = ('', 'S', 1) | |
else: | |
self.next = ('S', ) | |
if buffer[position + 1] in ['S', 'Z']: | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
def process_t(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
if buffer[position:position + 4] == 'TION': | |
self.next = ('X', 3) | |
elif buffer[position:position + 3] in ['TIA', 'TCH']: | |
self.next = ('X', 3) | |
elif (buffer[position:position + 2] == 'TH' | |
or buffer[position:position + 3] == 'TTH'): | |
# special case 'thomas', 'thames' or germanic | |
if (buffer[position + 2:position + 4] in ['OM', 'AM'] | |
or buffer[start_index:start_index + 4] in ['VON ', 'VAN '] | |
or buffer[start_index:start_index + 3] == 'SCH'): | |
self.next = ('T', 2) | |
else: | |
self.next = ('0', 'T', 2) | |
elif buffer[position + 1] in ['T', 'D']: | |
self.next = ('T', 2) | |
else: | |
self.next = ('T', 1) | |
def process_v(self): | |
if self.word.buffer[self.position + 1] == 'V': | |
self.next = ('F', 2) | |
else: | |
self.next = ('F', 1) | |
def process_w(self): | |
buffer = self.word.buffer | |
position = self.position | |
start_index = self.word.start_index | |
# can also be in middle of word | |
if buffer[position:position + 2] == 'WR': | |
self.next = ('R', 2) | |
elif (position == start_index | |
and (buffer[position + 1] in VOWELS | |
or buffer[position:position + 2] == 'WH')): | |
# Wasserman should match Vasserman | |
if buffer[position + 1] in VOWELS: | |
self.next = ('A', 'F', 1) | |
else: | |
self.next = ('A', 1) | |
# Arnow should match Arnoff | |
elif ((position == self.word.end_index | |
and buffer[position - 1] in VOWELS) | |
or buffer[position - 1:position + 4] in [ | |
"EWSKI", "EWSKY", "OWSKI", "OWSKY"] | |
or buffer[start_index:start_index + 3] == 'SCH'): | |
self.next = ('', 'F', 1) | |
# polish e.g. 'filipowicz' | |
elif buffer[position:position + 4] in ["WICZ", "WITZ"]: | |
self.next = ('TS', 'FX', 4) | |
else: # default is to skip it | |
self.next = (None, 1) | |
def process_x(self): | |
buffer = self.word.buffer | |
position = self.position | |
# french e.g. breaux | |
self.next = (None, ) | |
if not ( | |
position == self.word.end_index | |
and (buffer[position - 3:position] in ["IAU", "EAU"] | |
or buffer[position - 2:position] in ['AU', 'OU'])): | |
self.next = ('KS',) | |
if buffer[position + 1] in ['C', 'X']: | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
def process_z(self): | |
# chinese pinyin e.g. 'zhao' | |
if self.word.buffer[self.position + 1] == 'H': | |
self.next = ('J', ) | |
elif ( | |
self.word.buffer[self.position + 1:self.position + 3] in [ | |
"ZO", "ZI", "ZA"] | |
or (self.word.is_slavo_germanic | |
and self.position > self.word.start_index | |
and self.word.buffer[self.position - 1] != 'T')): | |
self.next = ('S', 'TS') | |
else: | |
self.next = ('S', ) | |
if (self.word.buffer[self.position + 1] == 'Z' | |
or self.word.buffer[self.position + 1] == 'H'): | |
self.next = self.next + (2,) | |
else: | |
self.next = self.next + (1,) | |
def parse(self, input): | |
self.word = Word(input) | |
self.position = self.word.start_index | |
self.check_word_start() | |
# loop through chars in word.buffer | |
while self.position <= self.word.end_index: | |
character = self.word.buffer[self.position] | |
if character in VOWELS: | |
self.process_initial_vowels() | |
elif character == ' ': | |
self.position += 1 | |
continue | |
elif character == 'B': | |
self.process_b() | |
elif character == 'C': | |
self.process_c() | |
elif character == 'D': | |
self.process_d() | |
elif character == 'F': | |
self.process_f() | |
elif character == 'G': | |
self.process_g() | |
elif character == 'H': | |
self.process_h() | |
elif character == 'J': | |
self.process_j() | |
elif character == 'K': | |
self.process_k() | |
elif character == 'L': | |
self.process_l() | |
elif character == 'M': | |
self.process_m() | |
elif character == 'N': | |
self.process_n() | |
elif character == 'P': | |
self.process_p() | |
elif character == 'Q': | |
self.process_q() | |
elif character == 'R': | |
self.process_r() | |
elif character == 'S': | |
self.process_s() | |
elif character == 'T': | |
self.process_t() | |
elif character == 'V': | |
self.process_v() | |
elif character == 'W': | |
self.process_w() | |
elif character == 'X': | |
self.process_x() | |
elif character == 'Z': | |
self.process_z() | |
if len(self.next) == 2: | |
if self.next[0]: | |
self.primary_phone += self.next[0] | |
self.secondary_phone += self.next[0] | |
self.position += self.next[1] | |
elif len(self.next) == 3: | |
if self.next[0]: | |
self.primary_phone += self.next[0] | |
if self.next[1]: | |
self.secondary_phone += self.next[1] | |
self.position += self.next[2] | |
if self.primary_phone == self.secondary_phone: | |
self.secondary_phone = "" | |
return (self.primary_phone, self.secondary_phone) | |
# backwards compatibility for the pre-OO implementation | |
def _doublemetaphone(input): | |
""" | |
Given an input string, return a 2-tuple of the double metaphone codes for | |
the provided string. The second element of the tuple will be an empty | |
string if it is identical to the first element. | |
""" | |
return DoubleMetaphone().parse(input) | |
from qgis.core import * | |
from qgis.gui import * | |
@qgsfunction(args='auto', group='Custom') | |
def doublemetaphone0(text, feature, parent): | |
return _doublemetaphone(text)[0] | |
@qgsfunction(args='auto', group='Custom') | |
def doublemetaphone1(text, feature, parent): | |
return _doublemetaphone(text)[1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment