Skip to content

Instantly share code, notes, and snippets.

@risicle
Created July 22, 2019 20:12
Show Gist options
  • Save risicle/91fa7de74466058b15a59f69ad145296 to your computer and use it in GitHub Desktop.
Save risicle/91fa7de74466058b15a59f69ad145296 to your computer and use it in GitHub Desktop.
Nasty all-in-one bundling of Metaphone 0.6 module, suitable for dumping into QGIS in a hurry without battling with imports
from __future__ import unicode_literals
import unicodedata
class Word(object):
"""
"""
def __init__(self, input):
self.original = input
if isinstance(input, bytes):
self.decoded = input.decode('utf-8', 'ignore')
else:
self.decoded = input
self.decoded = self.decoded.replace('\xc7', "s")
self.decoded = self.decoded.replace('\xe7', "s")
self.normalized = ''.join(
(c for c in unicodedata.normalize('NFD', self.decoded)
if unicodedata.category(c) != 'Mn'))
self.upper = self.normalized.upper()
self.length = len(self.upper)
self.prepad = "--"
self.start_index = len(self.prepad)
self.end_index = self.start_index + self.length - 1
self.postpad = "------"
# so we can index beyond the begining and end of the input string
self.buffer = self.prepad + self.upper + self.postpad
@property
def is_slavo_germanic(self):
return (
self.upper.find('W') > -1
or self.upper.find('K') > -1
or self.upper.find('CZ') > -1
or self.upper.find('WITZ') > -1)
def get_letters(self, start=0, end=None):
if not end:
end = start + 1
start = self.start_index + start
end = self.start_index + end
return self.buffer[start:end]
VOWELS = ['A', 'E', 'I', 'O', 'U', 'Y']
SILENT_STARTERS = ["GN", "KN", "PN", "WR", "PS"]
class DoubleMetaphone(object):
"""
"""
def __init__(self):
self.position = 0
self.primary_phone = ""
self.secondary_phone = ""
# next is used set to a tuple of the next characters in the primary and
# secondary codes and to indicate how many characters to move forward
# in the string. The secondary code letter is given only when it is
# different than the primary. This is just a trick to make the code
# easier to write and read. The default action is to add nothing and
# move to next char.
self.next = (None, 1)
def check_word_start(self):
# skip these silent letters when at start of word
if self.word.get_letters(0, 2) in SILENT_STARTERS:
self.position += 1
# Initial 'X' is pronounced 'Z' e.g. 'Xavier'
if self.word.get_letters(0) == 'X':
# 'Z' maps to 'S'
self.primary_phone = self.secondary_phone = 'S'
self.position += 1
def process_initial_vowels(self):
# XXX do we need this next set? it should already be done...
self.next = (None, 1)
# all init vowels now map to 'A'
if self.position == self.word.start_index:
self.next = ('A', 1)
def process_b(self):
# "-mb", e.g., "dumb", already skipped over... see 'M' below
if self.word.buffer[self.position + 1] == 'B':
self.next = ('P', 2)
else:
self.next = ('P', 1)
def process_c(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
# various germanic
if (position > start_index + 1
and buffer[position - 2] not in VOWELS
and buffer[position - 1:self.position + 2] == 'ACH'
and buffer[position + 2] not in ['I']
and (buffer[position + 2] not in ['E']
or buffer[position - 2:position + 4] in [
'BACHER', 'MACHER'])):
self.next = ('K', 2)
# special case 'CAESAR'
elif (position == start_index
and buffer[start_index:start_index + 6] == 'CAESAR'):
self.next = ('S', 2)
# italian 'chianti'
elif buffer[position:position + 4] == 'CHIA':
self.next = ('K', 2)
elif buffer[position:position + 2] == 'CH':
# find 'michael'
if (position > start_index
and buffer[position:position + 4] == 'CHAE'):
self.next = ('K', 'X', 2)
elif (position == start_index
and (buffer[position + 1:position + 6] in ['HARAC', 'HARIS']
or buffer[position + 1:position + 4] in ["HOR", "HYM", "HIA",
"HEM"])
and buffer[start_index:start_index + 5] != 'CHORE'):
self.next = ('K', 2)
# germanic, greek, or otherwise 'ch' for 'kh' sound
elif (
buffer[start_index:start_index + 4] in ['VAN ', 'VON ']
or buffer[start_index:start_index + 3] == 'SCH'
or buffer[position - 2:position + 4] in ["ORCHES", "ARCHIT",
"ORCHID"]
or buffer[position + 2] in ['T', 'S']
or (
(buffer[position - 1] in ["A", "O", "U", "E"]
or position == start_index)
and (buffer[position + 2] in [
"L", "R", "N", "M", "B", "H", "F", "V", "W"]))):
self.next = ('K', 2)
else:
if position > start_index:
if buffer[start_index:start_index + 2] == 'MC':
self.next = ('K', 2)
else:
self.next = ('X', 'K', 2)
else:
self.next = ('X', 2)
# e.g, 'czerny'
elif (buffer[position:position + 2] == 'CZ'
and buffer[position - 2:position + 2] != 'WICZ'):
self.next = ('S', 'X', 2)
# e.g., 'focaccia'
elif buffer[position + 1:position + 4] == 'CIA':
self.next = ('X', 3)
# double 'C', but not if e.g. 'McClellan'
elif (
buffer[position:position + 2] == 'CC'
and not (position == (start_index + 1)
and buffer[start_index] == 'M')):
#'bellocchio' but not 'bacchus'
if (buffer[position + 2] in ["I", "E", "H"]
and buffer[position + 2:position + 4] != 'HU'):
# 'accident', 'accede' 'succeed'
if (
(position == (start_index + 1)
and buffer[start_index] == 'A')
or buffer[position - 1:position + 4] in [
'UCCEE', 'UCCES']):
self.next = ('KS', 3)
# 'bacci', 'bertucci', other italian
else:
self.next = ('X', 3)
else:
self.next = ('K', 2)
elif buffer[position:position + 2] in ["CK", "CG", "CQ"]:
self.next = ('K', 2)
elif buffer[position:position + 2] in ["CI", "CE", "CY"]:
# italian vs. english
if buffer[position:position + 3] in ["CIO", "CIE", "CIA"]:
self.next = ('S', 'X', 2)
else:
self.next = ('S', 2)
else:
# name sent in 'mac caffrey', 'mac gregor'
if buffer[position + 1:position + 3] in [" C", " Q", " G"]:
self.next = ('K', 3)
else:
if (buffer[position + 1] in ["C", "K", "Q"]
and buffer[position + 1:position + 3] not in ["CE", "CI"]):
self.next = ('K', 2)
# default for 'C'
else:
self.next = ('K', 1)
def process_d(self):
if self.word.buffer[self.position:self.position + 2] == 'DG':
# e.g. 'edge'
if self.word.buffer[self.position + 2] in ['I', 'E', 'Y']:
self.next = ('J', 3)
else:
self.next = ('TK', 2)
elif self.word.buffer[self.position:self.position + 2] in ['DT', 'DD']:
self.next = ('T', 2)
else:
self.next = ('T', 1)
def process_f(self):
if self.word.buffer[self.position + 1] == 'F':
self.next = ('F', 2)
else:
self.next = ('F', 1)
def process_g(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
if buffer[position + 1] == 'H':
if (position > start_index
and buffer[position - 1] not in VOWELS):
self.next = ('K', 2)
elif position < (start_index + 3):
# 'ghislane', ghiradelli
if position == start_index:
if buffer[position + 2] == 'I':
self.next = ('J', 2)
else:
self.next = ('K', 2)
# Parker's rule (with some further refinements) - e.g., 'hugh'
elif (
(position > (start_index + 1)
and buffer[position - 2] in ['B', 'H', 'D'])
or (position > (start_index + 2)
and buffer[position - 3] in ['B', 'H', 'D'])
or (position > (start_index + 3)
and buffer[position - 4] in ['B', 'H'])):
self.next = (None, 2)
else:
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough',
# 'tough'
if (position > (start_index + 2)
and buffer[position - 1] == 'U'
and buffer[position - 3] in [
"C", "G", "L", "R", "T"]):
self.next = ('F', 2)
else:
if (position > start_index
and buffer[position - 1] != 'I'):
self.next = ('K', 2)
elif buffer[position + 1] == 'N':
if (position == (start_index + 1)
and buffer[start_index] in VOWELS
and not self.word.is_slavo_germanic):
self.next = ('KN', 'N', 2)
else:
# not e.g. 'cagney'
if (buffer[position + 2:position + 4] != 'EY'
and buffer[position + 1] != 'Y'
and not self.word.is_slavo_germanic):
self.next = ('N', 'KN', 2)
else:
self.next = ('KN', 2)
# 'tagliaro'
elif (buffer[position + 1:position + 3] == 'LI'
and not self.word.is_slavo_germanic):
self.next = ('KL', 'L', 2)
# -ges-,-gep-,-gel-, -gie- at beginning
elif (position == start_index
and (buffer[position + 1] == 'Y'
or buffer[position + 1:position + 3] in [
"ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI",
"ER"])):
self.next = ('K', 'J', 2)
# -ger-, -gy-
elif (
(buffer[position + 1:position + 3] == 'ER'
or buffer[position + 1] == 'Y')
and buffer[start_index:start_index + 6] not in [
"DANGER", "RANGER", "MANGER"]
and buffer[position - 1] not in ['E', 'I']
and buffer[position - 1:position + 2] not in ['RGY', 'OGY']):
self.next = ('K', 'J', 2)
# italian e.g, 'biaggi'
elif (
buffer[position + 1] in ['E', 'I', 'Y']
or buffer[position - 1:position + 3] in [
"AGGI", "OGGI"]):
# obvious germanic
if (buffer[start_index:start_index + 4] in ['VON ', 'VAN ']
or buffer[start_index:start_index + 3] == 'SCH'
or buffer[position + 1:position + 3] == 'ET'):
self.next = ('K', 2)
else:
# always soft if french ending
if buffer[position + 1:position + 5] == 'IER ':
self.next = ('J', 2)
else:
self.next = ('J', 'K', 2)
elif buffer[position + 1] == 'G':
self.next = ('K', 2)
else:
self.next = ('K', 1)
def process_h(self):
# only keep if self.word.start_index & before vowel or btw. 2 vowels
if ((self.position == self.word.start_index
or self.word.buffer[self.position - 1] in VOWELS)
and self.word.buffer[self.position + 1] in VOWELS):
self.next = ('H', 2)
# (also takes care of 'HH')
else:
self.next = (None, 1)
def process_j(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
# obvious spanish, 'jose', 'san jacinto'
if (buffer[self.position:self.position + 4] == 'JOSE'
or buffer[start_index:start_index + 4] == 'SAN '):
if (
(position == start_index and buffer[position + 4] == ' ')
or buffer[start_index:start_index + 4] == 'SAN '):
self.next = ('H', )
else:
self.next = ('J', 'H')
# Yankelovich/Jankelowicz
elif (position == start_index
and buffer[self.position:self.position + 4] != 'JOSE'):
self.next = ('J', 'A')
else:
# spanish pron. of e.g. 'bajador'
if (buffer[position - 1] in VOWELS
and not self.word.is_slavo_germanic
and buffer[position + 1] in ['A', 'O']):
self.next = ('J', 'H')
else:
if position == self.word.end_index:
self.next = ('J', ' ')
else:
if (buffer[position + 1] not in ["L", "T", "K", "S", "N",
"M", "B", "Z"]
and buffer[position - 1] not in ["S", "K", "L"]):
self.next = ('J',)
else:
self.next = (None, )
if buffer[position + 1] == 'J':
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
def process_k(self):
if self.word.buffer[self.position + 1] == 'K':
self.next = ('K', 2)
else:
self.next = ('K', 1)
def process_l(self):
buffer = self.word.buffer
position = self.position
end_index = self.word.end_index
if buffer[position + 1] == 'L':
# spanish e.g. 'cabrillo', 'gallegos'
if ((position == (end_index - 2)
and buffer[position - 1:position + 3] in [
"ILLO", "ILLA", "ALLE"])
or ((buffer[end_index - 1:end_index + 1] in ["AS", "OS"]
or buffer[end_index] in ["A", "O"])
and buffer[position - 1:position + 3] == 'ALLE')):
self.next = ('L', '', 2)
else:
self.next = ('L', 2)
else:
self.next = ('L', 1)
def process_m(self):
buffer = self.word.buffer
position = self.position
if ((buffer[position + 1:position + 4] == 'UMB'
and (position + 1 == self.word.end_index
or buffer[position + 2:position + 4] == 'ER'))
or buffer[position + 1] == 'M'):
self.next = ('M', 2)
else:
self.next = ('M', 1)
def process_n(self):
if self.word.buffer[self.position + 1] == 'N':
self.next = ('N', 2)
else:
self.next = ('N', 1)
def process_p(self):
if self.word.buffer[self.position + 1] == 'H':
self.next = ('F', 2)
# also account for "campbell", "raspberry"
elif self.word.buffer[self.position + 1] in ['P', 'B']:
self.next = ('P', 2)
else:
self.next = ('P', 1)
def process_q(self):
if self.word.buffer[self.position + 1] == 'Q':
self.next = ('K', 2)
else:
self.next = ('K', 1)
def process_r(self):
buffer = self.word.buffer
position = self.position
end_index = self.word.end_index
# french e.g. 'rogier', but exclude 'hochmeier'
if (position == end_index
and not self.word.is_slavo_germanic
and buffer[position - 2:position] == 'IE'
and buffer[position - 4:position - 2] not in ['ME', 'MA']):
self.next = ('', 'R')
else:
self.next = ('R',)
if buffer[position + 1] == 'R':
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
def process_s(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
end_index = self.word.end_index
# special cases 'island', 'isle', 'carlisle', 'carlysle'
if buffer[position - 1:position + 2] in ['ISL', 'YSL']:
self.next = (None, 1)
# special case 'sugar-'
elif (position == start_index
and buffer[start_index:start_index + 5] == 'SUGAR'):
self.next = ('X', 'S', 1)
elif buffer[position:position + 2] == 'SH':
# germanic
if buffer[position + 1:position + 5] in [
"HEIM", "HOEK", "HOLM", "HOLZ"]:
self.next = ('S', 2)
else:
self.next = ('X', 2)
# italian & armenian
elif (buffer[position:position + 3] in ["SIO", "SIA"]
or buffer[position:position + 4] == 'SIAN'):
if not self.word.is_slavo_germanic:
self.next = ('S', 'X', 3)
else:
self.next = ('S', 3)
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider'
# match 'schneider' also, -sz- in slavic language altho in
# hungarian it is pronounced 's'
elif ((position == start_index
and buffer[position + 1] in ["M", "N", "L", "W"])
or buffer[position + 1] == 'Z'):
self.next = ('S', 'X')
if buffer[position + 1] == 'Z':
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
elif buffer[position:position + 2] == 'SC':
# Schlesinger's rule
if buffer[position + 2] == 'H':
# dutch origin, e.g. 'school', 'schooner'
if buffer[position + 3:position + 5] in [
"OO", "ER", "EN", "UY", "ED", "EM"]:
# 'schermerhorn', 'schenker'
if buffer[position + 3:position + 5] in ['ER', 'EN']:
self.next = ('X', 'SK', 3)
else:
self.next = ('SK', 3)
else:
if (position == start_index
and buffer[start_index + 3] not in VOWELS
and buffer[start_index + 3] != 'W'):
self.next = ('X', 'S', 3)
else:
self.next = ('X', 3)
elif buffer[position + 2] in ['I', 'E', 'Y']:
self.next = ('S', 3)
else:
self.next = ('SK', 3)
# french e.g. 'resnais', 'artois'
elif (position == end_index
and buffer[position - 2:position] in ['AI', 'OI']):
self.next = ('', 'S', 1)
else:
self.next = ('S', )
if buffer[position + 1] in ['S', 'Z']:
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
def process_t(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
if buffer[position:position + 4] == 'TION':
self.next = ('X', 3)
elif buffer[position:position + 3] in ['TIA', 'TCH']:
self.next = ('X', 3)
elif (buffer[position:position + 2] == 'TH'
or buffer[position:position + 3] == 'TTH'):
# special case 'thomas', 'thames' or germanic
if (buffer[position + 2:position + 4] in ['OM', 'AM']
or buffer[start_index:start_index + 4] in ['VON ', 'VAN ']
or buffer[start_index:start_index + 3] == 'SCH'):
self.next = ('T', 2)
else:
self.next = ('0', 'T', 2)
elif buffer[position + 1] in ['T', 'D']:
self.next = ('T', 2)
else:
self.next = ('T', 1)
def process_v(self):
if self.word.buffer[self.position + 1] == 'V':
self.next = ('F', 2)
else:
self.next = ('F', 1)
def process_w(self):
buffer = self.word.buffer
position = self.position
start_index = self.word.start_index
# can also be in middle of word
if buffer[position:position + 2] == 'WR':
self.next = ('R', 2)
elif (position == start_index
and (buffer[position + 1] in VOWELS
or buffer[position:position + 2] == 'WH')):
# Wasserman should match Vasserman
if buffer[position + 1] in VOWELS:
self.next = ('A', 'F', 1)
else:
self.next = ('A', 1)
# Arnow should match Arnoff
elif ((position == self.word.end_index
and buffer[position - 1] in VOWELS)
or buffer[position - 1:position + 4] in [
"EWSKI", "EWSKY", "OWSKI", "OWSKY"]
or buffer[start_index:start_index + 3] == 'SCH'):
self.next = ('', 'F', 1)
# polish e.g. 'filipowicz'
elif buffer[position:position + 4] in ["WICZ", "WITZ"]:
self.next = ('TS', 'FX', 4)
else: # default is to skip it
self.next = (None, 1)
def process_x(self):
buffer = self.word.buffer
position = self.position
# french e.g. breaux
self.next = (None, )
if not (
position == self.word.end_index
and (buffer[position - 3:position] in ["IAU", "EAU"]
or buffer[position - 2:position] in ['AU', 'OU'])):
self.next = ('KS',)
if buffer[position + 1] in ['C', 'X']:
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
def process_z(self):
# chinese pinyin e.g. 'zhao'
if self.word.buffer[self.position + 1] == 'H':
self.next = ('J', )
elif (
self.word.buffer[self.position + 1:self.position + 3] in [
"ZO", "ZI", "ZA"]
or (self.word.is_slavo_germanic
and self.position > self.word.start_index
and self.word.buffer[self.position - 1] != 'T')):
self.next = ('S', 'TS')
else:
self.next = ('S', )
if (self.word.buffer[self.position + 1] == 'Z'
or self.word.buffer[self.position + 1] == 'H'):
self.next = self.next + (2,)
else:
self.next = self.next + (1,)
def parse(self, input):
self.word = Word(input)
self.position = self.word.start_index
self.check_word_start()
# loop through chars in word.buffer
while self.position <= self.word.end_index:
character = self.word.buffer[self.position]
if character in VOWELS:
self.process_initial_vowels()
elif character == ' ':
self.position += 1
continue
elif character == 'B':
self.process_b()
elif character == 'C':
self.process_c()
elif character == 'D':
self.process_d()
elif character == 'F':
self.process_f()
elif character == 'G':
self.process_g()
elif character == 'H':
self.process_h()
elif character == 'J':
self.process_j()
elif character == 'K':
self.process_k()
elif character == 'L':
self.process_l()
elif character == 'M':
self.process_m()
elif character == 'N':
self.process_n()
elif character == 'P':
self.process_p()
elif character == 'Q':
self.process_q()
elif character == 'R':
self.process_r()
elif character == 'S':
self.process_s()
elif character == 'T':
self.process_t()
elif character == 'V':
self.process_v()
elif character == 'W':
self.process_w()
elif character == 'X':
self.process_x()
elif character == 'Z':
self.process_z()
if len(self.next) == 2:
if self.next[0]:
self.primary_phone += self.next[0]
self.secondary_phone += self.next[0]
self.position += self.next[1]
elif len(self.next) == 3:
if self.next[0]:
self.primary_phone += self.next[0]
if self.next[1]:
self.secondary_phone += self.next[1]
self.position += self.next[2]
if self.primary_phone == self.secondary_phone:
self.secondary_phone = ""
return (self.primary_phone, self.secondary_phone)
# backwards compatibility for the pre-OO implementation
def _doublemetaphone(input):
"""
Given an input string, return a 2-tuple of the double metaphone codes for
the provided string. The second element of the tuple will be an empty
string if it is identical to the first element.
"""
return DoubleMetaphone().parse(input)
from qgis.core import *
from qgis.gui import *
@qgsfunction(args='auto', group='Custom')
def doublemetaphone0(text, feature, parent):
return _doublemetaphone(text)[0]
@qgsfunction(args='auto', group='Custom')
def doublemetaphone1(text, feature, parent):
return _doublemetaphone(text)[1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment