|
from pyclts.transcriptionsystem import TranscriptionSystem |
|
from itertools import combinations |
|
|
|
def score_sounds( |
|
a, |
|
b, |
|
features=None, |
|
classes=None, |
|
bipa=None |
|
): |
|
""" |
|
Score sounds with Hamming distance from feature system. |
|
""" |
|
# load bipa object |
|
bipa = bipa or TranscriptionSystem('bipa') |
|
|
|
# define the features |
|
features = features or { |
|
"consonant": list( |
|
bipa['t'].featuredict), |
|
"vowel": list( |
|
bipa['a'].featuredict), |
|
"tone": list( |
|
bipa['⁵⁵'].featuredict) |
|
} |
|
# define base score for the classes |
|
classes = classes or { |
|
"consonant": 1, |
|
"vowel": 1, |
|
"tone": 1 |
|
} |
|
|
|
# convert sounds to transcription system |
|
sA, sB = bipa(a+' '+b) |
|
|
|
# check for diphthongs or clusters |
|
if hasattr(sA, 'from_sound'): |
|
sA = sA.from_sound |
|
if hasattr(sB, 'from_sound'): |
|
sB = sB.from_sound |
|
|
|
# return -10 if classes don't match |
|
if sA.type != sB.type: |
|
return -10 |
|
|
|
# base score is the number of features |
|
sim = len(features[sA.type]) |
|
|
|
# normalization factor |
|
normalize = classes[sA.type] / sim |
|
|
|
# return in case of identity |
|
if a == b: |
|
return sim * normalize |
|
|
|
# reduce similarity in case of mismatch |
|
for feature in features[sA.type]: |
|
if sA.featuredict[feature] != sB.featuredict[feature]: |
|
sim -= 1 |
|
return sim * normalize |
|
|
|
|
|
def get_scorer( |
|
letters, |
|
bipa=None, |
|
classes=None, |
|
features=None |
|
): |
|
""" |
|
Retrieve a scoring dictionary for alignment algorithms. |
|
""" |
|
# load bipa object |
|
bipa = bipa or TranscriptionSystem('bipa') |
|
|
|
# define the features |
|
features = features or { |
|
"consonant": list( |
|
bipa['t'].featuredict), |
|
"vowel": list( |
|
bipa['a'].featuredict), |
|
"tone": list( |
|
bipa['⁵⁵'].featuredict) |
|
} |
|
# define base score for the classes |
|
classes = classes or { |
|
"consonant": 1, |
|
"vowel": 1, |
|
"tone": 1 |
|
} |
|
|
|
scorer = {} |
|
bipa = bipa or TranscriptionSystem('bipa') |
|
for a, b in combinations(letters, r=2): |
|
scorer[a, b] = scorer[b, a] = score_sounds(a, b, bipa=bipa) |
|
scorer[a, a] = score_sounds(a, a, bipa=bipa) |
|
scorer[b, b] = score_sounds(b, b, bipa=bipa) |
|
|
|
return scorer |
|
|
|
|
|
from tabulate import tabulate |
|
cons = ['p', 't', 'b', 'd', 'pʰ', 'tʰ'] |
|
vows = ['a', 'e', 'i', 'o', 'u'] |
|
scorer = get_scorer(cons+vows) |
|
|
|
matrix = [[1 for x in cons] for y in cons] |
|
for (i, a), (j, b) in combinations(enumerate(cons), r=2): |
|
matrix[i][j] = matrix[j][i] = round(scorer[a, b], 2) |
|
for i, (c, r) in enumerate(zip(cons, matrix)): |
|
matrix[i] = [c]+r |
|
print(tabulate(matrix, headers=cons, tablefmt='pipe')) |
|
|
|
matrix = [[1 for x in vows] for y in vows] |
|
for (i, a), (j, b) in combinations(enumerate(vows), r=2): |
|
matrix[i][j] = matrix[j][i] = round(scorer[a, b], 2) |
|
for i, (c, r) in enumerate(zip(vows, matrix)): |
|
matrix[i] = [c]+r |
|
print(tabulate(matrix, headers=vows, tablefmt='pipe')) |