Skip to content

Instantly share code, notes, and snippets.

@LinguList
Created August 17, 2019 15:19
Show Gist options
  • Save LinguList/7fac44813572f65259c872ef89fa64ad to your computer and use it in GitHub Desktop.
Save LinguList/7fac44813572f65259c872ef89fa64ad to your computer and use it in GitHub Desktop.
Feature-Based Alignment Analyses with LingPy and CLTS (1)

Feature-Based Alignment Analyses with LingPy and CLTS (1)

Requirements (can all be installed with PIP)

  • pyclts
  • tabulate

Run code

python features.py

More information

from pyclts.transcriptionsystem import TranscriptionSystem
from itertools import combinations
def score_sounds(
a,
b,
features=None,
classes=None,
bipa=None
):
"""
Score sounds with Hamming distance from feature system.
"""
# load bipa object
bipa = bipa or TranscriptionSystem('bipa')
# define the features
features = features or {
"consonant": list(
bipa['t'].featuredict),
"vowel": list(
bipa['a'].featuredict),
"tone": list(
bipa['⁵⁵'].featuredict)
}
# define base score for the classes
classes = classes or {
"consonant": 1,
"vowel": 1,
"tone": 1
}
# convert sounds to transcription system
sA, sB = bipa(a+' '+b)
# check for diphthongs or clusters
if hasattr(sA, 'from_sound'):
sA = sA.from_sound
if hasattr(sB, 'from_sound'):
sB = sB.from_sound
# return -10 if classes don't match
if sA.type != sB.type:
return -10
# base score is the number of features
sim = len(features[sA.type])
# normalization factor
normalize = classes[sA.type] / sim
# return in case of identity
if a == b:
return sim * normalize
# reduce similarity in case of mismatch
for feature in features[sA.type]:
if sA.featuredict[feature] != sB.featuredict[feature]:
sim -= 1
return sim * normalize
def get_scorer(
letters,
bipa=None,
classes=None,
features=None
):
"""
Retrieve a scoring dictionary for alignment algorithms.
"""
# load bipa object
bipa = bipa or TranscriptionSystem('bipa')
# define the features
features = features or {
"consonant": list(
bipa['t'].featuredict),
"vowel": list(
bipa['a'].featuredict),
"tone": list(
bipa['⁵⁵'].featuredict)
}
# define base score for the classes
classes = classes or {
"consonant": 1,
"vowel": 1,
"tone": 1
}
scorer = {}
bipa = bipa or TranscriptionSystem('bipa')
for a, b in combinations(letters, r=2):
scorer[a, b] = scorer[b, a] = score_sounds(a, b, bipa=bipa)
scorer[a, a] = score_sounds(a, a, bipa=bipa)
scorer[b, b] = score_sounds(b, b, bipa=bipa)
return scorer
from tabulate import tabulate
cons = ['p', 't', 'b', 'd', 'pʰ', 'tʰ']
vows = ['a', 'e', 'i', 'o', 'u']
scorer = get_scorer(cons+vows)
matrix = [[1 for x in cons] for y in cons]
for (i, a), (j, b) in combinations(enumerate(cons), r=2):
matrix[i][j] = matrix[j][i] = round(scorer[a, b], 2)
for i, (c, r) in enumerate(zip(cons, matrix)):
matrix[i] = [c]+r
print(tabulate(matrix, headers=cons, tablefmt='pipe'))
matrix = [[1 for x in vows] for y in vows]
for (i, a), (j, b) in combinations(enumerate(vows), r=2):
matrix[i][j] = matrix[j][i] = round(scorer[a, b], 2)
for i, (c, r) in enumerate(zip(vows, matrix)):
matrix[i] = [c]+r
print(tabulate(matrix, headers=vows, tablefmt='pipe'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment