Skip to content

Instantly share code, notes, and snippets.

@macleginn
Last active June 25, 2017 09:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save macleginn/4ef36f18052d5c8199447870a1aa16e0 to your computer and use it in GitHub Desktop.
Save macleginn/4ef36f18052d5c8199447870a1aa16e0 to your computer and use it in GitHub Desktop.
Convert the data-table of the phonologies of the languages of Kurdistan into the normalised format
import numpy as np
import pandas as pd
import re
from functools import reduce
def process_phoneme(p):
"""Normalise phonetic symbols and enforce pharyngealised treatment of emphatics."""
p = p.split('/')[0].replace(':', 'ː').replace('\u0361', '').replace('ˠ', 'ˤ').replace('\u033b', '').replace("'", 'ʰ').replace('\u032a', '')
if 'l' not in p and '\u0334' in p:
p = p.replace('\u0334', 'ˤ')
status = 'normal'
if '(' in p:
status = 'borrowed'
elif '<' in p:
status = 'marginal'
return p.strip('()<>'), status
def process_row(r):
"""Normalise a description of a language."""
phons = []
status = []
for el in re.split(r'\s+', r[1]['Cons'].strip()):
p, s = process_phoneme(el)
# For debugging.
if not p:
print(el)
print(r[1]['Cons'])
print(r[1]['Name'])
print()
phons.append(p)
status.append(s)
return pd.DataFrame({
'Name': np.repeat(r[1]['Name'], len(phons)),
'Group': np.repeat(r[1]['Group'], len(phons)),
'Lat': np.repeat(r[1]['Lat'], len(phons)),
'Lon': np.repeat(r[1]['Lon'], len(phons)),
'Cons': phons,
'Status': status
})
data = pd.read_csv('june_6_data.tsv', sep='\t')
# Create an empty dataframe and append all the
# normalised descriptions to it.
dataframe = reduce(lambda x, y: x.append(process_row(y),
ignore_index = True),
data.iterrows(),
pd.DataFrame())
dataframe.to_csv('Kurdistan_normalised_data.csv', sep=',', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment