Skip to content

Instantly share code, notes, and snippets.

@rma2015
Last active April 6, 2017 22:12
Show Gist options
  • Save rma2015/feee2ec1cc4450d91d50 to your computer and use it in GitHub Desktop.
Save rma2015/feee2ec1cc4450d91d50 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from enum import Enum
from itertools import chain
import re
import pprint
filepath = "chukchi_texts.txt"
#pprint.pprint(words)
def suffix(word, affix_recessive, affix_dominant = "\0"):
return word.endswith(affix_recessive) or word.endswith(affix_dominant)
def prefix(word, affix_recessive, affix_dominant = "\0"):
return word.startswith(affix_recessive) or word.startswith(affix_dominant)
def in_affix(word, affix_recessive, affix_dominant = "\0"):
return affix_recessive in word or affix_dominant in word
verb_prefixes = [
{'mət',
'mən',
'mənʔ',
'ʔən',
'nʔ',
'mʔ',
'ne',
'na',
'ɣe',
'ɣa',
't',
'm',
'n',
'q',},
{'re', 'ra', 'r'},
{'ine',
'ena',
'in',
'en'},
{'r', 'n'}
]
verb_suffixes = [{'et', 'at', 'ew', 'aw'},
{'tku', 'tko'},
{'ŋŋo', 'pɬətku'},
{'ɣe','ɣi', 'nin', 'nen', 'ɬin', 'ɬen', 'qin', 'qen', 'jɣəm',
'qenat', 'ninet', 'ɬinet', 'ɬenat', 'nenat', 'qinet',
'rkəni', 'rkəne', 'ɣʔe', 'ɣʔa', 'ɣʔi', 'rk'},
{'n', 't', 'ɣəm','ɣət','muri','more','turi','tore', 'k',
'rkən'}]
verb_endings_require_prefix = {'k': True,
'nin': False,
'nen': False,
'ɬinet': False,
'ɬenat': False,
'ɬin': False,
'ɬen': False,
'qin': True,
'qen': True,
'qinet': True,
'qenat': True,
'jɣəm': True,
'ɣe': False,
'ɣi': False,
'ninet': False,
'nenat': False,
't': True,
'ɣəm': True,
'ɣət': True,
'muri': True,
'more': True,
'turi': True,
'tore': True,
'ɣʔe': False,
'ɣʔa': False,
'rkən': False,
'rkəne': False,
'rkəni': False}
noun_prefixes = [{'ɣe', 'ɣa'},
{'taŋ', 'mejŋ'}]
noun_suffixes = [{'ɬʔ', 'ɣərɣ'},
{'cɣ'},
{'cəku', 'cəko','jekwe', 'curm', 'corm'},
{'n', 't', 'ŋə', 'te', 'ta', 'e', 'a','etə',
'ɣtə', 'jpə', 'ɣəpə', 'epə', 'ɣjit', 'ɣjet',
'nu', 'no', 'k', 'jikwi', 'jekwe', 'cəku', 'cəko'}]
all_verb_prefixes = set(chain.from_iterable(verb_prefixes))
all_verb_suffixes = set(chain.from_iterable(verb_suffixes))
all_noun_prefixes = set(chain.from_iterable(noun_prefixes))
all_noun_suffixes = set(chain.from_iterable(noun_suffixes))
only_noun_prefixes = all_noun_prefixes.difference(all_verb_prefixes)
only_noun_suffixes = all_noun_suffixes.difference(all_verb_suffixes)
all_prefixes = set(chain(all_verb_prefixes, all_noun_prefixes))
all_suffixes = set(chain(all_verb_suffixes, all_noun_suffixes))
verb_prefix_glosses = [{'t': '1SG.S/A.IND',
'mət': '1PL.S/A.IND',
'm': '1SG.S/A.INT',
'n': '3.S/A.INT',
'q': '2.S/A.INT',
'mən': '1PL.S/A.INT',
'mʔ': '1SG.S/A.COND',
'mənʔ': '1PL.S/A.COND',
'nʔ': 'N1.S/A.COND',
'ne': '3A',
'na': '3A',
'ʔən': '3A.INT'},
{'re': 'FUT', 'ra': 'FUT', 'r': 'FUT'},
{'ine': '1SG.O',
'ena': '1SG.O',
'in': '1SG.O',
'en': '1SG.O'},
{'r': 'CS', 'n': 'CS'}]
verb_suffix_glosses = [{'et': 'TH', 'at': 'TH', 'ew': 'TH', 'aw': 'TH'},
{'k': '1SG.S/A.IND//INF', 'nin': '3SG.S>3SG.O', 'nen':'3SG.S>3SG.O',
'ɣʔe': 'TH', 'ɣʔa': 'TH', 'tək': '2PL', 'tkə': '2PL.A>3.O',
'ɣəm': '1SG.S/A','ɣət': '2SG.S/A','muri': '1PL.S/A','more': '1PL.S/A',
'turi': '2PL.S/A','tore': '2PL.S/A'}]
noun_prefix_glosses = [{'ɣe':'COM', 'ɣa':'COM'},
{'taŋ':'INTS'}]
noun_suffix_glosses = [{'ŋə':'ABS', 'te': 'ERG', 'ta':'ERG', 'e':'ERG', 'a':'ERG',
'jtə':'ALL', 'ɣtə':'ALL', 'jpə':'ABL', 'ɣəpə':'ABL',
'ɣpə':'ABL', 'ɣjit':'ORIENT', 'ɣjet':'ORIENT', 'nu':'EQU', 'no':'EQU'}]
# figure out which affixes only occur in verbs and which only in nouns
# we will use this later to determine whether the word is a verb or a noun
class WordType(Enum):
other = 0
noun = 1
verb = 2
adjective = 3
def prefix_in_word(word, prefixes, must_start=False):
if must_start:
for prefix in prefixes:
if word.startswith(prefix):
return True
else:
return False
longest_prefix = max(prefixes, key=len)
max_length = min(len(word), len(longest_prefix))
word_section = word[:max_length]
for prefix in prefixes:
if prefix in word_section:
return True
else:
return False
def suffix_in_word(word, suffixes, must_end=False):
if must_end:
for suffix in suffixes:
if word.endswith(suffix):
return True
else:
return False
longest_suffix = max(suffixes, key=len)
max_length = min(len(word), len(longest_suffix))
word_section = word[-max_length:]
for suffix in suffixes:
if suffix in word_section:
return True
else:
return False
def is_verb(word):
for suffix, requires_prefix in verb_endings_require_prefix.items():
if word.endswith(suffix):
if requires_prefix:
for prefix in verb_prefixes[0]:
if word.startswith(prefix):
return True
else:
return False
else:
# a suffix is sufficient to identify this as a verb
return True
else:
# this word does not end with any verb suffixes and so is not a verb
return False
def part_of_speech(word):
if is_verb(word):
return WordType.verb
elif (prefix_in_word(word, only_noun_prefixes)
or suffix_in_word(word, only_noun_suffixes)):
return WordType.noun
else:
return WordType.other
def segment_word(word, prefixes, suffixes, epenthetic='ə'):
my_prefixes, my_suffixes = [], []
for prefix_slot in prefixes:
for prefix in prefix_slot:
word_section = word[:len(prefix)]
if word_section == prefix:
my_prefixes.append(prefix)
word = word[len(prefix):]
# check for an epenthetic vowel
if word.startswith(epenthetic):
my_prefixes.append(epenthetic)
word = word[1:]
# no other prefixes can occur in this slot
break
else:
my_prefixes.append('')
# do the suffixes in reverse order
for suffix_slot in reversed(suffixes):
for suffix in suffix_slot:
word_section = word[-len(suffix):]
if word_section == suffix:
my_suffixes.append(suffix)
word = word[:-len(suffix)]
if word.endswith(epenthetic):
my_suffixes.append(epenthetic)
word = word[:-1]
# no other suffixes can occur in this slot
break
else:
my_suffixes.append('')
# flip the order of suffixes back to left-to-right
my_suffixes = list(reversed(my_suffixes))
# the remaining word is probably the root
root = word
morphemes = (my_prefixes, [root], my_suffixes)
return morphemes
def break_up_word(word):
# determine whether this is a noun or a verb
pos = part_of_speech(word)
if pos is WordType.noun:
# segment the word into prefixes
morphemes = segment_word(word, noun_prefixes, noun_suffixes)
elif pos is WordType.verb:
morphemes = segment_word(word, verb_prefixes, verb_suffixes)
else:
# this is neither a noun nor a verb
return None
# join the morphemes with a dash
dashed_word = '-'.join(morpheme for morpheme in chain.from_iterable(morphemes)
if morpheme)
return morphemes, pos, dashed_word
def find_glosses_affixes(affixes, glosses):
my_glosses = []
for n, affix in enumerate(affixes):
if affix:
try:
gloss = glosses[n][affix]
except KeyError:
# this prefix is not in the glosses
gloss = '???'
except IndexError:
gloss = '*'
my_glosses.append(gloss)
return my_glosses
def gloss_word(word_morphemes, pos):
prefixes, root, suffixes = word_morphemes
if pos is WordType.noun:
prefix_glosses = noun_prefix_glosses
suffix_glosses = noun_suffix_glosses
elif pos is WordType.verb:
prefix_glosses = verb_prefix_glosses
suffix_glosses = verb_suffix_glosses
prefix_glosses = find_glosses_affixes(prefixes, prefix_glosses)
root_gloss = root # replace with something else (like ???) if you want to
suffix_glosses = find_glosses_affixes(suffixes, suffix_glosses)
all_glosses = prefix_glosses + [root_gloss] + suffix_glosses
return all_glosses
def dash_insertion_reduplicate(word, length):
return (word[:-length] + "-" + word[-length:])
def format_epenthetic(word, epenthetic='ə'):
if not word:
# this is not a word
return None
if (('-'+epenthetic in word or epenthetic+'-' in word)
and '-{}-'.format(epenthetic) not in word):
#print('word:', word)
# check that there are no empty splits because of double dashes
split_word = word.split('-')
substrings = []
# check that this schwa is not word-initial or word-final
for n, substring in enumerate(split_word):
if (substring.startswith(epenthetic) and
substring.endswith(epenthetic) and 0 < n < len(word)):
new_substring = [epenthetic, substring[1:-1], epenthetic]
elif substring.startswith(epenthetic) and n > 0:
new_substring = [epenthetic, substring[1:]]
elif substring.endswith(epenthetic) and n < len(word):
new_substring = [substring[:-1], epenthetic]
else:
new_substring = [substring]
substrings += new_substring
word = '-'.join(substrings)
return word
if __name__ == '__main__':
with open(filepath, encoding='utf-16') as text_file:
original_text = text_file.read()
word_expression = '\w+'
words = re.findall(word_expression, original_text)
morphed_words = [break_up_word(word) for word in words if word]
to_gloss = ((word[0], word[1]) for word in morphed_words if word)
glossed_words = [gloss_word(morphemes, pos) for morphemes, pos in to_gloss]
with open('output.txt', 'w') as output_file:
pprint.pprint(morphed_words, stream=output_file)
# print('*****GLOSSES******', file=output_file)
# pprint.pprint(glossed_words, stream=output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment