rma2015/chkparser2

## chkparser2
#!/usr/bin/env python3

from enum import Enum
from itertools import chain
import re
import pprint

filepath = "chukchi_texts.txt"

#pprint.pprint(words)

def suffix(word, affix_recessive, affix_dominant = "\0"):
    return word.endswith(affix_recessive) or word.endswith(affix_dominant)

def prefix(word, affix_recessive, affix_dominant = "\0"):
    return word.startswith(affix_recessive) or word.startswith(affix_dominant)

def in_affix(word, affix_recessive, affix_dominant = "\0"):
        return affix_recessive in word or affix_dominant in word


verb_prefixes = [
                {'mət',
                'mən',
                'mənʔ',
                'ʔən',
                'nʔ',
                'mʔ',
                'ne',
                'na',
                'ɣe',
                'ɣa',
                't',
                'm',
                'n',
                'q',},
                {'re', 'ra', 'r'},
                {'ine',
                'ena',
                'in',
                'en'},
                {'r', 'n'}
            ]

verb_suffixes = [{'et', 'at', 'ew', 'aw'},
                 {'tku', 'tko'},
                 {'ŋŋo', 'pɬətku'},
                 {'ɣe','ɣi', 'nin', 'nen', 'ɬin', 'ɬen', 'qin', 'qen', 'jɣəm',
                 'qenat', 'ninet', 'ɬinet', 'ɬenat', 'nenat', 'qinet',
                 'rkəni', 'rkəne', 'ɣʔe', 'ɣʔa', 'ɣʔi', 'rk'},
                 {'n', 't', 'ɣəm','ɣət','muri','more','turi','tore', 'k',
                  'rkən'}]

verb_endings_require_prefix = {'k': True,
                                'nin': False,
                                'nen': False,
                                'ɬinet': False,
                                'ɬenat': False,
                                'ɬin': False,
                                'ɬen': False,
                                'qin': True,
                                'qen': True,
                                'qinet': True,
                                'qenat': True,
                                'jɣəm': True,
                                'ɣe': False,
                                'ɣi': False,
                                'ninet': False,
                                'nenat': False,
                                't': True,
                                'ɣəm': True,
                                'ɣət': True,
                                'muri': True,
                                'more': True,
                                'turi': True,
                                'tore': True,
                                'ɣʔe': False,
                                'ɣʔa': False,
                                'rkən': False,
                                'rkəne': False,
                                'rkəni': False}

noun_prefixes = [{'ɣe', 'ɣa'},
                 {'taŋ', 'mejŋ'}]

noun_suffixes = [{'ɬʔ', 'ɣərɣ'},
                 {'cɣ'},
                 {'cəku', 'cəko','jekwe', 'curm', 'corm'},
                 {'n', 't', 'ŋə', 'te', 'ta', 'e', 'a','etə',
                  'ɣtə', 'jpə', 'ɣəpə', 'epə', 'ɣjit', 'ɣjet',
                  'nu', 'no', 'k', 'jikwi', 'jekwe', 'cəku', 'cəko'}]

all_verb_prefixes = set(chain.from_iterable(verb_prefixes))
all_verb_suffixes = set(chain.from_iterable(verb_suffixes))

all_noun_prefixes = set(chain.from_iterable(noun_prefixes))
all_noun_suffixes = set(chain.from_iterable(noun_suffixes))

only_noun_prefixes = all_noun_prefixes.difference(all_verb_prefixes)
only_noun_suffixes = all_noun_suffixes.difference(all_verb_suffixes)

all_prefixes = set(chain(all_verb_prefixes, all_noun_prefixes))
all_suffixes = set(chain(all_verb_suffixes, all_noun_suffixes))


verb_prefix_glosses = [{'t': '1SG.S/A.IND',
                'mət': '1PL.S/A.IND',
                'm': '1SG.S/A.INT',
                'n': '3.S/A.INT',
                'q': '2.S/A.INT',
                'mən': '1PL.S/A.INT',
                'mʔ': '1SG.S/A.COND',
                'mənʔ': '1PL.S/A.COND',
                'nʔ': 'N1.S/A.COND',
                'ne': '3A',
                'na': '3A',
                'ʔən': '3A.INT'},
                {'re': 'FUT', 'ra': 'FUT', 'r': 'FUT'},
                {'ine': '1SG.O',
                'ena': '1SG.O',
                'in': '1SG.O',
                'en': '1SG.O'},
                {'r': 'CS', 'n': 'CS'}]

verb_suffix_glosses = [{'et': 'TH', 'at': 'TH', 'ew': 'TH', 'aw': 'TH'},
                 {'k': '1SG.S/A.IND//INF', 'nin': '3SG.S>3SG.O', 'nen':'3SG.S>3SG.O',
                 'ɣʔe': 'TH', 'ɣʔa': 'TH', 'tək': '2PL', 'tkə': '2PL.A>3.O',
                 'ɣəm': '1SG.S/A','ɣət': '2SG.S/A','muri': '1PL.S/A','more': '1PL.S/A',
                 'turi': '2PL.S/A','tore': '2PL.S/A'}]

noun_prefix_glosses = [{'ɣe':'COM', 'ɣa':'COM'},
                 {'taŋ':'INTS'}]
noun_suffix_glosses = [{'ŋə':'ABS', 'te': 'ERG', 'ta':'ERG', 'e':'ERG', 'a':'ERG',
                    'jtə':'ALL', 'ɣtə':'ALL', 'jpə':'ABL', 'ɣəpə':'ABL',
                    'ɣpə':'ABL', 'ɣjit':'ORIENT', 'ɣjet':'ORIENT', 'nu':'EQU', 'no':'EQU'}]

# figure out which affixes only occur in verbs and which only in nouns
# we will use this later to determine whether the word is a verb or a noun


class WordType(Enum):
    other = 0
    noun = 1
    verb = 2
    adjective = 3


def prefix_in_word(word, prefixes, must_start=False):
    if must_start:
        for prefix in prefixes:
            if word.startswith(prefix):
                return True
        else:
            return False

    longest_prefix = max(prefixes, key=len)
    max_length = min(len(word), len(longest_prefix))
    word_section = word[:max_length]

    for prefix in prefixes:
        if prefix in word_section:
            return True
    else:
        return False

def suffix_in_word(word, suffixes, must_end=False):

    if must_end:
        for suffix in suffixes:
            if word.endswith(suffix):
                return True
        else:
            return False

    longest_suffix = max(suffixes, key=len)
    max_length = min(len(word), len(longest_suffix))
    word_section = word[-max_length:]

    for suffix in suffixes:
        if suffix in word_section:
            return True
    else:
        return False

def is_verb(word):
    for suffix, requires_prefix in verb_endings_require_prefix.items():

        if word.endswith(suffix):
            if requires_prefix:
                for prefix in verb_prefixes[0]:
                    if word.startswith(prefix):
                        return True
                else:
                    return False
            else:
                # a suffix is sufficient to identify this as a verb
                return True

    else:
        # this word does not end with any verb suffixes and so is not a verb
        return False


def part_of_speech(word):
    if is_verb(word):
        return WordType.verb
    elif (prefix_in_word(word, only_noun_prefixes)
            or suffix_in_word(word, only_noun_suffixes)):
        return WordType.noun

    else:
        return WordType.other

def segment_word(word, prefixes, suffixes, epenthetic='ə'):
    my_prefixes, my_suffixes = [], []

    for prefix_slot in prefixes:

        for prefix in prefix_slot:
            word_section = word[:len(prefix)]
            if word_section == prefix:
                my_prefixes.append(prefix)

                word = word[len(prefix):]

                # check for an epenthetic vowel
                if word.startswith(epenthetic):
                    my_prefixes.append(epenthetic)
                    word = word[1:]

                # no other prefixes can occur in this slot
                break
        else:
            my_prefixes.append('')

    # do the suffixes in reverse order
    for suffix_slot in reversed(suffixes):

        for suffix in suffix_slot:
            word_section = word[-len(suffix):]

            if word_section == suffix:
                my_suffixes.append(suffix)

                word = word[:-len(suffix)]

                if word.endswith(epenthetic):
                    my_suffixes.append(epenthetic)
                    word = word[:-1]

                # no other suffixes can occur in this slot
                break
        else:
            my_suffixes.append('')

    # flip the order of suffixes back to left-to-right
    my_suffixes = list(reversed(my_suffixes))

    # the remaining word is probably the root
    root = word

    morphemes = (my_prefixes, [root], my_suffixes)
    return morphemes


def break_up_word(word):
    # determine whether this is a noun or a verb
    pos = part_of_speech(word)

    if pos is WordType.noun:
        # segment the word into prefixes
        morphemes = segment_word(word, noun_prefixes, noun_suffixes)
    elif pos is WordType.verb:
        morphemes = segment_word(word, verb_prefixes, verb_suffixes)
    else:
        # this is neither a noun nor a verb
        return None

    # join the morphemes with a dash
    dashed_word = '-'.join(morpheme for morpheme in chain.from_iterable(morphemes)
                            if morpheme)

    return morphemes, pos, dashed_word


def find_glosses_affixes(affixes, glosses):
    my_glosses = []
    for n, affix in enumerate(affixes):
        if affix:
            try:
                gloss = glosses[n][affix]
            except KeyError:
                # this prefix is not in the glosses
                gloss = '???'
            except IndexError:
                gloss = '*'
            my_glosses.append(gloss)
    return my_glosses


def gloss_word(word_morphemes, pos):
    prefixes, root, suffixes = word_morphemes

    if pos is WordType.noun:
        prefix_glosses = noun_prefix_glosses
        suffix_glosses = noun_suffix_glosses
    elif pos is WordType.verb:
        prefix_glosses = verb_prefix_glosses
        suffix_glosses = verb_suffix_glosses


    prefix_glosses = find_glosses_affixes(prefixes, prefix_glosses)
    root_gloss = root # replace with something else (like ???) if you want to
    suffix_glosses = find_glosses_affixes(suffixes, suffix_glosses)

    all_glosses = prefix_glosses + [root_gloss] + suffix_glosses

    return all_glosses


def dash_insertion_reduplicate(word, length):
        return (word[:-length] + "-" + word[-length:])

def format_epenthetic(word, epenthetic='ə'):
    if not word:
        # this is not a word
        return None

    if (('-'+epenthetic in word or epenthetic+'-' in word)
        and '-{}-'.format(epenthetic) not in word):

        #print('word:', word)

        # check that there are no empty splits because of double dashes

        split_word = word.split('-')
        substrings = []

        # check that this schwa is not word-initial or word-final
        for n, substring in enumerate(split_word):
            if (substring.startswith(epenthetic) and
                substring.endswith(epenthetic) and 0 < n < len(word)):

                new_substring = [epenthetic, substring[1:-1], epenthetic]

            elif substring.startswith(epenthetic) and n > 0:
                new_substring = [epenthetic, substring[1:]]

            elif substring.endswith(epenthetic) and n < len(word):
                new_substring = [substring[:-1], epenthetic]

            else:
                new_substring = [substring]

            substrings += new_substring


        word = '-'.join(substrings)

    return word


if __name__ == '__main__':

    with open(filepath, encoding='utf-16') as text_file:
        original_text = text_file.read()

    word_expression = '\w+'
    words = re.findall(word_expression, original_text)

    morphed_words = [break_up_word(word) for word in words if word]

    to_gloss = ((word[0], word[1]) for word in morphed_words if word)
    glossed_words = [gloss_word(morphemes, pos) for morphemes, pos in to_gloss]


    with open('output.txt', 'w') as output_file:
        pprint.pprint(morphed_words, stream=output_file)

        # print('*****GLOSSES******', file=output_file)
        # pprint.pprint(glossed_words, stream=output_file)
	#!/usr/bin/env python3

	from enum import Enum
	from itertools import chain
	import re
	import pprint

	filepath = "chukchi_texts.txt"

	#pprint.pprint(words)

	def suffix(word, affix_recessive, affix_dominant = "\0"):
	return word.endswith(affix_recessive) or word.endswith(affix_dominant)

	def prefix(word, affix_recessive, affix_dominant = "\0"):
	return word.startswith(affix_recessive) or word.startswith(affix_dominant)

	def in_affix(word, affix_recessive, affix_dominant = "\0"):
	return affix_recessive in word or affix_dominant in word


	verb_prefixes = [
	{'mət',
	'mən',
	'mənʔ',
	'ʔən',
	'nʔ',
	'mʔ',
	'ne',
	'na',
	'ɣe',
	'ɣa',
	't',
	'm',
	'n',
	'q',},
	{'re', 'ra', 'r'},
	{'ine',
	'ena',
	'in',
	'en'},
	{'r', 'n'}
	]

	verb_suffixes = [{'et', 'at', 'ew', 'aw'},
	{'tku', 'tko'},
	{'ŋŋo', 'pɬətku'},
	{'ɣe','ɣi', 'nin', 'nen', 'ɬin', 'ɬen', 'qin', 'qen', 'jɣəm',
	'qenat', 'ninet', 'ɬinet', 'ɬenat', 'nenat', 'qinet',
	'rkəni', 'rkəne', 'ɣʔe', 'ɣʔa', 'ɣʔi', 'rk'},
	{'n', 't', 'ɣəm','ɣət','muri','more','turi','tore', 'k',
	'rkən'}]

	verb_endings_require_prefix = {'k': True,
	'nin': False,
	'nen': False,
	'ɬinet': False,
	'ɬenat': False,
	'ɬin': False,
	'ɬen': False,
	'qin': True,
	'qen': True,
	'qinet': True,
	'qenat': True,
	'jɣəm': True,
	'ɣe': False,
	'ɣi': False,
	'ninet': False,
	'nenat': False,
	't': True,
	'ɣəm': True,
	'ɣət': True,
	'muri': True,
	'more': True,
	'turi': True,
	'tore': True,
	'ɣʔe': False,
	'ɣʔa': False,
	'rkən': False,
	'rkəne': False,
	'rkəni': False}

	noun_prefixes = [{'ɣe', 'ɣa'},
	{'taŋ', 'mejŋ'}]

	noun_suffixes = [{'ɬʔ', 'ɣərɣ'},
	{'cɣ'},
	{'cəku', 'cəko','jekwe', 'curm', 'corm'},
	{'n', 't', 'ŋə', 'te', 'ta', 'e', 'a','etə',
	'ɣtə', 'jpə', 'ɣəpə', 'epə', 'ɣjit', 'ɣjet',
	'nu', 'no', 'k', 'jikwi', 'jekwe', 'cəku', 'cəko'}]

	all_verb_prefixes = set(chain.from_iterable(verb_prefixes))
	all_verb_suffixes = set(chain.from_iterable(verb_suffixes))

	all_noun_prefixes = set(chain.from_iterable(noun_prefixes))
	all_noun_suffixes = set(chain.from_iterable(noun_suffixes))

	only_noun_prefixes = all_noun_prefixes.difference(all_verb_prefixes)
	only_noun_suffixes = all_noun_suffixes.difference(all_verb_suffixes)

	all_prefixes = set(chain(all_verb_prefixes, all_noun_prefixes))
	all_suffixes = set(chain(all_verb_suffixes, all_noun_suffixes))



	verb_prefix_glosses = [{'t': '1SG.S/A.IND',
	'mət': '1PL.S/A.IND',
	'm': '1SG.S/A.INT',
	'n': '3.S/A.INT',
	'q': '2.S/A.INT',
	'mən': '1PL.S/A.INT',
	'mʔ': '1SG.S/A.COND',
	'mənʔ': '1PL.S/A.COND',
	'nʔ': 'N1.S/A.COND',
	'ne': '3A',
	'na': '3A',
	'ʔən': '3A.INT'},
	{'re': 'FUT', 'ra': 'FUT', 'r': 'FUT'},
	{'ine': '1SG.O',
	'ena': '1SG.O',
	'in': '1SG.O',
	'en': '1SG.O'},
	{'r': 'CS', 'n': 'CS'}]

	verb_suffix_glosses = [{'et': 'TH', 'at': 'TH', 'ew': 'TH', 'aw': 'TH'},
	{'k': '1SG.S/A.IND//INF', 'nin': '3SG.S>3SG.O', 'nen':'3SG.S>3SG.O',
	'ɣʔe': 'TH', 'ɣʔa': 'TH', 'tək': '2PL', 'tkə': '2PL.A>3.O',
	'ɣəm': '1SG.S/A','ɣət': '2SG.S/A','muri': '1PL.S/A','more': '1PL.S/A',
	'turi': '2PL.S/A','tore': '2PL.S/A'}]

	noun_prefix_glosses = [{'ɣe':'COM', 'ɣa':'COM'},
	{'taŋ':'INTS'}]
	noun_suffix_glosses = [{'ŋə':'ABS', 'te': 'ERG', 'ta':'ERG', 'e':'ERG', 'a':'ERG',
	'jtə':'ALL', 'ɣtə':'ALL', 'jpə':'ABL', 'ɣəpə':'ABL',
	'ɣpə':'ABL', 'ɣjit':'ORIENT', 'ɣjet':'ORIENT', 'nu':'EQU', 'no':'EQU'}]

	# figure out which affixes only occur in verbs and which only in nouns
	# we will use this later to determine whether the word is a verb or a noun




	class WordType(Enum):
	other = 0
	noun = 1
	verb = 2
	adjective = 3


	def prefix_in_word(word, prefixes, must_start=False):
	if must_start:
	for prefix in prefixes:
	if word.startswith(prefix):
	return True
	else:
	return False

	longest_prefix = max(prefixes, key=len)
	max_length = min(len(word), len(longest_prefix))
	word_section = word[:max_length]

	for prefix in prefixes:
	if prefix in word_section:
	return True
	else:
	return False

	def suffix_in_word(word, suffixes, must_end=False):

	if must_end:
	for suffix in suffixes:
	if word.endswith(suffix):
	return True
	else:
	return False

	longest_suffix = max(suffixes, key=len)
	max_length = min(len(word), len(longest_suffix))
	word_section = word[-max_length:]

	for suffix in suffixes:
	if suffix in word_section:
	return True
	else:
	return False

	def is_verb(word):
	for suffix, requires_prefix in verb_endings_require_prefix.items():

	if word.endswith(suffix):
	if requires_prefix:
	for prefix in verb_prefixes[0]:
	if word.startswith(prefix):
	return True
	else:
	return False
	else:
	# a suffix is sufficient to identify this as a verb
	return True

	else:
	# this word does not end with any verb suffixes and so is not a verb
	return False




	def part_of_speech(word):
	if is_verb(word):
	return WordType.verb
	elif (prefix_in_word(word, only_noun_prefixes)
	or suffix_in_word(word, only_noun_suffixes)):
	return WordType.noun

	else:
	return WordType.other

	def segment_word(word, prefixes, suffixes, epenthetic='ə'):
	my_prefixes, my_suffixes = [], []

	for prefix_slot in prefixes:

	for prefix in prefix_slot:
	word_section = word[:len(prefix)]
	if word_section == prefix:
	my_prefixes.append(prefix)

	word = word[len(prefix):]

	# check for an epenthetic vowel
	if word.startswith(epenthetic):
	my_prefixes.append(epenthetic)
	word = word[1:]

	# no other prefixes can occur in this slot
	break
	else:
	my_prefixes.append('')

	# do the suffixes in reverse order
	for suffix_slot in reversed(suffixes):

	for suffix in suffix_slot:
	word_section = word[-len(suffix):]

	if word_section == suffix:
	my_suffixes.append(suffix)

	word = word[:-len(suffix)]

	if word.endswith(epenthetic):
	my_suffixes.append(epenthetic)
	word = word[:-1]

	# no other suffixes can occur in this slot
	break
	else:
	my_suffixes.append('')

	# flip the order of suffixes back to left-to-right
	my_suffixes = list(reversed(my_suffixes))

	# the remaining word is probably the root
	root = word

	morphemes = (my_prefixes, [root], my_suffixes)
	return morphemes


	def break_up_word(word):
	# determine whether this is a noun or a verb
	pos = part_of_speech(word)

	if pos is WordType.noun:
	# segment the word into prefixes
	morphemes = segment_word(word, noun_prefixes, noun_suffixes)
	elif pos is WordType.verb:
	morphemes = segment_word(word, verb_prefixes, verb_suffixes)
	else:
	# this is neither a noun nor a verb
	return None

	# join the morphemes with a dash
	dashed_word = '-'.join(morpheme for morpheme in chain.from_iterable(morphemes)
	if morpheme)

	return morphemes, pos, dashed_word


	def find_glosses_affixes(affixes, glosses):
	my_glosses = []
	for n, affix in enumerate(affixes):
	if affix:
	try:
	gloss = glosses[n][affix]
	except KeyError:
	# this prefix is not in the glosses
	gloss = '???'
	except IndexError:
	gloss = '*'
	my_glosses.append(gloss)
	return my_glosses


	def gloss_word(word_morphemes, pos):
	prefixes, root, suffixes = word_morphemes

	if pos is WordType.noun:
	prefix_glosses = noun_prefix_glosses
	suffix_glosses = noun_suffix_glosses
	elif pos is WordType.verb:
	prefix_glosses = verb_prefix_glosses
	suffix_glosses = verb_suffix_glosses


	prefix_glosses = find_glosses_affixes(prefixes, prefix_glosses)
	root_gloss = root # replace with something else (like ???) if you want to
	suffix_glosses = find_glosses_affixes(suffixes, suffix_glosses)

	all_glosses = prefix_glosses + [root_gloss] + suffix_glosses

	return all_glosses



	def dash_insertion_reduplicate(word, length):
	return (word[:-length] + "-" + word[-length:])

	def format_epenthetic(word, epenthetic='ə'):
	if not word:
	# this is not a word
	return None

	if (('-'+epenthetic in word or epenthetic+'-' in word)
	and '-{}-'.format(epenthetic) not in word):

	#print('word:', word)

	# check that there are no empty splits because of double dashes

	split_word = word.split('-')
	substrings = []

	# check that this schwa is not word-initial or word-final
	for n, substring in enumerate(split_word):
	if (substring.startswith(epenthetic) and
	substring.endswith(epenthetic) and 0 < n < len(word)):

	new_substring = [epenthetic, substring[1:-1], epenthetic]

	elif substring.startswith(epenthetic) and n > 0:
	new_substring = [epenthetic, substring[1:]]

	elif substring.endswith(epenthetic) and n < len(word):
	new_substring = [substring[:-1], epenthetic]

	else:
	new_substring = [substring]

	substrings += new_substring


	word = '-'.join(substrings)

	return word


	if __name__ == '__main__':

	with open(filepath, encoding='utf-16') as text_file:
	original_text = text_file.read()

	word_expression = '\w+'
	words = re.findall(word_expression, original_text)

	morphed_words = [break_up_word(word) for word in words if word]

	to_gloss = ((word[0], word[1]) for word in morphed_words if word)
	glossed_words = [gloss_word(morphemes, pos) for morphemes, pos in to_gloss]


	with open('output.txt', 'w') as output_file:
	pprint.pprint(morphed_words, stream=output_file)

	# print('***GLOSSES****', file=output_file)
	# pprint.pprint(glossed_words, stream=output_file)