eddieantonio/pipe.py

## pipe.py
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import re
from dataclasses import dataclass
from functools import partial
from typing import TypeVar, Generic, Callable
from unicodedata import normalize

A = TypeVar('A')
B = TypeVar('B')

MACRON_TO_CIRCUMFLEX = str.maketrans('ēīōā', 'êîôâ')
QUOTE_TO_SHORT_I = str.maketrans("'’", 'ii')
TRANSLATIONS = {**MACRON_TO_CIRCUMFLEX, **QUOTE_TO_SHORT_I}

WHITESPACE = re.compile(r'\s+')
PARENTHESIZED_ELISION = re.compile(r'[(]([ioa])[)]')


@dataclass
class Pipe(Generic[A]):
    _value: A

    def done(self) -> A:
        return self._value

    def __or__(self, fn: Callable[[A], B]) -> 'Pipe[B]':
        return type(self)(fn(self._value))


class Arguments:
    def __init__(self, *args, **kwargs) -> None:
        self.args = args
        self.kwargs = kwargs

    def __rtruediv__(self, fn):
        return lambda x: fn(x, *self.args, **self.kwargs)


def normalize_sro(utterance: str) -> str:
    """
    Normalizes Plains Cree utterances written in the standard Roman orthography.

    The following are the normalizations applied:

    Lower-cased:

    >>> normalize_sro('Maskêkosihk')
    'maskêkosihk'

    No extraneous whitespace on either edge of the string:

    >>> normalize_sro('  maskêkosihk ')
    'maskêkosihk'

    Exactly one U+0020 SPACE character between words:

    >>> normalize_sro('nisto  nêwo  kapakihtikta    nipiy')
    'nisto nêwo kapakihtikta nipiy'

    All <ê> are long:

    >>> normalize_sro('kecikwasakew')
    'kêcikwasakêw'

    Undo short-i elision (with apostrophe or quotes):

    >>> normalize_sro("tân’si/tân'si")
    'tânisi/tânisi'

    Undo vowel elision using parentheses:

    >>> normalize_sro("mostos(o)wiyâs/nin(i)s(i)tohtên")
    'mostosowiyâs/ninisitohtên'
    """

    return (Pipe(utterance) |
            str.strip |
            str.lower |
            str.replace / Arguments('e', 'ê') |
            str.translate / Arguments(TRANSLATIONS) |
            partial(WHITESPACE.sub, ' ') |
            partial(PARENTHESIZED_ELISION.sub, r'\1')).done()
	#!/usr/bin/env python3
	# -- coding: UTF-8 --

	import re
	from dataclasses import dataclass
	from functools import partial
	from typing import TypeVar, Generic, Callable
	from unicodedata import normalize

	A = TypeVar('A')
	B = TypeVar('B')

	MACRON_TO_CIRCUMFLEX = str.maketrans('ēīōā', 'êîôâ')
	QUOTE_TO_SHORT_I = str.maketrans("'’", 'ii')
	TRANSLATIONS = {MACRON_TO_CIRCUMFLEX, QUOTE_TO_SHORT_I}

	WHITESPACE = re.compile(r'\s+')
	PARENTHESIZED_ELISION = re.compile(r'[(]([ioa])[)]')


	@dataclass
	class Pipe(Generic[A]):
	_value: A

	def done(self) -> A:
	return self._value

	def __or__(self, fn: Callable[[A], B]) -> 'Pipe[B]':
	return type(self)(fn(self._value))


	class Arguments:
	def __init__(self, args, *kwargs) -> None:
	self.args = args
	self.kwargs = kwargs

	def __rtruediv__(self, fn):
	return lambda x: fn(x, self.args, *self.kwargs)


	def normalize_sro(utterance: str) -> str:
	"""
	Normalizes Plains Cree utterances written in the standard Roman orthography.

	The following are the normalizations applied:

	Lower-cased:

	>>> normalize_sro('Maskêkosihk')
	'maskêkosihk'

	No extraneous whitespace on either edge of the string:

	>>> normalize_sro(' maskêkosihk ')
	'maskêkosihk'

	Exactly one U+0020 SPACE character between words:

	>>> normalize_sro('nisto nêwo kapakihtikta nipiy')
	'nisto nêwo kapakihtikta nipiy'

	All <ê> are long:

	>>> normalize_sro('kecikwasakew')
	'kêcikwasakêw'

	Undo short-i elision (with apostrophe or quotes):

	>>> normalize_sro("tân’si/tân'si")
	'tânisi/tânisi'

	Undo vowel elision using parentheses:

	>>> normalize_sro("mostos(o)wiyâs/nin(i)s(i)tohtên")
	'mostosowiyâs/ninisitohtên'
	"""

	return (Pipe(utterance) \|
	str.strip \|
	str.lower \|
	str.replace / Arguments('e', 'ê') \|
	str.translate / Arguments(TRANSLATIONS) \|
	partial(WHITESPACE.sub, ' ') \|
	partial(PARENTHESIZED_ELISION.sub, r'\1')).done()