Skip to content

Instantly share code, notes, and snippets.

@eddieantonio eddieantonio/
Last active Dec 10, 2018

What would you like to do?
ceci n'est pas une pipe
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import re
from dataclasses import dataclass
from functools import partial
from typing import TypeVar, Generic, Callable
from unicodedata import normalize
A = TypeVar('A')
B = TypeVar('B')
MACRON_TO_CIRCUMFLEX = str.maketrans('ēīōā', 'êîôâ')
QUOTE_TO_SHORT_I = str.maketrans("'’", 'ii')
WHITESPACE = re.compile(r'\s+')
PARENTHESIZED_ELISION = re.compile(r'[(]([ioa])[)]')
class Pipe(Generic[A]):
_value: A
def done(self) -> A:
return self._value
def __or__(self, fn: Callable[[A], B]) -> 'Pipe[B]':
return type(self)(fn(self._value))
class Arguments:
def __init__(self, *args, **kwargs) -> None:
self.args = args
self.kwargs = kwargs
def __rtruediv__(self, fn):
return lambda x: fn(x, *self.args, **self.kwargs)
def normalize_sro(utterance: str) -> str:
Normalizes Plains Cree utterances written in the standard Roman orthography.
The following are the normalizations applied:
>>> normalize_sro('Maskêkosihk')
No extraneous whitespace on either edge of the string:
>>> normalize_sro(' maskêkosihk ')
Exactly one U+0020 SPACE character between words:
>>> normalize_sro('nisto nêwo kapakihtikta nipiy')
'nisto nêwo kapakihtikta nipiy'
All <ê> are long:
>>> normalize_sro('kecikwasakew')
Undo short-i elision (with apostrophe or quotes):
>>> normalize_sro("tân’si/tân'si")
Undo vowel elision using parentheses:
>>> normalize_sro("mostos(o)wiyâs/nin(i)s(i)tohtên")
return (Pipe(utterance) |
str.strip |
str.lower |
str.replace / Arguments('e', 'ê') |
str.translate / Arguments(TRANSLATIONS) |
partial(WHITESPACE.sub, ' ') |
partial(PARENTHESIZED_ELISION.sub, r'\1')).done()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.