Skip to content

Instantly share code, notes, and snippets.

@eddieantonio
Last active December 10, 2018 18:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eddieantonio/59beb8c5def8e332b49f6442297a01aa to your computer and use it in GitHub Desktop.
Save eddieantonio/59beb8c5def8e332b49f6442297a01aa to your computer and use it in GitHub Desktop.
ceci n'est pas une pipe
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import re
from dataclasses import dataclass
from functools import partial
from typing import TypeVar, Generic, Callable
from unicodedata import normalize
A = TypeVar('A')
B = TypeVar('B')
MACRON_TO_CIRCUMFLEX = str.maketrans('ēīōā', 'êîôâ')
QUOTE_TO_SHORT_I = str.maketrans("'’", 'ii')
TRANSLATIONS = {**MACRON_TO_CIRCUMFLEX, **QUOTE_TO_SHORT_I}
WHITESPACE = re.compile(r'\s+')
PARENTHESIZED_ELISION = re.compile(r'[(]([ioa])[)]')
@dataclass
class Pipe(Generic[A]):
_value: A
def done(self) -> A:
return self._value
def __or__(self, fn: Callable[[A], B]) -> 'Pipe[B]':
return type(self)(fn(self._value))
class Arguments:
def __init__(self, *args, **kwargs) -> None:
self.args = args
self.kwargs = kwargs
def __rtruediv__(self, fn):
return lambda x: fn(x, *self.args, **self.kwargs)
def normalize_sro(utterance: str) -> str:
"""
Normalizes Plains Cree utterances written in the standard Roman orthography.
The following are the normalizations applied:
Lower-cased:
>>> normalize_sro('Maskêkosihk')
'maskêkosihk'
No extraneous whitespace on either edge of the string:
>>> normalize_sro(' maskêkosihk ')
'maskêkosihk'
Exactly one U+0020 SPACE character between words:
>>> normalize_sro('nisto nêwo kapakihtikta nipiy')
'nisto nêwo kapakihtikta nipiy'
All <ê> are long:
>>> normalize_sro('kecikwasakew')
'kêcikwasakêw'
Undo short-i elision (with apostrophe or quotes):
>>> normalize_sro("tân’si/tân'si")
'tânisi/tânisi'
Undo vowel elision using parentheses:
>>> normalize_sro("mostos(o)wiyâs/nin(i)s(i)tohtên")
'mostosowiyâs/ninisitohtên'
"""
return (Pipe(utterance) |
str.strip |
str.lower |
str.replace / Arguments('e', 'ê') |
str.translate / Arguments(TRANSLATIONS) |
partial(WHITESPACE.sub, ' ') |
partial(PARENTHESIZED_ELISION.sub, r'\1')).done()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment