Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Singularize function from patterns codebase
#### SINGULARIZE #########################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/
# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.
_singular_rules = [
(r'(?i)(.)ae$', '\\1a'),
(r'(?i)(.)itis$', '\\1itis'),
(r'(?i)(.)eaux$', '\\1eau'),
(r'(?i)(quiz)zes$', '\\1'),
(r'(?i)(matr)ices$', '\\1ix'),
(r'(?i)(ap|vert|ind)ices$', '\\1ex'),
(r'(?i)^(ox)en', '\\1'),
(r'(?i)(alias|status)es$', '\\1'),
(r'(?i)([octop|vir])i$', '\\1us'),
(r'(?i)(cris|ax|test)es$', '\\1is'),
(r'(?i)(shoe)s$', '\\1'),
(r'(?i)(o)es$', '\\1'),
(r'(?i)(bus)es$', '\\1'),
(r'(?i)([m|l])ice$', '\\1ouse'),
(r'(?i)(x|ch|ss|sh)es$', '\\1'),
(r'(?i)(m)ovies$', '\\1ovie'),
(r'(?i)(.)ombies$', '\\1ombie'),
(r'(?i)(s)eries$', '\\1eries'),
(r'(?i)([^aeiouy]|qu)ies$', '\\1y'),
# -f, -fe sometimes take -ves in the plural
# (e.g., lives, wolves).
(r"([aeo]l)ves$", "\\1f"),
(r"([^d]ea)ves$", "\\1f"),
(r"arves$", "arf"),
(r"erves$", "erve"),
(r"([nlw]i)ves$", "\\1fe"),
(r'(?i)([lr])ves$', '\\1f'),
(r"([aeo])ves$", "\\1ve"),
(r'(?i)(sive)s$', '\\1'),
(r'(?i)(tive)s$', '\\1'),
(r'(?i)(hive)s$', '\\1'),
(r'(?i)([^f])ves$', '\\1fe'),
# -ses suffixes.
(r'(?i)(^analy)ses$', '\\1sis'),
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$',
'\\1\\2sis'),
(r'(?i)(.)opses$', '\\1opsis'),
(r'(?i)(.)yses$', '\\1ysis'),
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$',
'\\1ose'),
(r'(?i)(.)oses$', '\\1osis'),
# -a
(r'(?i)([ti])a$', '\\1um'),
(r'(?i)(n)ews$', '\\1ews'),
(r'(?i)([^s])s$', '\\1'), # don't make ss singularize to s.
]
# For performance, compile the regular expressions only once:
_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules]
_singular_uninflected = set((
"bison", "debris", "headquarters", "pincers", "trout",
"bream", "diabetes", "herpes", "pliers", "tuna",
"breeches", "djinn", "high-jinks", "proceedings", "whiting",
"britches", "eland", "homework", "rabies", "wildebeest"
"carp", "elk", "innings", "salmon",
"chassis", "flounder", "jackanapes", "scissors",
"christmas", "gallows", "mackerel", "series",
"clippers", "georgia", "measles", "shears",
"cod", "graffiti", "mews", "species",
"contretemps", "mumps", "swine",
"corps", "news", "swiss",
# Custom added from MD&A corpus
"api", "mae", "sae", "basis", "india", "media",
))
_singular_uncountable = set((
"advice", "equipment", "happiness", "luggage", "news", "software",
"bread", "fruit", "information", "mathematics", "progress", "understanding",
"butter", "furniture", "ketchup", "mayonnaise", "research", "water"
"cheese", "garbage", "knowledge", "meat", "rice",
"electricity", "gravel", "love", "mustard", "sand",
))
_singular_ie = set((
"alergie", "cutie", "hoagie", "newbie", "softie", "veggie",
"auntie", "doggie", "hottie", "nightie", "sortie", "weenie",
"beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie",
"birdie", "freebie", "junkie", "^pie", "sweetie", "zombie"
"bogie", "goonie", "laddie", "pixie", "techie",
"bombie", "groupie", "laramie", "quickie", "^tie",
"collie", "hankie", "lingerie", "reverie", "toughie",
"cookie", "hippie", "meanie", "rookie", "valkyrie",
))
_singular_irregular = {
"abuses": "abuse",
"ads": "ad",
"atlantes": "atlas",
"atlases": "atlas",
"analysis": "analysis",
"axes": "axe",
"beeves": "beef",
"brethren": "brother",
"children": "child",
"children": "child",
"corpora": "corpus",
"corpuses": "corpus",
"ephemerides": "ephemeris",
"feet": "foot",
"ganglia": "ganglion",
"geese": "goose",
"genera": "genus",
"genii": "genie",
"graffiti": "graffito",
"helves": "helve",
"kine": "cow",
"leaves": "leaf",
"loaves": "loaf",
"men": "man",
"mongooses": "mongoose",
"monies": "money",
"moves": "move",
"mythoi": "mythos",
"numena": "numen",
"occipita": "occiput",
"octopodes": "octopus",
"opera": "opus",
"opuses": "opus",
"our": "my",
"oxen": "ox",
"penes": "penis",
"penises": "penis",
"people": "person",
"sexes": "sex",
"soliloquies": "soliloquy",
"teeth": "tooth",
"testes": "testis",
"trilbys": "trilby",
"turves": "turf",
"zoa": "zoon",
}
_plural_prepositions = set((
"about", "before", "during", "of", "till",
"above", "behind", "except", "off", "to",
"across", "below", "for", "on", "under",
"after", "beneath", "from", "onto", "until",
"among", "beside", "in", "out", "unto",
"around", "besides", "into", "over", "upon",
"at", "between", "near", "since", "with",
"athwart", "betwixt", "beyond", "but", "by"
))
def singularize(word, custom={}):
"""Returns the singular of a given word."""
if word in custom:
return custom[word]
# Recurse compound words (e.g. mothers-in-law).
if "-" in word:
w = word.split("-")
if len(w) > 1 and w[1] in _plural_prepositions:
return singularize(w[0], custom) + "-" + "-".join(w[1:])
# dogs' => dog's
if word.endswith("'"):
return singularize(word[:-1], custom) + "'s"
w = word.lower()
for x in _singular_uninflected:
if x.endswith(w):
return word
for x in _singular_uncountable:
if x.endswith(w):
return word
for x in _singular_ie:
if w.endswith(x + "s"):
return w
for x in _singular_irregular:
if w.endswith(x):
return re.sub('(?i)' + x + '$', _singular_irregular[x], word)
for suffix, inflection in _singular_rules:
m = suffix.search(word)
g = m and m.groups() or []
if m:
for k in range(len(g)):
if g[k] is None:
inflection = inflection.replace('\\' + str(k + 1), '')
return suffix.sub(inflection, word)
return word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment