Skip to content

Instantly share code, notes, and snippets.

@gaulinmp gaulinmp/singularize.py
Last active Aug 29, 2015

Embed
What would you like to do?
Singularize function from patterns codebase
#### SINGULARIZE #########################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/
# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.
_singular_rules = [
(r'(?i)(.)ae$', '\\1a'),
(r'(?i)(.)itis$', '\\1itis'),
(r'(?i)(.)eaux$', '\\1eau'),
(r'(?i)(quiz)zes$', '\\1'),
(r'(?i)(matr)ices$', '\\1ix'),
(r'(?i)(ap|vert|ind)ices$', '\\1ex'),
(r'(?i)^(ox)en', '\\1'),
(r'(?i)(alias|status)es$', '\\1'),
(r'(?i)([octop|vir])i$', '\\1us'),
(r'(?i)(cris|ax|test)es$', '\\1is'),
(r'(?i)(shoe)s$', '\\1'),
(r'(?i)(o)es$', '\\1'),
(r'(?i)(bus)es$', '\\1'),
(r'(?i)([m|l])ice$', '\\1ouse'),
(r'(?i)(x|ch|ss|sh)es$', '\\1'),
(r'(?i)(m)ovies$', '\\1ovie'),
(r'(?i)(.)ombies$', '\\1ombie'),
(r'(?i)(s)eries$', '\\1eries'),
(r'(?i)([^aeiouy]|qu)ies$', '\\1y'),
# -f, -fe sometimes take -ves in the plural
# (e.g., lives, wolves).
(r"([aeo]l)ves$", "\\1f"),
(r"([^d]ea)ves$", "\\1f"),
(r"arves$", "arf"),
(r"erves$", "erve"),
(r"([nlw]i)ves$", "\\1fe"),
(r'(?i)([lr])ves$', '\\1f'),
(r"([aeo])ves$", "\\1ve"),
(r'(?i)(sive)s$', '\\1'),
(r'(?i)(tive)s$', '\\1'),
(r'(?i)(hive)s$', '\\1'),
(r'(?i)([^f])ves$', '\\1fe'),
# -ses suffixes.
(r'(?i)(^analy)ses$', '\\1sis'),
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$',
'\\1\\2sis'),
(r'(?i)(.)opses$', '\\1opsis'),
(r'(?i)(.)yses$', '\\1ysis'),
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$',
'\\1ose'),
(r'(?i)(.)oses$', '\\1osis'),
# -a
(r'(?i)([ti])a$', '\\1um'),
(r'(?i)(n)ews$', '\\1ews'),
(r'(?i)([^s])s$', '\\1'), # don't make ss singularize to s.
]
# For performance, compile the regular expressions only once:
_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules]
_singular_uninflected = set((
"bison", "debris", "headquarters", "pincers", "trout",
"bream", "diabetes", "herpes", "pliers", "tuna",
"breeches", "djinn", "high-jinks", "proceedings", "whiting",
"britches", "eland", "homework", "rabies", "wildebeest"
"carp", "elk", "innings", "salmon",
"chassis", "flounder", "jackanapes", "scissors",
"christmas", "gallows", "mackerel", "series",
"clippers", "georgia", "measles", "shears",
"cod", "graffiti", "mews", "species",
"contretemps", "mumps", "swine",
"corps", "news", "swiss",
# Custom added from MD&A corpus
"api", "mae", "sae", "basis", "india", "media",
))
_singular_uncountable = set((
"advice", "equipment", "happiness", "luggage", "news", "software",
"bread", "fruit", "information", "mathematics", "progress", "understanding",
"butter", "furniture", "ketchup", "mayonnaise", "research", "water"
"cheese", "garbage", "knowledge", "meat", "rice",
"electricity", "gravel", "love", "mustard", "sand",
))
_singular_ie = set((
"alergie", "cutie", "hoagie", "newbie", "softie", "veggie",
"auntie", "doggie", "hottie", "nightie", "sortie", "weenie",
"beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie",
"birdie", "freebie", "junkie", "^pie", "sweetie", "zombie"
"bogie", "goonie", "laddie", "pixie", "techie",
"bombie", "groupie", "laramie", "quickie", "^tie",
"collie", "hankie", "lingerie", "reverie", "toughie",
"cookie", "hippie", "meanie", "rookie", "valkyrie",
))
_singular_irregular = {
"abuses": "abuse",
"ads": "ad",
"atlantes": "atlas",
"atlases": "atlas",
"analysis": "analysis",
"axes": "axe",
"beeves": "beef",
"brethren": "brother",
"children": "child",
"children": "child",
"corpora": "corpus",
"corpuses": "corpus",
"ephemerides": "ephemeris",
"feet": "foot",
"ganglia": "ganglion",
"geese": "goose",
"genera": "genus",
"genii": "genie",
"graffiti": "graffito",
"helves": "helve",
"kine": "cow",
"leaves": "leaf",
"loaves": "loaf",
"men": "man",
"mongooses": "mongoose",
"monies": "money",
"moves": "move",
"mythoi": "mythos",
"numena": "numen",
"occipita": "occiput",
"octopodes": "octopus",
"opera": "opus",
"opuses": "opus",
"our": "my",
"oxen": "ox",
"penes": "penis",
"penises": "penis",
"people": "person",
"sexes": "sex",
"soliloquies": "soliloquy",
"teeth": "tooth",
"testes": "testis",
"trilbys": "trilby",
"turves": "turf",
"zoa": "zoon",
}
_plural_prepositions = set((
"about", "before", "during", "of", "till",
"above", "behind", "except", "off", "to",
"across", "below", "for", "on", "under",
"after", "beneath", "from", "onto", "until",
"among", "beside", "in", "out", "unto",
"around", "besides", "into", "over", "upon",
"at", "between", "near", "since", "with",
"athwart", "betwixt", "beyond", "but", "by"
))
def singularize(word, custom={}):
"""Returns the singular of a given word."""
if word in custom:
return custom[word]
# Recurse compound words (e.g. mothers-in-law).
if "-" in word:
w = word.split("-")
if len(w) > 1 and w[1] in _plural_prepositions:
return singularize(w[0], custom) + "-" + "-".join(w[1:])
# dogs' => dog's
if word.endswith("'"):
return singularize(word[:-1], custom) + "'s"
w = word.lower()
for x in _singular_uninflected:
if x.endswith(w):
return word
for x in _singular_uncountable:
if x.endswith(w):
return word
for x in _singular_ie:
if w.endswith(x + "s"):
return w
for x in _singular_irregular:
if w.endswith(x):
return re.sub('(?i)' + x + '$', _singular_irregular[x], word)
for suffix, inflection in _singular_rules:
m = suffix.search(word)
g = m and m.groups() or []
if m:
for k in range(len(g)):
if g[k] is None:
inflection = inflection.replace('\\' + str(k + 1), '')
return suffix.sub(inflection, word)
return word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.