Singularize function from patterns codebase
#### SINGULARIZE ######################################################### | |
# Adapted from Bermi Ferrer's Inflector for Python: | |
# http://www.bermi.org/inflector/ | |
# Copyright (c) 2006 Bermi Ferrer Martinez | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software to deal in this software without restriction, including | |
# without limitation the rights to use, copy, modify, merge, publish, | |
# distribute, sublicense, and/or sell copies of this software, and to permit | |
# persons to whom this software is furnished to do so, subject to the following | |
# condition: | |
# | |
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THIS SOFTWARE. | |
_singular_rules = [ | |
(r'(?i)(.)ae$', '\\1a'), | |
(r'(?i)(.)itis$', '\\1itis'), | |
(r'(?i)(.)eaux$', '\\1eau'), | |
(r'(?i)(quiz)zes$', '\\1'), | |
(r'(?i)(matr)ices$', '\\1ix'), | |
(r'(?i)(ap|vert|ind)ices$', '\\1ex'), | |
(r'(?i)^(ox)en', '\\1'), | |
(r'(?i)(alias|status)es$', '\\1'), | |
(r'(?i)([octop|vir])i$', '\\1us'), | |
(r'(?i)(cris|ax|test)es$', '\\1is'), | |
(r'(?i)(shoe)s$', '\\1'), | |
(r'(?i)(o)es$', '\\1'), | |
(r'(?i)(bus)es$', '\\1'), | |
(r'(?i)([m|l])ice$', '\\1ouse'), | |
(r'(?i)(x|ch|ss|sh)es$', '\\1'), | |
(r'(?i)(m)ovies$', '\\1ovie'), | |
(r'(?i)(.)ombies$', '\\1ombie'), | |
(r'(?i)(s)eries$', '\\1eries'), | |
(r'(?i)([^aeiouy]|qu)ies$', '\\1y'), | |
# -f, -fe sometimes take -ves in the plural | |
# (e.g., lives, wolves). | |
(r"([aeo]l)ves$", "\\1f"), | |
(r"([^d]ea)ves$", "\\1f"), | |
(r"arves$", "arf"), | |
(r"erves$", "erve"), | |
(r"([nlw]i)ves$", "\\1fe"), | |
(r'(?i)([lr])ves$', '\\1f'), | |
(r"([aeo])ves$", "\\1ve"), | |
(r'(?i)(sive)s$', '\\1'), | |
(r'(?i)(tive)s$', '\\1'), | |
(r'(?i)(hive)s$', '\\1'), | |
(r'(?i)([^f])ves$', '\\1fe'), | |
# -ses suffixes. | |
(r'(?i)(^analy)ses$', '\\1sis'), | |
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', | |
'\\1\\2sis'), | |
(r'(?i)(.)opses$', '\\1opsis'), | |
(r'(?i)(.)yses$', '\\1ysis'), | |
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'), | |
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', | |
'\\1ose'), | |
(r'(?i)(.)oses$', '\\1osis'), | |
# -a | |
(r'(?i)([ti])a$', '\\1um'), | |
(r'(?i)(n)ews$', '\\1ews'), | |
(r'(?i)([^s])s$', '\\1'), # don't make ss singularize to s. | |
] | |
# For performance, compile the regular expressions only once: | |
_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules] | |
_singular_uninflected = set(( | |
"bison", "debris", "headquarters", "pincers", "trout", | |
"bream", "diabetes", "herpes", "pliers", "tuna", | |
"breeches", "djinn", "high-jinks", "proceedings", "whiting", | |
"britches", "eland", "homework", "rabies", "wildebeest" | |
"carp", "elk", "innings", "salmon", | |
"chassis", "flounder", "jackanapes", "scissors", | |
"christmas", "gallows", "mackerel", "series", | |
"clippers", "georgia", "measles", "shears", | |
"cod", "graffiti", "mews", "species", | |
"contretemps", "mumps", "swine", | |
"corps", "news", "swiss", | |
# Custom added from MD&A corpus | |
"api", "mae", "sae", "basis", "india", "media", | |
)) | |
_singular_uncountable = set(( | |
"advice", "equipment", "happiness", "luggage", "news", "software", | |
"bread", "fruit", "information", "mathematics", "progress", "understanding", | |
"butter", "furniture", "ketchup", "mayonnaise", "research", "water" | |
"cheese", "garbage", "knowledge", "meat", "rice", | |
"electricity", "gravel", "love", "mustard", "sand", | |
)) | |
_singular_ie = set(( | |
"alergie", "cutie", "hoagie", "newbie", "softie", "veggie", | |
"auntie", "doggie", "hottie", "nightie", "sortie", "weenie", | |
"beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie", | |
"birdie", "freebie", "junkie", "^pie", "sweetie", "zombie" | |
"bogie", "goonie", "laddie", "pixie", "techie", | |
"bombie", "groupie", "laramie", "quickie", "^tie", | |
"collie", "hankie", "lingerie", "reverie", "toughie", | |
"cookie", "hippie", "meanie", "rookie", "valkyrie", | |
)) | |
_singular_irregular = { | |
"abuses": "abuse", | |
"ads": "ad", | |
"atlantes": "atlas", | |
"atlases": "atlas", | |
"analysis": "analysis", | |
"axes": "axe", | |
"beeves": "beef", | |
"brethren": "brother", | |
"children": "child", | |
"children": "child", | |
"corpora": "corpus", | |
"corpuses": "corpus", | |
"ephemerides": "ephemeris", | |
"feet": "foot", | |
"ganglia": "ganglion", | |
"geese": "goose", | |
"genera": "genus", | |
"genii": "genie", | |
"graffiti": "graffito", | |
"helves": "helve", | |
"kine": "cow", | |
"leaves": "leaf", | |
"loaves": "loaf", | |
"men": "man", | |
"mongooses": "mongoose", | |
"monies": "money", | |
"moves": "move", | |
"mythoi": "mythos", | |
"numena": "numen", | |
"occipita": "occiput", | |
"octopodes": "octopus", | |
"opera": "opus", | |
"opuses": "opus", | |
"our": "my", | |
"oxen": "ox", | |
"penes": "penis", | |
"penises": "penis", | |
"people": "person", | |
"sexes": "sex", | |
"soliloquies": "soliloquy", | |
"teeth": "tooth", | |
"testes": "testis", | |
"trilbys": "trilby", | |
"turves": "turf", | |
"zoa": "zoon", | |
} | |
_plural_prepositions = set(( | |
"about", "before", "during", "of", "till", | |
"above", "behind", "except", "off", "to", | |
"across", "below", "for", "on", "under", | |
"after", "beneath", "from", "onto", "until", | |
"among", "beside", "in", "out", "unto", | |
"around", "besides", "into", "over", "upon", | |
"at", "between", "near", "since", "with", | |
"athwart", "betwixt", "beyond", "but", "by" | |
)) | |
def singularize(word, custom={}): | |
"""Returns the singular of a given word.""" | |
if word in custom: | |
return custom[word] | |
# Recurse compound words (e.g. mothers-in-law). | |
if "-" in word: | |
w = word.split("-") | |
if len(w) > 1 and w[1] in _plural_prepositions: | |
return singularize(w[0], custom) + "-" + "-".join(w[1:]) | |
# dogs' => dog's | |
if word.endswith("'"): | |
return singularize(word[:-1], custom) + "'s" | |
w = word.lower() | |
for x in _singular_uninflected: | |
if x.endswith(w): | |
return word | |
for x in _singular_uncountable: | |
if x.endswith(w): | |
return word | |
for x in _singular_ie: | |
if w.endswith(x + "s"): | |
return w | |
for x in _singular_irregular: | |
if w.endswith(x): | |
return re.sub('(?i)' + x + '$', _singular_irregular[x], word) | |
for suffix, inflection in _singular_rules: | |
m = suffix.search(word) | |
g = m and m.groups() or [] | |
if m: | |
for k in range(len(g)): | |
if g[k] is None: | |
inflection = inflection.replace('\\' + str(k + 1), '') | |
return suffix.sub(inflection, word) | |
return word |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment