Last active
December 18, 2015 16:09
-
-
Save seven1m/5809211 to your computer and use it in GitHub Desktop.
A Python function and a String-based class for capitalizing a proper name. There are a handful of tests; save the snippet to a .py file and run it from a terminal.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: iso-8859-1 -*- | |
# The code here is based loosely on John Cardinal's notes found at: | |
# http://www.johncardinal.com/tmgutil/capitalizenames.htm | |
# 2006-03-16 | |
# Thanks to David Kern <kernd@reasonspace.com> for fixing some bugs. | |
suffixes = "II|(II)|III|(III)|IV|(IV)|VI|(VI)|VII|(VII)|2nd|(2nd)|3rd|(3rd)|4th|(4th)|5th|(5th)".split('|') | |
# The names listed here are included by permission from John Cardinal's TMG Utility. | |
# http://www.johncardinal.com/tmgutil/index.htm | |
# John Cardinal maintains the copyright for this list of names. | |
surnames = "ApShaw|d'Albini|d'Aubigney|d'Aubigné|d'Autry|d'Entremont|d'Hurst|D'ovidio|da Graça|DaSilva|DeAnda|deAnnethe|deAubigne|deAubigny|DeBardelaben|DeBardeleben|DeBaugh|deBeauford|DeBerry|deBethune|DeBetuile|DeBoard|DeBoer|DeBohun|DeBord|DeBose|DeBrouwer|DeBroux|DeBruhl|deBruijn|deBrus|deBruse|deBrusse|DeBruyne|DeBusk|DeCamp|deCastilla|DeCello|deClare|DeClark|DeClerck|DeCoste|deCote|DeCoudres|DeCoursey|DeCredico|deCuire|DeCuyre|DeDominicios|DeDuyster|DeDuytscher|DeDuytser|deFiennes|DeFord|DeForest|DeFrance|DeFriece|DeGarmo|deGraaff|DeGraff|DeGraffenreid|DeGraw|DeGrenier|DeGroats|DeGroft|DeGrote|DeHaan|DeHaas|DeHaddeclive|deHannethe|DeHatclyf|DeHaven|DeHeer|DeJager|DeJarnette|DeJean|DeJong|deJonge|deKemmeter|deKirketon|DeKroon|deKype|del-Rosario|dela Chamotte|DeLa Cuadra|DeLa Force|dela Fountaine|dela Greña|dela Place|DeLa Ward|DeLaci|DeLacy|DeLaet|DeLalonde|DelAmarre|DeLancey|DeLascy|DelAshmutt|DeLassy|DeLattre|DeLaughter|DeLay|deLessine|DelGado|DelGaudio|DeLiberti|DeLoache|DeLoatch|DeLoch|DeLockwood|DeLong|DeLozier|DeLuca|DeLucenay|deLucy|DeMars|DeMartino|deMaule|DeMello|DeMinck|DeMink|DeMoree|DeMoss|DeMott|DeMuynck|deNiet|DeNise|DeNure|DePalma|DePasquale|dePender|dePercy|DePoe|DePriest|DePu|DePui|DePuis|DeReeper|deRochette|deRose|DeRossett|DeRover|deRuggele|deRuggle|DeRuyter|deSaint-Sauveur|DeSantis|desCuirs|DeSentis|DeShane|DeSilva|DesJardins|DesMarest|deSoleure|DeSoto|DeSpain|DeStefano|deSwaert|deSwart|DeVall|DeVane|DeVasher|DeVasier|DeVaughan|DeVaughn|DeVault|DeVeau|DeVeault|deVilleneuve|DeVilliers|DeVinney|DeVito|deVogel|DeVolder|DeVolld|DeVore|deVos|DeVries|deVries|DeWall|DeWaller|DeWalt|deWashington|deWerly|deWessyngton|DeWet|deWinter|DeWitt|DeWolf|DeWolfe|DeWolff|DeWoody|DeYager|DeYarmett|DeYoung|DiCicco|DiCredico|DiFillippi|DiGiacomo|DiMarco|DiMeo|DiMonte|DiNonno|DiPietro|diPilato|DiPrima|DiSalvo|du Bosc|du Hurst|DuFort|DuMars|DuPre|DuPue|DuPuy|FitzUryan|kummel|LaBarge|LaBarr|LaBauve|LaBean|LaBelle|LaBerteaux|LaBine|LaBonte|LaBorde|LaBounty|LaBranche|LaBrash|LaCaille|LaCasse|LaChapelle|LaClair|LaComb|LaCoste|LaCount|LaCour|LaCroix|LaFarlett|LaFarlette|LaFerry|LaFlamme|LaFollette|LaForge|LaFortune|LaFoy|LaFramboise|LaFrance|LaFuze|LaGioia|LaGrone|LaLiberte|LaLonde|LaLone|LaMaster|LaMay|LaMere|LaMont|LaMotte|LaPeer|LaPierre|LaPlante|LaPoint|LaPointe|LaPorte|LaPrade|LaRocca|LaRochelle|LaRose|LaRue|LaVallee|LaVaque|LaVeau|LeBleu|LeBoeuf|LeBoiteaux|LeBoyteulx|LeCheminant|LeClair|LeClerc|LeCompte|LeCroy|LeDuc|LeFevbre|LeFever|LeFevre|LeFlore|LeGette|LeGrand|LeGrave|LeGro|LeGros|LeJeune|LeMaistre|LeMaitre|LeMaster|LeMesurier|LeMieux|LeMoe|LeMoigne|LeMoine|LeNeve|LePage|LeQuire|LeQuyer|LeRou|LeRoy|LeSuer|LeSueur|LeTardif|LeVally|LeVert|LoMonaco|Macabe|Macaluso|MacaTasney|Macaulay|Macchitelli|Maccoone|Maccurry|Macdermattroe|Macdiarmada|Macelvaine|Macey|Macgraugh|Machan|Machann|Machum|Maciejewski|Maciel|Mackaben|Mackall|Mackartee|Mackay|Macken|Mackert|Mackey|Mackie|Mackin|Mackins|Macklin|Macko|Macksey|Mackwilliams|Maclean|Maclinden|Macomb|Macomber|Macon|Macoombs|Macraw|Macumber|Macurdy|Macwilliams|MaGuinness|MakCubyn|MakCumby|Mcelvany|Mcsherry|Op den Dyck|Op den Graeff|regory|Schweißguth|StElmo|StGelais|StJacques|te Boveldt|VanAernam|VanAken|VanAlstine|VanAmersfoort|VanAntwerp|VanArlem|VanArnam|VanArnem|VanArnhem|VanArnon|VanArsdale|VanArsdalen|VanArsdol|vanAssema|vanAsten|VanAuken|VanAwman|VanBaucom|VanBebber|VanBeber|VanBenschoten|VanBibber|VanBilliard|vanBlare|vanBlaricom|VanBuren|VanBuskirk|VanCamp|VanCampen|VanCleave|VanCleef|VanCleve|VanCouwenhoven|VanCovenhoven|VanCowenhoven|VanCuren|VanDalsem|VanDam|VanDe Poel|vanden Dijkgraaf|vanden Kommer|VanDer Aar|vander Gouwe|VanDer Honing|VanDer Hooning|vander Horst|vander Kroft|vander Krogt|VanDer Meer|vander Meulen|vander Putte|vander Schooren|VanDer Veen|VanDer Ven|VanDer Wal|VanDer Weide|VanDer Willigen|vander Wulp|vander Zanden|vander Zwan|VanDer Zweep|VanDeren|VanDerlaan|VanDerveer|VanderWoude|VanDeursen|VanDeusen|vanDijk|VanDoren|VanDorn|VanDort|VanDruff|VanDryer|VanDusen|VanDuzee|VanDuzen|VanDuzer|VanDyck|VanDyke|VanEman|VanEmmen|vanEmmerik|VanEngen|vanErp|vanEssen|VanFleet|VanGalder|VanGelder|vanGerrevink|VanGog|vanGogh|VanGorder|VanGordon|VanGroningen|VanGuilder|VanGundy|VanHaaften|VanHaute|VanHees|vanHeugten|VanHise|VanHoeck|VanHoek|VanHook|vanHoorn|VanHoornbeeck|VanHoose|VanHooser|VanHorn|VanHorne|VanHouten|VanHoye|VanHuijstee|VanHuss|VanImmon|VanKersschaever|VanKeuren|VanKleeck|VanKoughnet|VanKouwenhoven|VanKuykendaal|vanLeeuwen|vanLent|vanLet|VanLeuven|vanLingen|VanLoozen|VanLopik|VanLuven|vanMaasdijk|VanMele|VanMeter|vanMoorsel|VanMoorst|VanMossevelde|VanNaarden|VanNamen|VanNemon|VanNess|VanNest|VanNimmen|vanNobelen|VanNorman|VanNormon|VanNostrunt|VanNote|VanOker|vanOosten|VanOrden|VanOrder|VanOrma|VanOrman|VanOrnum|VanOstrander|VanOvermeire|VanPelt|VanPool|VanPoole|VanPoorvliet|VanPutten|vanRee|VanRhijn|vanRijswijk|VanRotmer|VanSchaick|vanSchelt|VanSchoik|VanSchoonhoven|VanSciver|VanScoy|VanScoyoc|vanSeters|VanSickle|VanSky|VanSnellenberg|vanStaveren|VanStraten|VanSuijdam|VanTassel|VanTassell|VanTessel|VanTexel|VanTuyl|VanValckenburgh|vanValen|VanValkenburg|VanVelsor|VanVelzor|VanVlack|VanVleck|VanVleckeren|VanWaard|VanWart|VanWassenhove|VanWinkle|VanWoggelum|vanWordragen|VanWormer|VanZuidam|VanZuijdam|VonAdenbach|vonAllmen|vonBardeleben|vonBerckefeldt|VonBergen|vonBreyman|VonCannon|vonFreymann|vonHeimburg|VonHuben|vonKramer|vonKruchenburg|vonPostel|VonRohr|VonRohrbach|VonSass|VonSasse|vonSchlotte|VonSchneider|VonSeldern|VonSpringer|VonVeyelmann|VonZweidorff".split('|') | |
#"# fix syntax highlighting in vim | |
import re | |
mc = re.compile( r"^Mc(\w)(?=\w)", re.I ) | |
mac = re.compile( r"^Mac(\w)(?=\w)", re.I ) | |
class Name(str): | |
"""A Class (based on the string type) that properly capitalizes a name.""" | |
def __new__(cls, value=''): | |
original = value | |
proper = Capitalize(value) | |
obj = str.__new__(cls, proper) | |
obj.original = original | |
return obj | |
def Capitalize(name): | |
"""Does the work of capitalizing a name (can be a full name).""" | |
hyphen_indexes = [] | |
while name.find('-') > -1: | |
index = name.find('-') | |
hyphen_indexes.append(index) | |
name = name[:index] + ' ' + name[index+1:] | |
name = name.split() | |
name = [w.capitalize() for w in name] # standard capitalization | |
# "Mcx" should be "McX" | |
index = 0 | |
for w in name: | |
try: name[index] = mc.sub( "Mc"+w[2].upper(), w ) | |
except: pass | |
index += 1 | |
# "Macx" should be "MacX" | |
index = 0 | |
for w in name: | |
try: name[index] = mac.sub( "Mac"+w[3].upper(), w ) | |
except: pass | |
index += 1 | |
name = ' '.join( name ) | |
for index in hyphen_indexes: | |
name = name[:index] + '-' + name[index+1:] | |
# funky stuff (no capitalization) | |
name = name.replace( " Dit ", " dit " ) | |
name = name.replace( " Van ", " van " ) | |
name = name.replace( " De ", " de " ) | |
# special surnames and suffixes | |
name += ' ' | |
for surname in surnames + suffixes: | |
pos = name.lower().find( surname.lower() ) | |
if pos > -1: | |
# surname/suffix must be: | |
# 1. at start of name or after a space | |
# -and- | |
# 2. followed by the end of string or a space | |
if (((pos == 0) or (pos > 0 and name[pos-1] == ' ')) | |
and ((len(name) == pos+len(surname)) | |
or (name[pos+len(surname)] == ' '))): | |
name = name[:pos] + surname + name[pos+len(surname):] | |
return name.strip() | |
import unittest | |
class TestCapitalize(unittest.TestCase): | |
def test_simple(self): | |
self.assertEqual(Capitalize("john smith"), "John Smith") | |
def test_mc(self): | |
self.assertEqual(Capitalize("BOB MCELROY"), "Bob McElroy") | |
def test_mac(self): | |
self.assertEqual(Capitalize("josh macelvany"), "Josh MacElvany") | |
def test_surname(self): | |
self.assertEqual(Capitalize("BILL VANWINKLE"), "Bill VanWinkle") | |
def test_van_space(self): | |
self.assertEqual(Capitalize("joe van buren"), "Joe van Buren") | |
def test_suffix(self): | |
self.assertEqual(Capitalize("bob jones, iii"), "Bob Jones, III") | |
def test_hyphenated(self): | |
self.assertEqual(Capitalize("mary johnson-smith"), "Mary Johnson-Smith") | |
def test_iv_not_in_suffix(self): | |
self.assertEqual(Capitalize("DAVID SULLIVAN"), "David Sullivan") | |
def test_mc_in_middle(self): | |
self.assertEqual(Capitalize("jason bumcorn"), "Jason Bumcorn") | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment