Skip to content

Instantly share code, notes, and snippets.

@apt142
Created August 22, 2018 16:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apt142/4dd4915a298d354ba84bf942ff5bbd18 to your computer and use it in GitHub Desktop.
Save apt142/4dd4915a298d354ba84bf942ff5bbd18 to your computer and use it in GitHub Desktop.
Python object for taking a full name in a single string and parsing out the relevant name parts.
import re
class NameParser(object):
"""
Parses a name from a given string.
This parser is definitely English/Western specific.
Certain assumptions are made such as
* first/last name order
* prefixes
* suffixes
"""
full_name = ''
prefix = ''
first = ''
nickname = ''
middle = ''
last = ''
suffix = ''
prefixes = ['mr', 'mrs', 'mr & mrs', 'ms', 'dr', 'miss', 'rev', 'prof', 'pres', 'gov', 'sgt',
'maj', 'cpl', 'pvt', 'lt', 'col', 'lt col', 'rev', 'gen', 'sir']
primary_prefixes = ['mr', 'mrs', 'mr & mrs']
suffixes = ['esq','esquire','jr','sr','2','ii','iii','iv', 'v','vi','vi','vii','viii']
last_name_prefixes = ['bar','ben','bin','da','dal','de la', 'de los', 'de', 'del','der','di',
'ibn','la','le','los','san','st','ste','van', 'van der', 'van den', 'vel','von']
# By no means exhaustive, but it should 80/20 it.
double_first_names = ['Anna Grace', 'Anna Lane', 'Ann Claire', 'Bettie Bee', 'Betty Grace',
'Betty Bell', 'Billie Beth', 'Billie Jean', 'Billie Jo', 'Carol Kaye', 'Carol Sue', 'Sadie Mae', 'Sadie Bee',
'Sarah Michelle', 'Emma Lou', 'Emma Grace', 'Emma Dale', 'Georgia Lee', 'Georgia Love', 'Mary Love',
'Mary Grace', 'Mary Jewel', 'Mary Laird', 'Mary Grace', 'Mary Jo', 'Mary Frances', 'Twyla Fay', 'Tressa Gail',
'Peggy Sue', 'Jack David', 'Jack Carl', 'Jessie Joe', 'Jessie Mac', 'Jimmie Dale', 'Jo Allen', 'John Andrew',
'John Mark', 'John Moss', 'John Preston', 'John Mark', 'John Michael', 'Bobby Jack', 'Billy Bob', 'Billie Bob',
'Charles Alan', 'Charlie Clyde', 'David Ryan', 'David Dash', 'Don Allan', 'Don Michael', 'Eddie Wayne',
'Terry Mack', 'Ford Allen', 'Frank Finn', 'Will Brock', 'Zack Alan']
def __init__(self, full_name=None):
self.full_name = full_name
def parse(self, full_name=None):
"""
Parse a supplied full name and return back all the known name parts.
:param full_name: Full name to parse
:return: Dict
"""
if full_name:
self.full_name = full_name
else:
full_name = self.full_name
self.prefix = ''
self.first = ''
self.nickname = ''
self.middle = ''
self.last = ''
self.suffix = ''
full_name = self.preprocess(full_name)
# Each method identifies, stores and slices off a piece of the full string
# Each extraction will remove a known quantity
# This improves certainty in finding remaining pieces
# Extract and remove nickname
full_name = self.extract_nickname(full_name)
# Extract suffixes. eg: "jr", "sr", "viii"
full_name = self.extract_suffix(full_name)
# Extract prefixes. eg: "mr", "rev", etc
full_name = self.extract_prefix(full_name)
# If the name has a comma flip the name around the comma
if ',' in full_name:
full_name = self.flip(full_name)
# Extract Last name
full_name = self.extract_last_name(full_name)
# Extract first names
full_name = self.extract_first_name(full_name)
# The remaining bit is the middle name
self.middle = full_name
return self.get_parts()
@staticmethod
def flip(full_name):
"""
Turns Bond, James to James Bond
:param full_name: Name to flip
:return: String
"""
parts = full_name.split(',')
parts = parts[::-1]
parts = [part.strip() for part in parts]
full_name = ' '.join(parts)
return full_name
def preprocess(self, full_name):
"""
Cleans the full name string into standard terms and usages
:param self:
:param full_name: String to clean up
:return: string
"""
# Standardize "&" notations
for amp in ['/', ' and ', ' or ']:
regex = re.compile(re.escape(amp), re.IGNORECASE)
full_name = regex.sub('&', full_name)
full_name = full_name.replace('&', ' & ')
# Standardize the apostrophes
full_name = full_name.replace('`', '\'')
# Ensure that all commas have spaces after them
full_name = full_name.replace(',', ', ')
# Allow only A-Z(a-z), Space, ", &, ', (, ), -, comma
regex = re.compile("[^A-Za-z0-9 \"&'\(\)\-,]")
full_name = regex.sub(' ', full_name)
# Turn all spaces into single spaces
regex = re.compile('[ \t]+')
full_name = regex.sub(' ', full_name)
# Trim off the white space on both ends
full_name = full_name.strip()
return full_name
def extract_first_name(self, full_name):
"""
Extracts the first name
:param full_name:
:return:
"""
# Identify if we have a couples name eg. Jen & Jeff Turpin
# Extract off the first person's name with the &. We'll reattach later.
couples_first_name = ''
amp_index = full_name.find('&')
if amp_index != -1:
couples_first_name = full_name[0:amp_index + 1].strip()
full_name = full_name[amp_index + 1:].strip()
# Treat the name as if it was one individual and find that first space
index = full_name.find(' ')
if index != -1:
# Do we have a likely double name? If so, use that.
for double_first_name in self.double_first_names:
if full_name.lower().find(double_first_name.lower()) == 0 and len(double_first_name) > index:
index = len(double_first_name)
# Set first name include couples back into it. (likely blank)
self.first = couples_first_name + ' ' + full_name[:index]
self.first = self.first.lower().title().strip()
full_name = full_name[index:]
else:
# If no space we have just a one word string left. We'll assume it's a first name.
self.first = couples_first_name + ' ' + full_name
self.first = self.first.strip()
full_name = ''
return full_name.strip()
def extract_last_name(self, full_name):
"""
Extract the Last Name
:param self:
:param full_name:
:return:
"""
# Get the last space.
# Where we'll split. Set by default to be the last space in the string
index = full_name.rfind(' ')
if index == -1:
# Just one word left. Likely a first name so we'll pass on a last name.
self.last = ''
else:
# Iterate through the prefixes and see if we can find a better point to split the name at.
# Turns out there is a finite number of these last name prefixes
for prefix in self.last_name_prefixes:
find = full_name.find(' ' + prefix + ' ')
# If it is earlier in the string than our index, we'll use that location instead.
if find != -1:
if find < index:
index = find
self.last = self.uclast(full_name[index:]).strip()
full_name = full_name[0: index].strip()
return full_name
def uclast(self, last_name):
"""
Uppercases a full last name keeping in mind prefixes and unusual casing rules
:param self:
:param last:
:return:
"""
last_name_parts = last_name.split(' ')
clean_parts = []
for part in last_name_parts:
# If it is a prefix then don't capitalize it.
if part.lower() in self.last_name_prefixes:
clean_parts.append(part.lower())
else:
# For those with hyphenated last names
compound_name_parts = part.split('-')
compound_name_parts = [self.uclastword(compound_part) for compound_part in compound_name_parts]
compound_name = '-'.join(compound_name_parts)
clean_parts.append(compound_name)
return ' '.join(clean_parts)
"""
* Properly uppercases a last name part
*
* @param string lastNamePart Last name part
*
* @return string
"""
def uclastword(self, last_name_part):
modifiers = ['mc', 'mac', 'o\'']
prefix = ''
for modifier in modifiers:
if last_name_part.lower().find(modifier) == 0:
prefix = modifier.lower()
break
postfix = last_name_part[len(prefix):].lower()
return prefix.title() + postfix.title()
def extract_nickname(self, full_name):
"""
Extracts the nickname from the full_name. The nick name is stored and the remaining name is returned.
eg: James "The Party" Benedict
:param self:
:param full_name:
:return:
"""
regex = re.compile("\"(.+?)\" ")
nicknames = regex.findall(full_name)
if nicknames:
self.nickname = nicknames[0]
full_name = full_name.replace("\"" + self.nickname + "\" ", '')
return full_name
def extract_suffix(self, full_name):
"""
Extracts the suffix from the full_name and stores it. The remaining name parts are returned.
:param self:
:param full_name:
:return:
"""
parts = full_name.split(' ')
potential_suffix = parts[-1]
if potential_suffix.lower() in self.suffixes:
self.suffix = potential_suffix
del parts[-1]
full_name = ' '.join(parts)
return full_name
"""
* Extracts the Prefix
*
* @param string full_name Full Name
*
* @return string
"""
def extract_prefix(self, full_name):
# Sort by biggest first
self.prefixes.sort(key=len, reverse=True)
prefix = ''
for potential_prefix in self.prefixes:
if potential_prefix.lower() in self.primary_prefixes:
potential_prefix = potential_prefix + ' '
if potential_prefix.lower() in full_name.lower():
prefix = potential_prefix.strip()
break
if len(prefix):
self.prefix = full_name[0: len(prefix) + 1].strip()
full_name = full_name[len(prefix) + 1:].strip()
return full_name
def get_parts(self):
"""
Returns all the known parts of the name
:return: Dict
"""
return {
'prefix': self.prefix,
'first': self.first,
'nickname': self.nickname,
'middle': self.middle,
'last': self.last,
'suffix': self.suffix
}
def get_full_name(self):
"""
Builds the parsed name back from the pieces
:return: String
"""
full_name = self.prefix + ' ' + self.first
if self.nickname:
full_name += ' "' + self.nickname + '" '
full_name += ' ' + self.middle + ' ' + self.last + ' ' + self.suffix
return full_name
parser = NameParser()
print parser.parse('James Bond')
print parser.parse('Bond, James')
print parser.parse('Rev Martin Luther King Jr')
print parser.parse('Billy "Bobcat" Goldthwait')
print parser.parse('Leonardo da Vinci')
print parser.parse('Sir Francis Drake')
print parser.parse('Billie Bob Marion Turpin')
print parser.parse('Mr and Mrs James Turpin')
print parser.parse('Jane and James Turpin')
print parser.parse('Dr McNinja')
print parser.parse('Antonio de la Torre')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment