Skip to content

Instantly share code, notes, and snippets.

@drkane
Created August 29, 2018 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drkane/260c38f9aab6018471449947ebb1a5fb to your computer and use it in GitHub Desktop.
Save drkane/260c38f9aab6018471449947ebb1a5fb to your computer and use it in GitHub Desktop.
normalise a pandas series string (built for the names of charities/other organisations)
# takes a pandas series
def normalize_string(s):
s = s.astype(str)
s = s.str.lower()
# replace apostrophes
def replace_apos(match):
return match.group(0).replace("'", "")
s = s.str.replace(r'\'s\b', replace_apos)
# replace acronyms with spaces in
def replace_space_acronym(match):
text = match.group(0).replace(" ", "")
if match.group(0).endswith(" "):
return text + " "
return text
s = s.str.replace(r'\b([A-Za-z] )+[A-Za-z]\b', replace_space_acronym)
# replace acronyms
def replace_acronym(match):
text = match.group(0).replace(".", "").replace(" ", "")
if match.group(0).endswith(" "):
return text + " "
return text
s = s.str.replace(r'\b((?:[A-Za-z]\. ?)+)', replace_acronym)
# replace non-alphanumeric characters with space
s = s.str.replace(r"[^0-9a-zA-Z]+", " ")
# replace limited and the
s = s.str.replace(r'ltd$', 'limited')
s = s.str.replace(r'^the\b', '')
s = s.str.replace('&', ' and ')
# replace multiple spaces with one space
s = s.str.replace(r'\s+', ' ')
s = s.str.strip()
return s.astype(str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment