Created
August 29, 2018 14:00
-
-
Save drkane/260c38f9aab6018471449947ebb1a5fb to your computer and use it in GitHub Desktop.
normalise a pandas series string (built for the names of charities/other organisations)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# takes a pandas series | |
def normalize_string(s): | |
s = s.astype(str) | |
s = s.str.lower() | |
# replace apostrophes | |
def replace_apos(match): | |
return match.group(0).replace("'", "") | |
s = s.str.replace(r'\'s\b', replace_apos) | |
# replace acronyms with spaces in | |
def replace_space_acronym(match): | |
text = match.group(0).replace(" ", "") | |
if match.group(0).endswith(" "): | |
return text + " " | |
return text | |
s = s.str.replace(r'\b([A-Za-z] )+[A-Za-z]\b', replace_space_acronym) | |
# replace acronyms | |
def replace_acronym(match): | |
text = match.group(0).replace(".", "").replace(" ", "") | |
if match.group(0).endswith(" "): | |
return text + " " | |
return text | |
s = s.str.replace(r'\b((?:[A-Za-z]\. ?)+)', replace_acronym) | |
# replace non-alphanumeric characters with space | |
s = s.str.replace(r"[^0-9a-zA-Z]+", " ") | |
# replace limited and the | |
s = s.str.replace(r'ltd$', 'limited') | |
s = s.str.replace(r'^the\b', '') | |
s = s.str.replace('&', ' and ') | |
# replace multiple spaces with one space | |
s = s.str.replace(r'\s+', ' ') | |
s = s.str.strip() | |
return s.astype(str) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment