drkane/normalise_string.py

## normalise_string.py
# takes a pandas series
def normalize_string(s):
    s = s.astype(str)
    s = s.str.lower()

    # replace apostrophes
    def replace_apos(match):
        return match.group(0).replace("'", "")
    s = s.str.replace(r'\'s\b', replace_apos)

    # replace acronyms with spaces in
    def replace_space_acronym(match):
        text = match.group(0).replace(" ", "")
        if match.group(0).endswith(" "):
            return text + " "
        return text
    s = s.str.replace(r'\b([A-Za-z] )+[A-Za-z]\b', replace_space_acronym)

    # replace acronyms
    def replace_acronym(match):
        text = match.group(0).replace(".", "").replace(" ", "")
        if match.group(0).endswith(" "):
            return text + " "
        return text
    s = s.str.replace(r'\b((?:[A-Za-z]\. ?)+)', replace_acronym)

    # replace non-alphanumeric characters with space
    s = s.str.replace(r"[^0-9a-zA-Z]+", " ")

    # replace limited and the
    s = s.str.replace(r'ltd$', 'limited')
    s = s.str.replace(r'^the\b', '')
    s = s.str.replace('&', ' and ')

    # replace multiple spaces with one space
    s = s.str.replace(r'\s+', ' ')

    s = s.str.strip()

    return s.astype(str)
	# takes a pandas series
	def normalize_string(s):
	s = s.astype(str)
	s = s.str.lower()

	# replace apostrophes
	def replace_apos(match):
	return match.group(0).replace("'", "")
	s = s.str.replace(r'\'s\b', replace_apos)

	# replace acronyms with spaces in
	def replace_space_acronym(match):
	text = match.group(0).replace(" ", "")
	if match.group(0).endswith(" "):
	return text + " "
	return text
	s = s.str.replace(r'\b([A-Za-z] )+[A-Za-z]\b', replace_space_acronym)

	# replace acronyms
	def replace_acronym(match):
	text = match.group(0).replace(".", "").replace(" ", "")
	if match.group(0).endswith(" "):
	return text + " "
	return text
	s = s.str.replace(r'\b((?:[A-Za-z]\. ?)+)', replace_acronym)

	# replace non-alphanumeric characters with space
	s = s.str.replace(r"[^0-9a-zA-Z]+", " ")

	# replace limited and the
	s = s.str.replace(r'ltd$', 'limited')
	s = s.str.replace(r'^the\b', '')
	s = s.str.replace('&', ' and ')

	# replace multiple spaces with one space
	s = s.str.replace(r'\s+', ' ')

	s = s.str.strip()

	return s.astype(str)