Skip to content

Instantly share code, notes, and snippets.

@alexsavio
Last active June 10, 2021 10:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexsavio/bcfd5b17e041833ef73f62d90f56951c to your computer and use it in GitHub Desktop.
Save alexsavio/bcfd5b17e041833ef73f62d90f56951c to your computer and use it in GitHub Desktop.
Recapitalize names
"""
Recapitalize a string of words that has passed a pre-processing, word-cuttind and case lowering process.
"""
import re
import difflib
from typing import Iterator, List, Tuple
def recapitalize_name(original: str, result: str) -> str:
"""Return the `result` with the words capitalized as they appear in `original`."""
if len(result) == len(original):
indices = _find_capital_letters(original)
return _capitalize(result, indices)
original_words = original.split()
result_words = result.split()
word_matches = _find_word_matches(result_words, original_words)
recapitalized_words = []
for word, original_word in word_matches:
if word == original_word:
recapitalized_word = word
else:
indices = _find_capital_letters(original_word)
recapitalized_word = _capitalize(word, indices)
recapitalized_words.append(recapitalized_word)
return ' '.join(recapitalized_words)
def _capitalize(word: str, indices: Iterator[int]) -> str:
"""Capitalize the characters in `word` indicated in `indices`.
To make this function public, it should raise an IndexError exception."""
if not indices or not word:
return word
characters = list(word)
for i in indices:
characters[i] = word[i].upper()
return "".join(characters)
def _find_capital_letters(word: str) -> List[int]:
"""Return the list of indices where the capital letters are found in `word`.
Only works on [A-Z] letters.
"""
if not word:
return []
capital_letters = re.compile(r'[A-Z]')
return [group.start() for group in capital_letters.finditer(word)]
def _find_word_matches(result_words: Iterator[str], original_words: Iterator[str],) -> List[Tuple[str, str]]:
"""Find the words in `original_words` that match the words in `result_words`.
Return a list of pairs (<result word>, <matching original>).
This is not perfect with words lengths < 3.
"""
def _get_diff_match_cutoff(word_length: int) -> float:
if word_length < 4:
return 1 - 1/word_length
return 0.8
lowered_source_words = {word.lower(): word for word in original_words}
result_words_matches = []
for word in result_words:
cutoff = _get_diff_match_cutoff(len(word))
closest_matches = difflib.get_close_matches(word, lowered_source_words.keys(), n=1, cutoff=cutoff)
closest_match = lowered_source_words[closest_matches[0]] if closest_matches else word
result_words_matches.append((word, closest_match))
return result_words_matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment