Skip to content

Instantly share code, notes, and snippets.

@msukmanowsky
Last active July 22, 2019 20:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msukmanowsky/155f7abc79523e197c87d49ebfc45c9c to your computer and use it in GitHub Desktop.
Save msukmanowsky/155f7abc79523e197c87d49ebfc45c9c to your computer and use it in GitHub Desktop.
Guess a delimiter being used to split a string using term frequencies of non-word characters.
import re
from collections import Counter
def guess_string_separator(
string: str,
to_ignore: Iterable[str] = tuple(),
ignore_empty_strings: bool = True,
) -> Optional[str]:
"""Guess a delimiter being used to split a string using term frequencies.
E.g. "mike, john, bob" -> ", "
"""
non_word_parts = re.split(r"\w+", string)
if ignore_empty_strings:
# get rid of spaces and empty string
non_word_parts = (part for part in non_word_parts if part.strip())
if to_ignore:
non_word_parts = (part for part in non_word_parts if part not in to_ignore)
term_frequencies = Counter(non_word_parts)
most_common = [
(term, count) for (term, count) in term_frequencies.most_common() if count > 1
]
if most_common:
return most_common[0][0]
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment