Skip to content

Instantly share code, notes, and snippets.

@hanayashiki
Created June 17, 2019 06:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hanayashiki/a925eec82def5a63a5741fdeff618cec to your computer and use it in GitHub Desktop.
Save hanayashiki/a925eec82def5a63a5741fdeff618cec to your computer and use it in GitHub Desktop.
import re
def normalize_text(t: str):
"""
:param t: English text
:return: Normalized text according to below:
# language-dependent part (assuming Western languages):
$norm_text = " $norm_text ";
$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
$norm_text =~ s/\s+/ /g; # one space only between words
$norm_text =~ s/^\s+//; # no leading space
$norm_text =~ s/\s+$//; # no trailing space
"""
t = " " + t + " "
t = t.lower()
t = re.sub(r"""([{|}~\[\\\]^\-` !"#$%&()*+:;<=>?@/])""", r" \1 ", t)
t = re.sub(r"([^0-9])([.,])", r"\1 \2", t)
t = re.sub(r"([.,])([^0-9])", r"\1 \2", t)
t = re.sub(r"([0-9])(-)", r"\1 \2", t)
t = re.sub(r"(\s+)", " ", t)
t = re.sub(r"^\s+", "", t)
t = re.sub(r"\s+$", "", t)
return t
if __name__ == '__main__':
print(normalize_text("fuck three bitches. 2010 year of 100-500 shit.fuck.baidu.com"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment