Skip to content

Instantly share code, notes, and snippets.

@himlohiya
Last active July 6, 2018 21:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save himlohiya/be016466bf3487fc3421ac66e6cd10a0 to your computer and use it in GitHub Desktop.
Save himlohiya/be016466bf3487fc3421ac66e6cd10a0 to your computer and use it in GitHub Desktop.
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True, remove_digits=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
try:
doc = doc.decode("utf-8-sig").replace(u"\ufffd", "?")
except:
doc = souped
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# expand contractions
if contraction_expansion:
doc = expand_contractions(doc)
# lowercase the text
if text_lower_case:
doc = doc.lower()
# remove extra newlines
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# remove special characters and\or digits
if special_char_removal:
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
doc = remove_special_characters(doc, remove_digits=remove_digits)
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
www_pat = r'www.[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
doc = re.sub(combined_pat, '', doc)
doc = re.sub(www_pat, '', doc)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
normalized_corpus.append(doc)
return normalized_corpus
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment