Last active
July 6, 2018 21:06
-
-
Save himlohiya/be016466bf3487fc3421ac66e6cd10a0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True, | |
accented_char_removal=True, text_lower_case=True, | |
text_lemmatization=True, special_char_removal=True, | |
stopword_removal=True, remove_digits=True): | |
normalized_corpus = [] | |
# normalize each document in the corpus | |
for doc in corpus: | |
# strip HTML | |
if html_stripping: | |
doc = strip_html_tags(doc) | |
try: | |
doc = doc.decode("utf-8-sig").replace(u"\ufffd", "?") | |
except: | |
doc = souped | |
# remove accented characters | |
if accented_char_removal: | |
doc = remove_accented_chars(doc) | |
# expand contractions | |
if contraction_expansion: | |
doc = expand_contractions(doc) | |
# lowercase the text | |
if text_lower_case: | |
doc = doc.lower() | |
# remove extra newlines | |
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc) | |
# lemmatize text | |
if text_lemmatization: | |
doc = lemmatize_text(doc) | |
# remove special characters and\or digits | |
if special_char_removal: | |
# insert spaces between special characters to isolate them | |
special_char_pattern = re.compile(r'([{.(-)!}])') | |
doc = special_char_pattern.sub(" \\1 ", doc) | |
doc = remove_special_characters(doc, remove_digits=remove_digits) | |
pat1 = r'@[A-Za-z0-9_]+' | |
pat2 = r'https?://[^ ]+' | |
www_pat = r'www.[^ ]+' | |
combined_pat = r'|'.join((pat1, pat2)) | |
doc = re.sub(combined_pat, '', doc) | |
doc = re.sub(www_pat, '', doc) | |
# remove extra whitespace | |
doc = re.sub(' +', ' ', doc) | |
# remove stopwords | |
if stopword_removal: | |
doc = remove_stopwords(doc, is_lower_case=text_lower_case) | |
normalized_corpus.append(doc) | |
return normalized_corpus |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment