himlohiya/normalize_corpus.py

## normalize_corpus.py
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True,
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)

        try:
          doc = doc.decode("utf-8-sig").replace(u"\ufffd", "?")
        except:
          doc = souped

        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)

        # expand contractions
        if contraction_expansion:
            doc = expand_contractions(doc)

        # lowercase the text
        if text_lower_case:
            doc = doc.lower()

        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)

        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)

        # remove special characters and\or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)

            pat1 = r'@[A-Za-z0-9_]+'
            pat2 = r'https?://[^ ]+'
            www_pat = r'www.[^ ]+'
            combined_pat = r'|'.join((pat1, pat2))
            doc = re.sub(combined_pat, '', doc)
            doc = re.sub(www_pat, '', doc)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)

        normalized_corpus.append(doc)

    return normalized_corpus
	def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
	accented_char_removal=True, text_lower_case=True,
	text_lemmatization=True, special_char_removal=True,
	stopword_removal=True, remove_digits=True):

	normalized_corpus = []
	# normalize each document in the corpus
	for doc in corpus:
	# strip HTML
	if html_stripping:
	doc = strip_html_tags(doc)

	try:
	doc = doc.decode("utf-8-sig").replace(u"\ufffd", "?")
	except:
	doc = souped

	# remove accented characters
	if accented_char_removal:
	doc = remove_accented_chars(doc)

	# expand contractions
	if contraction_expansion:
	doc = expand_contractions(doc)

	# lowercase the text
	if text_lower_case:
	doc = doc.lower()

	# remove extra newlines
	doc = re.sub(r'[\r\|\n\|\r\n]+', ' ',doc)

	# lemmatize text
	if text_lemmatization:
	doc = lemmatize_text(doc)

	# remove special characters and\or digits
	if special_char_removal:
	# insert spaces between special characters to isolate them
	special_char_pattern = re.compile(r'([{.(-)!}])')
	doc = special_char_pattern.sub(" \\1 ", doc)
	doc = remove_special_characters(doc, remove_digits=remove_digits)

	pat1 = r'@[A-Za-z0-9_]+'
	pat2 = r'https?://[^ ]+'
	www_pat = r'www.[^ ]+'
	combined_pat = r'\|'.join((pat1, pat2))
	doc = re.sub(combined_pat, '', doc)
	doc = re.sub(www_pat, '', doc)

	# remove extra whitespace
	doc = re.sub(' +', ' ', doc)
	# remove stopwords
	if stopword_removal:
	doc = remove_stopwords(doc, is_lower_case=text_lower_case)

	normalized_corpus.append(doc)

	return normalized_corpus