Davisy/clean_method.py

## clean_method.py
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"http\S+", " link ", text)
    text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text)  # remove numbers
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    # Optionally, remove stop words
    if remove_stop_words:
        # load stopwords
        stop_words = stopwords.words("english")
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    # Return a list of words
    return text
	def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
	# Clean the text, with the option to remove stop_words and to lemmatize word
	# Clean the text
	text = re.sub(r"[^A-Za-z0-9]", " ", text)
	text = re.sub(r"\'s", " ", text)
	text = re.sub(r"http\S+", " link ", text)
	text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text) # remove numbers
	# Remove punctuation from text
	text = "".join([c for c in text if c not in punctuation])
	# Optionally, remove stop words
	if remove_stop_words:
	# load stopwords
	stop_words = stopwords.words("english")
	text = text.split()
	text = [w for w in text if not w in stop_words]
	text = " ".join(text)
	# Optionally, shorten words to their stems
	if lemmatize_words:
	text = text.split()
	lemmatizer = WordNetLemmatizer()
	lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
	text = " ".join(lemmatized_words)
	# Return a list of words
	return text