hkaraoguz/text_preprocess.py

## text_preprocess.py
import string
import re
import nltk

def preprocess_text(text):

    # Make lowercase
    text = text.lower()
    #print(text)

    # Remove mentions and http links
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    # Remove punctionation
    text = ''.join(char for char in text if char not in string.punctuation)

    # Get words
    words = nltk.word_tokenize(text)

    # Perform stemming
    porter = nltk.stem.porter.PorterStemmer()
    stemmed = [porter.stem(word) for word in words]


    #print(words)

    #print(stemmed)

    return stemmed
	import string
	import re
	import nltk

	def preprocess_text(text):

	# Make lowercase
	text = text.lower()
	#print(text)

	# Remove mentions and http links
	text = re.sub(r"(?:\@\|https?\://)\S+", "", text)

	# Remove punctionation
	text = ''.join(char for char in text if char not in string.punctuation)

	# Get words
	words = nltk.word_tokenize(text)

	# Perform stemming
	porter = nltk.stem.porter.PorterStemmer()
	stemmed = [porter.stem(word) for word in words]


	#print(words)

	#print(stemmed)

	return stemmed