yuyasugano/preprocessing.py

## preprocessing.py
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocessing(tweet):

    # Generating the list of words in the tweet (hastags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words)

    # Removing stopwords and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
        return clean_mess

    # Lemmatize the words in tweets
    def lemmatization(tweet_list):
        lem = WordNetLemmatizer()
        lemmatized_tweet = []
        for word in tweet_list:
            lemmatized_text = lem.lemmatize(word, 'v')
            lemmatized_tweet.append(lemmatized_text)
        return lemmatized_tweet

    new_tweet = form_sentence(tweet)
    no_punc_tweet = no_user_alpha(new_tweet)
    lemmatized_tweet = lemmatization(no_punc_tweet)

    return lemmatized_tweet
	import nltk
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')

	def preprocessing(tweet):

	# Generating the list of words in the tweet (hastags and other punctuations removed)
	def form_sentence(tweet):
	tweet_blob = TextBlob(tweet)
	return ' '.join(tweet_blob.words)

	# Removing stopwords and words with unusual symbols
	def no_user_alpha(tweet):
	tweet_list = [ele for ele in tweet.split() if ele != 'user']
	clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
	clean_s = ' '.join(clean_tokens)
	clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
	return clean_mess

	# Lemmatize the words in tweets
	def lemmatization(tweet_list):
	lem = WordNetLemmatizer()
	lemmatized_tweet = []
	for word in tweet_list:
	lemmatized_text = lem.lemmatize(word, 'v')
	lemmatized_tweet.append(lemmatized_text)
	return lemmatized_tweet

	new_tweet = form_sentence(tweet)
	no_punc_tweet = no_user_alpha(new_tweet)
	lemmatized_tweet = lemmatization(no_punc_tweet)

	return lemmatized_tweet