Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
class SentimentClassifier(object):
def preprocessing(self, sentence):
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens_no_stop_words = [t for t in tokens if t not in stopwords.words('english')]
tokens_lowercase = [t.lower() for t in tokens_no_stop_words]
tokens_lemmatized = [WordNetLemmatizer().lemmatize(t) for t in tokens_lowercase]
return tokens_lemmatized
def format_data(self, sentence):
return ({word: True for word in self.preprocessing(sentence)})
def train_test_split(self, pos_tweets, neg_tweets):
pos_tweets_set = [(self.format_data(tweet), 'pos') for tweet in pos_tweets]
neg_tweets_set = [(self.format_data(tweet), 'neg') for tweet in neg_tweets]
training = pos_tweets_set[:int((.8) * len(pos_tweets_set))] + neg_tweets_set[:int((.8) * len(neg_tweets_set))]
testing = pos_tweets_set[int((.8) * len(pos_tweets_set)):] + neg_tweets_set[int((.8) * len(neg_tweets_set)):]
return training, testing
def train(self):
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
training, testing = self.train_test_split(pos_tweets, neg_tweets)
classifier = NaiveBayesClassifier.train(training)
return classifier
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment