Skip to content

Instantly share code, notes, and snippets.

@JustinaPetr
Last active February 18, 2019 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JustinaPetr/68a05656c706641cc0b2108e0ab98c35 to your computer and use it in GitHub Desktop.
Save JustinaPetr/68a05656c706641cc0b2108e0ab98c35 to your computer and use it in GitHub Desktop.
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
class SentimentClassifier(object):
def preprocessing(self, sentence):
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens_no_stop_words = [t for t in tokens if t not in stopwords.words('english')]
tokens_lowercase = [t.lower() for t in tokens_no_stop_words]
tokens_lemmatized = [WordNetLemmatizer().lemmatize(t) for t in tokens_lowercase]
return tokens_lemmatized
def format_data(self, sentence):
return ({word: True for word in self.preprocessing(sentence)})
def train_test_split(self, pos_tweets, neg_tweets):
pos_tweets_set = [(self.format_data(tweet), 'pos') for tweet in pos_tweets]
neg_tweets_set = [(self.format_data(tweet), 'neg') for tweet in neg_tweets]
training = pos_tweets_set[:int((.8) * len(pos_tweets_set))] + neg_tweets_set[:int((.8) * len(neg_tweets_set))]
testing = pos_tweets_set[int((.8) * len(pos_tweets_set)):] + neg_tweets_set[int((.8) * len(neg_tweets_set)):]
return training, testing
def train(self):
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
training, testing = self.train_test_split(pos_tweets, neg_tweets)
classifier = NaiveBayesClassifier.train(training)
return classifier
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment