tsoporan/manual_nltk_bayes_classify.py

## manual_nltk_bayes_classify.py
from nltk.probability import DictionaryProbDist
from nltk import NaiveBayesClassifier
from nltk import FreqDist, ConditionalFreqDist
from nltk import BigramAssocMeasures

train_samples = {
    'I hate you and you are a bad person': 'neg',
    'I love you and you are a good person': 'pos',
    'I fail at everything and I want to kill people' : 'neg',
    'I win at everything and I want to love people' : 'pos',
    'sad are things are heppening. fml' : 'neg',
    'good are things are heppening. gbu' : 'pos',
    'I am so poor' : 'neg',
    'I am so rich' : 'pos',
    'I hate you mommy ! You are my terrible person' : 'neg',
    'I love you mommy ! You are my amazing person' : 'pos',
    'I want to kill butterflies since they make me sad' : 'neg',
    'I want to chase butterflies since they make me happy' : 'pos',
    'I want to hurt bunnies' : 'neg',
    'I want to hug bunnies' : 'pos',
    'You make me frown' : 'neg',
    'You make me smile' : 'pos',
}

test_samples = [
  'You are a terrible person and everything you do is bad',
  'I love you all and you make me happy',
  'I frown whenever I see you in a poor state of mind',
  'Finally getting rich from my ideas. They make me smile.',
  'My mommy is poor',
  'I love butterflies. Yay for happy',
  'Everything is fail today and I hate stuff',
]


def gen_bow(text):
    words = text.split()
    bow = {}
    for word in words:
        bow[word.lower()] = True
    return bow


def get_feature_probdist(samples):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for words, label  in samples.items():
        for word in words.split():
            word_fd.inc(word.lower())
            label_word_fd[label].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    feature_probdist = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        feature_probdist[('neg',word)] = DictionaryProbDist({True: neg_score})
        feature_probdist[('pos',word)] = DictionaryProbDist({True: pos_score})

    return feature_probdist


feature_probdist = get_feature_probdist(train_samples)

print feature_probdist

label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})


classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

for sample in test_samples:
    print "%s | %s | %s" % (sample, classifier.classify(gen_bow(sample)), classifier.prob_classify(gen_bow(sample)))

classifier.show_most_informative_features()
	from nltk.probability import DictionaryProbDist
	from nltk import NaiveBayesClassifier
	from nltk import FreqDist, ConditionalFreqDist
	from nltk import BigramAssocMeasures

	train_samples = {
	'I hate you and you are a bad person': 'neg',
	'I love you and you are a good person': 'pos',
	'I fail at everything and I want to kill people' : 'neg',
	'I win at everything and I want to love people' : 'pos',
	'sad are things are heppening. fml' : 'neg',
	'good are things are heppening. gbu' : 'pos',
	'I am so poor' : 'neg',
	'I am so rich' : 'pos',
	'I hate you mommy ! You are my terrible person' : 'neg',
	'I love you mommy ! You are my amazing person' : 'pos',
	'I want to kill butterflies since they make me sad' : 'neg',
	'I want to chase butterflies since they make me happy' : 'pos',
	'I want to hurt bunnies' : 'neg',
	'I want to hug bunnies' : 'pos',
	'You make me frown' : 'neg',
	'You make me smile' : 'pos',
	}

	test_samples = [
	'You are a terrible person and everything you do is bad',
	'I love you all and you make me happy',
	'I frown whenever I see you in a poor state of mind',
	'Finally getting rich from my ideas. They make me smile.',
	'My mommy is poor',
	'I love butterflies. Yay for happy',
	'Everything is fail today and I hate stuff',
	]


	def gen_bow(text):
	words = text.split()
	bow = {}
	for word in words:
	bow[word.lower()] = True
	return bow


	def get_feature_probdist(samples):

	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()

	for words, label in samples.items():
	for word in words.split():
	word_fd.inc(word.lower())
	label_word_fd[label].inc(word.lower())

	pos_word_count = label_word_fd['pos'].N()
	neg_word_count = label_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count

	feature_probdist = {}
	for word, freq in word_fd.iteritems():
	pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
	(freq, pos_word_count), total_word_count)
	neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
	(freq, neg_word_count), total_word_count)
	feature_probdist[('neg',word)] = DictionaryProbDist({True: neg_score})
	feature_probdist[('pos',word)] = DictionaryProbDist({True: pos_score})

	return feature_probdist



	feature_probdist = get_feature_probdist(train_samples)

	print feature_probdist

	label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})


	classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

	for sample in test_samples:
	print "%s \| %s \| %s" % (sample, classifier.classify(gen_bow(sample)), classifier.prob_classify(gen_bow(sample)))

	classifier.show_most_informative_features()