dcollien/classify_text.py

## classify_text.py
import nltk
from summa.keywords import keywords

def get_features(text):
    # get the top 80% of the phrases from the text, scored by relevance
    return dict(keywords(text, ratio=0.8, split=True, scores=True))

def train_texts(classified_texts):
    # process the training set
    features = []
    for classification, text in classified_texts:
        features.append((get_features(text), classification))
    return nltk.NaiveBayesClassifier.train(features)

def classify(classifier, text):
    # classify a document
    return classifier.classify(get_features(text))

# Example:
classifier = train_texts([
    ('spam', spam_text),
    ('ham', ham_text)
])

is_spam = classify(classifier, ham_text) == 'spam'
	import nltk
	from summa.keywords import keywords

	def get_features(text):
	# get the top 80% of the phrases from the text, scored by relevance
	return dict(keywords(text, ratio=0.8, split=True, scores=True))

	def train_texts(classified_texts):
	# process the training set
	features = []
	for classification, text in classified_texts:
	features.append((get_features(text), classification))
	return nltk.NaiveBayesClassifier.train(features)

	def classify(classifier, text):
	# classify a document
	return classifier.classify(get_features(text))

	# Example:
	classifier = train_texts([
	('spam', spam_text),
	('ham', ham_text)
	])

	is_spam = classify(classifier, ham_text) == 'spam'