vinovator/nltk_name_classifier.py

## nltk_name_classifier.py
# nltk_name_classifier.py
# Python 2.7.6

"""
Classifier to determine the gender of a name using NLTK library

Classification - task of choosing the correct class label for a given input.

Supervised classifier:
Classifier that is built on training corpora containing the correct label
for each input.

Steps to create a classifier:
1) Decide what features of input are relevant and how to encode those features
    a. A feature extractor function is built to return a dictionary containing
     relevant information. The returned dictionary is known as feature set.
    b. Feature name is a case-sensitive string that provides human readable
     description of a feature (e.g. last_letter)
    c. Feature value are the extracted values of simple types, such as Boolean,
    numbers and strings.
2) Prepare a list of examples and corresponding class labels
3) Use the feature extractor to process the data and divide the resulting list
 of feature sets into a training set and test set
"""


import nltk
# Corpus which consists of male and female names dataset
from nltk.corpus import names
# For shuffling
import random


def gender_features(word):
    """ feature extractor for the name classifier
    The feature evaluated here is the last letter of a name
    feature name - "last_letter"
    """
    return {"last_letter": word[-1]}  # feature set


if __name__ == "__main__":
    """ Starting block """

    # Extract the data sets
    labeled_names = ([(name, "male") for name in names.words("male.txt")] +
                     [(name, "female") for name in names.words("female.txt")])

    print len(labeled_names)  # 7944 names

    # Shuffle the names in the list
    random.shuffle(labeled_names)

    # Process the names through feature extractor
    feature_sets = [(gender_features(n), gender)
                    for (n, gender) in labeled_names]

    # Divide the feature sets into training and test sets
    train_set, test_set = feature_sets[500:], feature_sets[:500]

    # Train the naiveBayes classifier
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    # Test out the classifier with few samples outside of training set
    print classifier.classify(gender_features("neo"))  # returns male
    print classifier.classify(gender_features("trinity"))  # returns female

    # Test the accuracy of the classifier on the test data
    print nltk.classify.accuracy(classifier, test_set)  # returns 0.78 for now

    # examine classifier to determine which feature is most effective for
    # distinguishing the name's gender
    print classifier.show_most_informative_features(5)
	# nltk_name_classifier.py
	# Python 2.7.6

	"""
	Classifier to determine the gender of a name using NLTK library

	Classification - task of choosing the correct class label for a given input.

	Supervised classifier:
	Classifier that is built on training corpora containing the correct label
	for each input.

	Steps to create a classifier:
	1) Decide what features of input are relevant and how to encode those features
	a. A feature extractor function is built to return a dictionary containing
	relevant information. The returned dictionary is known as feature set.
	b. Feature name is a case-sensitive string that provides human readable
	description of a feature (e.g. last_letter)
	c. Feature value are the extracted values of simple types, such as Boolean,
	numbers and strings.
	2) Prepare a list of examples and corresponding class labels
	3) Use the feature extractor to process the data and divide the resulting list
	of feature sets into a training set and test set
	"""


	import nltk
	# Corpus which consists of male and female names dataset
	from nltk.corpus import names
	# For shuffling
	import random


	def gender_features(word):
	""" feature extractor for the name classifier
	The feature evaluated here is the last letter of a name
	feature name - "last_letter"
	"""
	return {"last_letter": word[-1]} # feature set


	if __name__ == "__main__":
	""" Starting block """

	# Extract the data sets
	labeled_names = ([(name, "male") for name in names.words("male.txt")] +
	[(name, "female") for name in names.words("female.txt")])

	print len(labeled_names) # 7944 names

	# Shuffle the names in the list
	random.shuffle(labeled_names)

	# Process the names through feature extractor
	feature_sets = [(gender_features(n), gender)
	for (n, gender) in labeled_names]

	# Divide the feature sets into training and test sets
	train_set, test_set = feature_sets[500:], feature_sets[:500]

	# Train the naiveBayes classifier
	classifier = nltk.NaiveBayesClassifier.train(train_set)

	# Test out the classifier with few samples outside of training set
	print classifier.classify(gender_features("neo")) # returns male
	print classifier.classify(gender_features("trinity")) # returns female

	# Test the accuracy of the classifier on the test data
	print nltk.classify.accuracy(classifier, test_set) # returns 0.78 for now

	# examine classifier to determine which feature is most effective for
	# distinguishing the name's gender
	print classifier.show_most_informative_features(5)