syhw/dropout_simple_models.py

## dropout_simple_models.py
from sklearn.datasets import fetch_20newsgroups, load_digits
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics

newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
vectors = vectorizer.fit_transform(newsgroups_train.data)
dense_vectors = vectors.todense()
dense_vectors = np.asarray(dense_vectors)

newsgroups_test = fetch_20newsgroups(subset='test')
vectors_test = vectorizer.transform(newsgroups_test.data)

digits = load_digits()
d_train_x, d_test_x, d_train_y, d_test_y = train_test_split(digits.data,
    digits.target, test_size=0.2)

DO_ALL = True
N_TIMES = 20  # number of datasets dropped out
DROPOUT_RATE = 0.5  # TODO explore 0.0->0.5


#class Dropout(object):
#    def __init__(self, p=0.5):
#        self.p = p
#
#    def fit(self, X, y):
#        return self
#
#    def transform(self, X):
#        return np.random.binomial(n=1, p=1.-self.p, size=X.shape) * X
#
#    def get_params(self, **kwargs):
#            return {"p": self.p}


for dname, x_train, y_train, x_test, y_test in (('digits', d_train_x,
        d_train_y, d_test_x, d_test_y), ('20newsgroups', dense_vectors,
            newsgroups_train.target, vectors_test, newsgroups_test.target)):

    classifiers = [LogisticRegression(), SGDClassifier()]
    # default penaly for LogisticRegression and SGDClassifier
    # is L2 and dropout approximates an L2 ellipsis
    if dname == '20newsgroups':
        classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
    print "==> dataset name:", dname
    print "-> without dropout"
    if DO_ALL:
        for clf in classifiers:
            print clf
            clf.fit(x_train, y_train)
            pred = clf.predict(x_test)
            print metrics.f1_score(pred, y_test)

    tmp_l = [dense_vectors * np.random.binomial(n=1, p=0.5,
             size=dense_vectors.shape) for _ in xrange(N_TIMES)]
    X = np.concatenate(tmp_l, axis=0)
    y = np.concatenate([newsgroups_train.target for _ in xrange(N_TIMES)], axis=0)

    print "-> now with", N_TIMES, "dropouts, with rate", DROPOUT_RATE
    classifiers = [LogisticRegression(C=1.E6), SGDClassifier(alpha=1.E-9)]
    # default penaly for LogisticRegression and SGDClassifier
    # is L2 and dropout approximates an L2 ellipsis ==> we try to remove the L2
    if dname == '20newsgroups':
        classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
    for clf in classifiers:
        print clf
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        print metrics.f1_score(pred, y_test)


# Three things to keep in mind:
# - dropout is usually done at the hidden units activations, not at the input
# - dropout is good when models are strongly overfitting
# - dropout is good with lots of data

# ==> dataset name: digits
# -> without dropout
# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
# 0.96389747273
# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
#        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
#        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
#        random_state=None, rho=None, shuffle=False, verbose=0,
#        warm_start=False)
# 0.93351689353
# -> now with 20 dropouts, with rate 0.5
# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
#           fit_intercept=True, intercept_scaling=1, penalty=l2,
#           random_state=None, tol=0.0001)
# 0.935424946443
# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
#        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
#        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
#        random_state=None, rho=None, shuffle=False, verbose=0,
#        warm_start=False)
# 0.94890380291
# ==> dataset name: 20newsgroups
# -> without dropout
# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
# 0.810228116561
# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
#        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
#        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
#        random_state=None, rho=None, shuffle=False, verbose=0,
#        warm_start=False)
# 0.813840047475
# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
# 0.806747433797
# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
# 0.71331798034
# -> now with 20 dropouts, with rate 0.5
# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
#           fit_intercept=True, intercept_scaling=1, penalty=l2,
#           random_state=None, tol=0.0001)
# 0.813512528347
# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
#        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
#        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
#        random_state=None, rho=None, shuffle=False, verbose=0,
#        warm_start=False)
# 0.764192166602
# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
# 0.806747433797
# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
# 0.71331798034
	from sklearn.datasets import fetch_20newsgroups, load_digits
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cross_validation import train_test_split
	import numpy as np
	from sklearn.naive_bayes import MultinomialNB, BernoulliNB
	from sklearn.linear_model import LogisticRegression, SGDClassifier
	from sklearn import metrics

	newsgroups_train = fetch_20newsgroups(subset='train')
	vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
	vectors = vectorizer.fit_transform(newsgroups_train.data)
	dense_vectors = vectors.todense()
	dense_vectors = np.asarray(dense_vectors)

	newsgroups_test = fetch_20newsgroups(subset='test')
	vectors_test = vectorizer.transform(newsgroups_test.data)

	digits = load_digits()
	d_train_x, d_test_x, d_train_y, d_test_y = train_test_split(digits.data,
	digits.target, test_size=0.2)

	DO_ALL = True
	N_TIMES = 20 # number of datasets dropped out
	DROPOUT_RATE = 0.5 # TODO explore 0.0->0.5


	#class Dropout(object):
	# def __init__(self, p=0.5):
	# self.p = p
	#
	# def fit(self, X, y):
	# return self
	#
	# def transform(self, X):
	# return np.random.binomial(n=1, p=1.-self.p, size=X.shape) * X
	#
	# def get_params(self, **kwargs):
	# return {"p": self.p}


	for dname, x_train, y_train, x_test, y_test in (('digits', d_train_x,
	d_train_y, d_test_x, d_test_y), ('20newsgroups', dense_vectors,
	newsgroups_train.target, vectors_test, newsgroups_test.target)):

	classifiers = [LogisticRegression(), SGDClassifier()]
	# default penaly for LogisticRegression and SGDClassifier
	# is L2 and dropout approximates an L2 ellipsis
	if dname == '20newsgroups':
	classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
	print "==> dataset name:", dname
	print "-> without dropout"
	if DO_ALL:
	for clf in classifiers:
	print clf
	clf.fit(x_train, y_train)
	pred = clf.predict(x_test)
	print metrics.f1_score(pred, y_test)

	tmp_l = [dense_vectors * np.random.binomial(n=1, p=0.5,
	size=dense_vectors.shape) for _ in xrange(N_TIMES)]
	X = np.concatenate(tmp_l, axis=0)
	y = np.concatenate([newsgroups_train.target for _ in xrange(N_TIMES)], axis=0)

	print "-> now with", N_TIMES, "dropouts, with rate", DROPOUT_RATE
	classifiers = [LogisticRegression(C=1.E6), SGDClassifier(alpha=1.E-9)]
	# default penaly for LogisticRegression and SGDClassifier
	# is L2 and dropout approximates an L2 ellipsis ==> we try to remove the L2
	if dname == '20newsgroups':
	classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
	for clf in classifiers:
	print clf
	clf.fit(x_train, y_train)
	pred = clf.predict(x_test)
	print metrics.f1_score(pred, y_test)


	# Three things to keep in mind:
	# - dropout is usually done at the hidden units activations, not at the input
	# - dropout is good when models are strongly overfitting
	# - dropout is good with lots of data

	# ==> dataset name: digits
	# -> without dropout
	# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
	# intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
	# 0.96389747273
	# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.93351689353
	# -> now with 20 dropouts, with rate 0.5
	# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
	# fit_intercept=True, intercept_scaling=1, penalty=l2,
	# random_state=None, tol=0.0001)
	# 0.935424946443
	# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.94890380291
	# ==> dataset name: 20newsgroups
	# -> without dropout
	# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
	# intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
	# 0.810228116561
	# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.813840047475
	# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
	# 0.806747433797
	# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
	# 0.71331798034
	# -> now with 20 dropouts, with rate 0.5
	# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
	# fit_intercept=True, intercept_scaling=1, penalty=l2,
	# random_state=None, tol=0.0001)
	# 0.813512528347
	# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.764192166602
	# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
	# 0.806747433797
	# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
	# 0.71331798034