gwerbin/pandas-sklearn-workflow.py

## pandas-sklearn-workflow.py
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline

seed = 43770
rng = np.random.RandomState(seed)


class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return data[self.variables]


class Classifier():
    def __init__(self, classifier,  **kwargs):
        self.classifier = classifier(**kwargs)

    def make_pipe(self, x_vars):
        pipe = Pipeline([
                ('data', Selector(x_vars)),
                ('cf', self.classifier)
            ])
        return pipe

    def fit(self, x_vars, y_var, data):
        pipe = self.make_pipe(x_vars)
        return pipe.fit(data[x_vars], data[y_var])

## Create the dictionary of classifiers
classifiers = {
    'rf': Classifier(RandomForestClassifier, n_estimators=100, oob_score=True, bootstrap=True),
    'ab': Classifier(AdaBoostClassifier, n_estimators=50)
}

## Load the data
iris = pd.DataFrame(
	load_iris().data,
	columns = map(lambda x: x.replace(" (cm)", ""), load_iris().feature_names)
)
iris['species'] = load_iris().target

## Choose the features to use
features = {
    '0': ['sepal width', 'sepal length'],
    '1': ['sepal width', 'sepal length', 'petal width', 'petal length']
}

## Loop and fit
results = dict()
for k in features.keys():
    results[k] = dict()
    for m in classifiers.keys():
        print(len(features[k]))
        results[k][m] = classifiers[m].fit(features[k], 'species', iris)

## But it doesn't work right
print(
    'Number of features in X0: %i (should be 2)' % len(results['0']['rf'].steps[1][1].feature_importances_),
    'Number of features in X1: %i (should be 4)' % len(results['1']['rf'].steps[1][1].feature_importances_),
    sep = '\n'
)
	import numpy as np
	import pandas as pd
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.datasets import load_iris
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.pipeline import Pipeline

	seed = 43770
	rng = np.random.RandomState(seed)


	class Selector(BaseEstimator, TransformerMixin):
	def __init__(self, variables):
	self.variables = variables

	def fit(self, x, y=None):
	return self

	def transform(self, data):
	return data[self.variables]


	class Classifier():
	def __init__(self, classifier, **kwargs):
	self.classifier = classifier(**kwargs)

	def make_pipe(self, x_vars):
	pipe = Pipeline([
	('data', Selector(x_vars)),
	('cf', self.classifier)
	])
	return pipe

	def fit(self, x_vars, y_var, data):
	pipe = self.make_pipe(x_vars)
	return pipe.fit(data[x_vars], data[y_var])

	## Create the dictionary of classifiers
	classifiers = {
	'rf': Classifier(RandomForestClassifier, n_estimators=100, oob_score=True, bootstrap=True),
	'ab': Classifier(AdaBoostClassifier, n_estimators=50)
	}

	## Load the data
	iris = pd.DataFrame(
	load_iris().data,
	columns = map(lambda x: x.replace(" (cm)", ""), load_iris().feature_names)
	)
	iris['species'] = load_iris().target

	## Choose the features to use
	features = {
	'0': ['sepal width', 'sepal length'],
	'1': ['sepal width', 'sepal length', 'petal width', 'petal length']
	}

	## Loop and fit
	results = dict()
	for k in features.keys():
	results[k] = dict()
	for m in classifiers.keys():
	print(len(features[k]))
	results[k][m] = classifiers[m].fit(features[k], 'species', iris)

	## But it doesn't work right
	print(
	'Number of features in X0: %i (should be 2)' % len(results['0']['rf'].steps[1][1].feature_importances_),
	'Number of features in X1: %i (should be 4)' % len(results['1']['rf'].steps[1][1].feature_importances_),
	sep = '\n'
	)