oliver-batey/custom_transformers.py

## custom_transformers.py
from sklearn.base import BaseEstimator, TransformerMixin


class CountWords(BaseEstimator, TransformerMixin):
    # Creates a dataframe from a series of text documents by creating a new column named n_words,
    # that contains the number of words in each document
    def __init__(self, new_col_name):
        self.new_col_name = new_col_name

    def fit(self, series, y=None):
        return self

    def transform(self, series):
        n_words_col = series.apply(lambda x: len(x.split(" "))).rename(
            self.new_col_name
        )
        return pd.concat([series, n_words_col], axis=1)


class MeanWordLength(BaseEstimator, TransformerMixin):
    # Creates a column mean length of words in message
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, dataframe, y=None):
        return self

    def transform(self, dataframe):
        dataframe["mean_word_length"] = dataframe[self.text_column].apply(
            lambda x: sum(map(len, x.split(" "))) / len(x.split(" "))
        )
        return dataframe


class FeatureSelector(BaseEstimator, TransformerMixin):
    # Creates a new dataframe using only columns listed in attribute_names
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, dataframe, y=None):
        return self

    def transform(self, dataframe):
        return dataframe[self.attribute_names].values
	from sklearn.base import BaseEstimator, TransformerMixin


	class CountWords(BaseEstimator, TransformerMixin):
	# Creates a dataframe from a series of text documents by creating a new column named n_words,
	# that contains the number of words in each document
	def __init__(self, new_col_name):
	self.new_col_name = new_col_name

	def fit(self, series, y=None):
	return self

	def transform(self, series):
	n_words_col = series.apply(lambda x: len(x.split(" "))).rename(
	self.new_col_name
	)
	return pd.concat([series, n_words_col], axis=1)


	class MeanWordLength(BaseEstimator, TransformerMixin):
	# Creates a column mean length of words in message
	def __init__(self, text_column):
	self.text_column = text_column

	def fit(self, dataframe, y=None):
	return self

	def transform(self, dataframe):
	dataframe["mean_word_length"] = dataframe[self.text_column].apply(
	lambda x: sum(map(len, x.split(" "))) / len(x.split(" "))
	)
	return dataframe


	class FeatureSelector(BaseEstimator, TransformerMixin):
	# Creates a new dataframe using only columns listed in attribute_names
	def __init__(self, attribute_names):
	self.attribute_names = attribute_names

	def fit(self, dataframe, y=None):
	return self

	def transform(self, dataframe):
	return dataframe[self.attribute_names].values