Skip to content

Instantly share code, notes, and snippets.

@oliver-batey
Last active March 22, 2022 13:17
Show Gist options
  • Save oliver-batey/32cb51619b44b8a4ef380c1c109e0249 to your computer and use it in GitHub Desktop.
Save oliver-batey/32cb51619b44b8a4ef380c1c109e0249 to your computer and use it in GitHub Desktop.
from sklearn.base import BaseEstimator, TransformerMixin
class CountWords(BaseEstimator, TransformerMixin):
# Creates a dataframe from a series of text documents by creating a new column named n_words,
# that contains the number of words in each document
def __init__(self, new_col_name):
self.new_col_name = new_col_name
def fit(self, series, y=None):
return self
def transform(self, series):
n_words_col = series.apply(lambda x: len(x.split(" "))).rename(
self.new_col_name
)
return pd.concat([series, n_words_col], axis=1)
class MeanWordLength(BaseEstimator, TransformerMixin):
# Creates a column mean length of words in message
def __init__(self, text_column):
self.text_column = text_column
def fit(self, dataframe, y=None):
return self
def transform(self, dataframe):
dataframe["mean_word_length"] = dataframe[self.text_column].apply(
lambda x: sum(map(len, x.split(" "))) / len(x.split(" "))
)
return dataframe
class FeatureSelector(BaseEstimator, TransformerMixin):
# Creates a new dataframe using only columns listed in attribute_names
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, dataframe, y=None):
return self
def transform(self, dataframe):
return dataframe[self.attribute_names].values
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment