Last active
March 22, 2022 13:17
-
-
Save oliver-batey/32cb51619b44b8a4ef380c1c109e0249 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
class CountWords(BaseEstimator, TransformerMixin): | |
# Creates a dataframe from a series of text documents by creating a new column named n_words, | |
# that contains the number of words in each document | |
def __init__(self, new_col_name): | |
self.new_col_name = new_col_name | |
def fit(self, series, y=None): | |
return self | |
def transform(self, series): | |
n_words_col = series.apply(lambda x: len(x.split(" "))).rename( | |
self.new_col_name | |
) | |
return pd.concat([series, n_words_col], axis=1) | |
class MeanWordLength(BaseEstimator, TransformerMixin): | |
# Creates a column mean length of words in message | |
def __init__(self, text_column): | |
self.text_column = text_column | |
def fit(self, dataframe, y=None): | |
return self | |
def transform(self, dataframe): | |
dataframe["mean_word_length"] = dataframe[self.text_column].apply( | |
lambda x: sum(map(len, x.split(" "))) / len(x.split(" ")) | |
) | |
return dataframe | |
class FeatureSelector(BaseEstimator, TransformerMixin): | |
# Creates a new dataframe using only columns listed in attribute_names | |
def __init__(self, attribute_names): | |
self.attribute_names = attribute_names | |
def fit(self, dataframe, y=None): | |
return self | |
def transform(self, dataframe): | |
return dataframe[self.attribute_names].values |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment