Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Remove "n-grams" first, before stopwords with this handy class that extends the functionality of scikit-learn's CountVectorizer. Substitute the class extension for other types of vectorizers such as TfIDF in the class definition at the top.
# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer):
stop_grams = []
def __init__(self, stop_grams = [], **opts):
self.stop_grams = stop_grams
def remove_ngrams(self, doc):
for stop_gram in self.stop_grams:
doc = doc.replace(stop_gram, "")
return doc
# overwrite the build_analyzer method, allowing one to
# create a custom analyzer for the vectorizer
def build_analyzer(self):
# load stop words using CountVectorizer's built in method
stop_words = list(self.get_stop_words())
preprocessor = self.build_preprocessor()
tokenizer = self.build_tokenizer()
remove_ngrams = self.remove_ngrams
# create the analyzer that will be returned by this method
def analyser(doc):
# apply the preprocessing and tokenzation steps
doc_clean = preprocessor(doc.lower())
# remove phrase stopwords
doc_clean = remove_ngrams(doc)
# tokenize using default tokenizer
tokens = tokenizer(doc_clean)
# use CountVectorizer's _word_ngrams built in method
# to remove stop words and extract n-grams
return(self._word_ngrams(tokens, stop_words))
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
text_input = [
"Never put fox news or CNN in your pocket or bookbag.",
"I put fox news in my mother jones bookbag",
"Takimag and fox news are both conservative leaning outlets."
df = pd.DataFrame(text_input, columns=["text"])
stop_grams = ["fox news", "mother jones"]
vectorizer = CustomVectorizer(stop_words="english", stop_grams = stop_grams)
X = vectorizer.fit_transform(df['text'])
text = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.