Skip to content

Instantly share code, notes, and snippets.

@dyerrington
Created July 12, 2018 00:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dyerrington/b9e1179feed36ce266f61c6968779e91 to your computer and use it in GitHub Desktop.
Save dyerrington/b9e1179feed36ce266f61c6968779e91 to your computer and use it in GitHub Desktop.
Remove "n-grams" first, before stopwords with this handy class that extends the functionality of scikit-learn's CountVectorizer. Substitute the class extension for other types of vectorizers such as TfIDF in the class definition at the top.
# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer):
stop_grams = []
def __init__(self, stop_grams = [], **opts):
self.stop_grams = stop_grams
super().__init__(**opts)
def remove_ngrams(self, doc):
for stop_gram in self.stop_grams:
doc = doc.replace(stop_gram, "")
return doc
# overwrite the build_analyzer method, allowing one to
# create a custom analyzer for the vectorizer
def build_analyzer(self):
# load stop words using CountVectorizer's built in method
stop_words = list(self.get_stop_words())
preprocessor = self.build_preprocessor()
tokenizer = self.build_tokenizer()
remove_ngrams = self.remove_ngrams
# create the analyzer that will be returned by this method
def analyser(doc):
# apply the preprocessing and tokenzation steps
doc_clean = preprocessor(doc.lower())
# remove phrase stopwords
doc_clean = remove_ngrams(doc)
# tokenize using default tokenizer
tokens = tokenizer(doc_clean)
# use CountVectorizer's _word_ngrams built in method
# to remove stop words and extract n-grams
return(self._word_ngrams(tokens, stop_words))
return(analyser)
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
text_input = [
"Never put fox news or CNN in your pocket or bookbag.",
"I put fox news in my mother jones bookbag",
"Takimag and fox news are both conservative leaning outlets."
]
df = pd.DataFrame(text_input, columns=["text"])
stop_grams = ["fox news", "mother jones"]
vectorizer = CustomVectorizer(stop_words="english", stop_grams = stop_grams)
X = vectorizer.fit_transform(df['text'])
text = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment