Created
July 12, 2018 00:32
-
-
Save dyerrington/b9e1179feed36ce266f61c6968779e91 to your computer and use it in GitHub Desktop.
Remove "n-grams" first, before stopwords with this handy class that extends the functionality of scikit-learn's CountVectorizer. Substitute the class extension for other types of vectorizers such as TfIDF in the class definition at the top.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# defines a custom vectorizer class | |
class CustomVectorizer(CountVectorizer): | |
stop_grams = [] | |
def __init__(self, stop_grams = [], **opts): | |
self.stop_grams = stop_grams | |
super().__init__(**opts) | |
def remove_ngrams(self, doc): | |
for stop_gram in self.stop_grams: | |
doc = doc.replace(stop_gram, "") | |
return doc | |
# overwrite the build_analyzer method, allowing one to | |
# create a custom analyzer for the vectorizer | |
def build_analyzer(self): | |
# load stop words using CountVectorizer's built in method | |
stop_words = list(self.get_stop_words()) | |
preprocessor = self.build_preprocessor() | |
tokenizer = self.build_tokenizer() | |
remove_ngrams = self.remove_ngrams | |
# create the analyzer that will be returned by this method | |
def analyser(doc): | |
# apply the preprocessing and tokenzation steps | |
doc_clean = preprocessor(doc.lower()) | |
# remove phrase stopwords | |
doc_clean = remove_ngrams(doc) | |
# tokenize using default tokenizer | |
tokens = tokenizer(doc_clean) | |
# use CountVectorizer's _word_ngrams built in method | |
# to remove stop words and extract n-grams | |
return(self._word_ngrams(tokens, stop_words)) | |
return(analyser) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS | |
text_input = [ | |
"Never put fox news or CNN in your pocket or bookbag.", | |
"I put fox news in my mother jones bookbag", | |
"Takimag and fox news are both conservative leaning outlets." | |
] | |
df = pd.DataFrame(text_input, columns=["text"]) | |
stop_grams = ["fox news", "mother jones"] | |
vectorizer = CustomVectorizer(stop_words="english", stop_grams = stop_grams) | |
X = vectorizer.fit_transform(df['text']) | |
text = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()) | |
text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment