dyerrington/custom_vectorizer.py

## custom_vectorizer.py
# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer):

    stop_grams = []

    def __init__(self, stop_grams = [], **opts):
        self.stop_grams = stop_grams
        super().__init__(**opts)

    def remove_ngrams(self, doc):
        for stop_gram in self.stop_grams:
            doc = doc.replace(stop_gram, "")
        return doc

    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):

        # load stop words using CountVectorizer's built in method
        stop_words = list(self.get_stop_words())

        preprocessor = self.build_preprocessor()
        tokenizer = self.build_tokenizer()
        remove_ngrams = self.remove_ngrams


        # create the analyzer that will be returned by this method
        def analyser(doc):

            # apply the preprocessing and tokenzation steps
            doc_clean = preprocessor(doc.lower())

            # remove phrase stopwords
            doc_clean = remove_ngrams(doc)

            # tokenize using default tokenizer
            tokens = tokenizer(doc_clean)

            # use CountVectorizer's _word_ngrams built in method
            # to remove stop words and extract n-grams
            return(self._word_ngrams(tokens, stop_words))

        return(analyser)

## example.py
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

text_input = [
    "Never put fox news or CNN in your pocket or bookbag.",
    "I put fox news in my mother jones bookbag",
    "Takimag and fox news are both conservative leaning outlets."
]

df = pd.DataFrame(text_input, columns=["text"])

stop_grams = ["fox news", "mother jones"]

vectorizer = CustomVectorizer(stop_words="english", stop_grams = stop_grams)
X = vectorizer.fit_transform(df['text'])

text = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
text
	# defines a custom vectorizer class
	class CustomVectorizer(CountVectorizer):

	stop_grams = []

	def __init__(self, stop_grams = [], **opts):
	self.stop_grams = stop_grams
	super().__init__(**opts)

	def remove_ngrams(self, doc):
	for stop_gram in self.stop_grams:
	doc = doc.replace(stop_gram, "")
	return doc

	# overwrite the build_analyzer method, allowing one to
	# create a custom analyzer for the vectorizer
	def build_analyzer(self):

	# load stop words using CountVectorizer's built in method
	stop_words = list(self.get_stop_words())

	preprocessor = self.build_preprocessor()
	tokenizer = self.build_tokenizer()
	remove_ngrams = self.remove_ngrams


	# create the analyzer that will be returned by this method
	def analyser(doc):

	# apply the preprocessing and tokenzation steps
	doc_clean = preprocessor(doc.lower())

	# remove phrase stopwords
	doc_clean = remove_ngrams(doc)

	# tokenize using default tokenizer
	tokens = tokenizer(doc_clean)

	# use CountVectorizer's _word_ngrams built in method
	# to remove stop words and extract n-grams
	return(self._word_ngrams(tokens, stop_words))

	return(analyser)
	from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

	text_input = [
	"Never put fox news or CNN in your pocket or bookbag.",
	"I put fox news in my mother jones bookbag",
	"Takimag and fox news are both conservative leaning outlets."
	]

	df = pd.DataFrame(text_input, columns=["text"])

	stop_grams = ["fox news", "mother jones"]

	vectorizer = CustomVectorizer(stop_words="english", stop_grams = stop_grams)
	X = vectorizer.fit_transform(df['text'])

	text = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
	text