Skip to content

Instantly share code, notes, and snippets.

@deargle
Last active April 22, 2021 12:32
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deargle/b57738c8ce2b4ed6ca90f86d5422431f to your computer and use it in GitHub Desktop.
Save deargle/b57738c8ce2b4ed6ca90f86d5422431f to your computer and use it in GitHub Desktop.
Example of TfidfVectorizer with custom tokenizer that does basic stemming
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 24 16:30:42 2018
@author: deargle
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import nltk
import pandas as pd
import string
# These filenames are artifacts from translating the "predict future sales" kaggle competition files
# (<csv-name>, <column name of thing to tokenize>, <number of features to retain>)
the_things = [
('items-translated', 'item_name_translated', 50),
('item_categories-translated','item_category_name_translated', 10),
('shops-translated','shop_name_translated', 10)]
trans_table = {ord(c): None for c in string.punctuation + string.digits}
stemmer = PorterStemmer()
def tokenize(text):
# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
stems = [stemmer.stem(item) for item in tokens]
return stems
def do_the_thing(filename, name_name, feature_cnt):
things_to_do_it_to = pd.read_csv('%s.csv' % filename)
tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt)
features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()])
things_done_to = things_to_do_it_to.join(features)
return things_done_to
#%%
#for (filename, name_name, feature_cnt) in the_things:
#do_the_thing(filename, name_name, feature_cnt)
#%%
things_done_to = do_the_thing(*('pizza', 'text', 20))
things_done_to.to_csv('pizza_features.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment