deargle/tokenize.py

## tokenize.py
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 24 16:30:42 2018

@author: deargle
"""

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import nltk
import pandas as pd
import string

# These filenames are artifacts from translating the "predict future sales" kaggle competition files
# (<csv-name>, <column name of thing to tokenize>, <number of features to retain>)
the_things = [
        ('items-translated', 'item_name_translated', 50),
        ('item_categories-translated','item_category_name_translated', 10),
        ('shops-translated','shop_name_translated', 10)]

trans_table = {ord(c): None for c in string.punctuation + string.digits}
stemmer = PorterStemmer()

def tokenize(text):
        # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        stems = [stemmer.stem(item) for item in tokens]
        return stems

def do_the_thing(filename, name_name, feature_cnt):
    things_to_do_it_to = pd.read_csv('%s.csv' % filename)

    tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt)
    features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()])
    things_done_to = things_to_do_it_to.join(features)
    return things_done_to

#%%
#for (filename, name_name, feature_cnt) in the_things:
    #do_the_thing(filename, name_name, feature_cnt)

#%%

things_done_to = do_the_thing(*('pizza', 'text', 20))
things_done_to.to_csv('pizza_features.csv')
	# -- coding: utf-8 --
	"""
	Created on Tue Apr 24 16:30:42 2018

	@author: deargle
	"""

	from sklearn.feature_extraction.text import TfidfVectorizer
	from nltk.stem.porter import PorterStemmer
	import nltk
	import pandas as pd
	import string

	# These filenames are artifacts from translating the "predict future sales" kaggle competition files
	# (<csv-name>, <column name of thing to tokenize>, <number of features to retain>)
	the_things = [
	('items-translated', 'item_name_translated', 50),
	('item_categories-translated','item_category_name_translated', 10),
	('shops-translated','shop_name_translated', 10)]

	trans_table = {ord(c): None for c in string.punctuation + string.digits}
	stemmer = PorterStemmer()

	def tokenize(text):
	# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
	tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
	stems = [stemmer.stem(item) for item in tokens]
	return stems

	def do_the_thing(filename, name_name, feature_cnt):
	things_to_do_it_to = pd.read_csv('%s.csv' % filename)

	tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt)
	features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()])
	things_done_to = things_to_do_it_to.join(features)
	return things_done_to

	#%%
	#for (filename, name_name, feature_cnt) in the_things:
	#do_the_thing(filename, name_name, feature_cnt)

	#%%

	things_done_to = do_the_thing(*('pizza', 'text', 20))
	things_done_to.to_csv('pizza_features.csv')