chelseatroy/tfidf_vectorization_with_pandas.py

## tfidf_vectorization_with_pandas.py
import pandas as pd
import numpy as np
import itertool
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv('my_data_with_text.csv')
df.columns #id, text, category

texts = np.array(df['text']) #text contents in dataframe to array for processing
vocab_length = len(nltk.word_tokenize(list(itertools.chain.from_iterable(texts))) #concatenate all the texts and tokenize the whole corpus

vectorizer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) #make Tfidf Vectorizer
tfidf_encodings = vectorizer.fit_transform(texts) #encode the text

df['tfidf'] = list(tfidf_encodings.toarray()) #vectorized texts to dense list format for storage in dataframe

vectors_for_training = np.array(df['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else
X_train, y_train, X_test, y_test = train_test_split(vectors_for_training, df['category'].tolist())

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(df.tfidf.tolist())

#DO NOT DO:
df.to_csv('with_encoding.csv') #Stores the first and last 3 items in each vector as a string like "[0.0, 0.0, 0.0...0.0, 0.0, 0.0]"
	import pandas as pd
	import numpy as np
	import itertool
	from nltk import word_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB

	df = pd.read_csv('my_data_with_text.csv')
	df.columns #id, text, category

	texts = np.array(df['text']) #text contents in dataframe to array for processing
	vocab_length = len(nltk.word_tokenize(list(itertools.chain.from_iterable(texts))) #concatenate all the texts and tokenize the whole corpus

	vectorizer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) #make Tfidf Vectorizer
	tfidf_encodings = vectorizer.fit_transform(texts) #encode the text

	df['tfidf'] = list(tfidf_encodings.toarray()) #vectorized texts to dense list format for storage in dataframe

	vectors_for_training = np.array(df['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else
	X_train, y_train, X_test, y_test = train_test_split(vectors_for_training, df['category'].tolist())

	nb_classifier = MultinomialNB()
	nb_classifier.fit(X_train, y_train)
	nb_predictions = nb_classifier.predict(df.tfidf.tolist())

	#DO NOT DO:
	df.to_csv('with_encoding.csv') #Stores the first and last 3 items in each vector as a string like "[0.0, 0.0, 0.0...0.0, 0.0, 0.0]"