Created
February 7, 2020 17:55
-
-
Save lazuxd/d852297f2b3bf371fe97599d3fc20982 to your computer and use it in GitHub Desktop.
Building a Sentiment Classifier using Scikit-Learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from joblib import dump, load # used for saving and loading sklearn objects | |
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices | |
system("mkdir 'data_preprocessors'") | |
system("mkdir 'vectorized_data'") | |
# Unigram Counts | |
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1)) | |
unigram_vectorizer.fit(imdb_train['text'].values) | |
dump(unigram_vectorizer, 'data_preprocessors/unigram_vectorizer.joblib') | |
# unigram_vectorizer = load('data_preprocessors/unigram_vectorizer.joblib') | |
X_train_unigram = unigram_vectorizer.transform(imdb_train['text'].values) | |
save_npz('vectorized_data/X_train_unigram.npz', X_train_unigram) | |
# X_train_unigram = load_npz('vectorized_data/X_train_unigram.npz') | |
# Unigram Tf-Idf | |
unigram_tf_idf_transformer = TfidfTransformer() | |
unigram_tf_idf_transformer.fit(X_train_unigram) | |
dump(unigram_tf_idf_transformer, 'data_preprocessors/unigram_tf_idf_transformer.joblib') | |
# unigram_tf_idf_transformer = load('data_preprocessors/unigram_tf_idf_transformer.joblib') | |
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram) | |
save_npz('vectorized_data/X_train_unigram_tf_idf.npz', X_train_unigram_tf_idf) | |
# X_train_unigram_tf_idf = load_npz('vectorized_data/X_train_unigram_tf_idf.npz') | |
# Bigram Counts | |
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2)) | |
bigram_vectorizer.fit(imdb_train['text'].values) | |
dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib') | |
# bigram_vectorizer = load('data_preprocessors/bigram_vectorizer.joblib') | |
X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values) | |
save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram) | |
# X_train_bigram = load_npz('vectorized_data/X_train_bigram.npz') | |
# Bigram Tf-Idf | |
bigram_tf_idf_transformer = TfidfTransformer() | |
bigram_tf_idf_transformer.fit(X_train_bigram) | |
dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib') | |
# bigram_tf_idf_transformer = load('data_preprocessors/bigram_tf_idf_transformer.joblib') | |
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram) | |
save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf) | |
# X_train_bigram_tf_idf = load_npz('vectorized_data/X_train_bigram_tf_idf.npz') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment