Skip to content

Instantly share code, notes, and snippets.

@lazuxd
Created February 7, 2020 17:55
Show Gist options
  • Save lazuxd/d852297f2b3bf371fe97599d3fc20982 to your computer and use it in GitHub Desktop.
Save lazuxd/d852297f2b3bf371fe97599d3fc20982 to your computer and use it in GitHub Desktop.
Building a Sentiment Classifier using Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
system("mkdir 'data_preprocessors'")
system("mkdir 'vectorized_data'")
# Unigram Counts
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(imdb_train['text'].values)
dump(unigram_vectorizer, 'data_preprocessors/unigram_vectorizer.joblib')
# unigram_vectorizer = load('data_preprocessors/unigram_vectorizer.joblib')
X_train_unigram = unigram_vectorizer.transform(imdb_train['text'].values)
save_npz('vectorized_data/X_train_unigram.npz', X_train_unigram)
# X_train_unigram = load_npz('vectorized_data/X_train_unigram.npz')
# Unigram Tf-Idf
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)
dump(unigram_tf_idf_transformer, 'data_preprocessors/unigram_tf_idf_transformer.joblib')
# unigram_tf_idf_transformer = load('data_preprocessors/unigram_tf_idf_transformer.joblib')
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)
save_npz('vectorized_data/X_train_unigram_tf_idf.npz', X_train_unigram_tf_idf)
# X_train_unigram_tf_idf = load_npz('vectorized_data/X_train_unigram_tf_idf.npz')
# Bigram Counts
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(imdb_train['text'].values)
dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib')
# bigram_vectorizer = load('data_preprocessors/bigram_vectorizer.joblib')
X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values)
save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram)
# X_train_bigram = load_npz('vectorized_data/X_train_bigram.npz')
# Bigram Tf-Idf
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)
dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib')
# bigram_tf_idf_transformer = load('data_preprocessors/bigram_tf_idf_transformer.joblib')
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf)
# X_train_bigram_tf_idf = load_npz('vectorized_data/X_train_bigram_tf_idf.npz')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment