Created
February 7, 2020 17:57
-
-
Save lazuxd/fa446ebf8ebd025ce1ba1c038a15a031 to your computer and use it in GitHub Desktop.
Building a Sentiment Classifier using Scikit-Learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import SGDClassifier | |
from sklearn.model_selection import train_test_split | |
from scipy.sparse import csr_matrix | |
import numpy as np | |
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None: | |
X_train, X_valid, y_train, y_valid = train_test_split( | |
X, y, train_size=0.75, stratify=y | |
) | |
clf = SGDClassifier() | |
clf.fit(X_train, y_train) | |
train_score = clf.score(X_train, y_train) | |
valid_score = clf.score(X_valid, y_valid) | |
print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n') | |
y_train = imdb_train['label'].values | |
train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts') | |
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf') | |
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts') | |
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment