Created
August 22, 2019 06:21
-
-
Save shantanuo/d2cbbe4f77e0eba880010ef9bbadf82f to your computer and use it in GitHub Desktop.
simplified version of https://github.com/kavgan/nlp-in-practice/blob/master/text-classification/notebooks/Text%20Classification%20with%20Logistic%20Regression.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics import label_ranking_average_precision_score | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from keras.utils import to_categorical | |
df = pd.read_json("../data/news_category_dataset.json", lines=True) | |
le = LabelEncoder() | |
df["category"] = le.fit_transform(df["category"]) | |
cv = CountVectorizer(binary=True, max_df=0.95) | |
feature_set = cv.fit_transform(df["short_description"]) | |
X_train, X_test, y_train, y_test = train_test_split( | |
feature_set, df["category"], random_state=2000 | |
) | |
scikit_log_reg = LogisticRegression( | |
verbose=1, solver="liblinear", random_state=0, C=5, penalty="l2", max_iter=1000 | |
) | |
model = scikit_log_reg.fit(X_train, y_train) | |
def get_top_k_predictions(model, X_test, k): | |
probs = model.predict_proba(X_test) | |
best_n = np.argsort(probs, axis=1)[:, -k:] | |
preds = [ | |
[model.classes_[predicted_cat] for predicted_cat in prediction] | |
for prediction in best_n | |
] | |
preds = [item[::-1] for item in preds] | |
return preds | |
top_preds = get_top_k_predictions(model, X_test, 3) | |
print(np.mean([actual in pred for actual, pred in zip(y_test, top_preds)])) | |
target = to_categorical(y_test) | |
y_pred = model.predict_proba(X_test) | |
print(label_ranking_average_precision_score(target, y_pred)) | |
top_preds = get_top_k_predictions(model, X_test, 3) | |
print(np.mean([actual in pred for actual, pred in zip(y_test, top_preds)])) | |
model.score(X_test, y_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment