Last active
June 14, 2023 21:41
-
-
Save napsternxg/1ee9e7f3c85a2b111df1bfeffafa3da6 to your computer and use it in GitHub Desktop.
Prune Sklearn TF-IDF Logistic Regression model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import deepcopy | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from scipy import sparse | |
from joblib import dump, load | |
import joblib | |
import time | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.pipeline import Pipeline | |
def prune_sklearn_model(model, log_abs_thresh=-3): | |
pruned_model = deepcopy(model) | |
features = pruned_model.named_steps["tfidf"].get_feature_names_out() | |
coef = pruned_model.named_steps["model"].coef_[0] | |
idx = np.log10(np.abs(coef)) > log_abs_thresh | |
print(f"%{idx.mean()*100:.2f} weights will be retained.") | |
pruned_vocab_keys = features[idx] | |
pruned_vocab_original_ids = [model.named_steps["tfidf"].vocabulary_[k] for k in pruned_vocab_keys] | |
pruned_vocab = {k: i for i, k in enumerate(pruned_vocab_keys)} | |
pruned_vocab_size = len(pruned_vocab) | |
pruned_model.named_steps["tfidf"].vocabulary_ = pruned_vocab | |
pruned_model.named_steps["tfidf"]._tfidf.n_features_in_ = pruned_vocab_size | |
pruned_model.named_steps["tfidf"]._tfidf._idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag[pruned_vocab_original_ids][:, pruned_vocab_original_ids] | |
pruned_model.named_steps["model"].coef_ = model.named_steps["model"].coef_[:, pruned_vocab_original_ids] | |
pruned_model.named_steps["model"].n_features_in_ = pruned_model.named_steps["model"].coef_.shape[1] | |
print(f"Pruned model with {pruned_vocab_size} weights is: {pruned_model}") | |
return pruned_model | |
def add_extra_weights(model, extra_features): | |
""" | |
extra_features = { | |
"cat": [1, 2, 3], | |
"dog": [5, 6, 7] | |
} | |
""" | |
if extra_features is None: | |
extra_features = dict() | |
new_model = deepcopy(model) | |
features = model.named_steps["tfidf"].get_feature_names_out() | |
coef = model.named_steps["model"].coef_ | |
idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag | |
model_vocab = model.named_steps["tfidf"].vocabulary_ | |
max_id = max(model_vocab.values()) | |
existing_features = [] | |
new_features = [] | |
new_coef = [] | |
new_ids = [] | |
for k, v in extra_features.items(): | |
if k in model_vocab: | |
existing_features.append(k) | |
coef[:, model_vocab[k]] = np.array(v) | |
else: | |
new_features.append(k) | |
new_coef.append(v) | |
max_id += 1 | |
model_vocab[k] = max_id | |
num_new_features = len(new_features) | |
# Add new coefs | |
new_coef = np.vstack(new_coef).T # Vertically stack and then transpose | |
print(f"{len(existing_features)} existing features will be updated.") | |
print(f"{num_new_features} new features will be added.") | |
new_vocab_keys = np.hstack([ | |
features, | |
np.array(new_features) | |
]) | |
new_coef = np.hstack([coef, new_coef]) # Hstack for extra features | |
new_vocab_size = len(model_vocab) | |
# Add idf diag | |
new_idf_diag = sparse.diags( | |
np.hstack([idf_diag.diagonal(), np.ones(num_new_features)]), | |
format="csr", dtype=np.float64 | |
) | |
# Update model named steps | |
# Update tfidf | |
new_model.named_steps["tfidf"].vocabulary_ = model_vocab | |
new_model.named_steps["tfidf"]._tfidf.n_features_in_ = new_vocab_size | |
new_model.named_steps["tfidf"]._tfidf._idf_diag = new_idf_diag | |
# Update model | |
new_model.named_steps["model"].coef_ = new_coef | |
new_model.named_steps["model"].n_features_in_ = new_coef.shape[1] | |
print(f"New model with {new_vocab_size} weights is: {new_model}") | |
return new_model | |
model = Pipeline([ | |
("tfidf", TfidfVectorizer(ngram_range=(1, 3))), | |
("model", LogisticRegression(class_weight="balanced")), | |
]) | |
model.fit(df_train.text, df_train.askic_eligible) | |
model_save_path = f"./model_tfidf_lr_{time.time()}.joblib" | |
print(model_save_path) | |
dump(model, model_save_path) | |
df_features = pd.DataFrame(dict( | |
feature=tfidf_vectorizer.get_feature_names_out(), | |
coef=lr_model.coef_[0] | |
)).sort_values(by="coef", ascending=False) | |
top_class_features = pd.concat(dict( | |
true_related=df_features.head(20).reset_index(drop=True), | |
false_related=df_features.tail(20).reset_index(drop=True) | |
), axis=1, ignore_index=False) | |
plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=False, color="0.5"); | |
plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=True, histtype="step", color="r"); | |
pruned_model = prune_sklearn_model(model, log_abs_thresh=-3) | |
pruned_model_save_path = model_save_path.replace(".joblib", ".pruned.joblib") | |
print(pruned_model_save_path) | |
dump(pruned_model, pruned_model_save_path) | |
""" | |
%18.59 weights will be retained. | |
Pruned model with 3746595 weights is: Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))), | |
('model', LogisticRegression(class_weight='balanced'))]) | |
CPU times: user 1min 18s, sys: 2.25 s, total: 1min 20s | |
Wall time: 1min 17s | |
""" | |
pruned_model = joblib.load(pruned_model_save_path) | |
extra_features = { | |
"cat": [1,], | |
"dog": [-5,] | |
} | |
pruned_model_v2_extra = add_extra_weights(pruned_model_v2, extra_features) | |
""" | |
Some tips on setting the right value for extra weights: | |
Check the model.named_steps["tfidf"]._tfidf._idf_diag value for those weights | |
If setting new ngram feature with n>1 then see what the highest value for the 1grams | |
Your new weight should have more impact that the 1gram weight * idf value | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment