Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Last active June 14, 2023 21:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save napsternxg/1ee9e7f3c85a2b111df1bfeffafa3da6 to your computer and use it in GitHub Desktop.
Save napsternxg/1ee9e7f3c85a2b111df1bfeffafa3da6 to your computer and use it in GitHub Desktop.
Prune Sklearn TF-IDF Logistic Regression model
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse
from joblib import dump, load
import joblib
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
def prune_sklearn_model(model, log_abs_thresh=-3):
pruned_model = deepcopy(model)
features = pruned_model.named_steps["tfidf"].get_feature_names_out()
coef = pruned_model.named_steps["model"].coef_[0]
idx = np.log10(np.abs(coef)) > log_abs_thresh
print(f"%{idx.mean()*100:.2f} weights will be retained.")
pruned_vocab_keys = features[idx]
pruned_vocab_original_ids = [model.named_steps["tfidf"].vocabulary_[k] for k in pruned_vocab_keys]
pruned_vocab = {k: i for i, k in enumerate(pruned_vocab_keys)}
pruned_vocab_size = len(pruned_vocab)
pruned_model.named_steps["tfidf"].vocabulary_ = pruned_vocab
pruned_model.named_steps["tfidf"]._tfidf.n_features_in_ = pruned_vocab_size
pruned_model.named_steps["tfidf"]._tfidf._idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag[pruned_vocab_original_ids][:, pruned_vocab_original_ids]
pruned_model.named_steps["model"].coef_ = model.named_steps["model"].coef_[:, pruned_vocab_original_ids]
pruned_model.named_steps["model"].n_features_in_ = pruned_model.named_steps["model"].coef_.shape[1]
print(f"Pruned model with {pruned_vocab_size} weights is: {pruned_model}")
return pruned_model
def add_extra_weights(model, extra_features):
"""
extra_features = {
"cat": [1, 2, 3],
"dog": [5, 6, 7]
}
"""
if extra_features is None:
extra_features = dict()
new_model = deepcopy(model)
features = model.named_steps["tfidf"].get_feature_names_out()
coef = model.named_steps["model"].coef_
idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag
model_vocab = model.named_steps["tfidf"].vocabulary_
max_id = max(model_vocab.values())
existing_features = []
new_features = []
new_coef = []
new_ids = []
for k, v in extra_features.items():
if k in model_vocab:
existing_features.append(k)
coef[:, model_vocab[k]] = np.array(v)
else:
new_features.append(k)
new_coef.append(v)
max_id += 1
model_vocab[k] = max_id
num_new_features = len(new_features)
# Add new coefs
new_coef = np.vstack(new_coef).T # Vertically stack and then transpose
print(f"{len(existing_features)} existing features will be updated.")
print(f"{num_new_features} new features will be added.")
new_vocab_keys = np.hstack([
features,
np.array(new_features)
])
new_coef = np.hstack([coef, new_coef]) # Hstack for extra features
new_vocab_size = len(model_vocab)
# Add idf diag
new_idf_diag = sparse.diags(
np.hstack([idf_diag.diagonal(), np.ones(num_new_features)]),
format="csr", dtype=np.float64
)
# Update model named steps
# Update tfidf
new_model.named_steps["tfidf"].vocabulary_ = model_vocab
new_model.named_steps["tfidf"]._tfidf.n_features_in_ = new_vocab_size
new_model.named_steps["tfidf"]._tfidf._idf_diag = new_idf_diag
# Update model
new_model.named_steps["model"].coef_ = new_coef
new_model.named_steps["model"].n_features_in_ = new_coef.shape[1]
print(f"New model with {new_vocab_size} weights is: {new_model}")
return new_model
model = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1, 3))),
("model", LogisticRegression(class_weight="balanced")),
])
model.fit(df_train.text, df_train.askic_eligible)
model_save_path = f"./model_tfidf_lr_{time.time()}.joblib"
print(model_save_path)
dump(model, model_save_path)
df_features = pd.DataFrame(dict(
feature=tfidf_vectorizer.get_feature_names_out(),
coef=lr_model.coef_[0]
)).sort_values(by="coef", ascending=False)
top_class_features = pd.concat(dict(
true_related=df_features.head(20).reset_index(drop=True),
false_related=df_features.tail(20).reset_index(drop=True)
), axis=1, ignore_index=False)
plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=False, color="0.5");
plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=True, histtype="step", color="r");
pruned_model = prune_sklearn_model(model, log_abs_thresh=-3)
pruned_model_save_path = model_save_path.replace(".joblib", ".pruned.joblib")
print(pruned_model_save_path)
dump(pruned_model, pruned_model_save_path)
"""
%18.59 weights will be retained.
Pruned model with 3746595 weights is: Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
('model', LogisticRegression(class_weight='balanced'))])
CPU times: user 1min 18s, sys: 2.25 s, total: 1min 20s
Wall time: 1min 17s
"""
pruned_model = joblib.load(pruned_model_save_path)
extra_features = {
"cat": [1,],
"dog": [-5,]
}
pruned_model_v2_extra = add_extra_weights(pruned_model_v2, extra_features)
"""
Some tips on setting the right value for extra weights:
Check the model.named_steps["tfidf"]._tfidf._idf_diag value for those weights
If setting new ngram feature with n>1 then see what the highest value for the 1grams
Your new weight should have more impact that the 1gram weight * idf value
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment