napsternxg/prune_sklearn_model.py

## prune_sklearn_model.py
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse

from joblib import dump, load
import joblib
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


def prune_sklearn_model(model, log_abs_thresh=-3):
    pruned_model = deepcopy(model)
    features = pruned_model.named_steps["tfidf"].get_feature_names_out()
    coef = pruned_model.named_steps["model"].coef_[0]

    idx = np.log10(np.abs(coef)) > log_abs_thresh
    print(f"%{idx.mean()*100:.2f} weights will be retained.")
    pruned_vocab_keys = features[idx]
    pruned_vocab_original_ids = [model.named_steps["tfidf"].vocabulary_[k] for k in pruned_vocab_keys]
    pruned_vocab = {k: i for i, k in enumerate(pruned_vocab_keys)}
    pruned_vocab_size = len(pruned_vocab)
    pruned_model.named_steps["tfidf"].vocabulary_ = pruned_vocab
    pruned_model.named_steps["tfidf"]._tfidf.n_features_in_ = pruned_vocab_size
    pruned_model.named_steps["tfidf"]._tfidf._idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag[pruned_vocab_original_ids][:, pruned_vocab_original_ids]

    pruned_model.named_steps["model"].coef_ = model.named_steps["model"].coef_[:, pruned_vocab_original_ids]
    pruned_model.named_steps["model"].n_features_in_ = pruned_model.named_steps["model"].coef_.shape[1]

    print(f"Pruned model with {pruned_vocab_size} weights is: {pruned_model}")
    return pruned_model


def add_extra_weights(model, extra_features):
    """
    extra_features = {
        "cat": [1, 2, 3],
        "dog": [5, 6, 7]
    }
    """
    if extra_features is None:
        extra_features = dict()
    new_model = deepcopy(model)

    features = model.named_steps["tfidf"].get_feature_names_out()
    coef = model.named_steps["model"].coef_
    idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag
    model_vocab = model.named_steps["tfidf"].vocabulary_

    max_id = max(model_vocab.values())

    existing_features = []
    new_features = []
    new_coef = []
    new_ids = []

    for k, v in extra_features.items():
        if k in model_vocab:
            existing_features.append(k)
            coef[:, model_vocab[k]] = np.array(v)
        else:
            new_features.append(k)
            new_coef.append(v)
            max_id += 1
            model_vocab[k] = max_id
    num_new_features = len(new_features)

    # Add new coefs
    new_coef = np.vstack(new_coef).T # Vertically stack and then transpose
    print(f"{len(existing_features)} existing features will be updated.")
    print(f"{num_new_features} new features will be added.")
    new_vocab_keys = np.hstack([
        features,
        np.array(new_features)
    ])
    new_coef = np.hstack([coef, new_coef]) # Hstack for extra features
    new_vocab_size = len(model_vocab)

    # Add idf diag

    new_idf_diag = sparse.diags(
        np.hstack([idf_diag.diagonal(), np.ones(num_new_features)]),
        format="csr", dtype=np.float64
    )

    # Update model named steps
    # Update tfidf
    new_model.named_steps["tfidf"].vocabulary_ = model_vocab
    new_model.named_steps["tfidf"]._tfidf.n_features_in_ = new_vocab_size
    new_model.named_steps["tfidf"]._tfidf._idf_diag = new_idf_diag

    # Update model
    new_model.named_steps["model"].coef_ = new_coef
    new_model.named_steps["model"].n_features_in_ = new_coef.shape[1]

    print(f"New model with {new_vocab_size} weights is: {new_model}")
    return new_model


model = Pipeline([
  ("tfidf", TfidfVectorizer(ngram_range=(1, 3))),
  ("model", LogisticRegression(class_weight="balanced")),
])

model.fit(df_train.text, df_train.askic_eligible)

model_save_path = f"./model_tfidf_lr_{time.time()}.joblib"
print(model_save_path)
dump(model, model_save_path)


df_features = pd.DataFrame(dict(
    feature=tfidf_vectorizer.get_feature_names_out(),
    coef=lr_model.coef_[0]
)).sort_values(by="coef", ascending=False)

top_class_features = pd.concat(dict(
    true_related=df_features.head(20).reset_index(drop=True),
    false_related=df_features.tail(20).reset_index(drop=True)
), axis=1, ignore_index=False)

plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=False, color="0.5");
plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=True, histtype="step", color="r");

pruned_model = prune_sklearn_model(model, log_abs_thresh=-3)


pruned_model_save_path = model_save_path.replace(".joblib", ".pruned.joblib")
print(pruned_model_save_path)
dump(pruned_model, pruned_model_save_path)

"""
%18.59 weights will be retained.
Pruned model with 3746595 weights is: Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                ('model', LogisticRegression(class_weight='balanced'))])
CPU times: user 1min 18s, sys: 2.25 s, total: 1min 20s
Wall time: 1min 17s
"""


pruned_model = joblib.load(pruned_model_save_path)

extra_features = {
    "cat": [1,],
    "dog": [-5,]
}
pruned_model_v2_extra = add_extra_weights(pruned_model_v2, extra_features)

"""
Some tips on setting the right value for extra weights:
Check the model.named_steps["tfidf"]._tfidf._idf_diag value for those weights
If setting new ngram feature with n>1 then see what the highest value for the 1grams
Your new weight should have more impact that the 1gram weight * idf value
"""
	from copy import deepcopy
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from scipy import sparse

	from joblib import dump, load
	import joblib
	import time

	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import Pipeline


	def prune_sklearn_model(model, log_abs_thresh=-3):
	pruned_model = deepcopy(model)
	features = pruned_model.named_steps["tfidf"].get_feature_names_out()
	coef = pruned_model.named_steps["model"].coef_[0]

	idx = np.log10(np.abs(coef)) > log_abs_thresh
	print(f"%{idx.mean()*100:.2f} weights will be retained.")
	pruned_vocab_keys = features[idx]
	pruned_vocab_original_ids = [model.named_steps["tfidf"].vocabulary_[k] for k in pruned_vocab_keys]
	pruned_vocab = {k: i for i, k in enumerate(pruned_vocab_keys)}
	pruned_vocab_size = len(pruned_vocab)
	pruned_model.named_steps["tfidf"].vocabulary_ = pruned_vocab
	pruned_model.named_steps["tfidf"]._tfidf.n_features_in_ = pruned_vocab_size
	pruned_model.named_steps["tfidf"]._tfidf._idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag[pruned_vocab_original_ids][:, pruned_vocab_original_ids]

	pruned_model.named_steps["model"].coef_ = model.named_steps["model"].coef_[:, pruned_vocab_original_ids]
	pruned_model.named_steps["model"].n_features_in_ = pruned_model.named_steps["model"].coef_.shape[1]

	print(f"Pruned model with {pruned_vocab_size} weights is: {pruned_model}")
	return pruned_model


	def add_extra_weights(model, extra_features):
	"""
	extra_features = {
	"cat": [1, 2, 3],
	"dog": [5, 6, 7]
	}
	"""
	if extra_features is None:
	extra_features = dict()
	new_model = deepcopy(model)

	features = model.named_steps["tfidf"].get_feature_names_out()
	coef = model.named_steps["model"].coef_
	idf_diag = model.named_steps["tfidf"]._tfidf._idf_diag
	model_vocab = model.named_steps["tfidf"].vocabulary_

	max_id = max(model_vocab.values())

	existing_features = []
	new_features = []
	new_coef = []
	new_ids = []

	for k, v in extra_features.items():
	if k in model_vocab:
	existing_features.append(k)
	coef[:, model_vocab[k]] = np.array(v)
	else:
	new_features.append(k)
	new_coef.append(v)
	max_id += 1
	model_vocab[k] = max_id
	num_new_features = len(new_features)

	# Add new coefs
	new_coef = np.vstack(new_coef).T # Vertically stack and then transpose
	print(f"{len(existing_features)} existing features will be updated.")
	print(f"{num_new_features} new features will be added.")
	new_vocab_keys = np.hstack([
	features,
	np.array(new_features)
	])
	new_coef = np.hstack([coef, new_coef]) # Hstack for extra features
	new_vocab_size = len(model_vocab)

	# Add idf diag

	new_idf_diag = sparse.diags(
	np.hstack([idf_diag.diagonal(), np.ones(num_new_features)]),
	format="csr", dtype=np.float64
	)

	# Update model named steps
	# Update tfidf
	new_model.named_steps["tfidf"].vocabulary_ = model_vocab
	new_model.named_steps["tfidf"]._tfidf.n_features_in_ = new_vocab_size
	new_model.named_steps["tfidf"]._tfidf._idf_diag = new_idf_diag

	# Update model
	new_model.named_steps["model"].coef_ = new_coef
	new_model.named_steps["model"].n_features_in_ = new_coef.shape[1]

	print(f"New model with {new_vocab_size} weights is: {new_model}")
	return new_model


	model = Pipeline([
	("tfidf", TfidfVectorizer(ngram_range=(1, 3))),
	("model", LogisticRegression(class_weight="balanced")),
	])

	model.fit(df_train.text, df_train.askic_eligible)

	model_save_path = f"./model_tfidf_lr_{time.time()}.joblib"
	print(model_save_path)
	dump(model, model_save_path)



	df_features = pd.DataFrame(dict(
	feature=tfidf_vectorizer.get_feature_names_out(),
	coef=lr_model.coef_[0]
	)).sort_values(by="coef", ascending=False)

	top_class_features = pd.concat(dict(
	true_related=df_features.head(20).reset_index(drop=True),
	false_related=df_features.tail(20).reset_index(drop=True)
	), axis=1, ignore_index=False)

	plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=False, color="0.5");
	plt.hist(np.log10(np.abs(df_features["coef"].values)), bins=np.arange(-10, 2, 0.1), density=True, cumulative=True, histtype="step", color="r");

	pruned_model = prune_sklearn_model(model, log_abs_thresh=-3)


	pruned_model_save_path = model_save_path.replace(".joblib", ".pruned.joblib")
	print(pruned_model_save_path)
	dump(pruned_model, pruned_model_save_path)

	"""
	%18.59 weights will be retained.
	Pruned model with 3746595 weights is: Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
	('model', LogisticRegression(class_weight='balanced'))])
	CPU times: user 1min 18s, sys: 2.25 s, total: 1min 20s
	Wall time: 1min 17s
	"""



	pruned_model = joblib.load(pruned_model_save_path)

	extra_features = {
	"cat": [1,],
	"dog": [-5,]
	}
	pruned_model_v2_extra = add_extra_weights(pruned_model_v2, extra_features)

	"""
	Some tips on setting the right value for extra weights:
	Check the model.named_steps["tfidf"]._tfidf._idf_diag value for those weights
	If setting new ngram feature with n>1 then see what the highest value for the 1grams
	Your new weight should have more impact that the 1gram weight * idf value
	"""