Skip to content

Instantly share code, notes, and snippets.

@nsorros
nsorros / load_data.py
Created May 17, 2023 06:27
Load token classification data to Argilla
import os
import argilla as rg
rg.init(
api_url="https://pro.argilla.io",
api_key=os.environ.get("ARGILLA_API_KEY"),
workspace="mantisnlp",
#extra_headers={"X-Argilla-Workspace": "my_connection_headers"}
Y_pred_proba = load_npz(y_pred_path).tocsc()
Y_test = load_npz(y_test_path).tocsc()
if fp(optimal_thresholds_star) > fp(optimal_thresholds):
optimal_thresholds = optimal_thresholds_star
y_pred = y_pred_proba > optimal_thresholds[k]
cmk = confusion_matrix(y_test, y_pred)
mlcm[k,:,:] = cmk
updated = True
updated = False
for k in range(N):
start = time.time()
y_pred_proba = np.array(Y_pred_proba[:,k].todense()).ravel()
y_test = np.array(Y_test[:,k].todense()).ravel()
fp = partial(f, y_pred_proba, y_test, mlcm, k)
optimal_thresholds_star = argmaxf1(y_pred_proba, y_test, optimal_thresholds, mlcm, k, nb_thresholds)
def confusion_matrix(y_test, y_pred):
tp = y_test.dot(y_pred)
fp = y_pred.sum() - tp
fn = y_test.sum() - tp
tn = y_test.shape[0] - tp - fp - fn
return np.array([[tn, fp], [fn, tp]])
def f(Y_pred_proba, Y_test, mlcm, k, thresholds):
y_pred_proba = np.array(Y_pred_proba[:,k].todense()).ravel()
y_test = np.array(Y_test[:,k].todense()).ravel()
def f(Y_pred_proba, Y_test, thresholds):
Y_pred = Y_pred_proba > thresholds
mlcm = multilabel_confusion_matrix(Y_test, Y_pred)
cm = mlcm.sum(axis=0)
tn, fp, fn, tp = cm.ravel()
f1 = tp / ( tp+ (fp+fn) / 2)
return f1
def multilabel_confusion_matrix(Y_test, Y_pred):
tp = Y_test.multiply(Y_pred).sum(axis=0)
fp = Y_pred.sum(axis=0) - tp
fn = Y_test.sum(axis=0) - tp
tn = Y_test.shape[0] - tp - fp - fn
return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
@nsorros
nsorros / optimize_threshold_custom_f1.py
Created February 14, 2022 10:33
Add custom f1 score to the implementation of optimising threshold for multilabel classifiers described in "Threshold optimisation for multi label classifier"
from functools import partial
import time
from sklearn.metrics import multilabel_confusion_matrix
from scipy.sparse import load_npz
import numpy as np
import typer
if "line_profiler" not in dir() and "profile" not in dir():
# no-op profile decorator
@nsorros
nsorros / optimise_threshold_naive.py
Last active February 14, 2022 12:53
Naive implementation of optimising threshold for multilabel classifiers described in "Threshold optimisation for multi label classifier"
from functools import partial
import time
from sklearn.metrics import f1_score
from scipy.sparse import load_npz
import numpy as np
import typer
def f(Y_pred_proba, Y_test, thresholds):
@nsorros
nsorros / train_config.py
Last active April 22, 2021 16:15
Train with config
import configparser
import argparse
def train(data_path, model_path, learning_rate, batch_size):
...
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--config", type=str, help="path to config file")
args = argparser.parse_args()