Oliver Borchers oborchers

## SIF_Variant_6.pyx
# Adapted
for i in xrange(len(sentences)):
  if len(sentences[i]) > 0:
    # Define a memory view of the sentence indices
    sentence_view = sentences[i]
    sentence_len = len(sentences[i])
    # Pass all arguments to the C-Loop
    sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
return output

## SIF_Variant_7.pyx
def sif_embeddings(sentences, model):
    cdef int size = model.vector_size
    # Unsafe access via pointers
    cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))

    output = np.zeros((len(sentences), size), dtype=np.float32)
    cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))

    cdef INT_t *sentence_view


## LTT_1.py
exclusion_matrix = random_sparse(
    m = replications,           # Number of subsampled datasets
    n = len(X),                 # Length of training data
    density = p_excluded/100.,  # Percentage of excluded observations
    format = "csr",
    dtype = bool,               # True = Excluded
    random_state=random_state,
)

## LTT_2.py
exog_perf = [] # Exogenous performance

# Iterate rows of exclusion matrix and estimate model on reduced data
for row in trange(excl_mat.shape[0]):
    # Get exclusion mask and invert to inclusion mask (True = included)
    # Inversion is necessary for numpy indexing with boolean arrays
    mask = invert(squeeze(excl_mat[row].toarray()))

    # Fit arbtitrary estimator
    est = LinearRegression(n_jobs=-1)

## LTT_3.py
topn = 50
# Using -coef_ because we sort size
top_idx = argsort(-est.coef_)[-topn:]
print(
    len(
        set(top_idx).intersection(set(data["noise_index"]))
    )
)

## LTT_cite.bib
@misc{Borchers2020,
abstract = {This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.},
author = {Borchers, Oliver and Ringel, Daniel M.},
booktitle = {Towards Data Science},
title = {{Your Labels and Data are Noisy? LASSO The Traitors!}},
url = {https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc},
year = {2020}
}

## LTT_cite.ris
TY  - ICOMM
T1  - Your Labels and Data are Noisy? LASSO The Traitors!
A1  - Borchers, Oliver
A1  - Ringel, Daniel M.
Y1  - 2020///
JF  - Towards Data Science
UR  - https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc
N2  - This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.
ER  -

## build_onnx.sh
apt-get update

# Install ONNX ML
export ONNX_ML=1
pip install -U onnx

# Clone ONNX Runtime and build
git clone https://github.com/microsoft/onnxruntime.git --branch v1.7.1 --single-branch
/bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
cd onnxruntime/

## convert_graph_to_onnx.py
# Script to export a transformers model
model_name = "sentence-transformers/bert-base-nli-stsb-mean-tokens"
pipeline_name = "feature-extraction"
model_pth = Path(f"encoder/{model_name}.onnx")

nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
tokenizer = nlp.tokenizer

if model_pth.exists():
    model_pth.unlink()

## load_onnx_model.py
# We start by working with CUDA only
ONNX_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider"]
opt = rt.SessionOptions()
sess = rt.InferenceSession(str(model_pth), opt, providers=ONNX_PROVIDERS)

model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)

print(onnx_result[0].shape)
	# Adapted
	for i in xrange(len(sentences)):
	if len(sentences[i]) > 0:
	# Define a memory view of the sentence indices
	sentence_view = sentences[i]
	sentence_len = len(sentences[i])
	# Pass all arguments to the C-Loop
	sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
	return output
	def sif_embeddings(sentences, model):
	cdef int size = model.vector_size
	# Unsafe access via pointers
	cdef REAL_t vectors = <REAL_t >(np.PyArray_DATA(model.wv.sif_vectors))

	output = np.zeros((len(sentences), size), dtype=np.float32)
	cdef REAL_t sv = <REAL_t >(np.PyArray_DATA(output))

	cdef INT_t *sentence_view
	exclusion_matrix = random_sparse(
	m = replications, # Number of subsampled datasets
	n = len(X), # Length of training data
	density = p_excluded/100., # Percentage of excluded observations
	format = "csr",
	dtype = bool, # True = Excluded
	random_state=random_state,
	)
	exog_perf = [] # Exogenous performance

	# Iterate rows of exclusion matrix and estimate model on reduced data
	for row in trange(excl_mat.shape[0]):
	# Get exclusion mask and invert to inclusion mask (True = included)
	# Inversion is necessary for numpy indexing with boolean arrays
	mask = invert(squeeze(excl_mat[row].toarray()))

	# Fit arbtitrary estimator
	est = LinearRegression(n_jobs=-1)
	topn = 50
	# Using -coef_ because we sort size
	top_idx = argsort(-est.coef_)[-topn:]
	print(
	len(
	set(top_idx).intersection(set(data["noise_index"]))
	)
	)
	@misc{Borchers2020,
	abstract = {This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.},
	author = {Borchers, Oliver and Ringel, Daniel M.},
	booktitle = {Towards Data Science},
	title = {{Your Labels and Data are Noisy? LASSO The Traitors!}},
	url = {https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc},
	year = {2020}
	}
	TY - ICOMM
	T1 - Your Labels and Data are Noisy? LASSO The Traitors!
	A1 - Borchers, Oliver
	A1 - Ringel, Daniel M.
	Y1 - 2020///
	JF - Towards Data Science
	UR - https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc
	N2 - This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.
	ER -
	apt-get update

	# Install ONNX ML
	export ONNX_ML=1
	pip install -U onnx

	# Clone ONNX Runtime and build
	git clone https://github.com/microsoft/onnxruntime.git --branch v1.7.1 --single-branch
	/bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
	cd onnxruntime/
	# Script to export a transformers model
	model_name = "sentence-transformers/bert-base-nli-stsb-mean-tokens"
	pipeline_name = "feature-extraction"
	model_pth = Path(f"encoder/{model_name}.onnx")

	nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
	tokenizer = nlp.tokenizer

	if model_pth.exists():
	model_pth.unlink()
	# We start by working with CUDA only
	ONNX_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider"]
	opt = rt.SessionOptions()
	sess = rt.InferenceSession(str(model_pth), opt, providers=ONNX_PROVIDERS)

	model_input = tokenizer.encode_plus(span)
	model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
	onnx_result = sess.run(None, model_input)

	print(onnx_result[0].shape)