Skip to content

Instantly share code, notes, and snippets.

View oborchers's full-sized avatar

Oliver Borchers oborchers

View GitHub Profile
# Adapted
for i in xrange(len(sentences)):
if len(sentences[i]) > 0:
# Define a memory view of the sentence indices
sentence_view = sentences[i]
sentence_len = len(sentences[i])
# Pass all arguments to the C-Loop
sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
return output
def sif_embeddings(sentences, model):
cdef int size = model.vector_size
# Unsafe access via pointers
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))
cdef INT_t *sentence_view
exclusion_matrix = random_sparse(
m = replications, # Number of subsampled datasets
n = len(X), # Length of training data
density = p_excluded/100., # Percentage of excluded observations
format = "csr",
dtype = bool, # True = Excluded
random_state=random_state,
)
exog_perf = [] # Exogenous performance
# Iterate rows of exclusion matrix and estimate model on reduced data
for row in trange(excl_mat.shape[0]):
# Get exclusion mask and invert to inclusion mask (True = included)
# Inversion is necessary for numpy indexing with boolean arrays
mask = invert(squeeze(excl_mat[row].toarray()))
# Fit arbtitrary estimator
est = LinearRegression(n_jobs=-1)
topn = 50
# Using -coef_ because we sort size
top_idx = argsort(-est.coef_)[-topn:]
print(
len(
set(top_idx).intersection(set(data["noise_index"]))
)
)
@misc{Borchers2020,
abstract = {This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.},
author = {Borchers, Oliver and Ringel, Daniel M.},
booktitle = {Towards Data Science},
title = {{Your Labels and Data are Noisy? LASSO The Traitors!}},
url = {https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc},
year = {2020}
}
TY - ICOMM
T1 - Your Labels and Data are Noisy? LASSO The Traitors!
A1 - Borchers, Oliver
A1 - Ringel, Daniel M.
Y1 - 2020///
JF - Towards Data Science
UR - https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc
N2 - This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.
ER -
apt-get update
# Install ONNX ML
export ONNX_ML=1
pip install -U onnx
# Clone ONNX Runtime and build
git clone https://github.com/microsoft/onnxruntime.git --branch v1.7.1 --single-branch
/bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
cd onnxruntime/
# Script to export a transformers model
model_name = "sentence-transformers/bert-base-nli-stsb-mean-tokens"
pipeline_name = "feature-extraction"
model_pth = Path(f"encoder/{model_name}.onnx")
nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
tokenizer = nlp.tokenizer
if model_pth.exists():
model_pth.unlink()
# We start by working with CUDA only
ONNX_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider"]
opt = rt.SessionOptions()
sess = rt.InferenceSession(str(model_pth), opt, providers=ONNX_PROVIDERS)
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)
print(onnx_result[0].shape)