This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted | |
for i in xrange(len(sentences)): | |
if len(sentences[i]) > 0: | |
# Define a memory view of the sentence indices | |
sentence_view = sentences[i] | |
sentence_len = len(sentences[i]) | |
# Pass all arguments to the C-Loop | |
sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv) | |
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model): | |
cdef int size = model.vector_size | |
# Unsafe access via pointers | |
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors)) | |
output = np.zeros((len(sentences), size), dtype=np.float32) | |
cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output)) | |
cdef INT_t *sentence_view | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exclusion_matrix = random_sparse( | |
m = replications, # Number of subsampled datasets | |
n = len(X), # Length of training data | |
density = p_excluded/100., # Percentage of excluded observations | |
format = "csr", | |
dtype = bool, # True = Excluded | |
random_state=random_state, | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exog_perf = [] # Exogenous performance | |
# Iterate rows of exclusion matrix and estimate model on reduced data | |
for row in trange(excl_mat.shape[0]): | |
# Get exclusion mask and invert to inclusion mask (True = included) | |
# Inversion is necessary for numpy indexing with boolean arrays | |
mask = invert(squeeze(excl_mat[row].toarray())) | |
# Fit arbtitrary estimator | |
est = LinearRegression(n_jobs=-1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
topn = 50 | |
# Using -coef_ because we sort size | |
top_idx = argsort(-est.coef_)[-topn:] | |
print( | |
len( | |
set(top_idx).intersection(set(data["noise_index"])) | |
) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@misc{Borchers2020, | |
abstract = {This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic.}, | |
author = {Borchers, Oliver and Ringel, Daniel M.}, | |
booktitle = {Towards Data Science}, | |
title = {{Your Labels and Data are Noisy? LASSO The Traitors!}}, | |
url = {https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc}, | |
year = {2020} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
TY - ICOMM | |
T1 - Your Labels and Data are Noisy? LASSO The Traitors! | |
A1 - Borchers, Oliver | |
A1 - Ringel, Daniel M. | |
Y1 - 2020/// | |
JF - Towards Data Science | |
UR - https://medium.com/@oliverbor/lasso-the-traitors-dd33ea5942bc | |
N2 - This article develops the LASSO The Traitors (LTT) method. LTT filters out noisy observations from a dataset based on an exogenous performance metric. LTT significantly improves the performance of estimators based on the cleaned dataset. LTT is fast, easily applicable, and task agnostic. | |
ER - |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apt-get update | |
# Install ONNX ML | |
export ONNX_ML=1 | |
pip install -U onnx | |
# Clone ONNX Runtime and build | |
git clone https://github.com/microsoft/onnxruntime.git --branch v1.7.1 --single-branch | |
/bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh | |
cd onnxruntime/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to export a transformers model | |
model_name = "sentence-transformers/bert-base-nli-stsb-mean-tokens" | |
pipeline_name = "feature-extraction" | |
model_pth = Path(f"encoder/{model_name}.onnx") | |
nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0) | |
tokenizer = nlp.tokenizer | |
if model_pth.exists(): | |
model_pth.unlink() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We start by working with CUDA only | |
ONNX_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider"] | |
opt = rt.SessionOptions() | |
sess = rt.InferenceSession(str(model_pth), opt, providers=ONNX_PROVIDERS) | |
model_input = tokenizer.encode_plus(span) | |
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()} | |
onnx_result = sess.run(None, model_input) | |
print(onnx_result[0].shape) |