This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
topn = 50 | |
# Using -coef_ because we sort size | |
top_idx = argsort(-est.coef_)[-topn:] | |
print( | |
len( | |
set(top_idx).intersection(set(data["noise_index"])) | |
) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exog_perf = [] # Exogenous performance | |
# Iterate rows of exclusion matrix and estimate model on reduced data | |
for row in trange(excl_mat.shape[0]): | |
# Get exclusion mask and invert to inclusion mask (True = included) | |
# Inversion is necessary for numpy indexing with boolean arrays | |
mask = invert(squeeze(excl_mat[row].toarray())) | |
# Fit arbtitrary estimator | |
est = LinearRegression(n_jobs=-1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exclusion_matrix = random_sparse( | |
m = replications, # Number of subsampled datasets | |
n = len(X), # Length of training data | |
density = p_excluded/100., # Percentage of excluded observations | |
format = "csr", | |
dtype = bool, # True = Excluded | |
random_state=random_state, | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model): | |
cdef int size = model.vector_size | |
# Unsafe access via pointers | |
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors)) | |
output = np.zeros((len(sentences), size), dtype=np.float32) | |
cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output)) | |
cdef INT_t *sentence_view | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted | |
for i in xrange(len(sentences)): | |
if len(sentences[i]) > 0: | |
# Define a memory view of the sentence indices | |
sentence_view = sentences[i] | |
sentence_len = len(sentences[i]) | |
# Pass all arguments to the C-Loop | |
sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv) | |
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model): | |
cdef int size = model.vector_size | |
cdef float[:,:] vectors = model.wv.sif_vectors | |
cdef int sentence_index, word_index, d, count = 0 | |
cdef float inv = 1. | |
np_sum = np.sum | |
output = np.zeros((len(sentences), size), dtype=np.float32) | |
cdef float[:,:] sv = output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model): | |
""" Uses a pre-computed list of indices and skips the use of strings alltogether | |
""" | |
vectors = model.wv.sif_vectors | |
output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL) | |
for i,s in enumerate(sentences): | |
output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1) | |
return output.astype(REAL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model): | |
""" Precomputes the sif_vectors in a separate matrix | |
""" | |
vlookup = model.wv.vocab | |
vectors = model.wv.sif_vectors | |
# The sif_vectors are pre-computed as: | |
# sif_vectors = (model.wv.vectors * model.wv.sif[:, None]) | |
output = [] | |
for s in sentences: | |
idx = [vlookup[w].index for w in s if w in vlookup] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sif_embeddings(sentences, model, alpha=1e-3): | |
""" Precomputes the indices of the sentences and uses the numpy indexing to directly multiply and sum the vectors | |
""" | |
vlookup = model.wv.vocab | |
vectors = model.wv | |
output = [] | |
for s in sentences: | |
# Pre-compute sentence indices | |
idx = [vlookup[w].index for w in s if w in vlookup] | |
# Note: vectors.sif is a pre-computed numpy array containing the weights for all the word-vectors. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for w in s: | |
if w in vlookup: | |
# The loop over the the vector dimensions is completely unecessary and extremely slow | |
v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w] | |
count += 1 |