Skip to content

Instantly share code, notes, and snippets.

View oborchers's full-sized avatar

Oliver Borchers oborchers

View GitHub Profile
topn = 50
# Using -coef_ because we sort size
top_idx = argsort(-est.coef_)[-topn:]
print(
len(
set(top_idx).intersection(set(data["noise_index"]))
)
)
exog_perf = [] # Exogenous performance
# Iterate rows of exclusion matrix and estimate model on reduced data
for row in trange(excl_mat.shape[0]):
# Get exclusion mask and invert to inclusion mask (True = included)
# Inversion is necessary for numpy indexing with boolean arrays
mask = invert(squeeze(excl_mat[row].toarray()))
# Fit arbtitrary estimator
est = LinearRegression(n_jobs=-1)
exclusion_matrix = random_sparse(
m = replications, # Number of subsampled datasets
n = len(X), # Length of training data
density = p_excluded/100., # Percentage of excluded observations
format = "csr",
dtype = bool, # True = Excluded
random_state=random_state,
)
def sif_embeddings(sentences, model):
cdef int size = model.vector_size
# Unsafe access via pointers
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))
cdef INT_t *sentence_view
# Adapted
for i in xrange(len(sentences)):
if len(sentences[i]) > 0:
# Define a memory view of the sentence indices
sentence_view = sentences[i]
sentence_len = len(sentences[i])
# Pass all arguments to the C-Loop
sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
return output
def sif_embeddings(sentences, model):
cdef int size = model.vector_size
cdef float[:,:] vectors = model.wv.sif_vectors
cdef int sentence_index, word_index, d, count = 0
cdef float inv = 1.
np_sum = np.sum
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef float[:,:] sv = output
def sif_embeddings(sentences, model):
""" Uses a pre-computed list of indices and skips the use of strings alltogether
"""
vectors = model.wv.sif_vectors
output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL)
for i,s in enumerate(sentences):
output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1)
return output.astype(REAL)
def sif_embeddings(sentences, model):
""" Precomputes the sif_vectors in a separate matrix
"""
vlookup = model.wv.vocab
vectors = model.wv.sif_vectors
# The sif_vectors are pre-computed as:
# sif_vectors = (model.wv.vectors * model.wv.sif[:, None])
output = []
for s in sentences:
idx = [vlookup[w].index for w in s if w in vlookup]
def sif_embeddings(sentences, model, alpha=1e-3):
""" Precomputes the indices of the sentences and uses the numpy indexing to directly multiply and sum the vectors
"""
vlookup = model.wv.vocab
vectors = model.wv
output = []
for s in sentences:
# Pre-compute sentence indices
idx = [vlookup[w].index for w in s if w in vlookup]
# Note: vectors.sif is a pre-computed numpy array containing the weights for all the word-vectors.
for w in s:
if w in vlookup:
# The loop over the the vector dimensions is completely unecessary and extremely slow
v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w]
count += 1