Oliver Borchers oborchers

## LTT_3.py
topn = 50
# Using -coef_ because we sort size
top_idx = argsort(-est.coef_)[-topn:]
print(
    len(
        set(top_idx).intersection(set(data["noise_index"]))
    )
)

## LTT_2.py
exog_perf = [] # Exogenous performance

# Iterate rows of exclusion matrix and estimate model on reduced data
for row in trange(excl_mat.shape[0]):
    # Get exclusion mask and invert to inclusion mask (True = included)
    # Inversion is necessary for numpy indexing with boolean arrays
    mask = invert(squeeze(excl_mat[row].toarray()))

    # Fit arbtitrary estimator
    est = LinearRegression(n_jobs=-1)

## LTT_1.py
exclusion_matrix = random_sparse(
    m = replications,           # Number of subsampled datasets
    n = len(X),                 # Length of training data
    density = p_excluded/100.,  # Percentage of excluded observations
    format = "csr",
    dtype = bool,               # True = Excluded
    random_state=random_state,
)

## SIF_Variant_7.pyx
def sif_embeddings(sentences, model):
    cdef int size = model.vector_size
    # Unsafe access via pointers
    cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))

    output = np.zeros((len(sentences), size), dtype=np.float32)
    cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))

    cdef INT_t *sentence_view


## SIF_Variant_6.pyx
# Adapted
for i in xrange(len(sentences)):
  if len(sentences[i]) > 0:
    # Define a memory view of the sentence indices
    sentence_view = sentences[i]
    sentence_len = len(sentences[i])
    # Pass all arguments to the C-Loop
    sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
return output

## SIF_Variant_5.pyx
def sif_embeddings(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.sif_vectors

    cdef int sentence_index, word_index, d, count = 0
    cdef float inv = 1.
    np_sum = np.sum

    output = np.zeros((len(sentences), size), dtype=np.float32)
    cdef float[:,:] sv = output

## SIF_Variant_4.py
def sif_embeddings(sentences, model):
  """ Uses a pre-computed list of indices and skips the use of strings alltogether
  """
  vectors = model.wv.sif_vectors
  output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL)
  for i,s in enumerate(sentences):
    output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1)
  return output.astype(REAL)

## SIF_Variant_3.py
def sif_embeddings(sentences, model):
	""" Precomputes the sif_vectors in a separate matrix
	"""
	vlookup = model.wv.vocab
	vectors = model.wv.sif_vectors
	# The sif_vectors are pre-computed as:
	# sif_vectors = (model.wv.vectors * model.wv.sif[:, None])
	output = []
	for s in sentences:
	    idx = [vlookup[w].index for w in s if w in vlookup]

## SIF_Variant_2.py
def sif_embeddings(sentences, model, alpha=1e-3):
	""" Precomputes the indices of the sentences and uses the numpy indexing to directly multiply and sum the vectors
	"""
	vlookup = model.wv.vocab
	vectors = model.wv
	output = []
	for s in sentences:
      # Pre-compute sentence indices
	    idx = [vlookup[w].index for w in s if w in vlookup]
	    # Note: vectors.sif is a pre-computed numpy array containing the weights for all the word-vectors.

## SIF_Variant_1.py
for w in s:
  if w in vlookup:
    # The loop over the the vector dimensions is completely unecessary and extremely slow
    v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w]
    count += 1
	topn = 50
	# Using -coef_ because we sort size
	top_idx = argsort(-est.coef_)[-topn:]
	print(
	len(
	set(top_idx).intersection(set(data["noise_index"]))
	)
	)
	exog_perf = [] # Exogenous performance

	# Iterate rows of exclusion matrix and estimate model on reduced data
	for row in trange(excl_mat.shape[0]):
	# Get exclusion mask and invert to inclusion mask (True = included)
	# Inversion is necessary for numpy indexing with boolean arrays
	mask = invert(squeeze(excl_mat[row].toarray()))

	# Fit arbtitrary estimator
	est = LinearRegression(n_jobs=-1)
	exclusion_matrix = random_sparse(
	m = replications, # Number of subsampled datasets
	n = len(X), # Length of training data
	density = p_excluded/100., # Percentage of excluded observations
	format = "csr",
	dtype = bool, # True = Excluded
	random_state=random_state,
	)
	def sif_embeddings(sentences, model):
	cdef int size = model.vector_size
	# Unsafe access via pointers
	cdef REAL_t vectors = <REAL_t >(np.PyArray_DATA(model.wv.sif_vectors))

	output = np.zeros((len(sentences), size), dtype=np.float32)
	cdef REAL_t sv = <REAL_t >(np.PyArray_DATA(output))

	cdef INT_t *sentence_view
	# Adapted
	for i in xrange(len(sentences)):
	if len(sentences[i]) > 0:
	# Define a memory view of the sentence indices
	sentence_view = sentences[i]
	sentence_len = len(sentences[i])
	# Pass all arguments to the C-Loop
	sif_embeddings_cloop(size, sentence_view, sentence_len, i, vectors, sv)
	return output
	def sif_embeddings(sentences, model):
	cdef int size = model.vector_size
	cdef float[:,:] vectors = model.wv.sif_vectors

	cdef int sentence_index, word_index, d, count = 0
	cdef float inv = 1.
	np_sum = np.sum

	output = np.zeros((len(sentences), size), dtype=np.float32)
	cdef float[:,:] sv = output
	def sif_embeddings(sentences, model):
	""" Uses a pre-computed list of indices and skips the use of strings alltogether
	"""
	vectors = model.wv.sif_vectors
	output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL)
	for i,s in enumerate(sentences):
	output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1)
	return output.astype(REAL)
	def sif_embeddings(sentences, model):
	""" Precomputes the sif_vectors in a separate matrix
	"""
	vlookup = model.wv.vocab
	vectors = model.wv.sif_vectors
	# The sif_vectors are pre-computed as:
	# sif_vectors = (model.wv.vectors * model.wv.sif[:, None])
	output = []
	for s in sentences:
	idx = [vlookup[w].index for w in s if w in vlookup]
	def sif_embeddings(sentences, model, alpha=1e-3):
	""" Precomputes the indices of the sentences and uses the numpy indexing to directly multiply and sum the vectors
	"""
	vlookup = model.wv.vocab
	vectors = model.wv
	output = []
	for s in sentences:
	# Pre-compute sentence indices
	idx = [vlookup[w].index for w in s if w in vlookup]
	# Note: vectors.sif is a pre-computed numpy array containing the weights for all the word-vectors.
	for w in s:
	if w in vlookup:
	# The loop over the the vector dimensions is completely unecessary and extremely slow
	v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w]
	count += 1