mbednarski/pairs.py

## pairs.py
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
	window_size = 2
	idx_pairs = []
	# for each sentence
	for sentence in tokenized_corpus:
	indices = [word2idx[word] for word in sentence]
	# for each word, threated as center word
	for center_word_pos in range(len(indices)):
	# for each window position
	for w in range(-window_size, window_size + 1):
	context_word_pos = center_word_pos + w
	# make soure not jump out sentence
	if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
	continue
	context_word_idx = indices[context_word_pos]
	idx_pairs.append((indices[center_word_pos], context_word_idx))

	idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array