konverner/word2id.py

## word2id.py
def get_vocab(texts):
  id2word = {0 : "<pad>", 1 : "<unk>"}
  word2id = {"<pad>" : 0, 1 : "<unk>"}
  i = 1
  for text in texts:
    for word in text:
      word = word.lower()
      if word not in word2id.keys():
        word2id[word] = i
        id2word[i] = word
        i += 1
  return id2word, word2id

def encode(text, word2id, size=64):
  encoded = [word2id[word.lower()] for word in text]
  while len(encoded) < size:
    encoded.append(0)
  return torch.tensor(encoded)
	def get_vocab(texts):
	id2word = {0 : "<pad>", 1 : "<unk>"}
	word2id = {"<pad>" : 0, 1 : "<unk>"}
	i = 1
	for text in texts:
	for word in text:
	word = word.lower()
	if word not in word2id.keys():
	word2id[word] = i
	id2word[i] = word
	i += 1
	return id2word, word2id

	def encode(text, word2id, size=64):
	encoded = [word2id[word.lower()] for word in text]
	while len(encoded) < size:
	encoded.append(0)
	return torch.tensor(encoded)