Skip to content

Instantly share code, notes, and snippets.

@henry16lin
Last active February 13, 2020 15:47
Show Gist options
  • Save henry16lin/ba26abbcf169d33c575c929f3b955233 to your computer and use it in GitHub Desktop.
Save henry16lin/ba26abbcf169d33c575c929f3b955233 to your computer and use it in GitHub Desktop.
imdb_preprocess2
### define mapping between word and index ###
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'
def encode_samples(tokenized_samples): #use word index mapping to encode token
features = []
for sample in tokenized_samples:
feature = []
for token in sample:
if token in word_to_idx:
feature.append(word_to_idx[token])
else:
feature.append(0)
features.append(feature)
return features
def pad_samples(features, maxlen=350, PAD=0): #截長補短 讓長度一致,這裡固定文章長度為maxlen=350
padded_features = []
for feature in features:
if len(feature) >= maxlen:
padded_feature = feature[:maxlen]
else:
padded_feature = feature
while(len(padded_feature) < maxlen):
padded_feature.append(PAD)
padded_features.append(padded_feature)
return padded_features
### 將token轉成index 並轉成 pytorch tensor ###
train_features = torch.tensor(pad_samples(encode_samples(train_tokenized)))
train_labels = torch.tensor([score for _, score in train_data])
test_features = torch.tensor(pad_samples(encode_samples(test_tokenized)))
test_labels = torch.tensor([score for _, score in test_data])
### split validation set from train_feature ###
val_features = torch.cat( (train_features[:500],train_features[-500:]),0) #從train中抽出500筆nep&500筆pos的資料當val
val_labels = torch.cat((train_labels[:500],train_labels[-500:]),0)
train_features = train_features[500:-500]
train_labels = train_labels[500:-500]
### create pytorch dataloader ###
batch_size = 36
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,shuffle=True)
val_set = torch.utils.data.TensorDataset(val_features, val_labels)
val_iter = torch.utils.data.DataLoader(val_set, batch_size=batch_size,shuffle=False)
test_set = torch.utils.data.TensorDataset(test_features, test_labels)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,shuffle=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment