Last active
February 13, 2020 15:47
-
-
Save henry16lin/ba26abbcf169d33c575c929f3b955233 to your computer and use it in GitHub Desktop.
imdb_preprocess2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### define mapping between word and index ### | |
word_to_idx = {word: i+1 for i, word in enumerate(vocab)} | |
word_to_idx['<unk>'] = 0 | |
idx_to_word = {i+1: word for i, word in enumerate(vocab)} | |
idx_to_word[0] = '<unk>' | |
def encode_samples(tokenized_samples): #use word index mapping to encode token | |
features = [] | |
for sample in tokenized_samples: | |
feature = [] | |
for token in sample: | |
if token in word_to_idx: | |
feature.append(word_to_idx[token]) | |
else: | |
feature.append(0) | |
features.append(feature) | |
return features | |
def pad_samples(features, maxlen=350, PAD=0): #截長補短 讓長度一致,這裡固定文章長度為maxlen=350 | |
padded_features = [] | |
for feature in features: | |
if len(feature) >= maxlen: | |
padded_feature = feature[:maxlen] | |
else: | |
padded_feature = feature | |
while(len(padded_feature) < maxlen): | |
padded_feature.append(PAD) | |
padded_features.append(padded_feature) | |
return padded_features | |
### 將token轉成index 並轉成 pytorch tensor ### | |
train_features = torch.tensor(pad_samples(encode_samples(train_tokenized))) | |
train_labels = torch.tensor([score for _, score in train_data]) | |
test_features = torch.tensor(pad_samples(encode_samples(test_tokenized))) | |
test_labels = torch.tensor([score for _, score in test_data]) | |
### split validation set from train_feature ### | |
val_features = torch.cat( (train_features[:500],train_features[-500:]),0) #從train中抽出500筆nep&500筆pos的資料當val | |
val_labels = torch.cat((train_labels[:500],train_labels[-500:]),0) | |
train_features = train_features[500:-500] | |
train_labels = train_labels[500:-500] | |
### create pytorch dataloader ### | |
batch_size = 36 | |
train_set = torch.utils.data.TensorDataset(train_features, train_labels) | |
train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,shuffle=True) | |
val_set = torch.utils.data.TensorDataset(val_features, val_labels) | |
val_iter = torch.utils.data.DataLoader(val_set, batch_size=batch_size,shuffle=False) | |
test_set = torch.utils.data.TensorDataset(test_features, test_labels) | |
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,shuffle=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment