This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import confusion_matrix | |
true=[] | |
predictions=[] | |
with torch.no_grad(): | |
model.eval() | |
for data in testloader: | |
tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data] | |
val_outputs = model(input_ids=tokens_tensors, | |
token_type_ids=segments_tensors, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
from sklearn.metrics import accuracy_score | |
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") | |
print("device:",device) | |
model = model.to(device) | |
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) | |
EPOCHS = 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import BertForSequenceClassification | |
NUM_LABELS = 2 | |
model = BertForSequenceClassification.from_pretrained( | |
PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS) | |
print(""" | |
name module |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import DataLoader | |
from torch.nn.utils.rnn import pad_sequence | |
"""" | |
create_mini_batch(samples)吃上面定義的mydataset | |
回傳訓練 BERT 時會需要的 4 個 tensors: | |
- tokens_tensors : (batch_size, max_seq_len_in_batch) | |
- segments_tensors: (batch_size, max_seq_len_in_batch) | |
- masks_tensors : (batch_size, max_seq_len_in_batch) | |
- label_ids : (batch_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 隨便選一個樣本 | |
sample_idx = 10 | |
# 利用剛剛建立的 Dataset 取出轉換後的 id tensors | |
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx] | |
# 將 tokens_tensor 還原成文本 | |
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()) | |
print('token:\n',tokens,'\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import Dataset,random_split | |
TAG_RE = re.compile(r'<[^>]+>') | |
def preprocess_text(sen): | |
# Removing html tags | |
sentence = TAG_RE.sub('', sen) | |
# Remove punctuations and numbers | |
sentence = re.sub('[^a-zA-Z]', ' ', sentence) | |
# Single character removal | |
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get pre-train tokenizer | |
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) | |
vocab = tokenizer.vocab | |
print("dict size", len(vocab)) | |
# see some token and index mapping | |
import random | |
random_tokens = random.sample(list(vocab), 10) | |
random_ids = [vocab[t] for t in random_tokens] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
net.to(device) | |
loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss() | |
optimizer = optim.Adam(net.parameters()) | |
def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter): | |
for epoch in range(num_epochs): | |
start = time.time() | |
train_loss, val_losses = 0, 0 | |
train_acc, val_acc = 0, 0 | |
n, m = 0, 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### build model ### | |
class RNN(nn.Module): | |
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, | |
bidirectional, weight, labels, **kwargs): | |
super(RNN, self).__init__(**kwargs) | |
self.num_hiddens = num_hiddens | |
self.num_layers = num_layers | |
self.bidirectional = bidirectional | |
self.embedding = nn.Embedding.from_pretrained(weight) | |
self.embedding.weight.requires_grad = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### load word2vec model ### | |
#pre-train model download from: https://github.com/stanfordnlp/GloVe | |
#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt | |
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8') | |
## map golve pretrain weight to pytorch embedding pretrain weight | |
embed_size = 100 | |
weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove | |
for i in range(len(wvmodel.index2word)): | |
try: |
NewerOlder