Skip to content

Instantly share code, notes, and snippets.

from sklearn.metrics import confusion_matrix
true=[]
predictions=[]
with torch.no_grad():
model.eval()
for data in testloader:
tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
val_outputs = model(input_ids=tokens_tensors,
token_type_ids=segments_tensors,
@henry16lin
henry16lin / fine_tune_BERT.py
Created February 15, 2020 15:10
fine_tune_BERT
%%time
from sklearn.metrics import accuracy_score
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 10
@henry16lin
henry16lin / load_bert.py
Created February 15, 2020 15:04
load_bert
from transformers import BertForSequenceClassification
NUM_LABELS = 2
model = BertForSequenceClassification.from_pretrained(
PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
print("""
name module
@henry16lin
henry16lin / dataloader.py
Created February 15, 2020 15:02
datalaoder
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors:
- tokens_tensors : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors : (batch_size, max_seq_len_in_batch)
- label_ids : (batch_size)
@henry16lin
henry16lin / prepare_result.py
Created February 15, 2020 14:53
prepare_result
# 隨便選一個樣本
sample_idx = 10
# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]
# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
print('token:\n',tokens,'\n')
@henry16lin
henry16lin / prepare_for_BERT.py
Created February 15, 2020 14:41
prepare_for_BERT
from torch.utils.data import Dataset,random_split
TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
# Removing html tags
sentence = TAG_RE.sub('', sen)
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
@henry16lin
henry16lin / create_token.py
Created February 15, 2020 13:57
create_token
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))
# see some token and index mapping
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]
@henry16lin
henry16lin / RNN_training.py
Created February 13, 2020 16:39
RNN_training
net.to(device)
loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss()
optimizer = optim.Adam(net.parameters())
def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter):
for epoch in range(num_epochs):
start = time.time()
train_loss, val_losses = 0, 0
train_acc, val_acc = 0, 0
n, m = 0, 0
@henry16lin
henry16lin / LSTM_NET.py
Created February 13, 2020 16:34
LSTM_NET
### build model ###
class RNN(nn.Module):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
bidirectional, weight, labels, **kwargs):
super(RNN, self).__init__(**kwargs)
self.num_hiddens = num_hiddens
self.num_layers = num_layers
self.bidirectional = bidirectional
self.embedding = nn.Embedding.from_pretrained(weight)
self.embedding.weight.requires_grad = False
@henry16lin
henry16lin / load_glove.py
Created February 13, 2020 16:27
load_glove
### load word2vec model ###
#pre-train model download from: https://github.com/stanfordnlp/GloVe
#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8')
## map golve pretrain weight to pytorch embedding pretrain weight
embed_size = 100
weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove
for i in range(len(wvmodel.index2word)):
try: