henry16lin

## test_BERT.py
from sklearn.metrics import confusion_matrix

true=[]
predictions=[]
with torch.no_grad():
    model.eval()
    for data in testloader:
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
        val_outputs = model(input_ids=tokens_tensors,
                    token_type_ids=segments_tensors,

## fine_tune_BERT.py
%%time
from sklearn.metrics import accuracy_score

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 10

## load_bert.py
from transformers import BertForSequenceClassification

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)


print("""
name      module

## dataloader.py
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)

## prepare_result.py
# 隨便選一個樣本
sample_idx = 10

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print('token:\n',tokens,'\n')

## prepare_for_BERT.py
from torch.utils.data import Dataset,random_split

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

## create_token.py
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

# see some token and index mapping
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

## RNN_training.py
net.to(device)
loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss()
optimizer = optim.Adam(net.parameters())

def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter):
    for epoch in range(num_epochs):
        start = time.time()
        train_loss, val_losses = 0, 0
        train_acc, val_acc = 0, 0
        n, m = 0, 0

## LSTM_NET.py
### build model ###
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, weight, labels, **kwargs):
        super(RNN, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False

## load_glove.py
### load word2vec model ###
#pre-train model download from: https://github.com/stanfordnlp/GloVe
#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8')

## map golve pretrain weight to pytorch embedding pretrain weight
embed_size = 100
weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove
for i in range(len(wvmodel.index2word)):
    try:
	from sklearn.metrics import confusion_matrix

	true=[]
	predictions=[]
	with torch.no_grad():
	model.eval()
	for data in testloader:
	tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
	val_outputs = model(input_ids=tokens_tensors,
	token_type_ids=segments_tensors,
	%%time
	from sklearn.metrics import accuracy_score

	device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
	print("device:",device)
	model = model.to(device)


	optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
	EPOCHS = 10
	from transformers import BertForSequenceClassification

	NUM_LABELS = 2

	model = BertForSequenceClassification.from_pretrained(
	PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)


	print("""
	name module
	from torch.utils.data import DataLoader
	from torch.nn.utils.rnn import pad_sequence

	""""
	create_mini_batch(samples)吃上面定義的mydataset
	回傳訓練 BERT 時會需要的 4 個 tensors：
	- tokens_tensors : (batch_size, max_seq_len_in_batch)
	- segments_tensors: (batch_size, max_seq_len_in_batch)
	- masks_tensors : (batch_size, max_seq_len_in_batch)
	- label_ids : (batch_size)
	# 隨便選一個樣本
	sample_idx = 10

	# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
	tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

	# 將 tokens_tensor 還原成文本
	tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

	print('token:\n',tokens,'\n')
	from torch.utils.data import Dataset,random_split

	TAG_RE = re.compile(r'<[^>]+>')
	def preprocess_text(sen):
	# Removing html tags
	sentence = TAG_RE.sub('', sen)
	# Remove punctuations and numbers
	sentence = re.sub('[^a-zA-Z]', ' ', sentence)
	# Single character removal
	sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
	# get pre-train tokenizer
	tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
	vocab = tokenizer.vocab
	print("dict size", len(vocab))

	# see some token and index mapping
	import random
	random_tokens = random.sample(list(vocab), 10)
	random_ids = [vocab[t] for t in random_tokens]
	net.to(device)
	loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss()
	optimizer = optim.Adam(net.parameters())

	def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter):
	for epoch in range(num_epochs):
	start = time.time()
	train_loss, val_losses = 0, 0
	train_acc, val_acc = 0, 0
	n, m = 0, 0
	### build model ###
	class RNN(nn.Module):
	def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
	bidirectional, weight, labels, **kwargs):
	super(RNN, self).__init__(**kwargs)
	self.num_hiddens = num_hiddens
	self.num_layers = num_layers
	self.bidirectional = bidirectional
	self.embedding = nn.Embedding.from_pretrained(weight)
	self.embedding.weight.requires_grad = False
	### load word2vec model ###
	#pre-train model download from: https://github.com/stanfordnlp/GloVe
	#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt
	wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8')

	## map golve pretrain weight to pytorch embedding pretrain weight
	embed_size = 100
	weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove
	for i in range(len(wvmodel.index2word)):
	try: