This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def speeker(texts,lang='zh-tw'): | |
mixer.init() | |
with tempfile.NamedTemporaryFile(delete=True) as fp: | |
tts = gTTS(text=texts,lang=lang) | |
tts.save("{}.mp3".format(fp.name)) | |
mixer.music.load('{}.mp3'.format(fp.name)) | |
mixer.music.play() | |
def listener(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from itertools import chain | |
TAG_RE = re.compile(r'<[^>]+>') | |
def preprocess_text(sen): | |
# Removing html tags | |
sentence = TAG_RE.sub('', sen) | |
# Remove punctuations and numbers | |
sentence = re.sub('[^a-zA-Z]', ' ', sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### define mapping between word and index ### | |
word_to_idx = {word: i+1 for i, word in enumerate(vocab)} | |
word_to_idx['<unk>'] = 0 | |
idx_to_word = {i+1: word for i, word in enumerate(vocab)} | |
idx_to_word[0] = '<unk>' | |
def encode_samples(tokenized_samples): #use word index mapping to encode token | |
features = [] | |
for sample in tokenized_samples: | |
feature = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### load word2vec model ### | |
#pre-train model download from: https://github.com/stanfordnlp/GloVe | |
#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt | |
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8') | |
## map golve pretrain weight to pytorch embedding pretrain weight | |
embed_size = 100 | |
weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove | |
for i in range(len(wvmodel.index2word)): | |
try: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### build model ### | |
class RNN(nn.Module): | |
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, | |
bidirectional, weight, labels, **kwargs): | |
super(RNN, self).__init__(**kwargs) | |
self.num_hiddens = num_hiddens | |
self.num_layers = num_layers | |
self.bidirectional = bidirectional | |
self.embedding = nn.Embedding.from_pretrained(weight) | |
self.embedding.weight.requires_grad = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
net.to(device) | |
loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss() | |
optimizer = optim.Adam(net.parameters()) | |
def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter): | |
for epoch in range(num_epochs): | |
start = time.time() | |
train_loss, val_losses = 0, 0 | |
train_acc, val_acc = 0, 0 | |
n, m = 0, 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get pre-train tokenizer | |
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) | |
vocab = tokenizer.vocab | |
print("dict size", len(vocab)) | |
# see some token and index mapping | |
import random | |
random_tokens = random.sample(list(vocab), 10) | |
random_ids = [vocab[t] for t in random_tokens] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import Dataset,random_split | |
TAG_RE = re.compile(r'<[^>]+>') | |
def preprocess_text(sen): | |
# Removing html tags | |
sentence = TAG_RE.sub('', sen) | |
# Remove punctuations and numbers | |
sentence = re.sub('[^a-zA-Z]', ' ', sentence) | |
# Single character removal | |
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 隨便選一個樣本 | |
sample_idx = 10 | |
# 利用剛剛建立的 Dataset 取出轉換後的 id tensors | |
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx] | |
# 將 tokens_tensor 還原成文本 | |
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()) | |
print('token:\n',tokens,'\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import DataLoader | |
from torch.nn.utils.rnn import pad_sequence | |
"""" | |
create_mini_batch(samples)吃上面定義的mydataset | |
回傳訓練 BERT 時會需要的 4 個 tensors: | |
- tokens_tensors : (batch_size, max_seq_len_in_batch) | |
- segments_tensors: (batch_size, max_seq_len_in_batch) | |
- masks_tensors : (batch_size, max_seq_len_in_batch) | |
- label_ids : (batch_size) |
OlderNewer