henry16lin

## voice_assistant.py
def speeker(texts,lang='zh-tw'):
    mixer.init()
    with tempfile.NamedTemporaryFile(delete=True) as fp:
        tts = gTTS(text=texts,lang=lang)
        tts.save("{}.mp3".format(fp.name))
        mixer.music.load('{}.mp3'.format(fp.name))
        mixer.music.play()


def listener():

## imdb_preprocess.py
import os
import re
from itertools import chain

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

## imdb_preprocess2.py
### define mapping between word and index ###
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

def encode_samples(tokenized_samples): #use word index mapping to encode token
    features = []
    for sample in tokenized_samples:
        feature = []

## load_glove.py
### load word2vec model ###
#pre-train model download from: https://github.com/stanfordnlp/GloVe
#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8')

## map golve pretrain weight to pytorch embedding pretrain weight
embed_size = 100
weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove
for i in range(len(wvmodel.index2word)):
    try:

## LSTM_NET.py
### build model ###
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, weight, labels, **kwargs):
        super(RNN, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False

## RNN_training.py
net.to(device)
loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss()
optimizer = optim.Adam(net.parameters())

def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter):
    for epoch in range(num_epochs):
        start = time.time()
        train_loss, val_losses = 0, 0
        train_acc, val_acc = 0, 0
        n, m = 0, 0

## create_token.py
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

# see some token and index mapping
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

## prepare_for_BERT.py
from torch.utils.data import Dataset,random_split

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

## prepare_result.py
# 隨便選一個樣本
sample_idx = 10

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print('token:\n',tokens,'\n')

## dataloader.py
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
	def speeker(texts,lang='zh-tw'):
	mixer.init()
	with tempfile.NamedTemporaryFile(delete=True) as fp:
	tts = gTTS(text=texts,lang=lang)
	tts.save("{}.mp3".format(fp.name))
	mixer.music.load('{}.mp3'.format(fp.name))
	mixer.music.play()


	def listener():
	import os
	import re
	from itertools import chain

	TAG_RE = re.compile(r'<[^>]+>')
	def preprocess_text(sen):
	# Removing html tags
	sentence = TAG_RE.sub('', sen)
	# Remove punctuations and numbers
	sentence = re.sub('[^a-zA-Z]', ' ', sentence)
	### define mapping between word and index ###
	word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
	word_to_idx['<unk>'] = 0
	idx_to_word = {i+1: word for i, word in enumerate(vocab)}
	idx_to_word[0] = '<unk>'

	def encode_samples(tokenized_samples): #use word index mapping to encode token
	features = []
	for sample in tokenized_samples:
	feature = []
	### load word2vec model ###
	#pre-train model download from: https://github.com/stanfordnlp/GloVe
	#preprocess:https://stackoverflow.com/questions/51323344/cant-load-glove-6b-300d-txt
	wvmodel = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.w2vformat.txt',binary=False, encoding='utf-8')

	## map golve pretrain weight to pytorch embedding pretrain weight
	embed_size = 100
	weight = torch.zeros(vocab_size+1, embed_size) #given 0 if the word is not in glove
	for i in range(len(wvmodel.index2word)):
	try:
	### build model ###
	class RNN(nn.Module):
	def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
	bidirectional, weight, labels, **kwargs):
	super(RNN, self).__init__(**kwargs)
	self.num_hiddens = num_hiddens
	self.num_layers = num_layers
	self.bidirectional = bidirectional
	self.embedding = nn.Embedding.from_pretrained(weight)
	self.embedding.weight.requires_grad = False
	net.to(device)
	loss_function = nn.CrossEntropyLoss() # ~ nn.LogSoftmax()+nn.NLLLoss()
	optimizer = optim.Adam(net.parameters())

	def train(net,num_epochs,loss_function,optimizer,train_iter,val_iter):
	for epoch in range(num_epochs):
	start = time.time()
	train_loss, val_losses = 0, 0
	train_acc, val_acc = 0, 0
	n, m = 0, 0
	# get pre-train tokenizer
	tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
	vocab = tokenizer.vocab
	print("dict size", len(vocab))

	# see some token and index mapping
	import random
	random_tokens = random.sample(list(vocab), 10)
	random_ids = [vocab[t] for t in random_tokens]
	from torch.utils.data import Dataset,random_split

	TAG_RE = re.compile(r'<[^>]+>')
	def preprocess_text(sen):
	# Removing html tags
	sentence = TAG_RE.sub('', sen)
	# Remove punctuations and numbers
	sentence = re.sub('[^a-zA-Z]', ' ', sentence)
	# Single character removal
	sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
	# 隨便選一個樣本
	sample_idx = 10

	# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
	tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

	# 將 tokens_tensor 還原成文本
	tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

	print('token:\n',tokens,'\n')
	from torch.utils.data import DataLoader
	from torch.nn.utils.rnn import pad_sequence

	""""
	create_mini_batch(samples)吃上面定義的mydataset
	回傳訓練 BERT 時會需要的 4 個 tensors：
	- tokens_tensors : (batch_size, max_seq_len_in_batch)
	- segments_tensors: (batch_size, max_seq_len_in_batch)
	- masks_tensors : (batch_size, max_seq_len_in_batch)
	- label_ids : (batch_size)