Created
February 15, 2020 14:41
-
-
Save henry16lin/7532f24de148aa6e78ea7dc5b55e83aa to your computer and use it in GitHub Desktop.
prepare_for_BERT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import Dataset,random_split | |
TAG_RE = re.compile(r'<[^>]+>') | |
def preprocess_text(sen): | |
# Removing html tags | |
sentence = TAG_RE.sub('', sen) | |
# Remove punctuations and numbers | |
sentence = re.sub('[^a-zA-Z]', ' ', sentence) | |
# Single character removal | |
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) | |
# Removing multiple spaces | |
sentence = re.sub(r'\s+', ' ', sentence) | |
return sentence | |
def readIMDB(path, seg): | |
classes = ['pos', 'neg'] | |
data = [] | |
for label in classes: | |
files = os.listdir(os.path.join(path, seg, label)) | |
for file in files: | |
with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf: | |
review = rf.read().replace('\n', '') | |
if label == 'pos': | |
data.append([preprocess_text(review), 1]) | |
elif label == 'neg': | |
data.append([preprocess_text(review), 0]) | |
return data | |
label_map = {0: 'neg', 1: 'pos'} | |
#create Dataset | |
class MyDataset(Dataset): | |
def __init__(self, mode, tokenizer): | |
assert mode in ["train", "test"] | |
self.mode = mode | |
self.df = readIMDB('aclImdb',mode) #its list [['text1',label],['text2',label],...] | |
self.len = len(self.df) | |
self.maxlen = 300 #限制文章長度(若你記憶體夠多也可以不限) | |
self.tokenizer = tokenizer # we will use BERT tokenizer | |
# 定義回傳一筆訓練 / 測試數據的函式 | |
def __getitem__(self, idx): | |
origin_text = self.df[idx][0] | |
if self.mode == "test": | |
text_a = self.df[idx][0] | |
text_b = None #for natural language inference | |
#label_tensor = None #in our case, we have label | |
label_id = self.df[idx][1] | |
label_tensor = torch.tensor(label_id) | |
else: | |
text_a = self.df[idx][0] | |
text_b = None #for natural language inference | |
label_id = self.df[idx][1] | |
label_tensor = torch.tensor(label_id) | |
# 建立第一個句子的 BERT tokens | |
word_pieces = ["[CLS]"] | |
tokens_a = self.tokenizer.tokenize(text_a) | |
word_pieces += tokens_a[:self.maxlen] + ["[SEP]"] | |
len_a = len(word_pieces) | |
if text_b is not None: | |
tokens_b = self.tokenizer.tokenize(text_b) | |
word_pieces += tokens_b + ["[SEP]"] | |
len_b = len(word_pieces) - len_a | |
# 將整個 token 序列轉換成索引序列 | |
ids = self.tokenizer.convert_tokens_to_ids(word_pieces) | |
tokens_tensor = torch.tensor(ids) | |
# 將第一句包含 [SEP] 的 token 位置設為 0,其他為 1 表示第二句 | |
if text_b is None: | |
segments_tensor = torch.tensor([1] * len_a,dtype=torch.long) | |
elif text_b is not None: | |
segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long) | |
return (tokens_tensor, segments_tensor, label_tensor, origin_text) | |
def __len__(self): | |
return self.len | |
# initialize Dataset | |
trainset = MyDataset("train", tokenizer=tokenizer) | |
testset = MyDataset("test", tokenizer=tokenizer) | |
#split val from trainset | |
val_size = int(trainset.__len__()*0.04) #比對LSTM 切出1000筆當validation | |
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size]) | |
print('trainset size:' ,trainset.__len__()) | |
print('valset size:',valset.__len__()) | |
print('testset size: ',testset.__len__()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment