Skip to content

Instantly share code, notes, and snippets.

@henry16lin
Created February 15, 2020 14:41
Show Gist options
  • Save henry16lin/7532f24de148aa6e78ea7dc5b55e83aa to your computer and use it in GitHub Desktop.
Save henry16lin/7532f24de148aa6e78ea7dc5b55e83aa to your computer and use it in GitHub Desktop.
prepare_for_BERT
from torch.utils.data import Dataset,random_split
TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
# Removing html tags
sentence = TAG_RE.sub('', sen)
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
return sentence
def readIMDB(path, seg):
classes = ['pos', 'neg']
data = []
for label in classes:
files = os.listdir(os.path.join(path, seg, label))
for file in files:
with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
review = rf.read().replace('\n', '')
if label == 'pos':
data.append([preprocess_text(review), 1])
elif label == 'neg':
data.append([preprocess_text(review), 0])
return data
label_map = {0: 'neg', 1: 'pos'}
#create Dataset
class MyDataset(Dataset):
def __init__(self, mode, tokenizer):
assert mode in ["train", "test"]
self.mode = mode
self.df = readIMDB('aclImdb',mode) #its list [['text1',label],['text2',label],...]
self.len = len(self.df)
self.maxlen = 300 #限制文章長度(若你記憶體夠多也可以不限)
self.tokenizer = tokenizer # we will use BERT tokenizer
# 定義回傳一筆訓練 / 測試數據的函式
def __getitem__(self, idx):
origin_text = self.df[idx][0]
if self.mode == "test":
text_a = self.df[idx][0]
text_b = None #for natural language inference
#label_tensor = None #in our case, we have label
label_id = self.df[idx][1]
label_tensor = torch.tensor(label_id)
else:
text_a = self.df[idx][0]
text_b = None #for natural language inference
label_id = self.df[idx][1]
label_tensor = torch.tensor(label_id)
# 建立第一個句子的 BERT tokens
word_pieces = ["[CLS]"]
tokens_a = self.tokenizer.tokenize(text_a)
word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
len_a = len(word_pieces)
if text_b is not None:
tokens_b = self.tokenizer.tokenize(text_b)
word_pieces += tokens_b + ["[SEP]"]
len_b = len(word_pieces) - len_a
# 將整個 token 序列轉換成索引序列
ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
tokens_tensor = torch.tensor(ids)
# 將第一句包含 [SEP] 的 token 位置設為 0,其他為 1 表示第二句
if text_b is None:
segments_tensor = torch.tensor([1] * len_a,dtype=torch.long)
elif text_b is not None:
segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
return (tokens_tensor, segments_tensor, label_tensor, origin_text)
def __len__(self):
return self.len
# initialize Dataset
trainset = MyDataset("train", tokenizer=tokenizer)
testset = MyDataset("test", tokenizer=tokenizer)
#split val from trainset
val_size = int(trainset.__len__()*0.04) #比對LSTM 切出1000筆當validation
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size])
print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment